From 9283b42e46c2646dff1bec47e2dd683add7f9972 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 23 Apr 2015 10:24:45 -0600
Subject: NVMe: Fix VPD B0 max sectors translation

Use the namespace's block format for reporting the max transfer length.

Max unmap count is left as-is since NVMe doesn't provide a max, so the
value the driver provided the block layer is valid for any format.

Reported-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-scsi.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 6b736b00f63e..88f13c525712 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -944,7 +944,8 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 static int nvme_trans_bdev_limits_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 					u8 *inq_response, int alloc_len)
 {
-	__be32 max_sectors = cpu_to_be32(queue_max_hw_sectors(ns->queue));
+	__be32 max_sectors = cpu_to_be32(
+		nvme_block_nr(ns, queue_max_hw_sectors(ns->queue)));
 	__be32 max_discard = cpu_to_be32(ns->queue->limits.max_discard_sectors);
 	__be32 discard_desc_count = cpu_to_be32(0x100);
 
-- 
cgit v1.2.3


From f054b56c951bf1731ba7314a4c7f1cc0b2977cc9 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Tue, 21 Apr 2015 10:00:19 +0800
Subject: blk-mq: fix race between timeout and CPU hotplug

Firstly during CPU hotplug, even queue is freezed, timeout
handler still may come and access hctx->tags, which may cause
use after free, so this patch deactivates timeout handler
inside CPU hotplug notifier.

Secondly, tags can be shared by more than one queues, so we
have to check if the hctx has been unmapped, otherwise
still use-after-free on tags can be triggered.

Cc: <stable@vger.kernel.org>
Reported-by: Dongsu Park <dongsu.park@profitbricks.com>
Tested-by: Dongsu Park <dongsu.park@profitbricks.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ade8a2d1b0aa..1fccb98aa28f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -677,8 +677,11 @@ static void blk_mq_rq_timer(unsigned long priv)
 		data.next = blk_rq_timeout(round_jiffies_up(data.next));
 		mod_timer(&q->timeout, data.next);
 	} else {
-		queue_for_each_hw_ctx(q, hctx, i)
-			blk_mq_tag_idle(hctx);
+		queue_for_each_hw_ctx(q, hctx, i) {
+			/* the hctx may be unmapped, so check it here */
+			if (blk_mq_hw_queue_mapped(hctx))
+				blk_mq_tag_idle(hctx);
+		}
 	}
 }
 
@@ -2090,9 +2093,16 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	 */
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_freeze_queue_start(q);
-	list_for_each_entry(q, &all_q_list, all_q_node)
+	list_for_each_entry(q, &all_q_list, all_q_node) {
 		blk_mq_freeze_queue_wait(q);
 
+		/*
+		 * timeout handler can't touch hw queue during the
+		 * reinitialization
+		 */
+		del_timer_sync(&q->timeout);
+	}
+
 	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_queue_reinit(q);
 
-- 
cgit v1.2.3


From 2a34c0872adf252f23a6fef2d051a169ac796cef Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Tue, 21 Apr 2015 10:00:20 +0800
Subject: blk-mq: fix CPU hotplug handling

hctx->tags has to be set as NULL in case that it is to be unmapped
no matter if set->tags[hctx->queue_num] is NULL or not in blk_mq_map_swqueue()
because shared tags can be freed already from another request queue.

The same situation has to be considered during handling CPU online too.
Unmapped hw queue can be remapped after CPU topo is changed, so we need
to allocate tags for the hw queue in blk_mq_map_swqueue(). Then tags
allocation for hw queue can be removed in hctx cpu online notifier, and it
is reasonable to do that after mapping is updated.

Cc: <stable@vger.kernel.org>
Reported-by: Dongsu Park <dongsu.park@profitbricks.com>
Tested-by: Dongsu Park <dongsu.park@profitbricks.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1fccb98aa28f..76f460e36f1d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1574,22 +1574,6 @@ static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
 	return NOTIFY_OK;
 }
 
-static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
-{
-	struct request_queue *q = hctx->queue;
-	struct blk_mq_tag_set *set = q->tag_set;
-
-	if (set->tags[hctx->queue_num])
-		return NOTIFY_OK;
-
-	set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
-	if (!set->tags[hctx->queue_num])
-		return NOTIFY_STOP;
-
-	hctx->tags = set->tags[hctx->queue_num];
-	return NOTIFY_OK;
-}
-
 static int blk_mq_hctx_notify(void *data, unsigned long action,
 			      unsigned int cpu)
 {
@@ -1597,8 +1581,11 @@ static int blk_mq_hctx_notify(void *data, unsigned long action,
 
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
 		return blk_mq_hctx_cpu_offline(hctx, cpu);
-	else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
-		return blk_mq_hctx_cpu_online(hctx, cpu);
+
+	/*
+	 * In case of CPU online, tags may be reallocated
+	 * in blk_mq_map_swqueue() after mapping is updated.
+	 */
 
 	return NOTIFY_OK;
 }
@@ -1778,6 +1765,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	unsigned int i;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
+	struct blk_mq_tag_set *set = q->tag_set;
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		cpumask_clear(hctx->cpumask);
@@ -1806,16 +1794,20 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		 * disable it and free the request entries.
 		 */
 		if (!hctx->nr_ctx) {
-			struct blk_mq_tag_set *set = q->tag_set;
-
 			if (set->tags[i]) {
 				blk_mq_free_rq_map(set, set->tags[i], i);
 				set->tags[i] = NULL;
-				hctx->tags = NULL;
 			}
+			hctx->tags = NULL;
 			continue;
 		}
 
+		/* unmapped hw queue can be remapped after CPU topo changed */
+		if (!set->tags[i])
+			set->tags[i] = blk_mq_init_rq_map(set, i);
+		hctx->tags = set->tags[i];
+		WARN_ON(!hctx->tags);
+
 		/*
 		 * Set the map size to the number of mapped software queues.
 		 * This is more accurate and more efficient than looping
-- 
cgit v1.2.3


From 464d1387acb94dc43ba772b35242345e3d2ead1b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Apr 2015 16:49:13 -0400
Subject: writeback: use |1 instead of +1 to protect against div by zero

mm/page-writeback.c has several places where 1 is added to the divisor
to prevent division by zero exceptions; however, if the original
divisor is equivalent to -1, adding 1 leads to division by zero.

There are three places where +1 is used for this purpose - one in
pos_ratio_polynom() and two in bdi_position_ratio().  The second one
in bdi_position_ratio() actually triggered div-by-zero oops on a
machine running a 3.10 kernel.  The divisor is

  x_intercept - bdi_setpoint + 1 == span + 1

span is confirmed to be (u32)-1.  It isn't clear how it ended up that
but it could be from write bandwidth calculation underflow fixed by
c72efb658f7c ("writeback: fix possible underflow in write bandwidth
calculation").

At any rate, +1 isn't a proper protection against div-by-zero.  This
patch converts all +1 protections to |1.  Note that
bdi_update_dirty_ratelimit() was already using |1 before this patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 mm/page-writeback.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5daf5568b9e1..eb59f7eea508 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -580,7 +580,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
 	long x;
 
 	x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
-		    limit - setpoint + 1);
+		      (limit - setpoint) | 1);
 	pos_ratio = x;
 	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
 	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
@@ -807,7 +807,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 	 * scale global setpoint to bdi's:
 	 *	bdi_setpoint = setpoint * bdi_thresh / thresh
 	 */
-	x = div_u64((u64)bdi_thresh << 16, thresh + 1);
+	x = div_u64((u64)bdi_thresh << 16, thresh | 1);
 	bdi_setpoint = setpoint * (u64)x >> 16;
 	/*
 	 * Use span=(8*write_bw) in single bdi case as indicated by
@@ -822,7 +822,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 
 	if (bdi_dirty < x_intercept - span / 4) {
 		pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
-				    x_intercept - bdi_setpoint + 1);
+				      (x_intercept - bdi_setpoint) | 1);
 	} else
 		pos_ratio /= 4;
 
-- 
cgit v1.2.3


From 8406a4d56ea94d1d91f62cab3bed15399bac73cb Mon Sep 17 00:00:00 2001
From: Chao Yu <chao2.yu@samsung.com>
Date: Thu, 23 Apr 2015 10:47:44 -0600
Subject: elevator: fix double release of elevator module

Our issue is descripted in below call path:
->elevator_init
 ->elevator_init_fn
  ->{cfq,deadline,noop}_init_queue
   ->elevator_alloc
    ->kzalloc_node
   fail to call kzalloc_node and then put module in elevator_alloc;
fail to call elevator_init_fn and then put module again in elevator_init.

Remove elevator_put invoking in error path of elevator_alloc to avoid
double release issue.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/elevator.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 59794d0d38e3..8985038f398c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -157,7 +157,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
 
 	eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
 	if (unlikely(!eq))
-		goto err;
+		return NULL;
 
 	eq->type = e;
 	kobject_init(&eq->kobj, &elv_ktype);
@@ -165,10 +165,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
 	hash_init(eq->hash);
 
 	return eq;
-err:
-	kfree(eq);
-	elevator_put(e);
-	return NULL;
 }
 EXPORT_SYMBOL(elevator_alloc);
 
-- 
cgit v1.2.3


From 393a33970540ac6a2c894b0d6ef3f5d485860884 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Sun, 26 Apr 2015 16:43:31 +0800
Subject: block:bounce: fix call inc_|dec_zone_page_state on different pages
 confuse value of NR_BOUNCE

Commit d2c5e30c9a1420902262aa923794d2ae4e0bc391
("[PATCH] zoned vm counters: conversion of nr_bounce to per zone counter")
convert statistic of nr_bounce to per zone and one global value in vm_stat,
but it call inc_|dec_zone_page_state on different pages, then different
zones, and cause us to get unexpected value of NR_BOUNCE.

Below is the result on my machine:
Mar  2 09:26:08 udknight kernel: [144766.778265] Mem-Info:
Mar  2 09:26:08 udknight kernel: [144766.778266] DMA per-cpu:
Mar  2 09:26:08 udknight kernel: [144766.778268] CPU    0: hi:    0, btch:   1 usd:   0
Mar  2 09:26:08 udknight kernel: [144766.778269] CPU    1: hi:    0, btch:   1 usd:   0
Mar  2 09:26:08 udknight kernel: [144766.778270] Normal per-cpu:
Mar  2 09:26:08 udknight kernel: [144766.778271] CPU    0: hi:  186, btch:  31 usd:   0
Mar  2 09:26:08 udknight kernel: [144766.778273] CPU    1: hi:  186, btch:  31 usd:   0
Mar  2 09:26:08 udknight kernel: [144766.778274] HighMem per-cpu:
Mar  2 09:26:08 udknight kernel: [144766.778275] CPU    0: hi:  186, btch:  31 usd:   0
Mar  2 09:26:08 udknight kernel: [144766.778276] CPU    1: hi:  186, btch:  31 usd:   0
Mar  2 09:26:08 udknight kernel: [144766.778279] active_anon:46926 inactive_anon:287406 isolated_anon:0
Mar  2 09:26:08 udknight kernel: [144766.778279]  active_file:105085 inactive_file:139432 isolated_file:0
Mar  2 09:26:08 udknight kernel: [144766.778279]  unevictable:653 dirty:0 writeback:0 unstable:0
Mar  2 09:26:08 udknight kernel: [144766.778279]  free:178957 slab_reclaimable:6419 slab_unreclaimable:9966
Mar  2 09:26:08 udknight kernel: [144766.778279]  mapped:4426 shmem:305277 pagetables:784 bounce:0
Mar  2 09:26:08 udknight kernel: [144766.778279]  free_cma:0
Mar  2 09:26:08 udknight kernel: [144766.778286] DMA free:3324kB min:68kB low:84kB high:100kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15976kB managed:15900kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
Mar  2 09:26:08 udknight kernel: [144766.778287] lowmem_reserve[]: 0 822 3754 3754
Mar  2 09:26:08 udknight kernel: [144766.778293] Normal free:26828kB min:3632kB low:4540kB high:5448kB active_anon:4872kB inactive_anon:68kB active_file:1796kB inactive_file:1796kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:892920kB managed:842560kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:4144kB slab_reclaimable:25676kB slab_unreclaimable:39864kB kernel_stack:1944kB pagetables:3136kB unstable:0kB bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:2412612 all_unreclaimable? yes
Mar  2 09:26:08 udknight kernel: [144766.778294] lowmem_reserve[]: 0 0 23451 23451
Mar  2 09:26:08 udknight kernel: [144766.778299] HighMem free:685676kB min:512kB low:3748kB high:6984kB active_anon:182832kB inactive_anon:1149556kB active_file:418544kB inactive_file:555932kB unevictable:2612kB isolated(anon):0kB isolated(file):0kB present:3001732kB managed:3001732kB mlocked:0kB dirty:0kB writeback:0kB mapped:17704kB shmem:1216964kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:75771152kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
Mar  2 09:26:08 udknight kernel: [144766.778300] lowmem_reserve[]: 0 0 0 0

You can see bounce:75771152kB for HighMem, but bounce:0 for lowmem and global.

This patch fix it.

Signed-off-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bounce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/bounce.c b/block/bounce.c
index ab21ba203d5c..ed9dd8067120 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -221,8 +221,8 @@ bounce:
 		if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
 			continue;
 
-		inc_zone_page_state(to->bv_page, NR_BOUNCE);
 		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+		inc_zone_page_state(to->bv_page, NR_BOUNCE);
 
 		if (rw == WRITE) {
 			char *vto, *vfrom;
-- 
cgit v1.2.3


From 6cd18e711dd8075da9d78cfc1239f912ff28968a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Apr 2015 14:12:22 +1000
Subject: block: destroy bdi before blockdev is unregistered.

Because of the peculiar way that md devices are created (automatically
when the device node is opened), a new device can be created and
registered immediately after the
	blk_unregister_region(disk_devt(disk), disk->minors);
call in del_gendisk().

Therefore it is important that all visible artifacts of the previous
device are removed before this call.  In particular, the 'bdi'.

Since:
commit c4db59d31e39ea067c32163ac961e9c80198fd37
Author: Christoph Hellwig <hch@lst.de>
    fs: don't reassign dirty inodes to default_backing_dev_info

moved the
   device_unregister(bdi->dev);
call from bdi_unregister() to bdi_destroy() it has been quite easy to
lose a race and have a new (e.g.) "md127" be created after the
blk_unregister_region() call and before bdi_destroy() is ultimately
called by the final 'put_disk', which must come after del_gendisk().

The new device finds that the bdi name is already registered in sysfs
and complains

> [ 9627.630029] WARNING: CPU: 18 PID: 3330 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x5a/0x70()
> [ 9627.630032] sysfs: cannot create duplicate filename '/devices/virtual/bdi/9:127'

We can fix this by moving the bdi_destroy() call out of
blk_release_queue() (which can happen very late when a refcount
reaches zero) and into blk_cleanup_queue() - which happens exactly when the md
device driver calls it.

Then it is only necessary for md to call blk_cleanup_queue() before
del_gendisk().  As loop.c devices are also created on demand by
opening the device node, we make the same change there.

Fixes: c4db59d31e39ea067c32163ac961e9c80198fd37
Reported-by: Azat Khuzhin <a3at.mail@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: stable@vger.kernel.org (v4.0)
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c     | 2 ++
 block/blk-sysfs.c    | 2 --
 drivers/block/loop.c | 2 +-
 drivers/md/md.c      | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index fd154b94447a..7871603f0a29 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -552,6 +552,8 @@ void blk_cleanup_queue(struct request_queue *q)
 		q->queue_lock = &q->__queue_lock;
 	spin_unlock_irq(lock);
 
+	bdi_destroy(&q->backing_dev_info);
+
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index faaf36ade7eb..2b8fd302f677 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -522,8 +522,6 @@ static void blk_release_queue(struct kobject *kobj)
 
 	blk_trace_shutdown(q);
 
-	bdi_destroy(&q->backing_dev_info);
-
 	ida_simple_remove(&blk_queue_ida, q->id);
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ae3fcb4199e9..d7173cb1ea76 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1620,8 +1620,8 @@ out:
 
 static void loop_remove(struct loop_device *lo)
 {
-	del_gendisk(lo->lo_disk);
 	blk_cleanup_queue(lo->lo_queue);
+	del_gendisk(lo->lo_disk);
 	blk_mq_free_tag_set(&lo->tag_set);
 	put_disk(lo->lo_disk);
 	kfree(lo);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e6178787ce3d..e47d1dd046da 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4754,12 +4754,12 @@ static void md_free(struct kobject *ko)
 	if (mddev->sysfs_state)
 		sysfs_put(mddev->sysfs_state);
 
+	if (mddev->queue)
+		blk_cleanup_queue(mddev->queue);
 	if (mddev->gendisk) {
 		del_gendisk(mddev->gendisk);
 		put_disk(mddev->gendisk);
 	}
-	if (mddev->queue)
-		blk_cleanup_queue(mddev->queue);
 
 	kfree(mddev);
 }
-- 
cgit v1.2.3


From b2387ddcced8de3e6471a2fb16a409577063016f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 1 May 2015 09:59:44 -0700
Subject: blk-mq: fix FUA request hang

When a FUA request enters its DATA stage of flush pipeline, the
request is added to mq requeue list, the request will then be added to
ctx->rq_list. blk_mq_attempt_merge() might merge the request with a bio.
Later when the request is finished the flush pipeline, the
request->__data_len is 0. Then I only saw the bio gets endio called, the
original request never finish.

Adding REQ_FLUSH_SEQ into REQ_NOMERGE_FLAGS looks an easy fix.

stable: 3.15+

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blk_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index a1b25e35ea5f..b7299febc4b4 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -220,7 +220,7 @@ enum rq_flag_bits {
 
 /* This mask is used for both bio and request merge checking */
 #define REQ_NOMERGE_FLAGS \
-	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
+	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA | REQ_FLUSH_SEQ)
 
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_THROTTLED		(1ULL << __REQ_THROTTLED)
-- 
cgit v1.2.3


From 9ba52e5812e53f20f23600d79449a3ec05a0254f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 4 May 2015 14:32:48 -0600
Subject: blk-mq: don't lose requests if a stopped queue restarts

Normally if driver is busy to dispatch a request the logic is like below:
block layer:					driver:
	__blk_mq_run_hw_queue
a.						blk_mq_stop_hw_queue
b.	rq add to ctx->dispatch

later:
1.						blk_mq_start_hw_queue
2.	__blk_mq_run_hw_queue

But it's possible step 1-2 runs between a and b. And since rq isn't in
ctx->dispatch yet, step 2 will not run rq. The rq might get lost if
there are no subsequent requests kick in.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 76f460e36f1d..e68b71b85a7e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -858,6 +858,16 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 		spin_lock(&hctx->lock);
 		list_splice(&rq_list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
+		/*
+		 * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
+		 * it's possible the queue is stopped and restarted again
+		 * before this. Queue restart will dispatch requests. And since
+		 * requests in rq_list aren't added into hctx->dispatch yet,
+		 * the requests in rq_list might get lost.
+		 *
+		 * blk_mq_run_hw_queue() already checks the STOPPED bit
+		 **/
+		blk_mq_run_hw_queue(hctx, true);
 	}
 }
 
-- 
cgit v1.2.3


From 0ff28d9f4674d781e492bcff6f32f0fe48cf0fed Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 6 May 2015 17:26:47 +0200
Subject: splice: sendfile() at once fails for big files

Using sendfile with below small program to get MD5 sums of some files,
it appear that big files (over 64kbytes with 4k pages system) get a
wrong MD5 sum while small files get the correct sum.
This program uses sendfile() to send a file to an AF_ALG socket
for hashing.

/* md5sum2.c */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <fcntl.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <linux/if_alg.h>

int main(int argc, char **argv)
{
	int sk = socket(AF_ALG, SOCK_SEQPACKET, 0);
	struct stat st;
	struct sockaddr_alg sa = {
		.salg_family = AF_ALG,
		.salg_type = "hash",
		.salg_name = "md5",
	};
	int n;

	bind(sk, (struct sockaddr*)&sa, sizeof(sa));

	for (n = 1; n < argc; n++) {
		int size;
		int offset = 0;
		char buf[4096];
		int fd;
		int sko;
		int i;

		fd = open(argv[n], O_RDONLY);
		sko = accept(sk, NULL, 0);
		fstat(fd, &st);
		size = st.st_size;
		sendfile(sko, fd, &offset, size);
		size = read(sko, buf, sizeof(buf));
		for (i = 0; i < size; i++)
			printf("%2.2x", buf[i]);
		printf("  %s\n", argv[n]);
		close(fd);
		close(sko);
	}
	exit(0);
}

Test below is done using official linux patch files. First result is
with a software based md5sum. Second result is with the program above.

root@vgoip:~# ls -l patch-3.6.*
-rw-r--r--    1 root     root         64011 Aug 24 12:01 patch-3.6.2.gz
-rw-r--r--    1 root     root         94131 Aug 24 12:01 patch-3.6.3.gz

root@vgoip:~# md5sum patch-3.6.*
b3ffb9848196846f31b2ff133d2d6443  patch-3.6.2.gz
c5e8f687878457db77cb7158c38a7e43  patch-3.6.3.gz

root@vgoip:~# ./md5sum2 patch-3.6.*
b3ffb9848196846f31b2ff133d2d6443  patch-3.6.2.gz
5fd77b24e68bb24dcc72d6e57c64790e  patch-3.6.3.gz

After investivation, it appears that sendfile() sends the files by blocks
of 64kbytes (16 times PAGE_SIZE). The problem is that at the end of each
block, the SPLICE_F_MORE flag is missing, therefore the hashing operation
is reset as if it was the end of the file.

This patch adds SPLICE_F_MORE to the flags when more data is pending.

With the patch applied, we get the correct sums:

root@vgoip:~# md5sum patch-3.6.*
b3ffb9848196846f31b2ff133d2d6443  patch-3.6.2.gz
c5e8f687878457db77cb7158c38a7e43  patch-3.6.3.gz

root@vgoip:~# ./md5sum2 patch-3.6.*
b3ffb9848196846f31b2ff133d2d6443  patch-3.6.2.gz
c5e8f687878457db77cb7158c38a7e43  patch-3.6.3.gz

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/splice.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/splice.c b/fs/splice.c
index 476024bb6546..bfe62ae40f40 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1161,7 +1161,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	long ret, bytes;
 	umode_t i_mode;
 	size_t len;
-	int i, flags;
+	int i, flags, more;
 
 	/*
 	 * We require the input being a regular file, as we don't want to
@@ -1204,6 +1204,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	 * Don't block on output, we have to drain the direct pipe.
 	 */
 	sd->flags &= ~SPLICE_F_NONBLOCK;
+	more = sd->flags & SPLICE_F_MORE;
 
 	while (len) {
 		size_t read_len;
@@ -1216,6 +1217,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 		read_len = ret;
 		sd->total_len = read_len;
 
+		/*
+		 * If more data is pending, set SPLICE_F_MORE
+		 * If this is the last data and SPLICE_F_MORE was not set
+		 * initially, clears it.
+		 */
+		if (read_len < len)
+			sd->flags |= SPLICE_F_MORE;
+		else if (!more)
+			sd->flags &= ~SPLICE_F_MORE;
 		/*
 		 * NOTE: nonblocking mode only applies to the input. We
 		 * must not do the output in nonblocking mode as then we
-- 
cgit v1.2.3