From ff06db1efb2ad6db06eb5b99b88a0c15a9cc9b0e Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 16 Jun 2016 09:53:58 +0200
Subject: floppy: fix open(O_ACCMODE) for ioctl-only open

Commit 09954bad4 ("floppy: refactor open() flags handling"), as a
side-effect, causes open(/dev/fdX, O_ACCMODE) to fail. It turns out that
this is being used setfdprm userspace for ioctl-only open().

Reintroduce back the original behavior wrt !(FMODE_READ|FMODE_WRITE)
modes, while still keeping the original O_NDELAY bug fixed.

Cc: stable@vger.kernel.org # v4.5+
Reported-by: Wim Osterholt <wim@djo.tudelft.nl>
Tested-by: Wim Osterholt <wim@djo.tudelft.nl>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/floppy.c | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index c557057fe8ae..b71a9c767009 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3663,11 +3663,6 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 
 	opened_bdev[drive] = bdev;
 
-	if (!(mode & (FMODE_READ|FMODE_WRITE))) {
-		res = -EINVAL;
-		goto out;
-	}
-
 	res = -ENXIO;
 
 	if (!floppy_track_buffer) {
@@ -3711,13 +3706,15 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	if (UFDCS->rawcmd == 1)
 		UFDCS->rawcmd = 2;
 
-	UDRS->last_checked = 0;
-	clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
-	check_disk_change(bdev);
-	if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags))
-		goto out;
-	if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags))
-		goto out;
+	if (mode & (FMODE_READ|FMODE_WRITE)) {
+		UDRS->last_checked = 0;
+		clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
+		check_disk_change(bdev);
+		if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags))
+			goto out;
+		if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags))
+			goto out;
+	}
 
 	res = -EROFS;
 
-- 
cgit v1.2.3


From dc5ff2b1d66f21c27a4c37236636dff6946437e4 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 26 Jul 2016 11:38:20 +0200
Subject: writeback: Write dirty times for WB_SYNC_ALL writeback

Currently we take care to handle I_DIRTY_TIME in vfs_fsync() and
queue_io() so that inodes which have only dirty timestamps are properly
written on fsync(2) and sync(2). However there are other call sites -
most notably going through write_inode_now() - which expect inode to be
clean after WB_SYNC_ALL writeback. This is not currently true as we do
not clear I_DIRTY_TIME in __writeback_single_inode() even for
WB_SYNC_ALL writeback in all the cases. This then resulted in the
following oops because bdev_write_inode() did not clean the inode and
writeback code later stumbled over a dirty inode with detached wb.

  general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN
  Modules linked in:
  CPU: 3 PID: 32 Comm: kworker/u10:1 Not tainted 4.6.0-rc3+ #349
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  Workqueue: writeback wb_workfn (flush-11:0)
  task: ffff88006ccf1840 ti: ffff88006cda8000 task.ti: ffff88006cda8000
  RIP: 0010:[<ffffffff818884d2>]  [<ffffffff818884d2>]
  locked_inode_to_wb_and_lock_list+0xa2/0x750
  RSP: 0018:ffff88006cdaf7d0  EFLAGS: 00010246
  RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff88006ccf2050
  RDX: 0000000000000000 RSI: 000000114c8a8484 RDI: 0000000000000286
  RBP: ffff88006cdaf820 R08: ffff88006ccf1840 R09: 0000000000000000
  R10: 000229915090805f R11: 0000000000000001 R12: ffff88006a72f5e0
  R13: dffffc0000000000 R14: ffffed000d4e5eed R15: ffffffff8830cf40
  FS:  0000000000000000(0000) GS:ffff88006d500000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000003301bf8 CR3: 000000006368f000 CR4: 00000000000006e0
  DR0: 0000000000001ec9 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
  Stack:
   ffff88006a72f680 ffff88006a72f768 ffff8800671230d8 03ff88006cdaf948
   ffff88006a72f668 ffff88006a72f5e0 ffff8800671230d8 ffff88006cdaf948
   ffff880065b90cc8 ffff880067123100 ffff88006cdaf970 ffffffff8188e12e
  Call Trace:
   [<     inline     >] inode_to_wb_and_lock_list fs/fs-writeback.c:309
   [<ffffffff8188e12e>] writeback_sb_inodes+0x4de/0x1250 fs/fs-writeback.c:1554
   [<ffffffff8188efa4>] __writeback_inodes_wb+0x104/0x1e0 fs/fs-writeback.c:1600
   [<ffffffff8188f9ae>] wb_writeback+0x7ce/0xc90 fs/fs-writeback.c:1709
   [<     inline     >] wb_do_writeback fs/fs-writeback.c:1844
   [<ffffffff81891079>] wb_workfn+0x2f9/0x1000 fs/fs-writeback.c:1884
   [<ffffffff813bcd1e>] process_one_work+0x78e/0x15c0 kernel/workqueue.c:2094
   [<ffffffff813bdc2b>] worker_thread+0xdb/0xfc0 kernel/workqueue.c:2228
   [<ffffffff813cdeef>] kthread+0x23f/0x2d0 drivers/block/aoe/aoecmd.c:1303
   [<ffffffff867bc5d2>] ret_from_fork+0x22/0x50 arch/x86/entry/entry_64.S:392
  Code: 05 94 4a a8 06 85 c0 0f 85 03 03 00 00 e8 07 15 d0 ff 41 80 3e
  00 0f 85 64 06 00 00 49 8b 9c 24 88 01 00 00 48 89 d8 48 c1 e8 03 <42>
  80 3c 28 00 0f 85 17 06 00 00 48 8b 03 48 83 c0 50 48 39 c3
  RIP  [<     inline     >] wb_get include/linux/backing-dev-defs.h:212
  RIP  [<ffffffff818884d2>] locked_inode_to_wb_and_lock_list+0xa2/0x750
  fs/fs-writeback.c:281
   RSP <ffff88006cdaf7d0>
  ---[ end trace 986a4d314dcb2694 ]---

Fix the problem by making sure __writeback_single_inode() writes inode
only with dirty times in WB_SYNC_ALL mode.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/fs-writeback.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 56c8fda436c0..4d09d4441e3e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1327,6 +1327,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	dirty = inode->i_state & I_DIRTY;
 	if (inode->i_state & I_DIRTY_TIME) {
 		if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+		    wbc->sync_mode == WB_SYNC_ALL ||
 		    unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
 		    unlikely(time_after(jiffies,
 					(inode->dirtied_time_when +
-- 
cgit v1.2.3


From bfd279a868987e4bca7dc72a029e0fc50316d6df Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Tue, 26 Jul 2016 17:13:42 +0800
Subject: blkcg: kill unused field nr_undestroyed_grps

'nr_undestroyed_grps' in struct throtl_data was used to count
the number of throtl_grp related with throtl_data, but now
throtl_grp is tracked by blkcg_gq, so it is useless anymore.

Signed-off-by: Hou Tao <houtao1@huawei.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-throttle.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 47a3e540631a..c5494e403239 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -145,11 +145,6 @@ struct throtl_data
 	/* Total Number of queued bios on READ and WRITE lists */
 	unsigned int nr_queued[2];
 
-	/*
-	 * number of total undestroyed groups
-	 */
-	unsigned int nr_undestroyed_grps;
-
 	/* Work for dispatching throttled bios */
 	struct work_struct dispatch_work;
 };
-- 
cgit v1.2.3


From 20bd723ec6a3261df5e02250cd3a1fbb09a343f2 Mon Sep 17 00:00:00 2001
From: Paolo Valente <paolo.valente@linaro.org>
Date: Wed, 27 Jul 2016 07:22:05 +0200
Subject: block: add missing group association in bio-cloning functions

When a bio is cloned, the newly created bio must be associated with
the same blkcg as the original bio (if BLK_CGROUP is enabled). If
this operation is not performed, then the new bio is not associated
with any group, and the group of the current task is returned when
the group of the bio is requested.

Depending on the cloning frequency, this may cause a large
percentage of the bios belonging to a given group to be treated
as if belonging to other groups (in most cases as if belonging to
the root group). The expected group isolation may thereby be broken.

This commit adds the missing association in bio-cloning functions.

Fixes: da2f0f74cf7d ("Btrfs: add support for blkio controllers")
Cc: stable@vger.kernel.org # v4.3+

Signed-off-by: Paolo Valente <paolo.valente@linaro.org>
Reviewed-by: Nikolay Borisov <kernel@kyup.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c          | 15 +++++++++++++++
 fs/btrfs/extent_io.c |  6 ------
 include/linux/bio.h  |  3 +++
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 54ee3846c3a5..3f76a38a5e2d 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -583,6 +583,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio->bi_rw = bio_src->bi_rw;
 	bio->bi_iter = bio_src->bi_iter;
 	bio->bi_io_vec = bio_src->bi_io_vec;
+
+	bio_clone_blkcg_association(bio, bio_src);
 }
 EXPORT_SYMBOL(__bio_clone_fast);
 
@@ -687,6 +689,8 @@ integrity_clone:
 		}
 	}
 
+	bio_clone_blkcg_association(bio, bio_src);
+
 	return bio;
 }
 EXPORT_SYMBOL(bio_clone_bioset);
@@ -2004,6 +2008,17 @@ void bio_disassociate_task(struct bio *bio)
 	}
 }
 
+/**
+ * bio_clone_blkcg_association - clone blkcg association from src to dst bio
+ * @dst: destination bio
+ * @src: source bio
+ */
+void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
+{
+	if (src->bi_css)
+		WARN_ON(bio_associate_blkcg(dst, src->bi_css));
+}
+
 #endif /* CONFIG_BLK_CGROUP */
 
 static void __init biovec_init_slabs(void)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cee4cb99b8ce..5850d79806f4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2697,12 +2697,6 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
 		btrfs_bio->csum = NULL;
 		btrfs_bio->csum_allocated = NULL;
 		btrfs_bio->end_io = NULL;
-
-#ifdef CONFIG_BLK_CGROUP
-		/* FIXME, put this into bio_clone_bioset */
-		if (bio->bi_css)
-			bio_associate_blkcg(new, bio->bi_css);
-#endif
 	}
 	return new;
 }
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 583c10810e32..e09a8895fc31 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -470,11 +470,14 @@ extern unsigned int bvec_nr_vecs(unsigned short idx);
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 int bio_associate_current(struct bio *bio);
 void bio_disassociate_task(struct bio *bio);
+void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
 static inline int bio_associate_blkcg(struct bio *bio,
 			struct cgroup_subsys_state *blkcg_css) { return 0; }
 static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
+static inline void bio_clone_blkcg_association(struct bio *dst,
+			struct bio *src) { }
 #endif	/* CONFIG_BLK_CGROUP */
 
 #ifdef CONFIG_HIGHMEM
-- 
cgit v1.2.3


From 1aee6b9a7d947d42ed84baa4cf461e9d943b80f0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 27 Jul 2016 12:52:21 -0600
Subject: f2fs: drop bio->bi_rw manual assignment

Merge 4fc29c1aa375 included this extra line, but it's not needed (or
useful) since we'll bio_set_op_attrs() right after to properly set
the op and flags for the bio.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/f2fs/data.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e2624275d828..d64d2a515cb2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -245,7 +245,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 		bio_put(bio);
 		return -EFAULT;
 	}
-	bio->bi_rw = fio->op_flags;
 	bio_set_op_attrs(bio, fio->op, fio->op_flags);
 
 	__submit_bio(fio->sbi, bio, fio->type);
-- 
cgit v1.2.3


From 77da160530dd1dc94f6ae15a981f24e5f0021e84 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Fri, 29 Jul 2016 10:40:31 +0200
Subject: block: fix use-after-free in seq file

I got a KASAN report of use-after-free:

    ==================================================================
    BUG: KASAN: use-after-free in klist_iter_exit+0x61/0x70 at addr ffff8800b6581508
    Read of size 8 by task trinity-c1/315
    =============================================================================
    BUG kmalloc-32 (Not tainted): kasan: bad access detected
    -----------------------------------------------------------------------------

    Disabling lock debugging due to kernel taint
    INFO: Allocated in disk_seqf_start+0x66/0x110 age=144 cpu=1 pid=315
            ___slab_alloc+0x4f1/0x520
            __slab_alloc.isra.58+0x56/0x80
            kmem_cache_alloc_trace+0x260/0x2a0
            disk_seqf_start+0x66/0x110
            traverse+0x176/0x860
            seq_read+0x7e3/0x11a0
            proc_reg_read+0xbc/0x180
            do_loop_readv_writev+0x134/0x210
            do_readv_writev+0x565/0x660
            vfs_readv+0x67/0xa0
            do_preadv+0x126/0x170
            SyS_preadv+0xc/0x10
            do_syscall_64+0x1a1/0x460
            return_from_SYSCALL_64+0x0/0x6a
    INFO: Freed in disk_seqf_stop+0x42/0x50 age=160 cpu=1 pid=315
            __slab_free+0x17a/0x2c0
            kfree+0x20a/0x220
            disk_seqf_stop+0x42/0x50
            traverse+0x3b5/0x860
            seq_read+0x7e3/0x11a0
            proc_reg_read+0xbc/0x180
            do_loop_readv_writev+0x134/0x210
            do_readv_writev+0x565/0x660
            vfs_readv+0x67/0xa0
            do_preadv+0x126/0x170
            SyS_preadv+0xc/0x10
            do_syscall_64+0x1a1/0x460
            return_from_SYSCALL_64+0x0/0x6a

    CPU: 1 PID: 315 Comm: trinity-c1 Tainted: G    B           4.7.0+ #62
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
     ffffea0002d96000 ffff880119b9f918 ffffffff81d6ce81 ffff88011a804480
     ffff8800b6581500 ffff880119b9f948 ffffffff8146c7bd ffff88011a804480
     ffffea0002d96000 ffff8800b6581500 fffffffffffffff4 ffff880119b9f970
    Call Trace:
     [<ffffffff81d6ce81>] dump_stack+0x65/0x84
     [<ffffffff8146c7bd>] print_trailer+0x10d/0x1a0
     [<ffffffff814704ff>] object_err+0x2f/0x40
     [<ffffffff814754d1>] kasan_report_error+0x221/0x520
     [<ffffffff8147590e>] __asan_report_load8_noabort+0x3e/0x40
     [<ffffffff83888161>] klist_iter_exit+0x61/0x70
     [<ffffffff82404389>] class_dev_iter_exit+0x9/0x10
     [<ffffffff81d2e8ea>] disk_seqf_stop+0x3a/0x50
     [<ffffffff8151f812>] seq_read+0x4b2/0x11a0
     [<ffffffff815f8fdc>] proc_reg_read+0xbc/0x180
     [<ffffffff814b24e4>] do_loop_readv_writev+0x134/0x210
     [<ffffffff814b4c45>] do_readv_writev+0x565/0x660
     [<ffffffff814b8a17>] vfs_readv+0x67/0xa0
     [<ffffffff814b8de6>] do_preadv+0x126/0x170
     [<ffffffff814b92ec>] SyS_preadv+0xc/0x10

This problem can occur in the following situation:

open()
 - pread()
    - .seq_start()
       - iter = kmalloc() // succeeds
       - seqf->private = iter
    - .seq_stop()
       - kfree(seqf->private)
 - pread()
    - .seq_start()
       - iter = kmalloc() // fails
    - .seq_stop()
       - class_dev_iter_exit(seqf->private) // boom! old pointer

As the comment in disk_seqf_stop() says, stop is called even if start
failed, so we need to reinitialise the private pointer to NULL when seq
iteration stops.

An alternative would be to set the private pointer to NULL when the
kmalloc() in disk_seqf_start() fails.

Cc: stable@vger.kernel.org
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/genhd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/genhd.c b/block/genhd.c
index 3c9dede4e04f..0ad879655f82 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -856,6 +856,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v)
 	if (iter) {
 		class_dev_iter_exit(iter);
 		kfree(iter);
+		seqf->private = NULL;
 	}
 }
 
-- 
cgit v1.2.3


From 97240963eb308d8d21a89c0459822f7ea98463b4 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Fri, 27 May 2016 12:59:35 +0200
Subject: nbd: fix race in ioctl

Quentin ran into this bug:

WARNING: CPU: 64 PID: 10085 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x65/0x80
sysfs: cannot create duplicate filename '/devices/virtual/block/nbd3/pid'
Modules linked in: nbd
CPU: 64 PID: 10085 Comm: qemu-nbd Tainted: G      D         4.6.0+ #7
 0000000000000000 ffff8820330bba68 ffffffff814b8791 ffff8820330bbac8
 0000000000000000 ffff8820330bbab8 ffffffff810d04ab ffff8820330bbaa8
 0000001f00000296 0000000000017681 ffff8810380bf000 ffffffffa0001790
Call Trace:
 [<ffffffff814b8791>] dump_stack+0x4d/0x6c
 [<ffffffff810d04ab>] __warn+0xdb/0x100
 [<ffffffff810d0574>] warn_slowpath_fmt+0x44/0x50
 [<ffffffff81218c65>] sysfs_warn_dup+0x65/0x80
 [<ffffffff81218a02>] sysfs_add_file_mode_ns+0x172/0x180
 [<ffffffff81218a35>] sysfs_create_file_ns+0x25/0x30
 [<ffffffff81594a76>] device_create_file+0x36/0x90
 [<ffffffffa0000e8d>] __nbd_ioctl+0x32d/0x9b0 [nbd]
 [<ffffffff814cc8e8>] ? find_next_bit+0x18/0x20
 [<ffffffff810f7c29>] ? select_idle_sibling+0xe9/0x120
 [<ffffffff810f6cd7>] ? __enqueue_entity+0x67/0x70
 [<ffffffff810f9bf0>] ? enqueue_task_fair+0x630/0xe20
 [<ffffffff810efa76>] ? resched_curr+0x36/0x70
 [<ffffffff810f0078>] ? check_preempt_curr+0x78/0x90
 [<ffffffff810f00a2>] ? ttwu_do_wakeup+0x12/0x80
 [<ffffffff810f01b1>] ? ttwu_do_activate.constprop.86+0x61/0x70
 [<ffffffff810f0c15>] ? try_to_wake_up+0x185/0x2d0
 [<ffffffff810f0d6d>] ? default_wake_function+0xd/0x10
 [<ffffffff81105471>] ? autoremove_wake_function+0x11/0x40
 [<ffffffffa0001577>] nbd_ioctl+0x67/0x94 [nbd]
 [<ffffffff814ac0fd>] blkdev_ioctl+0x14d/0x940
 [<ffffffff811b0da2>] ? put_pipe_info+0x22/0x60
 [<ffffffff811d96cc>] block_ioctl+0x3c/0x40
 [<ffffffff811ba08d>] do_vfs_ioctl+0x8d/0x5e0
 [<ffffffff811aa329>] ? ____fput+0x9/0x10
 [<ffffffff810e9092>] ? task_work_run+0x72/0x90
 [<ffffffff811ba627>] SyS_ioctl+0x47/0x80
 [<ffffffff8185f5df>] entry_SYSCALL_64_fastpath+0x17/0x93
---[ end trace 7899b295e4f850c8 ]---

It seems fairly obvious that device_create_file() is not being protected
from being run concurrently on the same nbd.

Quentin found the following relevant commits:

1a2ad21 nbd: add locking to nbd_ioctl
90b8f28 [PATCH] end of methods switch: remove the old ones
d4430d6 [PATCH] beginning of methods conversion
08f8585 [PATCH] move block_device_operations to blkdev.h

It would seem that the race was introduced in the process of moving nbd
from BKL to unlocked ioctls.

By setting nbd->task_recv while the mutex is held, we can prevent other
processes from running concurrently (since nbd->task_recv is also checked
while the mutex is held).

Reported-and-tested-by: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Markus Pargmann <mpa@pengutronix.de>
Cc: Paul Clements <paul.clements@steeleye.com>
Cc: Pavel Machek <pavel@suse.cz>
Cc: Jens Axboe <axboe@fb.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nbd.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6f55b262b5ce..a9e398019f38 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -451,14 +451,9 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
 
 	sk_set_memalloc(nbd->sock->sk);
 
-	nbd->task_recv = current;
-
 	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
 	if (ret) {
 		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
-
-		nbd->task_recv = NULL;
-
 		return ret;
 	}
 
@@ -477,9 +472,6 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
 	nbd_size_clear(nbd, bdev);
 
 	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-
-	nbd->task_recv = NULL;
-
 	return ret;
 }
 
@@ -788,6 +780,8 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		if (!nbd->sock)
 			return -EINVAL;
 
+		/* We have to claim the device under the lock */
+		nbd->task_recv = current;
 		mutex_unlock(&nbd->tx_lock);
 
 		nbd_parse_flags(nbd, bdev);
@@ -796,6 +790,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 				     nbd_name(nbd));
 		if (IS_ERR(thread)) {
 			mutex_lock(&nbd->tx_lock);
+			nbd->task_recv = NULL;
 			return PTR_ERR(thread);
 		}
 
@@ -805,6 +800,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		kthread_stop(thread);
 
 		mutex_lock(&nbd->tx_lock);
+		nbd->task_recv = NULL;
 
 		sock_shutdown(nbd);
 		nbd_clear_que(nbd);
-- 
cgit v1.2.3


From 71f79fb3179e69b0c1448a2101a866d871c66e7f Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Date: Mon, 1 Aug 2016 08:23:39 -0600
Subject: blk-mq: Allow timeouts to run while queue is freezing

In case a submitted request gets stuck for some reason, the block layer
can prevent the request starvation by starting the scheduled timeout work.
If this stuck request occurs at the same time another thread has started
a queue freeze, the blk_mq_timeout_work will not be able to acquire the
queue reference and will return silently, thus not issuing the timeout.
But since the request is already holding a q_usage_counter reference and
is unable to complete, it will never release its reference, preventing
the queue from completing the freeze started by first thread.  This puts
the request_queue in a hung state, forever waiting for the freeze
completion.

This was observed while running IO to a NVMe device at the same time we
toggled the CPU hotplug code. Eventually, once a request got stuck
requiring a timeout during a queue freeze, we saw the CPU Hotplug
notification code get stuck inside blk_mq_freeze_queue_wait, as shown in
the trace below.

[c000000deaf13690] [c000000deaf13738] 0xc000000deaf13738 (unreliable)
[c000000deaf13860] [c000000000015ce8] __switch_to+0x1f8/0x350
[c000000deaf138b0] [c000000000ade0e4] __schedule+0x314/0x990
[c000000deaf13940] [c000000000ade7a8] schedule+0x48/0xc0
[c000000deaf13970] [c0000000005492a4] blk_mq_freeze_queue_wait+0x74/0x110
[c000000deaf139e0] [c00000000054b6a8] blk_mq_queue_reinit_notify+0x1a8/0x2e0
[c000000deaf13a40] [c0000000000e7878] notifier_call_chain+0x98/0x100
[c000000deaf13a90] [c0000000000b8e08] cpu_notify_nofail+0x48/0xa0
[c000000deaf13ac0] [c0000000000b92f0] _cpu_down+0x2a0/0x400
[c000000deaf13b90] [c0000000000b94a8] cpu_down+0x58/0xa0
[c000000deaf13bc0] [c0000000006d5dcc] cpu_subsys_offline+0x2c/0x50
[c000000deaf13bf0] [c0000000006cd244] device_offline+0x104/0x140
[c000000deaf13c30] [c0000000006cd40c] online_store+0x6c/0xc0
[c000000deaf13c80] [c0000000006c8c78] dev_attr_store+0x68/0xa0
[c000000deaf13cc0] [c0000000003974d0] sysfs_kf_write+0x80/0xb0
[c000000deaf13d00] [c0000000003963e8] kernfs_fop_write+0x188/0x200
[c000000deaf13d50] [c0000000002e0f6c] __vfs_write+0x6c/0xe0
[c000000deaf13d90] [c0000000002e1ca0] vfs_write+0xc0/0x230
[c000000deaf13de0] [c0000000002e2cdc] SyS_write+0x6c/0x110
[c000000deaf13e30] [c000000000009204] system_call+0x38/0xb4

The fix is to allow the timeout work to execute in the window between
dropping the initial refcount reference and the release of the last
reference, which actually marks the freeze completion.  This can be
achieved with percpu_refcount_tryget, which does not require the counter
to be alive.  This way the timeout work can do it's job and terminate a
stuck request even during a freeze, returning its reference and avoiding
the deadlock.

Allowing the timeout to run is just a part of the fix, since for some
devices, we might get stuck again inside the device driver's timeout
handler, should it attempt to allocate a new request in that path -
which is a quite common action for Abort commands, which need to be sent
after a timeout.  In NVMe, for instance, we call blk_mq_alloc_request
from inside the timeout handler, which will fail during a freeze, since
it also tries to acquire a queue reference.

I considered a similar change to blk_mq_alloc_request as a generic
solution for further device driver hangs, but we can't do that, since it
would allow new requests to disturb the freeze process.  I thought about
creating a new function in the block layer to support unfreezable
requests for these occasions, but after working on it for a while, I
feel like this should be handled in a per-driver basis.  I'm now
experimenting with changes to the NVMe timeout path, but I'm open to
suggestions of ways to make this generic.

Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Cc: Brian King <brking@linux.vnet.ibm.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: linux-nvme@lists.infradead.org
Cc: linux-block@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 576e7112f807..6a63da101bc4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -672,7 +672,20 @@ static void blk_mq_timeout_work(struct work_struct *work)
 	};
 	int i;
 
-	if (blk_queue_enter(q, true))
+	/* A deadlock might occur if a request is stuck requiring a
+	 * timeout at the same time a queue freeze is waiting
+	 * completion, since the timeout code would not be able to
+	 * acquire the queue reference here.
+	 *
+	 * That's why we don't use blk_queue_enter here; instead, we use
+	 * percpu_ref_tryget directly, because we need to be able to
+	 * obtain a reference even in the short window between the queue
+	 * starting to freeze, by dropping the first reference in
+	 * blk_mq_freeze_queue_start, and the moment the last request is
+	 * consumed, marked by the instant q_usage_counter reaches
+	 * zero.
+	 */
+	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
 
 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
-- 
cgit v1.2.3


From df08c32ce3be5be138c1dbfcba203314a3a7cd6f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sun, 31 Jul 2016 11:15:13 -0700
Subject: block: fix bdi vs gendisk lifetime mismatch

The name for a bdi of a gendisk is derived from the gendisk's devt.
However, since the gendisk is destroyed before the bdi it leaves a
window where a new gendisk could dynamically reuse the same devt while a
bdi with the same name is still live.  Arrange for the bdi to hold a
reference against its "owner" disk device while it is registered.
Otherwise we can hit sysfs duplicate name collisions like the following:

 WARNING: CPU: 10 PID: 2078 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x64/0x80
 sysfs: cannot create duplicate filename '/devices/virtual/bdi/259:1'

 Hardware name: HP ProLiant DL580 Gen8, BIOS P79 05/06/2015
  0000000000000286 0000000002c04ad5 ffff88006f24f970 ffffffff8134caec
  ffff88006f24f9c0 0000000000000000 ffff88006f24f9b0 ffffffff8108c351
  0000001f0000000c ffff88105d236000 ffff88105d1031e0 ffff8800357427f8
 Call Trace:
  [<ffffffff8134caec>] dump_stack+0x63/0x87
  [<ffffffff8108c351>] __warn+0xd1/0xf0
  [<ffffffff8108c3cf>] warn_slowpath_fmt+0x5f/0x80
  [<ffffffff812a0d34>] sysfs_warn_dup+0x64/0x80
  [<ffffffff812a0e1e>] sysfs_create_dir_ns+0x7e/0x90
  [<ffffffff8134faaa>] kobject_add_internal+0xaa/0x320
  [<ffffffff81358d4e>] ? vsnprintf+0x34e/0x4d0
  [<ffffffff8134ff55>] kobject_add+0x75/0xd0
  [<ffffffff816e66b2>] ? mutex_lock+0x12/0x2f
  [<ffffffff8148b0a5>] device_add+0x125/0x610
  [<ffffffff8148b788>] device_create_groups_vargs+0xd8/0x100
  [<ffffffff8148b7cc>] device_create_vargs+0x1c/0x20
  [<ffffffff811b775c>] bdi_register+0x8c/0x180
  [<ffffffff811b7877>] bdi_register_dev+0x27/0x30
  [<ffffffff813317f5>] add_disk+0x175/0x4a0

Cc: <stable@vger.kernel.org>
Reported-by: Yi Zhang <yizhan@redhat.com>
Tested-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Fixed up missing 0 return in bdi_register_owner().

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/genhd.c                    |  2 +-
 include/linux/backing-dev-defs.h |  1 +
 include/linux/backing-dev.h      |  1 +
 mm/backing-dev.c                 | 19 +++++++++++++++++++
 4 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index 0ad879655f82..fcd6d4fae657 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -614,7 +614,7 @@ void device_add_disk(struct device *parent, struct gendisk *disk)
 
 	/* Register BDI before referencing it from bdev */
 	bdi = &disk->queue->backing_dev_info;
-	bdi_register_dev(bdi, disk_devt(disk));
+	bdi_register_owner(bdi, disk_to_dev(disk));
 
 	blk_register_region(disk_devt(disk), disk->minors, NULL,
 			    exact_match, exact_lock, disk);
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 3f103076d0bf..c357f27d5483 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -163,6 +163,7 @@ struct backing_dev_info {
 	wait_queue_head_t wb_waitq;
 
 	struct device *dev;
+	struct device *owner;
 
 	struct timer_list laptop_mode_wb_timer;
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 491a91717788..43b93a947e61 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -24,6 +24,7 @@ __printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
+int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
 void bdi_unregister(struct backing_dev_info *bdi);
 
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index efe237742074..8fde443f36d7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -825,6 +825,20 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
 }
 EXPORT_SYMBOL(bdi_register_dev);
 
+int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
+{
+	int rc;
+
+	rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt),
+			MINOR(owner->devt));
+	if (rc)
+		return rc;
+	bdi->owner = owner;
+	get_device(owner);
+	return 0;
+}
+EXPORT_SYMBOL(bdi_register_owner);
+
 /*
  * Remove bdi from bdi_list, and ensure that it is no longer visible
  */
@@ -849,6 +863,11 @@ void bdi_unregister(struct backing_dev_info *bdi)
 		device_unregister(bdi->dev);
 		bdi->dev = NULL;
 	}
+
+	if (bdi->owner) {
+		put_device(bdi->owner);
+		bdi->owner = NULL;
+	}
 }
 
 void bdi_exit(struct backing_dev_info *bdi)
-- 
cgit v1.2.3


From b571bc606e714e448b00920987d77b384a6a9570 Mon Sep 17 00:00:00 2001
From: Shaun Tancheff <shaun@tancheff.com>
Date: Sat, 30 Jul 2016 16:45:48 -0500
Subject: Fixup direct bi_rw modifiers

bi_rw should be using bio_set_op_attrs to set bi_rw.

Signed-off-by: Shaun Tancheff <shaun@tancheff.com>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Mike Christie <mchristi@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 5850d79806f4..881eb4667051 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2049,7 +2049,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 		return -EIO;
 	}
 	bio->bi_bdev = dev->bdev;
-	bio->bi_rw = WRITE_SYNC;
+	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
 	bio_add_page(bio, page, length, pg_offset);
 
 	if (btrfsic_submit_bio_wait(bio)) {
-- 
cgit v1.2.3


From 6d25ec147e3a71858bed5439c92accd7f739a0a3 Mon Sep 17 00:00:00 2001
From: John Pittman <jpittman@redhat.com>
Date: Mon, 1 Aug 2016 16:35:53 -0400
Subject: Include: blkdev: Removed duplicate 'struct request;' declaration.

In include/linux/blkdev.h duplicate declarations of the request
struct exist.  Cleaned up by removing the second, unneeded
declaration.

Signed-off-by: John Pittman <jpittman@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index adf33079771e..de7935961c27 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -47,7 +47,6 @@ struct pr_ops;
  */
 #define BLKCG_MAX_POLS		2
 
-struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
 
 #define BLK_RL_SYNCFULL		(1U << 0)
-- 
cgit v1.2.3


From c0f3fd2b387448d67ae83c4ce1cc69da375b9186 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Tue, 2 Aug 2016 08:45:44 -0600
Subject: blk-mq: fix deadlock in blk_mq_register_disk() error path

If we fail registering any of the hardware queues, we call
into blk_mq_unregister_disk() with the hotplug mutex already
held. Since blk_mq_unregister_disk() attempts to acquire the
same mutex, we end up in a less than happy place.

Reported-by: Jinpu Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-sysfs.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 4ea4dd8a1eed..fe822aa5b8e4 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -380,15 +380,13 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 	return ret;
 }
 
-void blk_mq_unregister_disk(struct gendisk *disk)
+static void __blk_mq_unregister_disk(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	int i, j;
 
-	blk_mq_disable_hotplug();
-
 	queue_for_each_hw_ctx(q, hctx, i) {
 		blk_mq_unregister_hctx(hctx);
 
@@ -405,6 +403,12 @@ void blk_mq_unregister_disk(struct gendisk *disk)
 	kobject_put(&disk_to_dev(disk)->kobj);
 
 	q->mq_sysfs_init_done = false;
+}
+
+void blk_mq_unregister_disk(struct gendisk *disk)
+{
+	blk_mq_disable_hotplug();
+	__blk_mq_unregister_disk(disk);
 	blk_mq_enable_hotplug();
 }
 
@@ -450,7 +454,7 @@ int blk_mq_register_disk(struct gendisk *disk)
 	}
 
 	if (ret)
-		blk_mq_unregister_disk(disk);
+		__blk_mq_unregister_disk(disk);
 	else
 		q->mq_sysfs_init_done = true;
 out:
-- 
cgit v1.2.3


From f0225cacfe7e69ff3234a125aeb0f3d65077835c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 4 Aug 2016 16:10:00 +0200
Subject: loop: don't try to use AIO for discards

Fix a fat-fingered conversion to the req_op accessors, and also
use a switch statement to make it more obvious what is being checked.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Dave Chinner <david@fromorbit.com>
Fixes: c2df40 ("drivers: use req op accessor");
Reviewed-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 075377eee0c0..91c2c881cb49 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1659,11 +1659,15 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (lo->lo_state != Lo_bound)
 		return -EIO;
 
-	if (lo->use_dio && (req_op(cmd->rq) != REQ_OP_FLUSH ||
-	    req_op(cmd->rq) == REQ_OP_DISCARD))
-		cmd->use_aio = true;
-	else
+	switch (req_op(cmd->rq)) {
+	case REQ_OP_FLUSH:
+	case REQ_OP_DISCARD:
 		cmd->use_aio = false;
+		break;
+	default:
+		cmd->use_aio = lo->use_dio;
+		break;
+	}
 
 	queue_kthread_work(&lo->worker, &cmd->work);
 
-- 
cgit v1.2.3


From c1c87c2ba9ec06d8ba9e8a26c18c67a2ba9cd9c1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 4 Aug 2016 16:10:01 +0200
Subject: loop: make do_req_filebacked more robust

Use a switch statement to iterate over the possible operations and
error out if it's an incorrect one.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/loop.c | 55 +++++++++++++++++++++-------------------------------
 1 file changed, 22 insertions(+), 33 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 91c2c881cb49..c9f2107f7095 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -510,14 +510,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	return 0;
 }
 
-
-static inline int lo_rw_simple(struct loop_device *lo,
-		struct request *rq, loff_t pos, bool rw)
+static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 {
 	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
-
-	if (cmd->use_aio)
-		return lo_rw_aio(lo, cmd, pos, rw);
+	loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
 
 	/*
 	 * lo_write_simple and lo_read_simple should have been covered
@@ -528,37 +524,30 @@ static inline int lo_rw_simple(struct loop_device *lo,
 	 * of the req at one time. And direct read IO doesn't need to
 	 * run flush_dcache_page().
 	 */
-	if (rw == WRITE)
-		return lo_write_simple(lo, rq, pos);
-	else
-		return lo_read_simple(lo, rq, pos);
-}
-
-static int do_req_filebacked(struct loop_device *lo, struct request *rq)
-{
-	loff_t pos;
-	int ret;
-
-	pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
-
-	if (op_is_write(req_op(rq))) {
-		if (req_op(rq) == REQ_OP_FLUSH)
-			ret = lo_req_flush(lo, rq);
-		else if (req_op(rq) == REQ_OP_DISCARD)
-			ret = lo_discard(lo, rq, pos);
-		else if (lo->transfer)
-			ret = lo_write_transfer(lo, rq, pos);
+	switch (req_op(rq)) {
+	case REQ_OP_FLUSH:
+		return lo_req_flush(lo, rq);
+	case REQ_OP_DISCARD:
+		return lo_discard(lo, rq, pos);
+	case REQ_OP_WRITE:
+		if (lo->transfer)
+			return lo_write_transfer(lo, rq, pos);
+		else if (cmd->use_aio)
+			return lo_rw_aio(lo, cmd, pos, WRITE);
 		else
-			ret = lo_rw_simple(lo, rq, pos, WRITE);
-
-	} else {
+			return lo_write_simple(lo, rq, pos);
+	case REQ_OP_READ:
 		if (lo->transfer)
-			ret = lo_read_transfer(lo, rq, pos);
+			return lo_read_transfer(lo, rq, pos);
+		else if (cmd->use_aio)
+			return lo_rw_aio(lo, cmd, pos, READ);
 		else
-			ret = lo_rw_simple(lo, rq, pos, READ);
+			return lo_read_simple(lo, rq, pos);
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+		break;
 	}
-
-	return ret;
 }
 
 struct switch_request {
-- 
cgit v1.2.3


From abf545484d31b68777a85c5c8f5b4bcde08283eb Mon Sep 17 00:00:00 2001
From: Mike Christie <mchristi@redhat.com>
Date: Thu, 4 Aug 2016 14:23:34 -0600
Subject: mm/block: convert rw_page users to bio op use

The rw_page users were not converted to use bio/req ops. As a result
bdev_write_page is not passing down REQ_OP_WRITE and the IOs will
be sent down as reads.

Signed-off-by: Mike Christie <mchristi@redhat.com>
Fixes: 4e1b2d52a80d ("block, fs, drivers: remove REQ_OP compat defs and related code")

Modified by me to:

1) Drop op_flags passing into ->rw_page(), as we don't use it.
2) Make op_is_write() and friends safe to use for !CONFIG_BLOCK

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/brd.c           | 17 +++++++----------
 drivers/block/zram/zram_drv.c | 28 +++++++++++++++-------------
 drivers/nvdimm/btt.c          | 18 +++++++++---------
 drivers/nvdimm/pmem.c         | 12 ++++++------
 fs/block_dev.c                |  7 ++++---
 fs/mpage.c                    |  2 +-
 include/linux/blk_types.h     | 22 +++++++++++-----------
 include/linux/blkdev.h        |  2 +-
 include/linux/fs.h            |  3 ++-
 include/linux/pagemap.h       |  2 +-
 mm/filemap.c                  |  6 +++---
 11 files changed, 60 insertions(+), 59 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 3022dad24071..3439b28cce8b 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -300,20 +300,20 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
  * Process a single bvec of a bio.
  */
 static int brd_do_bvec(struct brd_device *brd, struct page *page,
-			unsigned int len, unsigned int off, int rw,
+			unsigned int len, unsigned int off, int op,
 			sector_t sector)
 {
 	void *mem;
 	int err = 0;
 
-	if (rw != READ) {
+	if (op_is_write(op)) {
 		err = copy_to_brd_setup(brd, sector, len);
 		if (err)
 			goto out;
 	}
 
 	mem = kmap_atomic(page);
-	if (rw == READ) {
+	if (!op_is_write(op)) {
 		copy_from_brd(mem + off, brd, sector, len);
 		flush_dcache_page(page);
 	} else {
@@ -330,7 +330,6 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	struct brd_device *brd = bdev->bd_disk->private_data;
-	int rw;
 	struct bio_vec bvec;
 	sector_t sector;
 	struct bvec_iter iter;
@@ -347,14 +346,12 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
 		goto out;
 	}
 
-	rw = bio_data_dir(bio);
-
 	bio_for_each_segment(bvec, bio, iter) {
 		unsigned int len = bvec.bv_len;
 		int err;
 
 		err = brd_do_bvec(brd, bvec.bv_page, len,
-					bvec.bv_offset, rw, sector);
+					bvec.bv_offset, bio_op(bio), sector);
 		if (err)
 			goto io_error;
 		sector += len >> SECTOR_SHIFT;
@@ -369,11 +366,11 @@ io_error:
 }
 
 static int brd_rw_page(struct block_device *bdev, sector_t sector,
-		       struct page *page, int rw)
+		       struct page *page, int op)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
-	int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, rw, sector);
-	page_endio(page, rw & WRITE, err);
+	int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
+	page_endio(page, op, err);
 	return err;
 }
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 7454cf188c8e..ca29649c4b08 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -843,15 +843,15 @@ static void zram_bio_discard(struct zram *zram, u32 index,
 }
 
 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
-			int offset, int rw)
+			int offset, int op)
 {
 	unsigned long start_time = jiffies;
 	int ret;
 
-	generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT,
+	generic_start_io_acct(op, bvec->bv_len >> SECTOR_SHIFT,
 			&zram->disk->part0);
 
-	if (rw == READ) {
+	if (!op_is_write(op)) {
 		atomic64_inc(&zram->stats.num_reads);
 		ret = zram_bvec_read(zram, bvec, index, offset);
 	} else {
@@ -859,10 +859,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 		ret = zram_bvec_write(zram, bvec, index, offset);
 	}
 
-	generic_end_io_acct(rw, &zram->disk->part0, start_time);
+	generic_end_io_acct(op, &zram->disk->part0, start_time);
 
 	if (unlikely(ret)) {
-		if (rw == READ)
+		if (!op_is_write(op))
 			atomic64_inc(&zram->stats.failed_reads);
 		else
 			atomic64_inc(&zram->stats.failed_writes);
@@ -873,7 +873,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 
 static void __zram_make_request(struct zram *zram, struct bio *bio)
 {
-	int offset, rw;
+	int offset;
 	u32 index;
 	struct bio_vec bvec;
 	struct bvec_iter iter;
@@ -888,7 +888,6 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
 		return;
 	}
 
-	rw = bio_data_dir(bio);
 	bio_for_each_segment(bvec, bio, iter) {
 		int max_transfer_size = PAGE_SIZE - offset;
 
@@ -903,15 +902,18 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
 			bv.bv_len = max_transfer_size;
 			bv.bv_offset = bvec.bv_offset;
 
-			if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0)
+			if (zram_bvec_rw(zram, &bv, index, offset,
+					 bio_op(bio)) < 0)
 				goto out;
 
 			bv.bv_len = bvec.bv_len - max_transfer_size;
 			bv.bv_offset += max_transfer_size;
-			if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0)
+			if (zram_bvec_rw(zram, &bv, index + 1, 0,
+					 bio_op(bio)) < 0)
 				goto out;
 		} else
-			if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0)
+			if (zram_bvec_rw(zram, &bvec, index, offset,
+					 bio_op(bio)) < 0)
 				goto out;
 
 		update_position(&index, &offset, &bvec);
@@ -968,7 +970,7 @@ static void zram_slot_free_notify(struct block_device *bdev,
 }
 
 static int zram_rw_page(struct block_device *bdev, sector_t sector,
-		       struct page *page, int rw)
+		       struct page *page, int op)
 {
 	int offset, err = -EIO;
 	u32 index;
@@ -992,7 +994,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector,
 	bv.bv_len = PAGE_SIZE;
 	bv.bv_offset = 0;
 
-	err = zram_bvec_rw(zram, &bv, index, offset, rw);
+	err = zram_bvec_rw(zram, &bv, index, offset, op);
 put_zram:
 	zram_meta_put(zram);
 out:
@@ -1005,7 +1007,7 @@ out:
 	 * (e.g., SetPageError, set_page_dirty and extra works).
 	 */
 	if (err == 0)
-		page_endio(page, rw, 0);
+		page_endio(page, op, 0);
 	return err;
 }
 
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 9dce03f420eb..7cf3bdfaf809 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1133,11 +1133,11 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip,
 
 static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip,
 			struct page *page, unsigned int len, unsigned int off,
-			int rw, sector_t sector)
+			int op, sector_t sector)
 {
 	int ret;
 
-	if (rw == READ) {
+	if (!op_is_write(op)) {
 		ret = btt_read_pg(btt, bip, page, off, sector, len);
 		flush_dcache_page(page);
 	} else {
@@ -1155,7 +1155,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
 	struct bvec_iter iter;
 	unsigned long start;
 	struct bio_vec bvec;
-	int err = 0, rw;
+	int err = 0;
 	bool do_acct;
 
 	/*
@@ -1170,7 +1170,6 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	do_acct = nd_iostat_start(bio, &start);
-	rw = bio_data_dir(bio);
 	bio_for_each_segment(bvec, bio, iter) {
 		unsigned int len = bvec.bv_len;
 
@@ -1181,11 +1180,12 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio)
 		BUG_ON(len % btt->sector_size);
 
 		err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset,
-				rw, iter.bi_sector);
+				  bio_op(bio), iter.bi_sector);
 		if (err) {
 			dev_info(&btt->nd_btt->dev,
 					"io error in %s sector %lld, len %d,\n",
-					(rw == READ) ? "READ" : "WRITE",
+					(op_is_write(bio_op(bio))) ? "WRITE" :
+					"READ",
 					(unsigned long long) iter.bi_sector, len);
 			bio->bi_error = err;
 			break;
@@ -1200,12 +1200,12 @@ out:
 }
 
 static int btt_rw_page(struct block_device *bdev, sector_t sector,
-		struct page *page, int rw)
+		struct page *page, int op)
 {
 	struct btt *btt = bdev->bd_disk->private_data;
 
-	btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, rw, sector);
-	page_endio(page, rw & WRITE, 0);
+	btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, op, sector);
+	page_endio(page, op, 0);
 	return 0;
 }
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index b511099457db..d64d92481c1d 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -67,7 +67,7 @@ static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset,
 }
 
 static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
-			unsigned int len, unsigned int off, int rw,
+			unsigned int len, unsigned int off, int op,
 			sector_t sector)
 {
 	int rc = 0;
@@ -79,7 +79,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
 	if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
 		bad_pmem = true;
 
-	if (rw == READ) {
+	if (!op_is_write(op)) {
 		if (unlikely(bad_pmem))
 			rc = -EIO;
 		else {
@@ -134,7 +134,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 	do_acct = nd_iostat_start(bio, &start);
 	bio_for_each_segment(bvec, bio, iter) {
 		rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
-				bvec.bv_offset, bio_data_dir(bio),
+				bvec.bv_offset, bio_op(bio),
 				iter.bi_sector);
 		if (rc) {
 			bio->bi_error = rc;
@@ -152,12 +152,12 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 }
 
 static int pmem_rw_page(struct block_device *bdev, sector_t sector,
-		       struct page *page, int rw)
+		       struct page *page, int op)
 {
 	struct pmem_device *pmem = bdev->bd_queue->queuedata;
 	int rc;
 
-	rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, rw, sector);
+	rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, op, sector);
 
 	/*
 	 * The ->rw_page interface is subtle and tricky.  The core
@@ -166,7 +166,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 	 * caused by double completion.
 	 */
 	if (rc == 0)
-		page_endio(page, rw & WRITE, 0);
+		page_endio(page, op, 0);
 
 	return rc;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2033a3f91d58..d402899ba135 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -416,7 +416,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 	result = blk_queue_enter(bdev->bd_queue, false);
 	if (result)
 		return result;
-	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+			      REQ_OP_READ);
 	blk_queue_exit(bdev->bd_queue);
 	return result;
 }
@@ -445,7 +446,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 			struct page *page, struct writeback_control *wbc)
 {
 	int result;
-	int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
 
 	if (!ops->rw_page || bdev_get_integrity(bdev))
@@ -455,7 +455,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 		return result;
 
 	set_page_writeback(page);
-	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
+	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+			      REQ_OP_WRITE);
 	if (result)
 		end_page_writeback(page);
 	else
diff --git a/fs/mpage.c b/fs/mpage.c
index 2ca1f39c8cba..7a09c55b4bd0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -50,7 +50,7 @@ static void mpage_end_io(struct bio *bio)
 
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
-		page_endio(page, bio_data_dir(bio), bio->bi_error);
+		page_endio(page, bio_op(bio), bio->bi_error);
 	}
 
 	bio_put(bio);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index f254eb264924..14b28ff2caf8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -18,6 +18,17 @@ struct cgroup_subsys_state;
 typedef void (bio_end_io_t) (struct bio *);
 typedef void (bio_destructor_t) (struct bio *);
 
+enum req_op {
+	REQ_OP_READ,
+	REQ_OP_WRITE,
+	REQ_OP_DISCARD,		/* request to discard sectors */
+	REQ_OP_SECURE_ERASE,	/* request to securely erase sectors */
+	REQ_OP_WRITE_SAME,	/* write same block many times */
+	REQ_OP_FLUSH,		/* request for cache flush */
+};
+
+#define REQ_OP_BITS 3
+
 #ifdef CONFIG_BLOCK
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
@@ -228,17 +239,6 @@ enum rq_flag_bits {
 #define REQ_HASHED		(1ULL << __REQ_HASHED)
 #define REQ_MQ_INFLIGHT		(1ULL << __REQ_MQ_INFLIGHT)
 
-enum req_op {
-	REQ_OP_READ,
-	REQ_OP_WRITE,
-	REQ_OP_DISCARD,		/* request to discard sectors */
-	REQ_OP_SECURE_ERASE,	/* request to securely erase sectors */
-	REQ_OP_WRITE_SAME,	/* write same block many times */
-	REQ_OP_FLUSH,		/* request for cache flush */
-};
-
-#define REQ_OP_BITS 3
-
 typedef unsigned int blk_qc_t;
 #define BLK_QC_T_NONE	-1U
 #define BLK_QC_T_SHIFT	16
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index de7935961c27..ccd68c0d01de 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1672,7 +1672,7 @@ struct blk_dax_ctl {
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
 	void (*release) (struct gendisk *, fmode_t);
-	int (*rw_page)(struct block_device *, sector_t, struct page *, int rw);
+	int (*rw_page)(struct block_device *, sector_t, struct page *, int op);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	long (*direct_access)(struct block_device *, sector_t, void **, pfn_t *,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f3f0b4c8e8ac..498255e6914e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2480,12 +2480,13 @@ extern void init_special_inode(struct inode *, umode_t, dev_t);
 extern void make_bad_inode(struct inode *);
 extern bool is_bad_inode(struct inode *);
 
-#ifdef CONFIG_BLOCK
 static inline bool op_is_write(unsigned int op)
 {
 	return op == REQ_OP_READ ? false : true;
 }
 
+#ifdef CONFIG_BLOCK
+
 /*
  * return data direction, READ or WRITE
  */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 81363b834900..45786374abbd 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -510,7 +510,7 @@ static inline void wait_on_page_writeback(struct page *page)
 extern void end_page_writeback(struct page *page);
 void wait_for_stable_page(struct page *page);
 
-void page_endio(struct page *page, int rw, int err);
+void page_endio(struct page *page, int op, int err);
 
 /*
  * Add an arbitrary waiter to a page's wait queue
diff --git a/mm/filemap.c b/mm/filemap.c
index 3083ded98b15..daef091d4c50 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -887,9 +887,9 @@ EXPORT_SYMBOL(end_page_writeback);
  * After completing I/O on a page, call this routine to update the page
  * flags appropriately
  */
-void page_endio(struct page *page, int rw, int err)
+void page_endio(struct page *page, int op, int err)
 {
-	if (rw == READ) {
+	if (!op_is_write(op)) {
 		if (!err) {
 			SetPageUptodate(page);
 		} else {
@@ -897,7 +897,7 @@ void page_endio(struct page *page, int rw, int err)
 			SetPageError(page);
 		}
 		unlock_page(page);
-	} else { /* rw == WRITE */
+	} else {
 		if (err) {
 			SetPageError(page);
 			if (page->mapping)
-- 
cgit v1.2.3