From ff06db1efb2ad6db06eb5b99b88a0c15a9cc9b0e Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 16 Jun 2016 09:53:58 +0200 Subject: floppy: fix open(O_ACCMODE) for ioctl-only open Commit 09954bad4 ("floppy: refactor open() flags handling"), as a side-effect, causes open(/dev/fdX, O_ACCMODE) to fail. It turns out that this is being used setfdprm userspace for ioctl-only open(). Reintroduce back the original behavior wrt !(FMODE_READ|FMODE_WRITE) modes, while still keeping the original O_NDELAY bug fixed. Cc: stable@vger.kernel.org # v4.5+ Reported-by: Wim Osterholt Tested-by: Wim Osterholt Signed-off-by: Jiri Kosina Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index c557057fe8ae..b71a9c767009 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3663,11 +3663,6 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) opened_bdev[drive] = bdev; - if (!(mode & (FMODE_READ|FMODE_WRITE))) { - res = -EINVAL; - goto out; - } - res = -ENXIO; if (!floppy_track_buffer) { @@ -3711,13 +3706,15 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) if (UFDCS->rawcmd == 1) UFDCS->rawcmd = 2; - UDRS->last_checked = 0; - clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); - check_disk_change(bdev); - if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) - goto out; - if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) - goto out; + if (mode & (FMODE_READ|FMODE_WRITE)) { + UDRS->last_checked = 0; + clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); + check_disk_change(bdev); + if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) + goto out; + if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) + goto out; + } res = -EROFS; -- cgit v1.2.3 From dc5ff2b1d66f21c27a4c37236636dff6946437e4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 26 Jul 2016 11:38:20 +0200 Subject: writeback: Write dirty times for WB_SYNC_ALL writeback Currently we take care to handle I_DIRTY_TIME in vfs_fsync() and queue_io() so that inodes which have only dirty timestamps are properly written on fsync(2) and sync(2). However there are other call sites - most notably going through write_inode_now() - which expect inode to be clean after WB_SYNC_ALL writeback. This is not currently true as we do not clear I_DIRTY_TIME in __writeback_single_inode() even for WB_SYNC_ALL writeback in all the cases. This then resulted in the following oops because bdev_write_inode() did not clean the inode and writeback code later stumbled over a dirty inode with detached wb. general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN Modules linked in: CPU: 3 PID: 32 Comm: kworker/u10:1 Not tainted 4.6.0-rc3+ #349 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: writeback wb_workfn (flush-11:0) task: ffff88006ccf1840 ti: ffff88006cda8000 task.ti: ffff88006cda8000 RIP: 0010:[] [] locked_inode_to_wb_and_lock_list+0xa2/0x750 RSP: 0018:ffff88006cdaf7d0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff88006ccf2050 RDX: 0000000000000000 RSI: 000000114c8a8484 RDI: 0000000000000286 RBP: ffff88006cdaf820 R08: ffff88006ccf1840 R09: 0000000000000000 R10: 000229915090805f R11: 0000000000000001 R12: ffff88006a72f5e0 R13: dffffc0000000000 R14: ffffed000d4e5eed R15: ffffffff8830cf40 FS: 0000000000000000(0000) GS:ffff88006d500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000003301bf8 CR3: 000000006368f000 CR4: 00000000000006e0 DR0: 0000000000001ec9 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600 Stack: ffff88006a72f680 ffff88006a72f768 ffff8800671230d8 03ff88006cdaf948 ffff88006a72f668 ffff88006a72f5e0 ffff8800671230d8 ffff88006cdaf948 ffff880065b90cc8 ffff880067123100 ffff88006cdaf970 ffffffff8188e12e Call Trace: [< inline >] inode_to_wb_and_lock_list fs/fs-writeback.c:309 [] writeback_sb_inodes+0x4de/0x1250 fs/fs-writeback.c:1554 [] __writeback_inodes_wb+0x104/0x1e0 fs/fs-writeback.c:1600 [] wb_writeback+0x7ce/0xc90 fs/fs-writeback.c:1709 [< inline >] wb_do_writeback fs/fs-writeback.c:1844 [] wb_workfn+0x2f9/0x1000 fs/fs-writeback.c:1884 [] process_one_work+0x78e/0x15c0 kernel/workqueue.c:2094 [] worker_thread+0xdb/0xfc0 kernel/workqueue.c:2228 [] kthread+0x23f/0x2d0 drivers/block/aoe/aoecmd.c:1303 [] ret_from_fork+0x22/0x50 arch/x86/entry/entry_64.S:392 Code: 05 94 4a a8 06 85 c0 0f 85 03 03 00 00 e8 07 15 d0 ff 41 80 3e 00 0f 85 64 06 00 00 49 8b 9c 24 88 01 00 00 48 89 d8 48 c1 e8 03 <42> 80 3c 28 00 0f 85 17 06 00 00 48 8b 03 48 83 c0 50 48 39 c3 RIP [< inline >] wb_get include/linux/backing-dev-defs.h:212 RIP [] locked_inode_to_wb_and_lock_list+0xa2/0x750 fs/fs-writeback.c:281 RSP ---[ end trace 986a4d314dcb2694 ]--- Fix the problem by making sure __writeback_single_inode() writes inode only with dirty times in WB_SYNC_ALL mode. Reported-by: Dmitry Vyukov Tested-by: Laurent Dufour Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 56c8fda436c0..4d09d4441e3e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1327,6 +1327,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) dirty = inode->i_state & I_DIRTY; if (inode->i_state & I_DIRTY_TIME) { if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + wbc->sync_mode == WB_SYNC_ALL || unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || unlikely(time_after(jiffies, (inode->dirtied_time_when + -- cgit v1.2.3 From bfd279a868987e4bca7dc72a029e0fc50316d6df Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Tue, 26 Jul 2016 17:13:42 +0800 Subject: blkcg: kill unused field nr_undestroyed_grps 'nr_undestroyed_grps' in struct throtl_data was used to count the number of throtl_grp related with throtl_data, but now throtl_grp is tracked by blkcg_gq, so it is useless anymore. Signed-off-by: Hou Tao Signed-off-by: Jens Axboe --- block/blk-throttle.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 47a3e540631a..c5494e403239 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -145,11 +145,6 @@ struct throtl_data /* Total Number of queued bios on READ and WRITE lists */ unsigned int nr_queued[2]; - /* - * number of total undestroyed groups - */ - unsigned int nr_undestroyed_grps; - /* Work for dispatching throttled bios */ struct work_struct dispatch_work; }; -- cgit v1.2.3 From 20bd723ec6a3261df5e02250cd3a1fbb09a343f2 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 27 Jul 2016 07:22:05 +0200 Subject: block: add missing group association in bio-cloning functions When a bio is cloned, the newly created bio must be associated with the same blkcg as the original bio (if BLK_CGROUP is enabled). If this operation is not performed, then the new bio is not associated with any group, and the group of the current task is returned when the group of the bio is requested. Depending on the cloning frequency, this may cause a large percentage of the bios belonging to a given group to be treated as if belonging to other groups (in most cases as if belonging to the root group). The expected group isolation may thereby be broken. This commit adds the missing association in bio-cloning functions. Fixes: da2f0f74cf7d ("Btrfs: add support for blkio controllers") Cc: stable@vger.kernel.org # v4.3+ Signed-off-by: Paolo Valente Reviewed-by: Nikolay Borisov Reviewed-by: Jeff Moyer Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/bio.c | 15 +++++++++++++++ fs/btrfs/extent_io.c | 6 ------ include/linux/bio.h | 3 +++ 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/block/bio.c b/block/bio.c index 54ee3846c3a5..3f76a38a5e2d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -583,6 +583,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) bio->bi_rw = bio_src->bi_rw; bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; + + bio_clone_blkcg_association(bio, bio_src); } EXPORT_SYMBOL(__bio_clone_fast); @@ -687,6 +689,8 @@ integrity_clone: } } + bio_clone_blkcg_association(bio, bio_src); + return bio; } EXPORT_SYMBOL(bio_clone_bioset); @@ -2004,6 +2008,17 @@ void bio_disassociate_task(struct bio *bio) } } +/** + * bio_clone_blkcg_association - clone blkcg association from src to dst bio + * @dst: destination bio + * @src: source bio + */ +void bio_clone_blkcg_association(struct bio *dst, struct bio *src) +{ + if (src->bi_css) + WARN_ON(bio_associate_blkcg(dst, src->bi_css)); +} + #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cee4cb99b8ce..5850d79806f4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2697,12 +2697,6 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) btrfs_bio->csum = NULL; btrfs_bio->csum_allocated = NULL; btrfs_bio->end_io = NULL; - -#ifdef CONFIG_BLK_CGROUP - /* FIXME, put this into bio_clone_bioset */ - if (bio->bi_css) - bio_associate_blkcg(new, bio->bi_css); -#endif } return new; } diff --git a/include/linux/bio.h b/include/linux/bio.h index 583c10810e32..e09a8895fc31 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -470,11 +470,14 @@ extern unsigned int bvec_nr_vecs(unsigned short idx); int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css); int bio_associate_current(struct bio *bio); void bio_disassociate_task(struct bio *bio); +void bio_clone_blkcg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ static inline int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) { return 0; } static inline int bio_associate_current(struct bio *bio) { return -ENOENT; } static inline void bio_disassociate_task(struct bio *bio) { } +static inline void bio_clone_blkcg_association(struct bio *dst, + struct bio *src) { } #endif /* CONFIG_BLK_CGROUP */ #ifdef CONFIG_HIGHMEM -- cgit v1.2.3 From 1aee6b9a7d947d42ed84baa4cf461e9d943b80f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Jul 2016 12:52:21 -0600 Subject: f2fs: drop bio->bi_rw manual assignment Merge 4fc29c1aa375 included this extra line, but it's not needed (or useful) since we'll bio_set_op_attrs() right after to properly set the op and flags for the bio. Signed-off-by: Jens Axboe --- fs/f2fs/data.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e2624275d828..d64d2a515cb2 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -245,7 +245,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_put(bio); return -EFAULT; } - bio->bi_rw = fio->op_flags; bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); -- cgit v1.2.3 From 77da160530dd1dc94f6ae15a981f24e5f0021e84 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 29 Jul 2016 10:40:31 +0200 Subject: block: fix use-after-free in seq file I got a KASAN report of use-after-free: ================================================================== BUG: KASAN: use-after-free in klist_iter_exit+0x61/0x70 at addr ffff8800b6581508 Read of size 8 by task trinity-c1/315 ============================================================================= BUG kmalloc-32 (Not tainted): kasan: bad access detected ----------------------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: Allocated in disk_seqf_start+0x66/0x110 age=144 cpu=1 pid=315 ___slab_alloc+0x4f1/0x520 __slab_alloc.isra.58+0x56/0x80 kmem_cache_alloc_trace+0x260/0x2a0 disk_seqf_start+0x66/0x110 traverse+0x176/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a INFO: Freed in disk_seqf_stop+0x42/0x50 age=160 cpu=1 pid=315 __slab_free+0x17a/0x2c0 kfree+0x20a/0x220 disk_seqf_stop+0x42/0x50 traverse+0x3b5/0x860 seq_read+0x7e3/0x11a0 proc_reg_read+0xbc/0x180 do_loop_readv_writev+0x134/0x210 do_readv_writev+0x565/0x660 vfs_readv+0x67/0xa0 do_preadv+0x126/0x170 SyS_preadv+0xc/0x10 do_syscall_64+0x1a1/0x460 return_from_SYSCALL_64+0x0/0x6a CPU: 1 PID: 315 Comm: trinity-c1 Tainted: G B 4.7.0+ #62 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 ffffea0002d96000 ffff880119b9f918 ffffffff81d6ce81 ffff88011a804480 ffff8800b6581500 ffff880119b9f948 ffffffff8146c7bd ffff88011a804480 ffffea0002d96000 ffff8800b6581500 fffffffffffffff4 ffff880119b9f970 Call Trace: [] dump_stack+0x65/0x84 [] print_trailer+0x10d/0x1a0 [] object_err+0x2f/0x40 [] kasan_report_error+0x221/0x520 [] __asan_report_load8_noabort+0x3e/0x40 [] klist_iter_exit+0x61/0x70 [] class_dev_iter_exit+0x9/0x10 [] disk_seqf_stop+0x3a/0x50 [] seq_read+0x4b2/0x11a0 [] proc_reg_read+0xbc/0x180 [] do_loop_readv_writev+0x134/0x210 [] do_readv_writev+0x565/0x660 [] vfs_readv+0x67/0xa0 [] do_preadv+0x126/0x170 [] SyS_preadv+0xc/0x10 This problem can occur in the following situation: open() - pread() - .seq_start() - iter = kmalloc() // succeeds - seqf->private = iter - .seq_stop() - kfree(seqf->private) - pread() - .seq_start() - iter = kmalloc() // fails - .seq_stop() - class_dev_iter_exit(seqf->private) // boom! old pointer As the comment in disk_seqf_stop() says, stop is called even if start failed, so we need to reinitialise the private pointer to NULL when seq iteration stops. An alternative would be to set the private pointer to NULL when the kmalloc() in disk_seqf_start() fails. Cc: stable@vger.kernel.org Signed-off-by: Vegard Nossum Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/genhd.c b/block/genhd.c index 3c9dede4e04f..0ad879655f82 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -856,6 +856,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v) if (iter) { class_dev_iter_exit(iter); kfree(iter); + seqf->private = NULL; } } -- cgit v1.2.3 From 97240963eb308d8d21a89c0459822f7ea98463b4 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 27 May 2016 12:59:35 +0200 Subject: nbd: fix race in ioctl Quentin ran into this bug: WARNING: CPU: 64 PID: 10085 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x65/0x80 sysfs: cannot create duplicate filename '/devices/virtual/block/nbd3/pid' Modules linked in: nbd CPU: 64 PID: 10085 Comm: qemu-nbd Tainted: G D 4.6.0+ #7 0000000000000000 ffff8820330bba68 ffffffff814b8791 ffff8820330bbac8 0000000000000000 ffff8820330bbab8 ffffffff810d04ab ffff8820330bbaa8 0000001f00000296 0000000000017681 ffff8810380bf000 ffffffffa0001790 Call Trace: [] dump_stack+0x4d/0x6c [] __warn+0xdb/0x100 [] warn_slowpath_fmt+0x44/0x50 [] sysfs_warn_dup+0x65/0x80 [] sysfs_add_file_mode_ns+0x172/0x180 [] sysfs_create_file_ns+0x25/0x30 [] device_create_file+0x36/0x90 [] __nbd_ioctl+0x32d/0x9b0 [nbd] [] ? find_next_bit+0x18/0x20 [] ? select_idle_sibling+0xe9/0x120 [] ? __enqueue_entity+0x67/0x70 [] ? enqueue_task_fair+0x630/0xe20 [] ? resched_curr+0x36/0x70 [] ? check_preempt_curr+0x78/0x90 [] ? ttwu_do_wakeup+0x12/0x80 [] ? ttwu_do_activate.constprop.86+0x61/0x70 [] ? try_to_wake_up+0x185/0x2d0 [] ? default_wake_function+0xd/0x10 [] ? autoremove_wake_function+0x11/0x40 [] nbd_ioctl+0x67/0x94 [nbd] [] blkdev_ioctl+0x14d/0x940 [] ? put_pipe_info+0x22/0x60 [] block_ioctl+0x3c/0x40 [] do_vfs_ioctl+0x8d/0x5e0 [] ? ____fput+0x9/0x10 [] ? task_work_run+0x72/0x90 [] SyS_ioctl+0x47/0x80 [] entry_SYSCALL_64_fastpath+0x17/0x93 ---[ end trace 7899b295e4f850c8 ]--- It seems fairly obvious that device_create_file() is not being protected from being run concurrently on the same nbd. Quentin found the following relevant commits: 1a2ad21 nbd: add locking to nbd_ioctl 90b8f28 [PATCH] end of methods switch: remove the old ones d4430d6 [PATCH] beginning of methods conversion 08f8585 [PATCH] move block_device_operations to blkdev.h It would seem that the race was introduced in the process of moving nbd from BKL to unlocked ioctls. By setting nbd->task_recv while the mutex is held, we can prevent other processes from running concurrently (since nbd->task_recv is also checked while the mutex is held). Reported-and-tested-by: Quentin Casasnovas Cc: Markus Pargmann Cc: Paul Clements Cc: Pavel Machek Cc: Jens Axboe Cc: Al Viro Signed-off-by: Vegard Nossum Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 6f55b262b5ce..a9e398019f38 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -451,14 +451,9 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) sk_set_memalloc(nbd->sock->sk); - nbd->task_recv = current; - ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (ret) { dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); - - nbd->task_recv = NULL; - return ret; } @@ -477,9 +472,6 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) nbd_size_clear(nbd, bdev); device_remove_file(disk_to_dev(nbd->disk), &pid_attr); - - nbd->task_recv = NULL; - return ret; } @@ -788,6 +780,8 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, if (!nbd->sock) return -EINVAL; + /* We have to claim the device under the lock */ + nbd->task_recv = current; mutex_unlock(&nbd->tx_lock); nbd_parse_flags(nbd, bdev); @@ -796,6 +790,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd_name(nbd)); if (IS_ERR(thread)) { mutex_lock(&nbd->tx_lock); + nbd->task_recv = NULL; return PTR_ERR(thread); } @@ -805,6 +800,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, kthread_stop(thread); mutex_lock(&nbd->tx_lock); + nbd->task_recv = NULL; sock_shutdown(nbd); nbd_clear_que(nbd); -- cgit v1.2.3 From 71f79fb3179e69b0c1448a2101a866d871c66e7f Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 1 Aug 2016 08:23:39 -0600 Subject: blk-mq: Allow timeouts to run while queue is freezing In case a submitted request gets stuck for some reason, the block layer can prevent the request starvation by starting the scheduled timeout work. If this stuck request occurs at the same time another thread has started a queue freeze, the blk_mq_timeout_work will not be able to acquire the queue reference and will return silently, thus not issuing the timeout. But since the request is already holding a q_usage_counter reference and is unable to complete, it will never release its reference, preventing the queue from completing the freeze started by first thread. This puts the request_queue in a hung state, forever waiting for the freeze completion. This was observed while running IO to a NVMe device at the same time we toggled the CPU hotplug code. Eventually, once a request got stuck requiring a timeout during a queue freeze, we saw the CPU Hotplug notification code get stuck inside blk_mq_freeze_queue_wait, as shown in the trace below. [c000000deaf13690] [c000000deaf13738] 0xc000000deaf13738 (unreliable) [c000000deaf13860] [c000000000015ce8] __switch_to+0x1f8/0x350 [c000000deaf138b0] [c000000000ade0e4] __schedule+0x314/0x990 [c000000deaf13940] [c000000000ade7a8] schedule+0x48/0xc0 [c000000deaf13970] [c0000000005492a4] blk_mq_freeze_queue_wait+0x74/0x110 [c000000deaf139e0] [c00000000054b6a8] blk_mq_queue_reinit_notify+0x1a8/0x2e0 [c000000deaf13a40] [c0000000000e7878] notifier_call_chain+0x98/0x100 [c000000deaf13a90] [c0000000000b8e08] cpu_notify_nofail+0x48/0xa0 [c000000deaf13ac0] [c0000000000b92f0] _cpu_down+0x2a0/0x400 [c000000deaf13b90] [c0000000000b94a8] cpu_down+0x58/0xa0 [c000000deaf13bc0] [c0000000006d5dcc] cpu_subsys_offline+0x2c/0x50 [c000000deaf13bf0] [c0000000006cd244] device_offline+0x104/0x140 [c000000deaf13c30] [c0000000006cd40c] online_store+0x6c/0xc0 [c000000deaf13c80] [c0000000006c8c78] dev_attr_store+0x68/0xa0 [c000000deaf13cc0] [c0000000003974d0] sysfs_kf_write+0x80/0xb0 [c000000deaf13d00] [c0000000003963e8] kernfs_fop_write+0x188/0x200 [c000000deaf13d50] [c0000000002e0f6c] __vfs_write+0x6c/0xe0 [c000000deaf13d90] [c0000000002e1ca0] vfs_write+0xc0/0x230 [c000000deaf13de0] [c0000000002e2cdc] SyS_write+0x6c/0x110 [c000000deaf13e30] [c000000000009204] system_call+0x38/0xb4 The fix is to allow the timeout work to execute in the window between dropping the initial refcount reference and the release of the last reference, which actually marks the freeze completion. This can be achieved with percpu_refcount_tryget, which does not require the counter to be alive. This way the timeout work can do it's job and terminate a stuck request even during a freeze, returning its reference and avoiding the deadlock. Allowing the timeout to run is just a part of the fix, since for some devices, we might get stuck again inside the device driver's timeout handler, should it attempt to allocate a new request in that path - which is a quite common action for Abort commands, which need to be sent after a timeout. In NVMe, for instance, we call blk_mq_alloc_request from inside the timeout handler, which will fail during a freeze, since it also tries to acquire a queue reference. I considered a similar change to blk_mq_alloc_request as a generic solution for further device driver hangs, but we can't do that, since it would allow new requests to disturb the freeze process. I thought about creating a new function in the block layer to support unfreezable requests for these occasions, but after working on it for a while, I feel like this should be handled in a per-driver basis. I'm now experimenting with changes to the NVMe timeout path, but I'm open to suggestions of ways to make this generic. Signed-off-by: Gabriel Krisman Bertazi Cc: Brian King Cc: Keith Busch Cc: linux-nvme@lists.infradead.org Cc: linux-block@vger.kernel.org Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 576e7112f807..6a63da101bc4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -672,7 +672,20 @@ static void blk_mq_timeout_work(struct work_struct *work) }; int i; - if (blk_queue_enter(q, true)) + /* A deadlock might occur if a request is stuck requiring a + * timeout at the same time a queue freeze is waiting + * completion, since the timeout code would not be able to + * acquire the queue reference here. + * + * That's why we don't use blk_queue_enter here; instead, we use + * percpu_ref_tryget directly, because we need to be able to + * obtain a reference even in the short window between the queue + * starting to freeze, by dropping the first reference in + * blk_mq_freeze_queue_start, and the moment the last request is + * consumed, marked by the instant q_usage_counter reaches + * zero. + */ + if (!percpu_ref_tryget(&q->q_usage_counter)) return; blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); -- cgit v1.2.3 From df08c32ce3be5be138c1dbfcba203314a3a7cd6f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 31 Jul 2016 11:15:13 -0700 Subject: block: fix bdi vs gendisk lifetime mismatch The name for a bdi of a gendisk is derived from the gendisk's devt. However, since the gendisk is destroyed before the bdi it leaves a window where a new gendisk could dynamically reuse the same devt while a bdi with the same name is still live. Arrange for the bdi to hold a reference against its "owner" disk device while it is registered. Otherwise we can hit sysfs duplicate name collisions like the following: WARNING: CPU: 10 PID: 2078 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x64/0x80 sysfs: cannot create duplicate filename '/devices/virtual/bdi/259:1' Hardware name: HP ProLiant DL580 Gen8, BIOS P79 05/06/2015 0000000000000286 0000000002c04ad5 ffff88006f24f970 ffffffff8134caec ffff88006f24f9c0 0000000000000000 ffff88006f24f9b0 ffffffff8108c351 0000001f0000000c ffff88105d236000 ffff88105d1031e0 ffff8800357427f8 Call Trace: [] dump_stack+0x63/0x87 [] __warn+0xd1/0xf0 [] warn_slowpath_fmt+0x5f/0x80 [] sysfs_warn_dup+0x64/0x80 [] sysfs_create_dir_ns+0x7e/0x90 [] kobject_add_internal+0xaa/0x320 [] ? vsnprintf+0x34e/0x4d0 [] kobject_add+0x75/0xd0 [] ? mutex_lock+0x12/0x2f [] device_add+0x125/0x610 [] device_create_groups_vargs+0xd8/0x100 [] device_create_vargs+0x1c/0x20 [] bdi_register+0x8c/0x180 [] bdi_register_dev+0x27/0x30 [] add_disk+0x175/0x4a0 Cc: Reported-by: Yi Zhang Tested-by: Yi Zhang Signed-off-by: Dan Williams Fixed up missing 0 return in bdi_register_owner(). Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- include/linux/backing-dev-defs.h | 1 + include/linux/backing-dev.h | 1 + mm/backing-dev.c | 19 +++++++++++++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 0ad879655f82..fcd6d4fae657 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -614,7 +614,7 @@ void device_add_disk(struct device *parent, struct gendisk *disk) /* Register BDI before referencing it from bdev */ bdi = &disk->queue->backing_dev_info; - bdi_register_dev(bdi, disk_devt(disk)); + bdi_register_owner(bdi, disk_to_dev(disk)); blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 3f103076d0bf..c357f27d5483 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -163,6 +163,7 @@ struct backing_dev_info { wait_queue_head_t wb_waitq; struct device *dev; + struct device *owner; struct timer_list laptop_mode_wb_timer; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 491a91717788..43b93a947e61 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -24,6 +24,7 @@ __printf(3, 4) int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); +int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index efe237742074..8fde443f36d7 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -825,6 +825,20 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) } EXPORT_SYMBOL(bdi_register_dev); +int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) +{ + int rc; + + rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt), + MINOR(owner->devt)); + if (rc) + return rc; + bdi->owner = owner; + get_device(owner); + return 0; +} +EXPORT_SYMBOL(bdi_register_owner); + /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ @@ -849,6 +863,11 @@ void bdi_unregister(struct backing_dev_info *bdi) device_unregister(bdi->dev); bdi->dev = NULL; } + + if (bdi->owner) { + put_device(bdi->owner); + bdi->owner = NULL; + } } void bdi_exit(struct backing_dev_info *bdi) -- cgit v1.2.3 From b571bc606e714e448b00920987d77b384a6a9570 Mon Sep 17 00:00:00 2001 From: Shaun Tancheff Date: Sat, 30 Jul 2016 16:45:48 -0500 Subject: Fixup direct bi_rw modifiers bi_rw should be using bio_set_op_attrs to set bi_rw. Signed-off-by: Shaun Tancheff Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Cc: Mike Christie Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5850d79806f4..881eb4667051 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2049,7 +2049,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } bio->bi_bdev = dev->bdev; - bio->bi_rw = WRITE_SYNC; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); bio_add_page(bio, page, length, pg_offset); if (btrfsic_submit_bio_wait(bio)) { -- cgit v1.2.3 From 6d25ec147e3a71858bed5439c92accd7f739a0a3 Mon Sep 17 00:00:00 2001 From: John Pittman Date: Mon, 1 Aug 2016 16:35:53 -0400 Subject: Include: blkdev: Removed duplicate 'struct request;' declaration. In include/linux/blkdev.h duplicate declarations of the request struct exist. Cleaned up by removing the second, unneeded declaration. Signed-off-by: John Pittman Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index adf33079771e..de7935961c27 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -47,7 +47,6 @@ struct pr_ops; */ #define BLKCG_MAX_POLS 2 -struct request; typedef void (rq_end_io_fn)(struct request *, int); #define BLK_RL_SYNCFULL (1U << 0) -- cgit v1.2.3 From c0f3fd2b387448d67ae83c4ce1cc69da375b9186 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 2 Aug 2016 08:45:44 -0600 Subject: blk-mq: fix deadlock in blk_mq_register_disk() error path If we fail registering any of the hardware queues, we call into blk_mq_unregister_disk() with the hotplug mutex already held. Since blk_mq_unregister_disk() attempts to acquire the same mutex, we end up in a less than happy place. Reported-by: Jinpu Wang Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 4ea4dd8a1eed..fe822aa5b8e4 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -380,15 +380,13 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; } -void blk_mq_unregister_disk(struct gendisk *disk) +static void __blk_mq_unregister_disk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; int i, j; - blk_mq_disable_hotplug(); - queue_for_each_hw_ctx(q, hctx, i) { blk_mq_unregister_hctx(hctx); @@ -405,6 +403,12 @@ void blk_mq_unregister_disk(struct gendisk *disk) kobject_put(&disk_to_dev(disk)->kobj); q->mq_sysfs_init_done = false; +} + +void blk_mq_unregister_disk(struct gendisk *disk) +{ + blk_mq_disable_hotplug(); + __blk_mq_unregister_disk(disk); blk_mq_enable_hotplug(); } @@ -450,7 +454,7 @@ int blk_mq_register_disk(struct gendisk *disk) } if (ret) - blk_mq_unregister_disk(disk); + __blk_mq_unregister_disk(disk); else q->mq_sysfs_init_done = true; out: -- cgit v1.2.3 From f0225cacfe7e69ff3234a125aeb0f3d65077835c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Aug 2016 16:10:00 +0200 Subject: loop: don't try to use AIO for discards Fix a fat-fingered conversion to the req_op accessors, and also use a switch statement to make it more obvious what is being checked. Signed-off-by: Christoph Hellwig Reported-by: Dave Chinner Fixes: c2df40 ("drivers: use req op accessor"); Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/loop.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 075377eee0c0..91c2c881cb49 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1659,11 +1659,15 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (lo->lo_state != Lo_bound) return -EIO; - if (lo->use_dio && (req_op(cmd->rq) != REQ_OP_FLUSH || - req_op(cmd->rq) == REQ_OP_DISCARD)) - cmd->use_aio = true; - else + switch (req_op(cmd->rq)) { + case REQ_OP_FLUSH: + case REQ_OP_DISCARD: cmd->use_aio = false; + break; + default: + cmd->use_aio = lo->use_dio; + break; + } queue_kthread_work(&lo->worker, &cmd->work); -- cgit v1.2.3 From c1c87c2ba9ec06d8ba9e8a26c18c67a2ba9cd9c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 4 Aug 2016 16:10:01 +0200 Subject: loop: make do_req_filebacked more robust Use a switch statement to iterate over the possible operations and error out if it's an incorrect one. Signed-off-by: Jens Axboe --- drivers/block/loop.c | 55 +++++++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 91c2c881cb49..c9f2107f7095 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -510,14 +510,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, return 0; } - -static inline int lo_rw_simple(struct loop_device *lo, - struct request *rq, loff_t pos, bool rw) +static int do_req_filebacked(struct loop_device *lo, struct request *rq) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); - - if (cmd->use_aio) - return lo_rw_aio(lo, cmd, pos, rw); + loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; /* * lo_write_simple and lo_read_simple should have been covered @@ -528,37 +524,30 @@ static inline int lo_rw_simple(struct loop_device *lo, * of the req at one time. And direct read IO doesn't need to * run flush_dcache_page(). */ - if (rw == WRITE) - return lo_write_simple(lo, rq, pos); - else - return lo_read_simple(lo, rq, pos); -} - -static int do_req_filebacked(struct loop_device *lo, struct request *rq) -{ - loff_t pos; - int ret; - - pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - - if (op_is_write(req_op(rq))) { - if (req_op(rq) == REQ_OP_FLUSH) - ret = lo_req_flush(lo, rq); - else if (req_op(rq) == REQ_OP_DISCARD) - ret = lo_discard(lo, rq, pos); - else if (lo->transfer) - ret = lo_write_transfer(lo, rq, pos); + switch (req_op(rq)) { + case REQ_OP_FLUSH: + return lo_req_flush(lo, rq); + case REQ_OP_DISCARD: + return lo_discard(lo, rq, pos); + case REQ_OP_WRITE: + if (lo->transfer) + return lo_write_transfer(lo, rq, pos); + else if (cmd->use_aio) + return lo_rw_aio(lo, cmd, pos, WRITE); else - ret = lo_rw_simple(lo, rq, pos, WRITE); - - } else { + return lo_write_simple(lo, rq, pos); + case REQ_OP_READ: if (lo->transfer) - ret = lo_read_transfer(lo, rq, pos); + return lo_read_transfer(lo, rq, pos); + else if (cmd->use_aio) + return lo_rw_aio(lo, cmd, pos, READ); else - ret = lo_rw_simple(lo, rq, pos, READ); + return lo_read_simple(lo, rq, pos); + default: + WARN_ON_ONCE(1); + return -EIO; + break; } - - return ret; } struct switch_request { -- cgit v1.2.3 From abf545484d31b68777a85c5c8f5b4bcde08283eb Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 4 Aug 2016 14:23:34 -0600 Subject: mm/block: convert rw_page users to bio op use The rw_page users were not converted to use bio/req ops. As a result bdev_write_page is not passing down REQ_OP_WRITE and the IOs will be sent down as reads. Signed-off-by: Mike Christie Fixes: 4e1b2d52a80d ("block, fs, drivers: remove REQ_OP compat defs and related code") Modified by me to: 1) Drop op_flags passing into ->rw_page(), as we don't use it. 2) Make op_is_write() and friends safe to use for !CONFIG_BLOCK Signed-off-by: Jens Axboe --- drivers/block/brd.c | 17 +++++++---------- drivers/block/zram/zram_drv.c | 28 +++++++++++++++------------- drivers/nvdimm/btt.c | 18 +++++++++--------- drivers/nvdimm/pmem.c | 12 ++++++------ fs/block_dev.c | 7 ++++--- fs/mpage.c | 2 +- include/linux/blk_types.h | 22 +++++++++++----------- include/linux/blkdev.h | 2 +- include/linux/fs.h | 3 ++- include/linux/pagemap.h | 2 +- mm/filemap.c | 6 +++--- 11 files changed, 60 insertions(+), 59 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 3022dad24071..3439b28cce8b 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -300,20 +300,20 @@ static void copy_from_brd(void *dst, struct brd_device *brd, * Process a single bvec of a bio. */ static int brd_do_bvec(struct brd_device *brd, struct page *page, - unsigned int len, unsigned int off, int rw, + unsigned int len, unsigned int off, int op, sector_t sector) { void *mem; int err = 0; - if (rw != READ) { + if (op_is_write(op)) { err = copy_to_brd_setup(brd, sector, len); if (err) goto out; } mem = kmap_atomic(page); - if (rw == READ) { + if (!op_is_write(op)) { copy_from_brd(mem + off, brd, sector, len); flush_dcache_page(page); } else { @@ -330,7 +330,6 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct brd_device *brd = bdev->bd_disk->private_data; - int rw; struct bio_vec bvec; sector_t sector; struct bvec_iter iter; @@ -347,14 +346,12 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) goto out; } - rw = bio_data_dir(bio); - bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; int err; err = brd_do_bvec(brd, bvec.bv_page, len, - bvec.bv_offset, rw, sector); + bvec.bv_offset, bio_op(bio), sector); if (err) goto io_error; sector += len >> SECTOR_SHIFT; @@ -369,11 +366,11 @@ io_error: } static int brd_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, int rw) + struct page *page, int op) { struct brd_device *brd = bdev->bd_disk->private_data; - int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, rw, sector); - page_endio(page, rw & WRITE, err); + int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector); + page_endio(page, op, err); return err; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7454cf188c8e..ca29649c4b08 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -843,15 +843,15 @@ static void zram_bio_discard(struct zram *zram, u32 index, } static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, int rw) + int offset, int op) { unsigned long start_time = jiffies; int ret; - generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, + generic_start_io_acct(op, bvec->bv_len >> SECTOR_SHIFT, &zram->disk->part0); - if (rw == READ) { + if (!op_is_write(op)) { atomic64_inc(&zram->stats.num_reads); ret = zram_bvec_read(zram, bvec, index, offset); } else { @@ -859,10 +859,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, ret = zram_bvec_write(zram, bvec, index, offset); } - generic_end_io_acct(rw, &zram->disk->part0, start_time); + generic_end_io_acct(op, &zram->disk->part0, start_time); if (unlikely(ret)) { - if (rw == READ) + if (!op_is_write(op)) atomic64_inc(&zram->stats.failed_reads); else atomic64_inc(&zram->stats.failed_writes); @@ -873,7 +873,7 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, static void __zram_make_request(struct zram *zram, struct bio *bio) { - int offset, rw; + int offset; u32 index; struct bio_vec bvec; struct bvec_iter iter; @@ -888,7 +888,6 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) return; } - rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { int max_transfer_size = PAGE_SIZE - offset; @@ -903,15 +902,18 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) bv.bv_len = max_transfer_size; bv.bv_offset = bvec.bv_offset; - if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) + if (zram_bvec_rw(zram, &bv, index, offset, + bio_op(bio)) < 0) goto out; bv.bv_len = bvec.bv_len - max_transfer_size; bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) + if (zram_bvec_rw(zram, &bv, index + 1, 0, + bio_op(bio)) < 0) goto out; } else - if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0) + if (zram_bvec_rw(zram, &bvec, index, offset, + bio_op(bio)) < 0) goto out; update_position(&index, &offset, &bvec); @@ -968,7 +970,7 @@ static void zram_slot_free_notify(struct block_device *bdev, } static int zram_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, int rw) + struct page *page, int op) { int offset, err = -EIO; u32 index; @@ -992,7 +994,7 @@ static int zram_rw_page(struct block_device *bdev, sector_t sector, bv.bv_len = PAGE_SIZE; bv.bv_offset = 0; - err = zram_bvec_rw(zram, &bv, index, offset, rw); + err = zram_bvec_rw(zram, &bv, index, offset, op); put_zram: zram_meta_put(zram); out: @@ -1005,7 +1007,7 @@ out: * (e.g., SetPageError, set_page_dirty and extra works). */ if (err == 0) - page_endio(page, rw, 0); + page_endio(page, op, 0); return err; } diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 9dce03f420eb..7cf3bdfaf809 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1133,11 +1133,11 @@ static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, struct page *page, unsigned int len, unsigned int off, - int rw, sector_t sector) + int op, sector_t sector) { int ret; - if (rw == READ) { + if (!op_is_write(op)) { ret = btt_read_pg(btt, bip, page, off, sector, len); flush_dcache_page(page); } else { @@ -1155,7 +1155,7 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) struct bvec_iter iter; unsigned long start; struct bio_vec bvec; - int err = 0, rw; + int err = 0; bool do_acct; /* @@ -1170,7 +1170,6 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) } do_acct = nd_iostat_start(bio, &start); - rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; @@ -1181,11 +1180,12 @@ static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) BUG_ON(len % btt->sector_size); err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, - rw, iter.bi_sector); + bio_op(bio), iter.bi_sector); if (err) { dev_info(&btt->nd_btt->dev, "io error in %s sector %lld, len %d,\n", - (rw == READ) ? "READ" : "WRITE", + (op_is_write(bio_op(bio))) ? "WRITE" : + "READ", (unsigned long long) iter.bi_sector, len); bio->bi_error = err; break; @@ -1200,12 +1200,12 @@ out: } static int btt_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, int rw) + struct page *page, int op) { struct btt *btt = bdev->bd_disk->private_data; - btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, rw, sector); - page_endio(page, rw & WRITE, 0); + btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, op, sector); + page_endio(page, op, 0); return 0; } diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index b511099457db..d64d92481c1d 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -67,7 +67,7 @@ static void pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, } static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, - unsigned int len, unsigned int off, int rw, + unsigned int len, unsigned int off, int op, sector_t sector) { int rc = 0; @@ -79,7 +79,7 @@ static int pmem_do_bvec(struct pmem_device *pmem, struct page *page, if (unlikely(is_bad_pmem(&pmem->bb, sector, len))) bad_pmem = true; - if (rw == READ) { + if (!op_is_write(op)) { if (unlikely(bad_pmem)) rc = -EIO; else { @@ -134,7 +134,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) do_acct = nd_iostat_start(bio, &start); bio_for_each_segment(bvec, bio, iter) { rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, - bvec.bv_offset, bio_data_dir(bio), + bvec.bv_offset, bio_op(bio), iter.bi_sector); if (rc) { bio->bi_error = rc; @@ -152,12 +152,12 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) } static int pmem_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, int rw) + struct page *page, int op) { struct pmem_device *pmem = bdev->bd_queue->queuedata; int rc; - rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, rw, sector); + rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, op, sector); /* * The ->rw_page interface is subtle and tricky. The core @@ -166,7 +166,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, * caused by double completion. */ if (rc == 0) - page_endio(page, rw & WRITE, 0); + page_endio(page, op, 0); return rc; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 2033a3f91d58..d402899ba135 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -416,7 +416,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, result = blk_queue_enter(bdev->bd_queue, false); if (result) return result; - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, + REQ_OP_READ); blk_queue_exit(bdev->bd_queue); return result; } @@ -445,7 +446,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, struct page *page, struct writeback_control *wbc) { int result; - int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; const struct block_device_operations *ops = bdev->bd_disk->fops; if (!ops->rw_page || bdev_get_integrity(bdev)) @@ -455,7 +455,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, return result; set_page_writeback(page); - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, + REQ_OP_WRITE); if (result) end_page_writeback(page); else diff --git a/fs/mpage.c b/fs/mpage.c index 2ca1f39c8cba..7a09c55b4bd0 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -50,7 +50,7 @@ static void mpage_end_io(struct bio *bio) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - page_endio(page, bio_data_dir(bio), bio->bi_error); + page_endio(page, bio_op(bio), bio->bi_error); } bio_put(bio); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f254eb264924..14b28ff2caf8 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -18,6 +18,17 @@ struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); typedef void (bio_destructor_t) (struct bio *); +enum req_op { + REQ_OP_READ, + REQ_OP_WRITE, + REQ_OP_DISCARD, /* request to discard sectors */ + REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ + REQ_OP_WRITE_SAME, /* write same block many times */ + REQ_OP_FLUSH, /* request for cache flush */ +}; + +#define REQ_OP_BITS 3 + #ifdef CONFIG_BLOCK /* * main unit of I/O for the block layer and lower layers (ie drivers and @@ -228,17 +239,6 @@ enum rq_flag_bits { #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) -enum req_op { - REQ_OP_READ, - REQ_OP_WRITE, - REQ_OP_DISCARD, /* request to discard sectors */ - REQ_OP_SECURE_ERASE, /* request to securely erase sectors */ - REQ_OP_WRITE_SAME, /* write same block many times */ - REQ_OP_FLUSH, /* request for cache flush */ -}; - -#define REQ_OP_BITS 3 - typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U #define BLK_QC_T_SHIFT 16 diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index de7935961c27..ccd68c0d01de 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1672,7 +1672,7 @@ struct blk_dax_ctl { struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); - int (*rw_page)(struct block_device *, sector_t, struct page *, int rw); + int (*rw_page)(struct block_device *, sector_t, struct page *, int op); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); long (*direct_access)(struct block_device *, sector_t, void **, pfn_t *, diff --git a/include/linux/fs.h b/include/linux/fs.h index f3f0b4c8e8ac..498255e6914e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2480,12 +2480,13 @@ extern void init_special_inode(struct inode *, umode_t, dev_t); extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); -#ifdef CONFIG_BLOCK static inline bool op_is_write(unsigned int op) { return op == REQ_OP_READ ? false : true; } +#ifdef CONFIG_BLOCK + /* * return data direction, READ or WRITE */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 81363b834900..45786374abbd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -510,7 +510,7 @@ static inline void wait_on_page_writeback(struct page *page) extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); -void page_endio(struct page *page, int rw, int err); +void page_endio(struct page *page, int op, int err); /* * Add an arbitrary waiter to a page's wait queue diff --git a/mm/filemap.c b/mm/filemap.c index 3083ded98b15..daef091d4c50 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -887,9 +887,9 @@ EXPORT_SYMBOL(end_page_writeback); * After completing I/O on a page, call this routine to update the page * flags appropriately */ -void page_endio(struct page *page, int rw, int err) +void page_endio(struct page *page, int op, int err) { - if (rw == READ) { + if (!op_is_write(op)) { if (!err) { SetPageUptodate(page); } else { @@ -897,7 +897,7 @@ void page_endio(struct page *page, int rw, int err) SetPageError(page); } unlock_page(page); - } else { /* rw == WRITE */ + } else { if (err) { SetPageError(page); if (page->mapping) -- cgit v1.2.3