From b817525a4a80c04e4ca44192d97a1ffa9f2be572 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 2 Oct 2015 14:47:05 -0400 Subject: writeback: bdi_writeback iteration must not skip dying ones bdi_for_each_wb() is used in several places to wake up or issue writeback work items to all wb's (bdi_writeback's) on a given bdi. The iteration is performed by walking bdi->cgwb_tree; however, the tree only indexes wb's which are currently active. For example, when a memcg gets associated with a different blkcg, the old wb is removed from the tree so that the new one can be indexed. The old wb starts dying from then on but will linger till all its inodes are drained. As these dying wb's may still host dirty inodes, writeback operations which affect all wb's must include them. bdi_for_each_wb() skipping dying wb's led to sync(2) missing and failing to sync the inodes belonging to those wb's. This patch adds a RCU protected @bdi->wb_list which lists all wb's beloinging to that bdi. wb's are added on creation and removed on release rather than on the start of destruction. bdi_for_each_wb() usages are replaced with list_for_each[_continue]_rcu() iterations over @bdi->wb_list and bdi_for_each_wb() and its helpers are removed. v2: Updated as per Jan. last_wb ref leak in bdi_split_work_to_wbs() fixed and unnecessary list head severing in cgwb_bdi_destroy() removed. Signed-off-by: Tejun Heo Reported-and-tested-by: Artem Bityutskiy Fixes: ebe41ab0c79d ("writeback: implement bdi_for_each_wb()") Link: http://lkml.kernel.org/g/1443012552.19983.209.camel@gmail.com Cc: Jan Kara Signed-off-by: Jens Axboe --- mm/backing-dev.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 2df8ddcb0ca0..e92d77937fd3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -480,6 +480,10 @@ static void cgwb_release_workfn(struct work_struct *work) release_work); struct backing_dev_info *bdi = wb->bdi; + spin_lock_irq(&cgwb_lock); + list_del_rcu(&wb->bdi_node); + spin_unlock_irq(&cgwb_lock); + wb_shutdown(wb); css_put(wb->memcg_css); @@ -575,6 +579,7 @@ static int cgwb_create(struct backing_dev_info *bdi, ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); if (!ret) { atomic_inc(&bdi->usage_cnt); + list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); css_get(memcg_css); @@ -764,15 +769,22 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } int bdi_init(struct backing_dev_info *bdi) { + int ret; + bdi->dev = NULL; bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); + INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); - return cgwb_bdi_init(bdi); + ret = cgwb_bdi_init(bdi); + + list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); + + return ret; } EXPORT_SYMBOL(bdi_init); -- cgit v1.2.3 From b02176f30cd30acccd3b633ab7d9aed8b5da52ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Sep 2015 12:20:22 -0400 Subject: block: don't release bdi while request_queue has live references bdi's are initialized in two steps, bdi_init() and bdi_register(), but destroyed in a single step by bdi_destroy() which, for a bdi embedded in a request_queue, is called during blk_cleanup_queue() which makes the queue invisible and starts the draining of remaining usages. A request_queue's user can access the congestion state of the embedded bdi as long as it holds a reference to the queue. As such, it may access the congested state of a queue which finished blk_cleanup_queue() but hasn't reached blk_release_queue() yet. Because the congested state was embedded in backing_dev_info which in turn is embedded in request_queue, accessing the congested state after bdi_destroy() was called was fine. The bdi was destroyed but the memory region for the congested state remained accessible till the queue got released. a13f35e87140 ("writeback: don't embed root bdi_writeback_congested in bdi_writeback") changed the situation. Now, the root congested state which is expected to be pinned while request_queue remains accessible is separately reference counted and the base ref is put during bdi_destroy(). This means that the root congested state may go away prematurely while the queue is between bdi_dstroy() and blk_cleanup_queue(), which was detected by Andrey's KASAN tests. The root cause of this problem is that bdi doesn't distinguish the two steps of destruction, unregistration and release, and now the root congested state actually requires a separate release step. To fix the issue, this patch separates out bdi_unregister() and bdi_exit() from bdi_destroy(). bdi_unregister() is called from blk_cleanup_queue() and bdi_exit() from blk_release_queue(). bdi_destroy() is now just a simple wrapper calling the two steps back-to-back. While at it, the prototype of bdi_destroy() is moved right below bdi_setup_and_register() so that the counterpart operations are located together. Signed-off-by: Tejun Heo Fixes: a13f35e87140 ("writeback: don't embed root bdi_writeback_congested in bdi_writeback") Cc: stable@vger.kernel.org # v4.2+ Reported-and-tested-by: Andrey Konovalov Link: http://lkml.kernel.org/g/CAAeHK+zUJ74Zn17=rOyxacHU18SgCfC6bsYW=6kCY5GXJBwGfQ@mail.gmail.com Reviewed-by: Jan Kara Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-sysfs.c | 1 + include/linux/backing-dev.h | 6 +++++- mm/backing-dev.c | 12 +++++++++++- 4 files changed, 18 insertions(+), 3 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/block/blk-core.c b/block/blk-core.c index 2eb722d48773..18e92a6645e2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -576,7 +576,7 @@ void blk_cleanup_queue(struct request_queue *q) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); - bdi_destroy(&q->backing_dev_info); + bdi_unregister(&q->backing_dev_info); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3e44a9da2a13..07b42f5ad797 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -540,6 +540,7 @@ static void blk_release_queue(struct kobject *kobj) struct request_queue *q = container_of(kobj, struct request_queue, kobj); + bdi_exit(&q->backing_dev_info); blkcg_exit_queue(q); if (q->elevator) { diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 78677e5a65bf..c85f74946a8b 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -19,13 +19,17 @@ #include int __must_check bdi_init(struct backing_dev_info *bdi); -void bdi_destroy(struct backing_dev_info *bdi); +void bdi_exit(struct backing_dev_info *bdi); __printf(3, 4) int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...); int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); +void bdi_unregister(struct backing_dev_info *bdi); + int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); +void bdi_destroy(struct backing_dev_info *bdi); + void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, bool range_cyclic, enum wb_reason reason); void wb_start_background_writeback(struct bdi_writeback *wb); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e92d77937fd3..9e841399041a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -835,7 +835,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) synchronize_rcu_expedited(); } -void bdi_destroy(struct backing_dev_info *bdi) +void bdi_unregister(struct backing_dev_info *bdi) { /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); @@ -847,9 +847,19 @@ void bdi_destroy(struct backing_dev_info *bdi) device_unregister(bdi->dev); bdi->dev = NULL; } +} +void bdi_exit(struct backing_dev_info *bdi) +{ + WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); } + +void bdi_destroy(struct backing_dev_info *bdi) +{ + bdi_unregister(bdi); + bdi_exit(bdi); +} EXPORT_SYMBOL(bdi_destroy); /* -- cgit v1.2.3 From e27c5b9d23168cc2cb8fec147ae7ed1f7a2005c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Oct 2015 18:14:19 -0400 Subject: writeback: remove broken rbtree_postorder_for_each_entry_safe() usage in cgwb_bdi_destroy() a20135ffbc44 ("writeback: don't drain bdi_writeback_congested on bdi destruction") added rbtree_postorder_for_each_entry_safe() which is used to remove all entries; however, according to Cody, the iterator isn't safe against operations which may rebalance the tree. Fix it by switching to repeatedly removing rb_first() until empty. Signed-off-by: Tejun Heo Reported-by: Cody P Schafer Fixes: a20135ffbc44 ("writeback: don't drain bdi_writeback_congested on bdi destruction") Link: http://lkml.kernel.org/g/1443997973-1700-1-git-send-email-dev@codyps.com Signed-off-by: Jens Axboe --- mm/backing-dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 9e841399041a..619984fc07ec 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -681,7 +681,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { struct radix_tree_iter iter; - struct bdi_writeback_congested *congested, *congested_n; + struct rb_node *rbn; void **slot; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); @@ -691,9 +691,11 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); - rbtree_postorder_for_each_entry_safe(congested, congested_n, - &bdi->cgwb_congested_tree, rb_node) { - rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree); + while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { + struct bdi_writeback_congested *congested = + rb_entry(rbn, struct bdi_writeback_congested, rb_node); + + rb_erase(rbn, &bdi->cgwb_congested_tree); congested->bdi = NULL; /* mark @congested unlinked */ } -- cgit v1.2.3