From b9a1ff504b9492ad6beb7d5606e0e3365d4d8499 Mon Sep 17 00:00:00 2001 From: Shenghui Wang Date: Mon, 1 Apr 2019 21:40:36 +0800 Subject: block: use blk_free_flush_queue() to free hctx->fq in blk_mq_init_hctx kfree() can leak the hctx->fq->flush_rq field. Reviewed-by: Ming Lei Signed-off-by: Shenghui Wang Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3ff3d7b49969..f3b0d33bdf88 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2332,7 +2332,7 @@ static int blk_mq_init_hctx(struct request_queue *q, return 0; free_fq: - kfree(hctx->fq); + blk_free_flush_queue(hctx->fq); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); -- cgit v1.2.3 From ff3b74b8e1675c802e09157a56c97ca38a659b9d Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Tue, 26 Mar 2019 21:19:25 +0800 Subject: blk-mq: add trace block plug and unplug for multiple queues For now, we just trace plug for single queue device or drivers provide .commit_rqs, and have not trace plug for multiple queues device. But, unplug events will be recorded when call blk_mq_flush_plug_list(). Then, trace events will be asymmetrical, just have unplug and without plug. This patch add trace plug and unplug for multiple queues device in blk_mq_make_request(). After that, we can accurately trace plug and unplug for multiple queues. Reviewed-by: Christoph Hellwig Signed-off-by: Yufen Yu Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index f3b0d33bdf88..22074a1e37cd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2003,11 +2003,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) plug->rq_count--; } blk_add_rq_to_plug(plug, rq); + trace_block_plug(q); blk_mq_put_ctx(data.ctx); if (same_queue_rq) { data.hctx = same_queue_rq->mq_hctx; + trace_block_unplug(q, 1, true); blk_mq_try_issue_directly(data.hctx, same_queue_rq, &cookie, false, true); } -- cgit v1.2.3 From bcc816dfe51ab86ca94663c7b225f2d6eb0fddb9 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Thu, 4 Apr 2019 10:57:44 +0800 Subject: blk-mq: do not reset plug->rq_count before the list is sorted We would never be able to sort the list if we first reset plug->rq_count which is used in conditional check later. Fixes: ce5b009cff19 ("block: improve logic around when to sort a plug list") Reviewed-by: Ming Lei Signed-off-by: Dongli Zhang Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 22074a1e37cd..ac61bcc67a89 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1711,11 +1711,12 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) unsigned int depth; list_splice_init(&plug->mq_list, &list); - plug->rq_count = 0; if (plug->rq_count > 2 && plug->multiple_queues) list_sort(NULL, &list, plug_rq_cmp); + plug->rq_count = 0; + this_q = NULL; this_hctx = NULL; this_ctx = NULL; -- cgit v1.2.3 From fd9c40f64c514bdc585a21e2e33fa5f83ca8811b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 4 Apr 2019 10:08:43 -0700 Subject: block: Revert v5.0 blk_mq_request_issue_directly() changes blk_mq_try_issue_directly() can return BLK_STS*_RESOURCE for requests that have been queued. If that happens when blk_mq_try_issue_directly() is called by the dm-mpath driver then dm-mpath will try to resubmit a request that is already queued and a kernel crash follows. Since it is nontrivial to fix blk_mq_request_issue_directly(), revert the blk_mq_request_issue_directly() changes that went into kernel v5.0. This patch reverts the following commits: * d6a51a97c0b2 ("blk-mq: replace and kill blk_mq_request_issue_directly") # v5.0. * 5b7a6f128aad ("blk-mq: issue directly with bypass 'false' in blk_mq_sched_insert_requests") # v5.0. * 7f556a44e61d ("blk-mq: refactor the code of issue request directly") # v5.0. Cc: Christoph Hellwig Cc: Ming Lei Cc: Jianchao Wang Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: James Smart Cc: Dongli Zhang Cc: Laurence Oberman Cc: Reported-by: Laurence Oberman Tested-by: Laurence Oberman Fixes: 7f556a44e61d ("blk-mq: refactor the code of issue request directly") # v5.0. Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 4 +- block/blk-mq-sched.c | 8 ++-- block/blk-mq.c | 122 ++++++++++++++++++++++++++------------------------- block/blk-mq.h | 6 +-- 4 files changed, 71 insertions(+), 69 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 4673ebe42255..a55389ba8779 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1245,8 +1245,6 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, */ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { - blk_qc_t unused; - if (blk_cloned_rq_check_limits(q, rq)) return BLK_STS_IOERR; @@ -1262,7 +1260,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, true); + return blk_mq_request_issue_directly(rq, true); } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 40905539afed..aa6bc5c02643 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -423,10 +423,12 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, * busy in case of 'none' scheduler, and this way may save * us one extra enqueue & dequeue to sw queue. */ - if (!hctx->dispatch_busy && !e && !run_queue_async) + if (!hctx->dispatch_busy && !e && !run_queue_async) { blk_mq_try_issue_list_directly(hctx, list); - else - blk_mq_insert_requests(hctx, ctx, list); + if (list_empty(list)) + return; + } + blk_mq_insert_requests(hctx, ctx, list); } blk_mq_run_hw_queue(hctx, run_queue_async); diff --git a/block/blk-mq.c b/block/blk-mq.c index ac61bcc67a89..a9354835cf51 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1801,74 +1801,76 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; } -blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, +static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, - bool bypass, bool last) + bool bypass_insert, bool last) { struct request_queue *q = rq->q; bool run_queue = true; - blk_status_t ret = BLK_STS_RESOURCE; - int srcu_idx; - bool force = false; - hctx_lock(hctx, &srcu_idx); /* - * hctx_lock is needed before checking quiesced flag. + * RCU or SRCU read lock is needed before checking quiesced flag. * - * When queue is stopped or quiesced, ignore 'bypass', insert - * and return BLK_STS_OK to caller, and avoid driver to try to - * dispatch again. + * When queue is stopped or quiesced, ignore 'bypass_insert' from + * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, + * and avoid driver to try to dispatch again. */ - if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) { + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { run_queue = false; - bypass = false; - goto out_unlock; + bypass_insert = false; + goto insert; } - if (unlikely(q->elevator && !bypass)) - goto out_unlock; + if (q->elevator && !bypass_insert) + goto insert; if (!blk_mq_get_dispatch_budget(hctx)) - goto out_unlock; + goto insert; if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(hctx); - goto out_unlock; + goto insert; } - /* - * Always add a request that has been through - *.queue_rq() to the hardware dispatch list. - */ - force = true; - ret = __blk_mq_issue_directly(hctx, rq, cookie, last); -out_unlock: + return __blk_mq_issue_directly(hctx, rq, cookie, last); +insert: + if (bypass_insert) + return BLK_STS_RESOURCE; + + blk_mq_request_bypass_insert(rq, run_queue); + return BLK_STS_OK; +} + +static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, blk_qc_t *cookie) +{ + blk_status_t ret; + int srcu_idx; + + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); + + hctx_lock(hctx, &srcu_idx); + + ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) + blk_mq_request_bypass_insert(rq, true); + else if (ret != BLK_STS_OK) + blk_mq_end_request(rq, ret); + + hctx_unlock(hctx, srcu_idx); +} + +blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) +{ + blk_status_t ret; + int srcu_idx; + blk_qc_t unused_cookie; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + hctx_lock(hctx, &srcu_idx); + ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last); hctx_unlock(hctx, srcu_idx); - switch (ret) { - case BLK_STS_OK: - break; - case BLK_STS_DEV_RESOURCE: - case BLK_STS_RESOURCE: - if (force) { - blk_mq_request_bypass_insert(rq, run_queue); - /* - * We have to return BLK_STS_OK for the DM - * to avoid livelock. Otherwise, we return - * the real result to indicate whether the - * request is direct-issued successfully. - */ - ret = bypass ? BLK_STS_OK : ret; - } else if (!bypass) { - blk_mq_sched_insert_request(rq, false, - run_queue, false); - } - break; - default: - if (!bypass) - blk_mq_end_request(rq, ret); - break; - } return ret; } @@ -1876,20 +1878,22 @@ out_unlock: void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - blk_qc_t unused; - blk_status_t ret = BLK_STS_OK; - while (!list_empty(list)) { + blk_status_t ret; struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - if (ret == BLK_STS_OK) - ret = blk_mq_try_issue_directly(hctx, rq, &unused, - false, + ret = blk_mq_request_issue_directly(rq, list_empty(list)); + if (ret != BLK_STS_OK) { + if (ret == BLK_STS_RESOURCE || + ret == BLK_STS_DEV_RESOURCE) { + blk_mq_request_bypass_insert(rq, list_empty(list)); - else - blk_mq_sched_insert_request(rq, false, true, false); + break; + } + blk_mq_end_request(rq, ret); + } } /* @@ -1897,7 +1901,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, * the driver there was more coming, but that turned out to * be a lie. */ - if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs) + if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs) hctx->queue->mq_ops->commit_rqs(hctx); } @@ -2012,13 +2016,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) data.hctx = same_queue_rq->mq_hctx; trace_block_unplug(q, 1, true); blk_mq_try_issue_directly(data.hctx, same_queue_rq, - &cookie, false, true); + &cookie); } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true); + blk_mq_try_issue_directly(data.hctx, rq, &cookie); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); diff --git a/block/blk-mq.h b/block/blk-mq.h index d704fc7766f4..423ea88ab6fb 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -70,10 +70,8 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); -blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, - bool bypass, bool last); +/* Used by blk_insert_cloned_request() to issue request directly */ +blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last); void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); -- cgit v1.2.3 From eed47d19d9362bdd958e4ab56af480b9dbf6b2b6 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 10 Apr 2019 10:38:33 +0200 Subject: block, bfq: fix use after free in bfq_bfqq_expire The function bfq_bfqq_expire() invokes the function __bfq_bfqq_expire(), and the latter may free the in-service bfq-queue. If this happens, then no other instruction of bfq_bfqq_expire() must be executed, or a use-after-free will occur. Basing on the assumption that __bfq_bfqq_expire() invokes bfq_put_queue() on the in-service bfq-queue exactly once, the queue is assumed to be freed if its refcounter is equal to one right before invoking __bfq_bfqq_expire(). But, since commit 9dee8b3b057e ("block, bfq: fix queue removal from weights tree") this assumption is false. __bfq_bfqq_expire() may also invoke bfq_weights_tree_remove() and, since commit 9dee8b3b057e ("block, bfq: fix queue removal from weights tree"), also the latter function may invoke bfq_put_queue(). So __bfq_bfqq_expire() may invoke bfq_put_queue() twice, and this is the actual case where the in-service queue may happen to be freed. To address this issue, this commit moves the check on the refcounter of the queue right around the last bfq_put_queue() that may be invoked on the queue. Fixes: 9dee8b3b057e ("block, bfq: fix queue removal from weights tree") Reported-by: Dmitrii Tcvetkov Reported-by: Douglas Anderson Tested-by: Dmitrii Tcvetkov Tested-by: Douglas Anderson Signed-off-by: Paolo Valente Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 15 +++++++-------- block/bfq-iosched.h | 2 +- block/bfq-wf2q.c | 17 +++++++++++++++-- 3 files changed, 23 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fac188dd78fa..dfb8cb0af13a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2822,7 +2822,7 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) bfq_remove_request(q, rq); } -static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) +static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* * If this bfqq is shared between multiple processes, check @@ -2855,9 +2855,11 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) /* * All in-service entities must have been properly deactivated * or requeued before executing the next function, which - * resets all in-service entites as no more in service. + * resets all in-service entities as no more in service. This + * may cause bfqq to be freed. If this happens, the next + * function returns true. */ - __bfq_bfqd_reset_in_service(bfqd); + return __bfq_bfqd_reset_in_service(bfqd); } /** @@ -3262,7 +3264,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, bool slow; unsigned long delta = 0; struct bfq_entity *entity = &bfqq->entity; - int ref; /* * Check whether the process is slow (see bfq_bfqq_is_slow). @@ -3347,10 +3348,8 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, * reason. */ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); - ref = bfqq->ref; - __bfq_bfqq_expire(bfqd, bfqq); - - if (ref == 1) /* bfqq is gone, no more actions on it */ + if (__bfq_bfqq_expire(bfqd, bfqq)) + /* bfqq is gone, no more actions on it */ return; bfqq->injected_service = 0; diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 062e1c4787f4..86394e503ca9 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -995,7 +995,7 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree); bool next_queue_may_preempt(struct bfq_data *bfqd); struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd); -void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd); +bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd); void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool ins_into_idle_tree, bool expiration); void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index a11bef75483d..ae4d000ac0af 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1605,7 +1605,8 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) return bfqq; } -void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) +/* returns true if the in-service queue gets freed */ +bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) { struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; @@ -1629,8 +1630,20 @@ void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) * service tree either, then release the service reference to * the queue it represents (taken with bfq_get_entity). */ - if (!in_serv_entity->on_st) + if (!in_serv_entity->on_st) { + /* + * If no process is referencing in_serv_bfqq any + * longer, then the service reference may be the only + * reference to the queue. If this is the case, then + * bfqq gets freed here. + */ + int ref = in_serv_bfqq->ref; bfq_put_queue(in_serv_bfqq); + if (ref == 1) + return true; + } + + return false; } void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- cgit v1.2.3 From 1b8f21b74c3c9c82fce5a751d7aefb7cc0b8d33d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 9 Apr 2019 06:31:21 +0800 Subject: blk-mq: introduce blk_mq_complete_request_sync() In NVMe's error handler, follows the typical steps of tearing down hardware for recovering controller: 1) stop blk_mq hw queues 2) stop the real hw queues 3) cancel in-flight requests via blk_mq_tagset_busy_iter(tags, cancel_request, ...) cancel_request(): mark the request as abort blk_mq_complete_request(req); 4) destroy real hw queues However, there may be race between #3 and #4, because blk_mq_complete_request() may run q->mq_ops->complete(rq) remotelly and asynchronously, and ->complete(rq) may be run after #4. This patch introduces blk_mq_complete_request_sync() for fixing the above race. Cc: Sagi Grimberg Cc: Bart Van Assche Cc: James Smart Cc: linux-nvme@lists.infradead.org Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 +++++++ include/linux/blk-mq.h | 1 + 2 files changed, 8 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a9354835cf51..9516304a38ee 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -654,6 +654,13 @@ bool blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); +void blk_mq_complete_request_sync(struct request *rq) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + rq->q->mq_ops->complete(rq); +} +EXPORT_SYMBOL_GPL(blk_mq_complete_request_sync); + int blk_mq_request_started(struct request *rq) { return blk_mq_rq_state(rq) != MQ_RQ_IDLE; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index cb2aa7ecafff..db29928de467 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -302,6 +302,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); bool blk_mq_complete_request(struct request *rq); +void blk_mq_complete_request_sync(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio); bool blk_mq_queue_stopped(struct request_queue *q); -- cgit v1.2.3 From a3761c3c91209b58b6f33bf69dd8bb8ec0c9d925 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Wed, 10 Apr 2019 16:27:51 -0400 Subject: block: do not leak memory in bio_copy_user_iov() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When bio_add_pc_page() fails in bio_copy_user_iov() we should free the page we just allocated otherwise we are leaking it. Cc: linux-block@vger.kernel.org Cc: Linus Torvalds Cc: stable@vger.kernel.org Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jérôme Glisse Signed-off-by: Jens Axboe --- block/bio.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index b64cedc7f87c..716510ecd7ff 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1298,8 +1298,11 @@ struct bio *bio_copy_user_iov(struct request_queue *q, } } - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { + if (!map_data) + __free_page(page); break; + } len -= bytes; offset = 0; -- cgit v1.2.3 From 77f1e0a52d26242b6c2dba019f6ebebfb9ff701e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 18 Jan 2019 10:34:16 -0700 Subject: bfq: update internal depth state when queue depth changes A previous commit moved the shallow depth and BFQ depth map calculations to be done at init time, moving it outside of the hotter IO path. This potentially causes hangs if the users changes the depth of the scheduler map, by writing to the 'nr_requests' sysfs file for that device. Add a blk-mq-sched hook that allows blk-mq to inform the scheduler if the depth changes, so that the scheduler can update its internal state. Tested-by: Kai Krakow Reported-by: Paolo Valente Fixes: f0635b8a416e ("bfq: calculate shallow depths at init time") Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 +++++++- block/blk-mq.c | 2 ++ include/linux/elevator.h | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index dfb8cb0af13a..5ba1e0d841b4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5396,7 +5396,7 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, return min_shallow; } -static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) +static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; @@ -5404,6 +5404,11 @@ static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); +} + +static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) +{ + bfq_depth_updated(hctx); return 0; } @@ -5826,6 +5831,7 @@ static struct elevator_type iosched_bfq_mq = { .requests_merged = bfq_requests_merged, .request_merged = bfq_request_merged, .has_work = bfq_has_work, + .depth_updated = bfq_depth_updated, .init_hctx = bfq_init_hctx, .init_sched = bfq_init_queue, .exit_sched = bfq_exit_queue, diff --git a/block/blk-mq.c b/block/blk-mq.c index 9516304a38ee..fc60ed7e940e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3135,6 +3135,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } if (ret) break; + if (q->elevator && q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(hctx); } if (!ret) diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 2e9e2763bf47..6e8bc53740f0 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -31,6 +31,7 @@ struct elevator_mq_ops { void (*exit_sched)(struct elevator_queue *); int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); + void (*depth_updated)(struct blk_mq_hw_ctx *); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); -- cgit v1.2.3