From 8796acbc9a0eceeddd99eaef833bdda1241d39b9 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 12 Oct 2022 17:40:32 +0800 Subject: blk-iocost: disable writeback throttling Commit b5dc5d4d1f4f ("block,bfq: Disable writeback throttling") disable wbt for bfq, because different write-throttling heuristics should not work together. For the same reason, wbt and iocost should not work together as well, unless admin really want to do that, dispite that performance is affected. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221012094035.390056-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 495396425bad..08036476e6fa 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3264,9 +3264,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; + wbt_disable_default(disk->queue); } else { blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; + wbt_enable_default(disk->queue); } if (user) { -- cgit v1.2.3 From 2c0647988433c3d7120542a3c42e12c152dd7afc Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 12 Oct 2022 17:40:33 +0800 Subject: blk-iocost: don't release 'ioc->lock' while updating params ioc_qos_write() and ioc_cost_model_write() are the same: 1) hold lock to read 'ioc->params' to local variable; 2) update params to local variable without lock; 3) hold lock to write local variable to 'ioc->params'; In theroy, if user updates params concurrenty, the params might be lost: t1: update params a t2: update params b spin_lock_irq(&ioc->lock); memcpy(qos, ioc->params.qos, sizeof(qos)) spin_unlock_irq(&ioc->lock); qos[a] = xxx; spin_lock_irq(&ioc->lock); memcpy(qos, ioc->params.qos, sizeof(qos)) spin_unlock_irq(&ioc->lock); qos[b] = xxx; spin_lock_irq(&ioc->lock); memcpy(ioc->params.qos, qos, sizeof(qos)); ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); spin_lock_irq(&ioc->lock); // updates of a will be lost memcpy(ioc->params.qos, qos, sizeof(qos)); ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); Althrough this is not common case, the problem can by fixed easily by holding the lock through the read, update, write process. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221012094035.390056-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 08036476e6fa..6d36a4bd4382 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3191,7 +3191,6 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, memcpy(qos, ioc->params.qos, sizeof(qos)); enable = ioc->enabled; user = ioc->user_qos_params; - spin_unlock_irq(&ioc->lock); while ((p = strsep(&input, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; @@ -3258,8 +3257,6 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, if (qos[QOS_MIN] > qos[QOS_MAX]) goto einval; - spin_lock_irq(&ioc->lock); - if (enable) { blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); @@ -3284,6 +3281,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, blkdev_put_no_open(bdev); return nbytes; einval: + spin_unlock_irq(&ioc->lock); ret = -EINVAL; err: blkdev_put_no_open(bdev); @@ -3359,7 +3357,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, spin_lock_irq(&ioc->lock); memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; - spin_unlock_irq(&ioc->lock); while ((p = strsep(&input, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; @@ -3396,7 +3393,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, user = true; } - spin_lock_irq(&ioc->lock); if (user) { memcpy(ioc->params.i_lcoefs, u, sizeof(u)); ioc->user_cost_model = true; @@ -3410,6 +3406,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, return nbytes; einval: + spin_unlock_irq(&ioc->lock); ret = -EINVAL; err: blkdev_put_no_open(bdev); -- cgit v1.2.3 From 2b2da2f6dc80e1299586f7e046a071809d5dee7f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 12 Oct 2022 17:40:34 +0800 Subject: blk-iocost: prevent configuration update concurrent with io throttling This won't cause any severe problem currently, however, this doesn't seems appropriate: 1) 'ioc->params' is read from multiple places without holding 'ioc->lock', unexpected value might be read if writing it concurrently. 2) If configuration is changed while io is throttling, the functionality might be affected. For example, if module params is updated and cost becomes smaller, waiting for timer that is caculated under old configuration is not appropriate. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221012094035.390056-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 6d36a4bd4382..5acc5f13bbd6 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3187,6 +3187,9 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc = q_to_ioc(disk->queue); } + blk_mq_freeze_queue(disk->queue); + blk_mq_quiesce_queue(disk->queue); + spin_lock_irq(&ioc->lock); memcpy(qos, ioc->params.qos, sizeof(qos)); enable = ioc->enabled; @@ -3278,10 +3281,17 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue); + blkdev_put_no_open(bdev); return nbytes; einval: spin_unlock_irq(&ioc->lock); + + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue); + ret = -EINVAL; err: blkdev_put_no_open(bdev); @@ -3336,6 +3346,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { struct block_device *bdev; + struct request_queue *q; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; @@ -3346,14 +3357,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, if (IS_ERR(bdev)) return PTR_ERR(bdev); - ioc = q_to_ioc(bdev_get_queue(bdev)); + q = bdev_get_queue(bdev); + ioc = q_to_ioc(q); if (!ioc) { ret = blk_iocost_init(bdev->bd_disk); if (ret) goto err; - ioc = q_to_ioc(bdev_get_queue(bdev)); + ioc = q_to_ioc(q); } + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + spin_lock_irq(&ioc->lock); memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; @@ -3402,11 +3417,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + blkdev_put_no_open(bdev); return nbytes; einval: spin_unlock_irq(&ioc->lock); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + ret = -EINVAL; err: blkdev_put_no_open(bdev); -- cgit v1.2.3 From 074501bce3c5b4200a94b9937d189b3e3e2dbc98 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 12 Oct 2022 17:40:35 +0800 Subject: blk-iocost: read 'ioc->params' inside 'ioc->lock' in ioc_timer_fn() 'ioc->params' is updated in ioc_refresh_params(), which is proteced by 'ioc->lock', however, ioc_timer_fn() read params outside the lock. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221012094035.390056-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 5acc5f13bbd6..f01359906c83 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2203,8 +2203,8 @@ static void ioc_timer_fn(struct timer_list *timer) LIST_HEAD(surpluses); int nr_debtors, nr_shortages = 0, nr_lagging = 0; u64 usage_us_sum = 0; - u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; - u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; + u32 ppm_rthr; + u32 ppm_wthr; u32 missed_ppm[2], rq_wait_pct; u64 period_vtime; int prev_busy_level; @@ -2215,6 +2215,8 @@ static void ioc_timer_fn(struct timer_list *timer) /* take care of active iocgs */ spin_lock_irq(&ioc->lock); + ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; + ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; ioc_now(ioc, &now); period_vtime = now.vnow - ioc->period_at_vtime; -- cgit v1.2.3 From 6d9f4cf125585ebf0718abcf5ce9ca898877c6d2 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 19 Oct 2022 20:15:13 +0800 Subject: elevator: remove redundant code in elv_unregister_queue() "elevator_queue *e" is already declared and initialized in the beginning of elv_unregister_queue(). Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221019121518.3865235-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/elevator.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index bd71f0fc4e4b..20e70fd3f77f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -524,8 +524,6 @@ void elv_unregister_queue(struct request_queue *q) lockdep_assert_held(&q->sysfs_lock); if (e && e->registered) { - struct elevator_queue *e = q->elevator; - kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); -- cgit v1.2.3 From b11d31ae01e6b0762b28e645ad6718a12faa8d14 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 19 Oct 2022 20:15:14 +0800 Subject: blk-wbt: remove unnecessary check in wbt_enable_default() If CONFIG_BLK_WBT_MQ is disabled, wbt_init() won't do anything. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221019121518.3865235-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index c293e08b301f..c5a8c10028a0 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -651,7 +651,7 @@ void wbt_enable_default(struct request_queue *q) if (!blk_queue_registered(q)) return; - if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) + if (queue_is_mq(q)) wbt_init(q); } EXPORT_SYMBOL_GPL(wbt_enable_default); -- cgit v1.2.3 From a9a236d238a5e8ab2e74ca62c2c7ba5dd435af77 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 19 Oct 2022 20:15:15 +0800 Subject: blk-wbt: make enable_state more accurate Currently, if user disable wbt through sysfs, 'enable_state' will be 'WBT_STATE_ON_MANUAL', which will be confusing. Add a new state 'WBT_STATE_OFF_MANUAL' to cover that case. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221019121518.3865235-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 7 ++++++- block/blk-wbt.h | 12 +++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index c5a8c10028a0..4680691a96bc 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -435,8 +435,13 @@ void wbt_set_min_lat(struct request_queue *q, u64 val) struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) return; + RQWB(rqos)->min_lat_nsec = val; - RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + if (val) + RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + else + RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL; + wbt_update_limits(RQWB(rqos)); } diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 7e44eccc676d..7fe98638fff5 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -28,13 +28,15 @@ enum { }; /* - * Enable states. Either off, or on by default (done at init time), - * or on through manual setup in sysfs. + * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other + * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered + * to WBT_STATE_OFF/ON_MANUAL. */ enum { - WBT_STATE_ON_DEFAULT = 1, - WBT_STATE_ON_MANUAL = 2, - WBT_STATE_OFF_DEFAULT + WBT_STATE_ON_DEFAULT = 1, /* on by default */ + WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ + WBT_STATE_OFF_DEFAULT = 3, /* off by default */ + WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ }; struct rq_wb { -- cgit v1.2.3 From 3642ef4d95699193c4a461862382e643ae3720f0 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 19 Oct 2022 20:15:16 +0800 Subject: blk-wbt: don't show valid wbt_lat_usec in sysfs while wbt is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, if wbt is initialized and then disabled by wbt_disable_default(), sysfs will still show valid wbt_lat_usec, which will confuse users that wbt is still enabled. This patch shows wbt_lat_usec as zero if it's disabled. Signed-off-by: Yu Kuai Reported-and-tested-by: Holger Hoffstätte Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221019121518.3865235-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 3 +++ block/blk-wbt.c | 8 ++++++++ block/blk-wbt.h | 5 +++++ 3 files changed, 16 insertions(+) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e71b3b43927c..7b98c7074771 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -470,6 +470,9 @@ static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) if (!wbt_rq_qos(q)) return -EINVAL; + if (wbt_disabled(q)) + return sprintf(page, "0\n"); + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); } diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 4680691a96bc..07ed0b0aee1f 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -422,6 +422,14 @@ static void wbt_update_limits(struct rq_wb *rwb) rwb_wake_all(rwb); } +bool wbt_disabled(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + + return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT || + RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL; +} + u64 wbt_get_min_lat(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 7fe98638fff5..e3ea6e7e2900 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -96,6 +96,7 @@ void wbt_enable_default(struct request_queue *); u64 wbt_get_min_lat(struct request_queue *q); void wbt_set_min_lat(struct request_queue *q, u64 val); +bool wbt_disabled(struct request_queue *); void wbt_set_write_cache(struct request_queue *, bool); @@ -127,6 +128,10 @@ static inline u64 wbt_default_latency_nsec(struct request_queue *q) { return 0; } +static inline bool wbt_disabled(struct request_queue *q) +{ + return true; +} #endif /* CONFIG_BLK_WBT */ -- cgit v1.2.3 From 181d06637451b5348d746039478e71fa53dfbff6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 19 Oct 2022 20:15:17 +0800 Subject: elevator: add new field flags in struct elevator_queue There are only one flag to indicate that elevator is registered currently, prepare to add a flag to disable wbt if default elevator is bfq. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221019121518.3865235-6-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/elevator.c | 6 ++---- block/elevator.h | 4 +++- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 20e70fd3f77f..9e12706e8d8c 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -512,7 +512,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD); - e->registered = 1; + set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags); } return error; } @@ -523,11 +523,9 @@ void elv_unregister_queue(struct request_queue *q) lockdep_assert_held(&q->sysfs_lock); - if (e && e->registered) { + if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); - - e->registered = 0; } } diff --git a/block/elevator.h b/block/elevator.h index 3f0593b3bf9d..ed574bf3e629 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -100,10 +100,12 @@ struct elevator_queue void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; - unsigned int registered:1; + unsigned long flags; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; +#define ELEVATOR_FLAG_REGISTERED 0 + /* * block elevator interface */ -- cgit v1.2.3 From 671fae5e51297fc76b3758ca2edd514858734a6a Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 19 Oct 2022 20:15:18 +0800 Subject: blk-wbt: don't enable throttling if default elevator is bfq Commit b5dc5d4d1f4f ("block,bfq: Disable writeback throttling") tries to disable wbt for bfq, it's done by calling wbt_disable_default() in bfq_init_queue(). However, wbt is still enabled if default elevator is bfq: device_add_disk elevator_init_mq bfq_init_queue wbt_disable_default -> done nothing blk_register_queue wbt_enable_default -> wbt is enabled Fix the problem by adding a new flag ELEVATOR_FLAG_DISBALE_WBT, bfq will set the flag in bfq_init_queue, and following wbt_enable_default() won't enable wbt while the flag is set. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221019121518.3865235-7-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 ++ block/blk-wbt.c | 11 ++++++++--- block/elevator.h | 3 ++- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7ea427817f7f..ccfadc6f71bf 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7045,6 +7045,7 @@ static void bfq_exit_queue(struct elevator_queue *e) #endif blk_stat_disable_accounting(bfqd->queue); + clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags); wbt_enable_default(bfqd->queue); kfree(bfqd); @@ -7190,6 +7191,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); + set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags); wbt_disable_default(q); blk_stat_enable_accounting(q); diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 07ed0b0aee1f..68a774d7a7c9 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -27,6 +27,7 @@ #include "blk-wbt.h" #include "blk-rq-qos.h" +#include "elevator.h" #define CREATE_TRACE_POINTS #include @@ -651,11 +652,15 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) */ void wbt_enable_default(struct request_queue *q) { - struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_qos *rqos; + bool disable_flag = q->elevator && + test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags); /* Throttling already enabled? */ + rqos = wbt_rq_qos(q); if (rqos) { - if (RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) + if (!disable_flag && + RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; return; } @@ -664,7 +669,7 @@ void wbt_enable_default(struct request_queue *q) if (!blk_queue_registered(q)) return; - if (queue_is_mq(q)) + if (queue_is_mq(q) && !disable_flag) wbt_init(q); } EXPORT_SYMBOL_GPL(wbt_enable_default); diff --git a/block/elevator.h b/block/elevator.h index ed574bf3e629..75382471222d 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -104,7 +104,8 @@ struct elevator_queue DECLARE_HASHTABLE(hash, ELV_HASH_BITS); }; -#define ELEVATOR_FLAG_REGISTERED 0 +#define ELEVATOR_FLAG_REGISTERED 0 +#define ELEVATOR_FLAG_DISABLE_WBT 1 /* * block elevator interface -- cgit v1.2.3 From dd6f7f17bf5831ac94d2b8ffe3d67dac201a9b97 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2022 08:48:16 +0200 Subject: block: add proper helpers for elevator_type module refcount management Make sure we have helpers for all relevant module refcount operations on the elevator_type in elevator.h, and use them. Move the call to the get helper in blk_mq_elv_switch_none a bit so that it is obvious with a less verbose comment. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221020064819.1469928-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 ++--------- block/elevator.c | 9 ++------- block/elevator.h | 15 +++++++++++++++ 3 files changed, 19 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 33292c01875d..9db8814cdd02 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4554,17 +4554,10 @@ static bool blk_mq_elv_switch_none(struct list_head *head, INIT_LIST_HEAD(&qe->node); qe->q = q; + /* keep a reference to the elevator module as we'll switch back */ + __elevator_get(qe->type); qe->type = q->elevator->type; list_add(&qe->node, head); - - /* - * After elevator_switch, the previous elevator_queue will be - * released by elevator_release. The reference of the io scheduler - * module get by elevator_get will also be put. So we need to get - * a reference of the io scheduler module here to prevent it to be - * removed. - */ - __module_get(qe->type->elevator_owner); elevator_switch(q, NULL); mutex_unlock(&q->sysfs_lock); diff --git a/block/elevator.c b/block/elevator.c index 9e12706e8d8c..8fb404a89f2e 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -132,11 +132,6 @@ static struct elevator_type *elevator_find(const char *name, return NULL; } -static void elevator_put(struct elevator_type *e) -{ - module_put(e->elevator_owner); -} - static struct elevator_type *elevator_get(struct request_queue *q, const char *name, bool try_loading) { @@ -152,7 +147,7 @@ static struct elevator_type *elevator_get(struct request_queue *q, e = elevator_find(name, q->required_elevator_features); } - if (e && !try_module_get(e->elevator_owner)) + if (e && !elevator_tryget(e)) e = NULL; spin_unlock(&elv_list_lock); @@ -659,7 +654,7 @@ static struct elevator_type *elevator_get_by_features(struct request_queue *q) } } - if (found && !try_module_get(found->elevator_owner)) + if (found && !elevator_tryget(found)) found = NULL; spin_unlock(&elv_list_lock); diff --git a/block/elevator.h b/block/elevator.h index 75382471222d..774a8f6b99e6 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -84,6 +84,21 @@ struct elevator_type struct list_head list; }; +static inline bool elevator_tryget(struct elevator_type *e) +{ + return try_module_get(e->elevator_owner); +} + +static inline void __elevator_get(struct elevator_type *e) +{ + __module_get(e->elevator_owner); +} + +static inline void elevator_put(struct elevator_type *e) +{ + module_put(e->elevator_owner); +} + #define ELV_HASH_BITS 6 void elv_rqhash_del(struct request_queue *q, struct request *rq); -- cgit v1.2.3 From 58367c8a5f845498fd341e5687bab513925e4ac0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Oct 2022 08:48:17 +0200 Subject: block: sanitize the elevator name before passing it to __elevator_change The stripped name should also be used for the none check. To do so strip it in the caller and pass in the sanitized name. Drop the pointless __ prefix in the function name while we're at it. Based on a patch from Jinlong Chen . Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221020064819.1469928-3-hch@lst.de Signed-off-by: Jens Axboe --- block/elevator.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 8fb404a89f2e..5b8fb8745e9a 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -734,9 +734,8 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) /* * Switch this queue to the given IO scheduler. */ -static int __elevator_change(struct request_queue *q, const char *name) +static int elevator_change(struct request_queue *q, const char *elevator_name) { - char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; /* Make sure queue is not in the middle of being removed */ @@ -746,14 +745,13 @@ static int __elevator_change(struct request_queue *q, const char *name) /* * Special case for mq, turn off scheduling */ - if (!strncmp(name, "none", 4)) { + if (!strncmp(elevator_name, "none", 4)) { if (!q->elevator) return 0; return elevator_switch(q, NULL); } - strlcpy(elevator_name, name, sizeof(elevator_name)); - e = elevator_get(q, strstrip(elevator_name), true); + e = elevator_get(q, elevator_name, true); if (!e) return -EINVAL; @@ -766,18 +764,19 @@ static int __elevator_change(struct request_queue *q, const char *name) return elevator_switch(q, e); } -ssize_t elv_iosched_store(struct request_queue *q, const char *name, +ssize_t elv_iosched_store(struct request_queue *q, const char *buf, size_t count) { + char elevator_name[ELV_NAME_MAX]; int ret; if (!elv_support_iosched(q)) return count; - ret = __elevator_change(q, name); + strlcpy(elevator_name, buf, sizeof(elevator_name)); + ret = elevator_change(q, strstrip(elevator_name)); if (!ret) return count; - return ret; } -- cgit v1.2.3 From b54c2ad9b77de846e42104ecf94eb8329d2f03a3 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Thu, 20 Oct 2022 08:48:18 +0200 Subject: block: check for an unchanged elevator earlier in __elevator_change No need to find the actual elevator_type struct for this comparism, the name is all that is needed. Signed-off-by: Jinlong Chen [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221020064819.1469928-4-hch@lst.de Signed-off-by: Jens Axboe --- block/elevator.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 5b8fb8745e9a..61d5655a3819 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -751,16 +751,13 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) return elevator_switch(q, NULL); } + if (q->elevator && elevator_match(q->elevator->type, elevator_name, 0)) + return 0; + e = elevator_get(q, elevator_name, true); if (!e) return -EINVAL; - if (q->elevator && - elevator_match(q->elevator->type, elevator_name, 0)) { - elevator_put(e); - return 0; - } - return elevator_switch(q, e); } -- cgit v1.2.3 From 8ed40ee35d94df2fdb56bbbc07e17dffd2383625 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Thu, 20 Oct 2022 08:48:19 +0200 Subject: block: fix up elevator_type refcounting The current reference management logic of io scheduler modules contains refcnt problems. For example, blk_mq_init_sched may fail before or after the calling of e->ops.init_sched. If it fails before the calling, it does nothing to the reference to the io scheduler module. But if it fails after the calling, it releases the reference by calling kobject_put(&eq->kobj). As the callers of blk_mq_init_sched can't know exactly where the failure happens, they can't handle the reference to the io scheduler module properly: releasing the reference on failure results in double-release if blk_mq_init_sched has released it, and not releasing the reference results in ghost reference if blk_mq_init_sched did not release it either. The same problem also exists in io schedulers' init_sched implementations. We can address the problem by adding releasing statements to the error handling procedures of blk_mq_init_sched and init_sched implementations. But that is counterintuitive and requires modifications to existing io schedulers. Instead, We make elevator_alloc get the io scheduler module references that will be released by elevator_release. And then, we match each elevator_get with an elevator_put. Therefore, each reference to an io scheduler module explicitly has its own getter and releaser, and we no longer need to worry about the refcnt problems. The bugs and the patch can be validated with tools here: https://github.com/nickyc975/linux_elv_refcnt_bug.git [hch: split out a few bits into separate patches, use a non-try module_get in elevator_alloc] Signed-off-by: Jinlong Chen Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221020064819.1469928-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 1 + block/blk-mq.c | 2 ++ block/elevator.c | 10 +++++++--- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index a4f7c101b53b..68227240fdea 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -555,6 +555,7 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue) return 0; } +/* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { unsigned int flags = q->tag_set->flags; diff --git a/block/blk-mq.c b/block/blk-mq.c index 9db8814cdd02..098432d3caf1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4591,6 +4591,8 @@ static void blk_mq_elv_switch_back(struct list_head *head, mutex_lock(&q->sysfs_lock); elevator_switch(q, t); + /* drop the reference acquired in blk_mq_elv_switch_none */ + elevator_put(t); mutex_unlock(&q->sysfs_lock); } diff --git a/block/elevator.c b/block/elevator.c index 61d5655a3819..d26aa787e29f 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -165,6 +165,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, if (unlikely(!eq)) return NULL; + __elevator_get(e); eq->type = e; kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); @@ -704,8 +705,9 @@ void elevator_init_mq(struct request_queue *q) if (err) { pr_warn("\"%s\" elevator initialization failed, " "falling back to \"none\"\n", e->elevator_name); - elevator_put(e); } + + elevator_put(e); } /* @@ -737,6 +739,7 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) static int elevator_change(struct request_queue *q, const char *elevator_name) { struct elevator_type *e; + int ret; /* Make sure queue is not in the middle of being removed */ if (!blk_queue_registered(q)) @@ -757,8 +760,9 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) e = elevator_get(q, elevator_name, true); if (!e) return -EINVAL; - - return elevator_switch(q, e); + ret = elevator_switch(q, e); + elevator_put(e); + return ret; } ssize_t elv_iosched_store(struct request_queue *q, const char *buf, -- cgit v1.2.3 From 2b3f056f72e56fa07df69b4705e0b46a6c08e77c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Oct 2022 15:57:17 +0200 Subject: blk-mq: move the call to blk_put_queue out of blk_mq_destroy_queue The fact that blk_mq_destroy_queue also drops a queue reference leads to various places having to grab an extra reference. Move the call to blk_put_queue into the callers to allow removing the extra references. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20221018135720.670094-2-hch@lst.de [axboe: fix fabrics_q vs admin_q conflict in nvme core.c] Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +--- block/bsg-lib.c | 2 ++ drivers/nvme/host/apple.c | 1 + drivers/nvme/host/core.c | 10 ++++++++-- drivers/nvme/host/pci.c | 1 + drivers/scsi/scsi_sysfs.c | 1 + drivers/ufs/core/ufshcd.c | 2 ++ 7 files changed, 16 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 098432d3caf1..4cecf281123f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4010,9 +4010,6 @@ void blk_mq_destroy_queue(struct request_queue *q) blk_sync_queue(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); - - /* @q is and will stay empty, shutdown and put */ - blk_put_queue(q); } EXPORT_SYMBOL(blk_mq_destroy_queue); @@ -4029,6 +4026,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_mq_destroy_queue(q); + blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index d6f5dcdce748..435c32373cd6 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -325,6 +325,7 @@ void bsg_remove_queue(struct request_queue *q) bsg_unregister_queue(bset->bd); blk_mq_destroy_queue(q); + blk_put_queue(q); blk_mq_free_tag_set(&bset->tag_set); kfree(bset); } @@ -400,6 +401,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, return q; out_cleanup_queue: blk_mq_destroy_queue(q); + blk_put_queue(q); out_queue: blk_mq_free_tag_set(set); out_tag_set: diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index ff8b083dc5c6..7b8068aab7fc 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1510,6 +1510,7 @@ static int apple_nvme_probe(struct platform_device *pdev) if (!blk_get_queue(anv->ctrl.admin_q)) { nvme_start_admin_queue(&anv->ctrl); blk_mq_destroy_queue(anv->ctrl.admin_q); + blk_put_queue(anv->ctrl.admin_q); anv->ctrl.admin_q = NULL; ret = -ENODEV; goto put_dev; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dc4220600585..0090dc0b3ae6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4851,6 +4851,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, out_cleanup_admin_q: blk_mq_destroy_queue(ctrl->admin_q); + blk_put_queue(ctrl->admin_q); out_free_tagset: blk_mq_free_tag_set(ctrl->admin_tagset); return ret; @@ -4860,8 +4861,11 @@ EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set); void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl) { blk_mq_destroy_queue(ctrl->admin_q); - if (ctrl->ops->flags & NVME_F_FABRICS) + blk_put_queue(ctrl->admin_q); + if (ctrl->ops->flags & NVME_F_FABRICS) { blk_mq_destroy_queue(ctrl->fabrics_q); + blk_put_queue(ctrl->fabrics_q); + } blk_mq_free_tag_set(ctrl->admin_tagset); } EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set); @@ -4907,8 +4911,10 @@ EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set); void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl) { - if (ctrl->ops->flags & NVME_F_FABRICS) + if (ctrl->ops->flags & NVME_F_FABRICS) { blk_mq_destroy_queue(ctrl->connect_q); + blk_put_queue(ctrl->connect_q); + } blk_mq_free_tag_set(ctrl->tagset); } EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 31e577b01257..7d37c81fbe70 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1749,6 +1749,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) */ nvme_start_admin_queue(&dev->ctrl); blk_mq_destroy_queue(dev->ctrl.admin_q); + blk_put_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } } diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index cac7c902cf70..579be9ffb10f 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -1486,6 +1486,7 @@ void __scsi_remove_device(struct scsi_device *sdev) mutex_unlock(&sdev->state_mutex); blk_mq_destroy_queue(sdev->request_queue); + blk_put_queue(sdev->request_queue); kref_put(&sdev->host->tagset_refcnt, scsi_mq_free_tags); cancel_work_sync(&sdev->requeue_work); diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 7256e6c43ca6..8ee0ac168ff2 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -9544,6 +9544,7 @@ void ufshcd_remove(struct ufs_hba *hba) ufshpb_remove(hba); ufs_sysfs_remove_nodes(hba->dev); blk_mq_destroy_queue(hba->tmf_queue); + blk_put_queue(hba->tmf_queue); blk_mq_free_tag_set(&hba->tmf_tag_set); scsi_remove_host(hba->host); /* disable interrupts */ @@ -9840,6 +9841,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) free_tmf_queue: blk_mq_destroy_queue(hba->tmf_queue); + blk_put_queue(hba->tmf_queue); free_tmf_tag_set: blk_mq_free_tag_set(&hba->tmf_tag_set); out_remove_scsi_host: -- cgit v1.2.3 From a55b70f1273a54b33482db8b2568da435fefd6c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Oct 2022 08:59:16 -0700 Subject: block: remove bio_start_io_acct_time bio_start_io_acct_time is not actually used anywhere, so remove it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221025155916.270303-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ------------ include/linux/blkdev.h | 1 - 2 files changed, 13 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 17667159482e..5d50dd16e2a5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -945,18 +945,6 @@ unsigned long bdev_start_io_acct(struct block_device *bdev, } EXPORT_SYMBOL(bdev_start_io_acct); -/** - * bio_start_io_acct_time - start I/O accounting for bio based drivers - * @bio: bio to start account for - * @start_time: start time that should be passed back to bio_end_io_acct(). - */ -void bio_start_io_acct_time(struct bio *bio, unsigned long start_time) -{ - bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), - bio_op(bio), start_time); -} -EXPORT_SYMBOL_GPL(bio_start_io_acct_time); - /** * bio_start_io_acct - start I/O accounting for bio based drivers * @bio: bio to start account for diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 50e358a19d98..57ed49f20d2e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1458,7 +1458,6 @@ unsigned long bdev_start_io_acct(struct block_device *bdev, void bdev_end_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time); -void bio_start_io_acct_time(struct bio *bio, unsigned long start_time); unsigned long bio_start_io_acct(struct bio *bio); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev); -- cgit v1.2.3 From aa261f20589d894eb08b9a2b11c9672c548387cd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 25 Oct 2022 12:17:54 -0700 Subject: block: Constify most queue limits pointers Document which functions do not modify the queue limits. Reviewed-by: Ming Lei Cc: Christoph Hellwig Cc: Keith Busch Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20221025191755.1711437-3-bvanassche@acm.org Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-map.c | 2 +- block/blk-merge.c | 29 ++++++++++++++++------------- block/blk-settings.c | 6 +++--- block/blk.h | 11 ++++++----- 4 files changed, 26 insertions(+), 22 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 34735626b00f..46688e70b141 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -555,7 +555,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) size_t nr_iter = iov_iter_count(iter); size_t nr_segs = iter->nr_segs; struct bio_vec *bvecs, *bvprvp = NULL; - struct queue_limits *lim = &q->limits; + const struct queue_limits *lim = &q->limits; unsigned int nsegs = 0, bytes = 0; struct bio *bio; size_t i; diff --git a/block/blk-merge.c b/block/blk-merge.c index ff04e9290715..58fdc3f8905b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -100,13 +100,14 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio) * is defined as 'unsigned int', meantime it has to be aligned to with the * logical block size, which is the minimum accepted unit by hardware. */ -static unsigned int bio_allowed_max_sectors(struct queue_limits *lim) +static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) { return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; } -static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim, - unsigned *nsegs, struct bio_set *bs) +static struct bio *bio_split_discard(struct bio *bio, + const struct queue_limits *lim, + unsigned *nsegs, struct bio_set *bs) { unsigned int max_discard_sectors, granularity; sector_t tmp; @@ -146,7 +147,8 @@ static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim, } static struct bio *bio_split_write_zeroes(struct bio *bio, - struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs) + const struct queue_limits *lim, + unsigned *nsegs, struct bio_set *bs) { *nsegs = 0; if (!lim->max_write_zeroes_sectors) @@ -165,7 +167,7 @@ static struct bio *bio_split_write_zeroes(struct bio *bio, * aligned to a physical block boundary. */ static inline unsigned get_max_io_size(struct bio *bio, - struct queue_limits *lim) + const struct queue_limits *lim) { unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT; unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT; @@ -184,7 +186,7 @@ static inline unsigned get_max_io_size(struct bio *bio, return max_sectors & ~(lbs - 1); } -static inline unsigned get_max_segment_size(struct queue_limits *lim, +static inline unsigned get_max_segment_size(const struct queue_limits *lim, struct page *start_page, unsigned long offset) { unsigned long mask = lim->seg_boundary_mask; @@ -219,9 +221,9 @@ static inline unsigned get_max_segment_size(struct queue_limits *lim, * *@nsegs segments and *@sectors sectors would make that bio unacceptable for * the block driver. */ -static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv, - unsigned *nsegs, unsigned *bytes, unsigned max_segs, - unsigned max_bytes) +static bool bvec_split_segs(const struct queue_limits *lim, + const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes, + unsigned max_segs, unsigned max_bytes) { unsigned max_len = min(max_bytes, UINT_MAX) - *bytes; unsigned len = min(bv->bv_len, max_len); @@ -267,7 +269,7 @@ static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv, * responsible for ensuring that @bs is only destroyed after processing of the * split bio has finished. */ -static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim, +static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *segs, struct bio_set *bs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; @@ -331,8 +333,9 @@ split: * The split bio is allocated from @q->bio_split, which is provided by the * block layer. */ -struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim, - unsigned int *nr_segs) +struct bio *__bio_split_to_limits(struct bio *bio, + const struct queue_limits *lim, + unsigned int *nr_segs) { struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split; struct bio *split; @@ -377,7 +380,7 @@ struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim, */ struct bio *bio_split_to_limits(struct bio *bio) { - struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; + const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; unsigned int nr_segs; if (bio_may_exceed_limits(bio, lim)) diff --git a/block/blk-settings.c b/block/blk-settings.c index 8bb9eef5310e..1cba5c2a2796 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -481,7 +481,7 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) } EXPORT_SYMBOL(blk_queue_io_opt); -static int queue_limit_alignment_offset(struct queue_limits *lim, +static int queue_limit_alignment_offset(const struct queue_limits *lim, sector_t sector) { unsigned int granularity = max(lim->physical_block_size, lim->io_min); @@ -491,8 +491,8 @@ static int queue_limit_alignment_offset(struct queue_limits *lim, return (granularity + lim->alignment_offset - alignment) % granularity; } -static unsigned int queue_limit_discard_alignment(struct queue_limits *lim, - sector_t sector) +static unsigned int queue_limit_discard_alignment( + const struct queue_limits *lim, sector_t sector) { unsigned int alignment, granularity, offset; diff --git a/block/blk.h b/block/blk.h index d6ea0d1a6db0..7f9e089ab1f7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -104,7 +104,7 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, return true; } -static inline bool __bvec_gap_to_prev(struct queue_limits *lim, +static inline bool __bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { return (offset & lim->virt_boundary_mask) || @@ -115,7 +115,7 @@ static inline bool __bvec_gap_to_prev(struct queue_limits *lim, * Check if adding a bio_vec after bprv with offset would create a gap in * the SG list. Most drivers don't care about this, but some do. */ -static inline bool bvec_gap_to_prev(struct queue_limits *lim, +static inline bool bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { if (!lim->virt_boundary_mask) @@ -297,7 +297,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); static inline bool bio_may_exceed_limits(struct bio *bio, - struct queue_limits *lim) + const struct queue_limits *lim) { switch (bio_op(bio)) { case REQ_OP_DISCARD: @@ -320,8 +320,9 @@ static inline bool bio_may_exceed_limits(struct bio *bio, bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; } -struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim, - unsigned int *nr_segs); +struct bio *__bio_split_to_limits(struct bio *bio, + const struct queue_limits *lim, + unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, -- cgit v1.2.3 From 95465318849f7525f4dce1b720e4627f48963327 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 25 Oct 2022 12:17:55 -0700 Subject: block: Micro-optimize get_max_segment_size() This patch removes a conditional jump from get_max_segment_size(). The x86-64 assembler code for this function without this patch is as follows: 206 return min_not_zero(mask - offset + 1, 0x0000000000000118 <+72>: not %rax 0x000000000000011b <+75>: and 0x8(%r10),%rax 0x000000000000011f <+79>: add $0x1,%rax 0x0000000000000123 <+83>: je 0x138 0x0000000000000125 <+85>: cmp %rdx,%rax 0x0000000000000128 <+88>: mov %rdx,%r12 0x000000000000012b <+91>: cmovbe %rax,%r12 0x000000000000012f <+95>: test %rdx,%rdx 0x0000000000000132 <+98>: mov %eax,%edx 0x0000000000000134 <+100>: cmovne %r12d,%edx With this patch applied: 206 return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1; 0x000000000000003f <+63>: mov 0x28(%rdi),%ebp 0x0000000000000042 <+66>: not %rax 0x0000000000000045 <+69>: and 0x8(%rdi),%rax 0x0000000000000049 <+73>: sub $0x1,%rbp 0x000000000000004d <+77>: cmp %rbp,%rax 0x0000000000000050 <+80>: cmova %rbp,%rax 0x0000000000000054 <+84>: add $0x1,%eax Reviewed-by: Ming Lei Cc: Christoph Hellwig Cc: Keith Busch Cc: Steven Rostedt Cc: Guenter Roeck Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20221025191755.1711437-4-bvanassche@acm.org Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-merge.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 58fdc3f8905b..35a8f75cc45d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -186,6 +186,14 @@ static inline unsigned get_max_io_size(struct bio *bio, return max_sectors & ~(lbs - 1); } +/** + * get_max_segment_size() - maximum number of bytes to add as a single segment + * @lim: Request queue limits. + * @start_page: See below. + * @offset: Offset from @start_page where to add a segment. + * + * Returns the maximum number of bytes that can be added as a single segment. + */ static inline unsigned get_max_segment_size(const struct queue_limits *lim, struct page *start_page, unsigned long offset) { @@ -194,11 +202,10 @@ static inline unsigned get_max_segment_size(const struct queue_limits *lim, offset = mask & (page_to_phys(start_page) + offset); /* - * overflow may be triggered in case of zero page physical address - * on 32bit arch, use queue's max segment size when that happens. + * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1 + * after having calculated the minimum. */ - return min_not_zero(mask - offset + 1, - (unsigned long)lim->max_segment_size); + return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1; } /** -- cgit v1.2.3 From 82c229476b8f6afd7e09bc4dc77d89dc19ff7688 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Wed, 26 Oct 2022 13:19:57 +0800 Subject: blk-mq: avoid double ->queue_rq() because of early timeout David Jeffery found one double ->queue_rq() issue, so far it can be triggered in VM use case because of long vmexit latency or preempt latency of vCPU pthread or long page fault in vCPU pthread, then block IO req could be timed out before queuing the request to hardware but after calling blk_mq_start_request() during ->queue_rq(), then timeout handler may handle it by requeue, then double ->queue_rq() is caused, and kernel panic. So far, it is driver's responsibility to cover the race between timeout and completion, so it seems supposed to be solved in driver in theory, given driver has enough knowledge. But it is really one common problem, lots of driver could have similar issue, and could be hard to fix all affected drivers, even it isn't easy for driver to handle the race. So David suggests this patch by draining in-progress ->queue_rq() for solving this issue. Cc: Stefan Hajnoczi Cc: Keith Busch Cc: virtualization@lists.linux-foundation.org Cc: Bart Van Assche Signed-off-by: David Jeffery Signed-off-by: Ming Lei Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20221026051957.358818-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 56 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4cecf281123f..060c8cca4b24 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1523,7 +1523,13 @@ static void blk_mq_rq_timed_out(struct request *req) blk_add_timer(req); } -static bool blk_mq_req_expired(struct request *rq, unsigned long *next) +struct blk_expired_data { + bool has_timedout_rq; + unsigned long next; + unsigned long timeout_start; +}; + +static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired) { unsigned long deadline; @@ -1533,13 +1539,13 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; deadline = READ_ONCE(rq->deadline); - if (time_after_eq(jiffies, deadline)) + if (time_after_eq(expired->timeout_start, deadline)) return true; - if (*next == 0) - *next = deadline; - else if (time_after(*next, deadline)) - *next = deadline; + if (expired->next == 0) + expired->next = deadline; + else if (time_after(expired->next, deadline)) + expired->next = deadline; return false; } @@ -1555,7 +1561,7 @@ void blk_mq_put_rq_ref(struct request *rq) static bool blk_mq_check_expired(struct request *rq, void *priv) { - unsigned long *next = priv; + struct blk_expired_data *expired = priv; /* * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot @@ -1564,7 +1570,18 @@ static bool blk_mq_check_expired(struct request *rq, void *priv) * it was completed and reallocated as a new request after returning * from blk_mq_check_expired(). */ - if (blk_mq_req_expired(rq, next)) + if (blk_mq_req_expired(rq, expired)) { + expired->has_timedout_rq = true; + return false; + } + return true; +} + +static bool blk_mq_handle_expired(struct request *rq, void *priv) +{ + struct blk_expired_data *expired = priv; + + if (blk_mq_req_expired(rq, expired)) blk_mq_rq_timed_out(rq); return true; } @@ -1573,7 +1590,9 @@ static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, timeout_work); - unsigned long next = 0; + struct blk_expired_data expired = { + .timeout_start = jiffies, + }; struct blk_mq_hw_ctx *hctx; unsigned long i; @@ -1593,10 +1612,23 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); + /* check if there is any timed-out request */ + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); + if (expired.has_timedout_rq) { + /* + * Before walking tags, we must ensure any submit started + * before the current time has finished. Since the submit + * uses srcu or rcu, wait for a synchronization point to + * ensure all running submits have finished + */ + blk_mq_wait_quiesce_done(q); + + expired.next = 0; + blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); + } - if (next != 0) { - mod_timer(&q->timeout, next); + if (expired.next != 0) { + mod_timer(&q->timeout, expired.next); } else { /* * Request timeouts are handled as a forward rolling timer. If -- cgit v1.2.3 From 219cf43c552a49a7710b7b341bf616682a2643f0 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Sun, 30 Oct 2022 17:47:30 +0800 Subject: blk-mq: move queue_is_mq out of blk_mq_cancel_work_sync The only caller that needs queue_is_mq check is del_gendisk, so move the check into it. Signed-off-by: Jinlong Chen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221030094730.1275463-1-nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/blk-mq.c | 12 +++++------- block/genhd.c | 4 +++- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 060c8cca4b24..bcb402f9bff6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4883,15 +4883,13 @@ EXPORT_SYMBOL(blk_mq_rq_cpu); void blk_mq_cancel_work_sync(struct request_queue *q) { - if (queue_is_mq(q)) { - struct blk_mq_hw_ctx *hctx; - unsigned long i; + struct blk_mq_hw_ctx *hctx; + unsigned long i; - cancel_delayed_work_sync(&q->requeue_work); + cancel_delayed_work_sync(&q->requeue_work); - queue_for_each_hw_ctx(q, hctx, i) - cancel_delayed_work_sync(&hctx->run_work); - } + queue_for_each_hw_ctx(q, hctx, i) + cancel_delayed_work_sync(&hctx->run_work); } static int __init blk_mq_init(void) diff --git a/block/genhd.c b/block/genhd.c index 17b33c62423d..493b93faee9c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -638,7 +638,9 @@ void del_gendisk(struct gendisk *disk) blk_sync_queue(q); blk_flush_integrity(); - blk_mq_cancel_work_sync(q); + + if (queue_is_mq(q)) + blk_mq_cancel_work_sync(q); blk_mq_quiesce_queue(q); if (q->elevator) { -- cgit v1.2.3 From 56c1ee92246a5099a626b955dd7f6636cdce6f93 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Sun, 30 Oct 2022 16:32:12 +0800 Subject: blk-mq: remove redundant call to blk_freeze_queue_start in blk_mq_destroy_queue The calling relationship in blk_mq_destroy_queue() is as follows: blk_mq_destroy_queue() ... -> blk_queue_start_drain() -> blk_freeze_queue_start() <- called ... -> blk_freeze_queue() -> blk_freeze_queue_start() <- called again -> blk_mq_freeze_queue_wait() ... So there is a redundant call to blk_freeze_queue_start(). Replace blk_freeze_queue() with blk_mq_freeze_queue_wait() to avoid the redundant call. Signed-off-by: Jinlong Chen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221030083212.1251255-1-nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index bcb402f9bff6..623e8a506539 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4037,7 +4037,7 @@ void blk_mq_destroy_queue(struct request_queue *q) blk_queue_flag_set(QUEUE_FLAG_DYING, q); blk_queue_start_drain(q); - blk_freeze_queue(q); + blk_mq_freeze_queue_wait(q); blk_sync_queue(q); blk_mq_cancel_work_sync(q); -- cgit v1.2.3 From 3d89bd12d352e20f4f7c8f11a0f1a712b95a5295 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Sep 2022 15:19:37 +0800 Subject: block, bfq: support to track if bfqq has pending requests If entity belongs to bfqq, then entity->in_groups_with_pending_reqs is not used currently. This patch use it to track if bfqq has pending requests through callers of weights_tree insertion and removal. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Acked-by: Paolo Valente Link: https://lore.kernel.org/r/20220916071942.214222-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 1 + block/bfq-iosched.h | 2 ++ block/bfq-wf2q.c | 24 ++++++++++++++++++++++-- 3 files changed, 25 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index ccfadc6f71bf..71fadc7ebe0d 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6261,6 +6261,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) */ bfqq->budget_timeout = jiffies; + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); bfq_weights_tree_remove(bfqd, bfqq); } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 71f721670ab6..42c910f2d50d 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -1078,6 +1078,8 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool expiration); void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); void bfq_add_bfqq_busy(struct bfq_queue *bfqq); +void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); +void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 8fc3da4c23bb..bd8f4ed84848 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1646,6 +1646,22 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq == bfqd->in_service_queue, expiration); } +void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (!entity->in_groups_with_pending_reqs) + entity->in_groups_with_pending_reqs = true; +} + +void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (entity->in_groups_with_pending_reqs) + entity->in_groups_with_pending_reqs = false; +} + /* * Called when the bfqq no longer has requests pending, remove it from * the service tree. As a special case, it can be invoked during an @@ -1668,8 +1684,10 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration) bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); - if (!bfqq->dispatched) + if (!bfqq->dispatched) { + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); bfq_weights_tree_remove(bfqd, bfqq); + } } /* @@ -1686,10 +1704,12 @@ void bfq_add_bfqq_busy(struct bfq_queue *bfqq) bfq_mark_bfqq_busy(bfqq); bfqd->busy_queues[bfqq->ioprio_class - 1]++; - if (!bfqq->dispatched) + if (!bfqq->dispatched) { + bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); if (bfqq->wr_coeff == 1) bfq_weights_tree_add(bfqd, bfqq, &bfqd->queue_weights_tree); + } if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; -- cgit v1.2.3 From 60a6e10c537a7459dd53882186bd16fff257fb03 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Sep 2022 15:19:38 +0800 Subject: block, bfq: record how many queues have pending requests Prepare to refactor the counting of 'num_groups_with_pending_reqs'. Add a counter in bfq_group, update it while tracking if bfqq have pending requests and when bfq_bfqq_move() is called. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Acked-by: Paolo Valente Link: https://lore.kernel.org/r/20220916071942.214222-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 10 ++++++++++ block/bfq-iosched.h | 1 + block/bfq-wf2q.c | 12 ++++++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 144bca006463..4c37398e0b99 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -552,6 +552,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; + bfqg->num_queues_with_pending_reqs = 0; bfqg->online = true; bfqg->rq_pos_tree = RB_ROOT; } @@ -641,6 +642,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, { struct bfq_entity *entity = &bfqq->entity; struct bfq_group *old_parent = bfqq_group(bfqq); + bool has_pending_reqs = false; /* * No point to move bfqq to the same group, which can happen when @@ -661,6 +663,11 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfqq->ref++; + if (entity->in_groups_with_pending_reqs) { + has_pending_reqs = true; + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); + } + /* If bfqq is empty, then bfq_bfqq_expire also invokes * bfq_del_bfqq_busy, thereby removing bfqq and its entity * from data structures related to current group. Otherwise we @@ -688,6 +695,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* pin down bfqg and its associated blkg */ bfqg_and_blkg_get(bfqg); + if (has_pending_reqs) + bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); + if (bfq_bfqq_busy(bfqq)) { if (unlikely(!bfqd->nonrot_with_queueing)) bfq_pos_tree_add_move(bfqd, bfqq); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 42c910f2d50d..35b366eb702a 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -939,6 +939,7 @@ struct bfq_group { struct bfq_entity *my_entity; int active_entities; + int num_queues_with_pending_reqs; struct rb_root rq_pos_tree; diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index bd8f4ed84848..5549ccf09cd2 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1650,16 +1650,24 @@ void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; - if (!entity->in_groups_with_pending_reqs) + if (!entity->in_groups_with_pending_reqs) { entity->in_groups_with_pending_reqs = true; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqq_group(bfqq)->num_queues_with_pending_reqs++; +#endif + } } void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; - if (entity->in_groups_with_pending_reqs) + if (entity->in_groups_with_pending_reqs) { entity->in_groups_with_pending_reqs = false; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + bfqq_group(bfqq)->num_queues_with_pending_reqs--; +#endif + } } /* -- cgit v1.2.3 From 71f8ca77cb8764d46f656b725999e8b8b1aec215 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Sep 2022 15:19:39 +0800 Subject: block, bfq: refactor the counting of 'num_groups_with_pending_reqs' Currently, bfq can't handle sync io concurrently as long as they are not issued from root group. This is because 'bfqd->num_groups_with_pending_reqs > 0' is always true in bfq_asymmetric_scenario(). The way that bfqg is counted into 'num_groups_with_pending_reqs': Before this patch: 1) root group will never be counted. 2) Count if bfqg or it's child bfqgs have pending requests. 3) Don't count if bfqg and it's child bfqgs complete all the requests. After this patch: 1) root group is counted. 2) Count if bfqg have pending requests. 3) Don't count if bfqg complete all the requests. With this change, the occasion that only one group is activated can be detected, and next patch will support concurrent sync io in the occasion. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Acked-by: Paolo Valente Link: https://lore.kernel.org/r/20220916071942.214222-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 42 ------------------------------------------ block/bfq-iosched.h | 18 +++++++++--------- block/bfq-wf2q.c | 23 ++++++++--------------- 3 files changed, 17 insertions(+), 66 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 71fadc7ebe0d..d5febdb0a6c0 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -970,48 +970,6 @@ reset_entity_pointer: void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - struct bfq_entity *entity = bfqq->entity.parent; - - for_each_entity(entity) { - struct bfq_sched_data *sd = entity->my_sched_data; - - if (sd->next_in_service || sd->in_service_entity) { - /* - * entity is still active, because either - * next_in_service or in_service_entity is not - * NULL (see the comments on the definition of - * next_in_service for details on why - * in_service_entity must be checked too). - * - * As a consequence, its parent entities are - * active as well, and thus this loop must - * stop here. - */ - break; - } - - /* - * The decrement of num_groups_with_pending_reqs is - * not performed immediately upon the deactivation of - * entity, but it is delayed to when it also happens - * that the first leaf descendant bfqq of entity gets - * all its pending requests completed. The following - * instructions perform this delayed decrement, if - * needed. See the comments on - * num_groups_with_pending_reqs for details. - */ - if (entity->in_groups_with_pending_reqs) { - entity->in_groups_with_pending_reqs = false; - bfqd->num_groups_with_pending_reqs--; - } - } - - /* - * Next function is invoked last, because it causes bfqq to be - * freed if the following holds: bfqq is not in service and - * has no dispatched request. DO NOT use bfqq after the next - * function invocation. - */ __bfq_weights_tree_remove(bfqd, bfqq, &bfqd->queue_weights_tree); } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 35b366eb702a..5bfb7fb3dc2b 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -492,27 +492,27 @@ struct bfq_data { struct rb_root_cached queue_weights_tree; /* - * Number of groups with at least one descendant process that + * Number of groups with at least one process that * has at least one request waiting for completion. Note that * this accounts for also requests already dispatched, but not * yet completed. Therefore this number of groups may differ * (be larger) than the number of active groups, as a group is * considered active only if its corresponding entity has - * descendant queues with at least one request queued. This + * queues with at least one request queued. This * number is used to decide whether a scenario is symmetric. * For a detailed explanation see comments on the computation * of the variable asymmetric_scenario in the function * bfq_better_to_idle(). * * However, it is hard to compute this number exactly, for - * groups with multiple descendant processes. Consider a group - * that is inactive, i.e., that has no descendant process with + * groups with multiple processes. Consider a group + * that is inactive, i.e., that has no process with * pending I/O inside BFQ queues. Then suppose that * num_groups_with_pending_reqs is still accounting for this - * group, because the group has descendant processes with some + * group, because the group has processes with some * I/O request still in flight. num_groups_with_pending_reqs * should be decremented when the in-flight request of the - * last descendant process is finally completed (assuming that + * last process is finally completed (assuming that * nothing else has changed for the group in the meantime, in * terms of composition of the group and active/inactive state of child * groups and processes). To accomplish this, an additional @@ -521,7 +521,7 @@ struct bfq_data { * we resort to the following tradeoff between simplicity and * accuracy: for an inactive group that is still counted in * num_groups_with_pending_reqs, we decrement - * num_groups_with_pending_reqs when the first descendant + * num_groups_with_pending_reqs when the first * process of the group remains with no request waiting for * completion. * @@ -529,12 +529,12 @@ struct bfq_data { * carefulness: to avoid multiple decrements, we flag a group, * more precisely an entity representing a group, as still * counted in num_groups_with_pending_reqs when it becomes - * inactive. Then, when the first descendant queue of the + * inactive. Then, when the first queue of the * entity remains with no request waiting for completion, * num_groups_with_pending_reqs is decremented, and this flag * is reset. After this flag is reset for the entity, * num_groups_with_pending_reqs won't be decremented any - * longer in case a new descendant queue of the entity remains + * longer in case a new queue of the entity remains * with no request waiting for completion. */ unsigned int num_groups_with_pending_reqs; diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 5549ccf09cd2..5e8224c96921 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -984,19 +984,6 @@ static void __bfq_activate_entity(struct bfq_entity *entity, entity->on_st_or_in_serv = true; } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - struct bfq_data *bfqd = bfqg->bfqd; - - if (!entity->in_groups_with_pending_reqs) { - entity->in_groups_with_pending_reqs = true; - bfqd->num_groups_with_pending_reqs++; - } - } -#endif - bfq_update_fin_time_enqueue(entity, st, backshifted); } @@ -1653,7 +1640,8 @@ void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) if (!entity->in_groups_with_pending_reqs) { entity->in_groups_with_pending_reqs = true; #ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqq_group(bfqq)->num_queues_with_pending_reqs++; + if (!(bfqq_group(bfqq)->num_queues_with_pending_reqs++)) + bfqq->bfqd->num_groups_with_pending_reqs++; #endif } } @@ -1665,7 +1653,8 @@ void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) if (entity->in_groups_with_pending_reqs) { entity->in_groups_with_pending_reqs = false; #ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqq_group(bfqq)->num_queues_with_pending_reqs--; + if (!(--bfqq_group(bfqq)->num_queues_with_pending_reqs)) + bfqq->bfqd->num_groups_with_pending_reqs--; #endif } } @@ -1694,6 +1683,10 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration) if (!bfqq->dispatched) { bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); + /* + * Next function is invoked last, because it causes bfqq to be + * freed. DO NOT use bfqq after the next function invocation. + */ bfq_weights_tree_remove(bfqd, bfqq); } } -- cgit v1.2.3 From eed3ecc991c90a4a0ce32ea2b35378dc351f012b Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Sep 2022 15:19:40 +0800 Subject: block, bfq: do not idle if only one group is activated Now that root group is counted into 'num_groups_with_pending_reqs', 'num_groups_with_pending_reqs > 0' is always true in bfq_asymmetric_scenario(). Thus change the condition to '> 1'. On the other hand, this change can enable concurrent sync io if only one group is activated. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Acked-by: Paolo Valente Link: https://lore.kernel.org/r/20220916071942.214222-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index d5febdb0a6c0..850becc2a522 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -820,7 +820,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) * much easier to maintain the needed state: * 1) all active queues have the same weight, * 2) all active queues belong to the same I/O-priority class, - * 3) there are no active groups. + * 3) there is at most one active group. * In particular, the last condition is always true if hierarchical * support or the cgroups interface are not enabled, thus no state * needs to be maintained in this case. @@ -852,7 +852,7 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd, return varied_queue_weights || multiple_classes_busy #ifdef CONFIG_BFQ_GROUP_IOSCHED - || bfqd->num_groups_with_pending_reqs > 0 + || bfqd->num_groups_with_pending_reqs > 1 #endif ; } -- cgit v1.2.3 From afdba14612622ec75896e5646950b3562a9aadd3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Sep 2022 15:19:41 +0800 Subject: block, bfq: cleanup bfq_weights_tree add/remove apis The 'bfq_data' and 'rb_root_cached' can both be accessed through 'bfq_queue', thus only pass 'bfq_queue' as parameter. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Acked-by: Paolo Valente Link: https://lore.kernel.org/r/20220916071942.214222-6-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 19 +++++++++---------- block/bfq-iosched.h | 10 +++------- block/bfq-wf2q.c | 18 ++++++------------ 3 files changed, 18 insertions(+), 29 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 850becc2a522..11703f92c7fb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -870,9 +870,9 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd, * In most scenarios, the rate at which nodes are created/destroyed * should be low too. */ -void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct rb_root_cached *root) +void bfq_weights_tree_add(struct bfq_queue *bfqq) { + struct rb_root_cached *root = &bfqq->bfqd->queue_weights_tree; struct bfq_entity *entity = &bfqq->entity; struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; bool leftmost = true; @@ -944,13 +944,14 @@ inc_counter: * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */ -void __bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct rb_root_cached *root) +void __bfq_weights_tree_remove(struct bfq_queue *bfqq) { + struct rb_root_cached *root; + if (!bfqq->weight_counter) return; + root = &bfqq->bfqd->queue_weights_tree; bfqq->weight_counter->num_active--; if (bfqq->weight_counter->num_active > 0) goto reset_entity_pointer; @@ -967,11 +968,9 @@ reset_entity_pointer: * Invoke __bfq_weights_tree_remove on bfqq and decrement the number * of active groups for each queue's inactive parent entity. */ -void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq) +void bfq_weights_tree_remove(struct bfq_queue *bfqq) { - __bfq_weights_tree_remove(bfqd, bfqq, - &bfqd->queue_weights_tree); + __bfq_weights_tree_remove(bfqq); } /* @@ -6220,7 +6219,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfqq->budget_timeout = jiffies; bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); - bfq_weights_tree_remove(bfqd, bfqq); + bfq_weights_tree_remove(bfqq); } now_ns = ktime_get_ns(); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 5bfb7fb3dc2b..f56147e12501 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -969,13 +969,9 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); -void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct rb_root_cached *root); -void __bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct rb_root_cached *root); -void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq); +void bfq_weights_tree_add(struct bfq_queue *bfqq); +void __bfq_weights_tree_remove(struct bfq_queue *bfqq); +void bfq_weights_tree_remove(struct bfq_queue *bfqq); void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 5e8224c96921..124aaea6196e 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -707,7 +707,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); unsigned int prev_weight, new_weight; struct bfq_data *bfqd = NULL; - struct rb_root_cached *root; #ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_sched_data *sd; struct bfq_group *bfqg; @@ -770,19 +769,15 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, * queue, remove the entity from its old weight counter (if * there is a counter associated with the entity). */ - if (prev_weight != new_weight && bfqq) { - root = &bfqd->queue_weights_tree; - __bfq_weights_tree_remove(bfqd, bfqq, root); - } + if (prev_weight != new_weight && bfqq) + __bfq_weights_tree_remove(bfqq); entity->weight = new_weight; /* * Add the entity, if it is not a weight-raised queue, * to the counter associated with its new weight. */ - if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) { - /* If we get here, root has been initialized. */ - bfq_weights_tree_add(bfqd, bfqq, root); - } + if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqq); new_st->wsum += entity->weight; @@ -1687,7 +1682,7 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration) * Next function is invoked last, because it causes bfqq to be * freed. DO NOT use bfqq after the next function invocation. */ - bfq_weights_tree_remove(bfqd, bfqq); + bfq_weights_tree_remove(bfqq); } } @@ -1708,8 +1703,7 @@ void bfq_add_bfqq_busy(struct bfq_queue *bfqq) if (!bfqq->dispatched) { bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); if (bfqq->wr_coeff == 1) - bfq_weights_tree_add(bfqd, bfqq, - &bfqd->queue_weights_tree); + bfq_weights_tree_add(bfqq); } if (bfqq->wr_coeff > 1) -- cgit v1.2.3 From eb5bca73655cb6aa3bb608253e1e47283240c933 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 16 Sep 2022 15:19:42 +0800 Subject: block, bfq: cleanup __bfq_weights_tree_remove() It's the same with bfq_weights_tree_remove() now. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Acked-by: Paolo Valente Link: https://lore.kernel.org/r/20220916071942.214222-7-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 11 +---------- block/bfq-iosched.h | 1 - block/bfq-wf2q.c | 2 +- 3 files changed, 2 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 11703f92c7fb..42aa5fc7f17b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -944,7 +944,7 @@ inc_counter: * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */ -void __bfq_weights_tree_remove(struct bfq_queue *bfqq) +void bfq_weights_tree_remove(struct bfq_queue *bfqq) { struct rb_root_cached *root; @@ -964,15 +964,6 @@ reset_entity_pointer: bfq_put_queue(bfqq); } -/* - * Invoke __bfq_weights_tree_remove on bfqq and decrement the number - * of active groups for each queue's inactive parent entity. - */ -void bfq_weights_tree_remove(struct bfq_queue *bfqq) -{ - __bfq_weights_tree_remove(bfqq); -} - /* * Return expired entry, or NULL to just start from scratch in rbtree. */ diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index f56147e12501..76363841d8ff 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -970,7 +970,6 @@ void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_weights_tree_add(struct bfq_queue *bfqq); -void __bfq_weights_tree_remove(struct bfq_queue *bfqq); void bfq_weights_tree_remove(struct bfq_queue *bfqq); void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 124aaea6196e..5a02cb94d86e 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -770,7 +770,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, * there is a counter associated with the entity). */ if (prev_weight != new_weight && bfqq) - __bfq_weights_tree_remove(bfqq); + bfq_weights_tree_remove(bfqq); entity->weight = new_weight; /* * Add the entity, if it is not a weight-raised queue, -- cgit v1.2.3 From 81eaca442ea962c43bdb1e9cbb9eddb41b97491d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2022 11:07:09 +0100 Subject: block: cleanup elevator_get Do the request_module and repeated lookup in the only caller that cares, pick a saner name that explains where are actually doing a lookup and use a sane calling conventions that passes the queue first. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221030100714.876891-3-hch@lst.de Signed-off-by: Jens Axboe --- block/elevator.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index d26aa787e29f..9793af0a75a7 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -132,24 +132,15 @@ static struct elevator_type *elevator_find(const char *name, return NULL; } -static struct elevator_type *elevator_get(struct request_queue *q, - const char *name, bool try_loading) +static struct elevator_type *elevator_find_get(struct request_queue *q, + const char *name) { struct elevator_type *e; spin_lock(&elv_list_lock); - e = elevator_find(name, q->required_elevator_features); - if (!e && try_loading) { - spin_unlock(&elv_list_lock); - request_module("%s-iosched", name); - spin_lock(&elv_list_lock); - e = elevator_find(name, q->required_elevator_features); - } - if (e && !elevator_tryget(e)) e = NULL; - spin_unlock(&elv_list_lock); return e; } @@ -634,7 +625,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) !blk_mq_is_shared_tags(q->tag_set->flags)) return NULL; - return elevator_get(q, "mq-deadline", false); + return elevator_find_get(q, "mq-deadline"); } /* @@ -757,9 +748,13 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) if (q->elevator && elevator_match(q->elevator->type, elevator_name, 0)) return 0; - e = elevator_get(q, elevator_name, true); - if (!e) - return -EINVAL; + e = elevator_find_get(q, elevator_name); + if (!e) { + request_module("%s-iosched", elevator_name); + e = elevator_find_get(q, elevator_name); + if (!e) + return -EINVAL; + } ret = elevator_switch(q, e); elevator_put(e); return ret; -- cgit v1.2.3 From aae2a643f508d768b65e59da447f3b11688db3cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2022 11:07:10 +0100 Subject: block: exit elv_iosched_show early when I/O schedulers are not supported If the tag_set has BLK_MQ_F_NO_SCHED flag set we will never show any scheduler, so exit early. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221030100714.876891-4-hch@lst.de Signed-off-by: Jens Axboe --- block/elevator.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 9793af0a75a7..fa58905e6b3e 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -783,7 +783,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_type *__e; int len = 0; - if (!queue_is_mq(q)) + if (!elv_support_iosched(q)) return sprintf(name, "none\n"); if (!q->elevator) @@ -797,8 +797,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) len += sprintf(name+len, "[%s] ", elv->elevator_name); continue; } - if (elv_support_iosched(q) && - elevator_match(__e, __e->elevator_name, + if (elevator_match(__e, __e->elevator_name, q->required_elevator_features)) len += sprintf(name+len, "%s ", __e->elevator_name); } -- cgit v1.2.3 From 16095af2fa2c3089ff1162e677d6596772f6f478 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2022 11:07:11 +0100 Subject: block: cleanup the variable naming in elv_iosched_store Use eq for the elevator_queue as done elsewhere. This frees e to be used for the loop iterator instead of the odd __ prefix. In addition rename elv to cur to make it more clear it is the currently selected elevator. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221030100714.876891-5-hch@lst.de Signed-off-by: Jens Axboe --- block/elevator.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index fa58905e6b3e..72346bf2f997 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -778,9 +778,8 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *buf, ssize_t elv_iosched_show(struct request_queue *q, char *name) { - struct elevator_queue *e = q->elevator; - struct elevator_type *elv = NULL; - struct elevator_type *__e; + struct elevator_queue *eq = q->elevator; + struct elevator_type *cur = NULL, *e; int len = 0; if (!elv_support_iosched(q)) @@ -789,17 +788,17 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) if (!q->elevator) len += sprintf(name+len, "[none] "); else - elv = e->type; + cur = eq->type; spin_lock(&elv_list_lock); - list_for_each_entry(__e, &elv_list, list) { - if (elv && elevator_match(elv, __e->elevator_name, 0)) { - len += sprintf(name+len, "[%s] ", elv->elevator_name); + list_for_each_entry(e, &elv_list, list) { + if (cur && elevator_match(cur, e->elevator_name, 0)) { + len += sprintf(name+len, "[%s] ", cur->elevator_name); continue; } - if (elevator_match(__e, __e->elevator_name, + if (elevator_match(e, e->elevator_name, q->required_elevator_features)) - len += sprintf(name+len, "%s ", __e->elevator_name); + len += sprintf(name+len, "%s ", e->elevator_name); } spin_unlock(&elv_list_lock); -- cgit v1.2.3 From 2eef17a209ab4d77923222045d462d379d6ef692 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2022 11:07:12 +0100 Subject: block: simplify the check for the current elevator in elv_iosched_show Just compare the pointers instead of using the string based elevator_match. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221030100714.876891-6-hch@lst.de Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 72346bf2f997..9a13d1959b76 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -792,7 +792,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) spin_lock(&elv_list_lock); list_for_each_entry(e, &elv_list, list) { - if (cur && elevator_match(cur, e->elevator_name, 0)) { + if (e == cur) { len += sprintf(name+len, "[%s] ", cur->elevator_name); continue; } -- cgit v1.2.3 From ffb86425ee2cadbe573c483b789aab2dd57aeb7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2022 11:07:13 +0100 Subject: block: don't check for required features in elevator_match Checking for the required features in the callers simplifies the code quite a bit, so do that. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221030100714.876891-7-hch@lst.de [axboe: adjust for dropping patch 1, use __elevator_find()] Signed-off-by: Jens Axboe --- block/elevator.c | 51 ++++++++++++++++----------------------------------- 1 file changed, 16 insertions(+), 35 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 9a13d1959b76..5781f5d50bb8 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -83,10 +83,11 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static inline bool elv_support_features(unsigned int elv_features, - unsigned int required_features) +static inline bool elv_support_features(struct request_queue *q, + const struct elevator_type *e) { - return (required_features & elv_features) == required_features; + return (q->required_elevator_features & e->elevator_features) == + q->required_elevator_features; } /** @@ -98,37 +99,19 @@ static inline bool elv_support_features(unsigned int elv_features, * Return true if the elevator @e name matches @name and if @e provides all * the features specified by @required_features. */ -static bool elevator_match(const struct elevator_type *e, const char *name, - unsigned int required_features) +static bool elevator_match(const struct elevator_type *e, const char *name) { - if (!elv_support_features(e->elevator_features, required_features)) - return false; - if (!strcmp(e->elevator_name, name)) - return true; - if (e->elevator_alias && !strcmp(e->elevator_alias, name)) - return true; - - return false; + return !strcmp(e->elevator_name, name) || + (e->elevator_alias && !strcmp(e->elevator_alias, name)); } -/** - * elevator_find - Find an elevator - * @name: Name of the elevator to find - * @required_features: Features that the elevator must provide - * - * Return the first registered scheduler with name @name and supporting the - * features @required_features and NULL otherwise. - */ -static struct elevator_type *elevator_find(const char *name, - unsigned int required_features) +static struct elevator_type *__elevator_find(const char *name) { struct elevator_type *e; - list_for_each_entry(e, &elv_list, list) { - if (elevator_match(e, name, required_features)) + list_for_each_entry(e, &elv_list, list) + if (elevator_match(e, name)) return e; - } - return NULL; } @@ -138,8 +121,8 @@ static struct elevator_type *elevator_find_get(struct request_queue *q, struct elevator_type *e; spin_lock(&elv_list_lock); - e = elevator_find(name, q->required_elevator_features); - if (e && !elevator_tryget(e)) + e = __elevator_find(name); + if (e && (!elv_support_features(q, e) || !elevator_tryget(e))) e = NULL; spin_unlock(&elv_list_lock); return e; @@ -538,7 +521,7 @@ int elv_register(struct elevator_type *e) /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - if (elevator_find(e->elevator_name, 0)) { + if (__elevator_find(e->elevator_name)) { spin_unlock(&elv_list_lock); kmem_cache_destroy(e->icq_cache); return -EBUSY; @@ -639,8 +622,7 @@ static struct elevator_type *elevator_get_by_features(struct request_queue *q) spin_lock(&elv_list_lock); list_for_each_entry(e, &elv_list, list) { - if (elv_support_features(e->elevator_features, - q->required_elevator_features)) { + if (elv_support_features(q, e)) { found = e; break; } @@ -745,7 +727,7 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) return elevator_switch(q, NULL); } - if (q->elevator && elevator_match(q->elevator->type, elevator_name, 0)) + if (q->elevator && elevator_match(q->elevator->type, elevator_name)) return 0; e = elevator_find_get(q, elevator_name); @@ -796,8 +778,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) len += sprintf(name+len, "[%s] ", cur->elevator_name); continue; } - if (elevator_match(e, e->elevator_name, - q->required_elevator_features)) + if (elv_support_features(q, e)) len += sprintf(name+len, "%s ", e->elevator_name); } spin_unlock(&elv_list_lock); -- cgit v1.2.3 From 64b36075eb0e50af6f59047b5f698a9f2bb2b4fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 30 Oct 2022 11:07:14 +0100 Subject: block: split elevator_switch Split an elevator_disable helper from elevator_switch for the case where we want to switch to no scheduler at all. This includes removing the pointless elevator_switch_mq helper and removing the switch to no schedule logic from blk_mq_init_sched. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221030100714.876891-8-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 7 ----- block/blk-mq.c | 2 +- block/blk.h | 1 + block/elevator.c | 77 ++++++++++++++++++++++++++-------------------------- 4 files changed, 40 insertions(+), 47 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 68227240fdea..23d1a90fec42 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -564,13 +564,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) unsigned long i; int ret; - if (!e) { - blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); - q->elevator = NULL; - q->nr_requests = q->tag_set->queue_depth; - return 0; - } - /* * Default to double of smaller one between hw queue_depth and 128, * since we don't split into sync/async like the old code did. diff --git a/block/blk-mq.c b/block/blk-mq.c index 623e8a506539..a78538586a40 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4588,7 +4588,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head, __elevator_get(qe->type); qe->type = q->elevator->type; list_add(&qe->node, head); - elevator_switch(q, NULL); + elevator_disable(q); mutex_unlock(&q->sysfs_lock); return true; diff --git a/block/blk.h b/block/blk.h index 7f9e089ab1f7..f1398fb96cec 100644 --- a/block/blk.h +++ b/block/blk.h @@ -278,6 +278,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, void blk_insert_flush(struct request *rq); int elevator_switch(struct request_queue *q, struct elevator_type *new_e); +void elevator_disable(struct request_queue *q); void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); diff --git a/block/elevator.c b/block/elevator.c index 5781f5d50bb8..800e0038be0d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -554,39 +554,6 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -static int elevator_switch_mq(struct request_queue *q, - struct elevator_type *new_e) -{ - int ret; - - lockdep_assert_held(&q->sysfs_lock); - - if (q->elevator) { - elv_unregister_queue(q); - elevator_exit(q); - } - - ret = blk_mq_init_sched(q, new_e); - if (ret) - goto out; - - if (new_e) { - ret = elv_register_queue(q, true); - if (ret) { - elevator_exit(q); - goto out; - } - } - - if (new_e) - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); - else - blk_add_trace_msg(q, "elv switch: none"); - -out: - return ret; -} - static inline bool elv_support_iosched(struct request_queue *q) { if (!queue_is_mq(q) || @@ -691,19 +658,51 @@ void elevator_init_mq(struct request_queue *q) */ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { - int err; + int ret; lockdep_assert_held(&q->sysfs_lock); blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); - err = elevator_switch_mq(q, new_e); + if (q->elevator) { + elv_unregister_queue(q); + elevator_exit(q); + } + ret = blk_mq_init_sched(q, new_e); + if (ret) + goto out_unfreeze; + + ret = elv_register_queue(q, true); + if (ret) { + elevator_exit(q); + goto out_unfreeze; + } + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + +out_unfreeze: blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); + return ret; +} + +void elevator_disable(struct request_queue *q) +{ + lockdep_assert_held(&q->sysfs_lock); - return err; + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + elv_unregister_queue(q); + elevator_exit(q); + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); + q->elevator = NULL; + q->nr_requests = q->tag_set->queue_depth; + blk_add_trace_msg(q, "elv switch: none"); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); } /* @@ -722,9 +721,9 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) * Special case for mq, turn off scheduling */ if (!strncmp(elevator_name, "none", 4)) { - if (!q->elevator) - return 0; - return elevator_switch(q, NULL); + if (q->elevator) + elevator_disable(q); + return 0; } if (q->elevator && elevator_match(q->elevator->type, elevator_name)) -- cgit v1.2.3 From db5896e9cf93f119c0b181c5e6b473d8bf0302e5 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Oct 2022 19:12:38 +0800 Subject: block: Remove redundant parent blkcg_gp check in check_scale_change Function blkcg_iolatency_throttle will make sure blkg->parent is not NULL before calls check_scale_change. And function check_scale_change is only called in blkcg_iolatency_throttle. Signed-off-by: Kemeng Shi Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20221018111240.22612-2-shikemeng@huawei.com Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 571fa95aafe9..b24d7b788ba3 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -403,9 +403,6 @@ static void check_scale_change(struct iolatency_grp *iolat) u64 scale_lat; int direction = 0; - if (lat_to_blkg(iolat)->parent == NULL) - return; - parent = blkg_to_lat(lat_to_blkg(iolat)->parent); if (!parent) return; -- cgit v1.2.3 From 6891f968985f0569eaf56430518895b3c2885da1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Oct 2022 19:12:39 +0800 Subject: block: Correct comment for scale_cookie_change Default queue depth of iolatency_grp is unlimited, so we scale down quickly(once by half) in scale_cookie_change. Remove the "subtract 1/16th" part which is not the truth and add the actual way we scale down. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20221018111240.22612-3-shikemeng@huawei.com Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index b24d7b788ba3..2c574f98c8d1 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -364,9 +364,11 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat, } /* - * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the + * Change the queue depth of the iolatency_grp. We add 1/16th of the * queue depth at a time so we don't get wild swings and hopefully dial in to - * fairer distribution of the overall queue depth. + * fairer distribution of the overall queue depth. We halve the queue depth + * at a time so we can scale down queue depth quickly from default unlimited + * to target. */ static void scale_change(struct iolatency_grp *iolat, bool up) { -- cgit v1.2.3 From dc572f418a14b18270676c4a7d23a3c8a5544abc Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Oct 2022 19:12:40 +0800 Subject: block: Replace struct rq_depth with unsigned int in struct iolatency_grp We only need a max queue depth for every iolatency to limit the inflight io number. Replace struct rq_depth with unsigned int to simplfy "struct iolatency_grp" and save memory. Signed-off-by: Kemeng Shi Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20221018111240.22612-4-shikemeng@huawei.com Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'block') diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 2c574f98c8d1..778a0057193e 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -141,7 +141,7 @@ struct iolatency_grp { struct latency_stat __percpu *stats; struct latency_stat cur_stat; struct blk_iolatency *blkiolat; - struct rq_depth rq_depth; + unsigned int max_depth; struct rq_wait rq_wait; atomic64_t window_start; atomic_t scale_cookie; @@ -280,7 +280,7 @@ static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) { struct iolatency_grp *iolat = private_data; - return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); + return rq_wait_inc_below(rqw, iolat->max_depth); } static void __blkcg_iolatency_throttle(struct rq_qos *rqos, @@ -374,7 +374,7 @@ static void scale_change(struct iolatency_grp *iolat, bool up) { unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; unsigned long scale = scale_amount(qd, up); - unsigned long old = iolat->rq_depth.max_depth; + unsigned long old = iolat->max_depth; if (old > qd) old = qd; @@ -386,12 +386,12 @@ static void scale_change(struct iolatency_grp *iolat, bool up) if (old < qd) { old += scale; old = min(old, qd); - iolat->rq_depth.max_depth = old; + iolat->max_depth = old; wake_up_all(&iolat->rq_wait.wait); } } else { old >>= 1; - iolat->rq_depth.max_depth = max(old, 1UL); + iolat->max_depth = max(old, 1UL); } } @@ -444,7 +444,7 @@ static void check_scale_change(struct iolatency_grp *iolat) } /* We're as low as we can go. */ - if (iolat->rq_depth.max_depth == 1 && direction < 0) { + if (iolat->max_depth == 1 && direction < 0) { blkcg_use_delay(lat_to_blkg(iolat)); return; } @@ -452,7 +452,7 @@ static void check_scale_change(struct iolatency_grp *iolat) /* We're back to the default cookie, unthrottle all the things. */ if (cur_cookie == DEFAULT_SCALE_COOKIE) { blkcg_clear_delay(lat_to_blkg(iolat)); - iolat->rq_depth.max_depth = UINT_MAX; + iolat->max_depth = UINT_MAX; wake_up_all(&iolat->rq_wait.wait); return; } @@ -507,7 +507,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat, * We don't want to count issue_as_root bio's in the cgroups latency * statistics as it could skew the numbers downwards. */ - if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) { + if (unlikely(issue_as_root && iolat->max_depth != UINT_MAX)) { u64 sub = iolat->min_lat_nsec; if (req_time < sub) blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); @@ -919,7 +919,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) } preempt_enable(); - if (iolat->rq_depth.max_depth == UINT_MAX) + if (iolat->max_depth == UINT_MAX) seq_printf(s, " missed=%llu total=%llu depth=max", (unsigned long long)stat.ps.missed, (unsigned long long)stat.ps.total); @@ -927,7 +927,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) seq_printf(s, " missed=%llu total=%llu depth=%u", (unsigned long long)stat.ps.missed, (unsigned long long)stat.ps.total, - iolat->rq_depth.max_depth); + iolat->max_depth); } static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) @@ -944,12 +944,12 @@ static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); - if (iolat->rq_depth.max_depth == UINT_MAX) + if (iolat->max_depth == UINT_MAX) seq_printf(s, " depth=max avg_lat=%llu win=%llu", avg_lat, cur_win); else seq_printf(s, " depth=%u avg_lat=%llu win=%llu", - iolat->rq_depth.max_depth, avg_lat, cur_win); + iolat->max_depth, avg_lat, cur_win); } static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, @@ -993,9 +993,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) latency_stat_init(iolat, &iolat->cur_stat); rq_wait_init(&iolat->rq_wait); spin_lock_init(&iolat->child_lat.lock); - iolat->rq_depth.queue_depth = blkg->q->nr_requests; - iolat->rq_depth.max_depth = UINT_MAX; - iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; + iolat->max_depth = UINT_MAX; iolat->blkiolat = blkiolat; iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; atomic64_set(&iolat->window_start, now); -- cgit v1.2.3 From 060d9217d356a28e1bcfd2df0c8bf59aa24a12ce Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 2 Nov 2022 10:25:38 +0800 Subject: block, bfq: remove set but not used variable in __bfq_entity_update_weight_prio After the patch "block, bfq: cleanup bfq_weights_tree add/remove apis"), the local variable 'bfqd' is not used anymore, thus remove it. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20221102022542.3621219-2-yukuai1@huaweicloud.com Fixes: afdba1461262 ("block, bfq: cleanup bfq_weights_tree add/remove apis") Signed-off-by: Jens Axboe --- block/bfq-wf2q.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'block') diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 5a02cb94d86e..2faa29fcb1e9 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -706,21 +706,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (entity->prio_changed) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); unsigned int prev_weight, new_weight; - struct bfq_data *bfqd = NULL; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd; - struct bfq_group *bfqg; -#endif - - if (bfqq) - bfqd = bfqq->bfqd; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - sd = entity->my_sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; - } -#endif /* Matches the smp_wmb() in bfq_group_set_weight. */ smp_rmb(); -- cgit v1.2.3 From e5c63eb4b59f9fb9b28e29d605a4dabbeff7772e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 2 Nov 2022 10:25:39 +0800 Subject: block, bfq: factor out code to update 'active_entities' Current code is a bit ugly and hard to read. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Acked-by: Paolo Valente Link: https://lore.kernel.org/r/20221102022542.3621219-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-wf2q.c | 61 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 32 insertions(+), 29 deletions(-) (limited to 'block') diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 2faa29fcb1e9..a908aa3eb0ab 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -218,6 +218,26 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) return false; } +static void bfq_inc_active_entities(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data); + struct bfq_data *bfqd = (struct bfq_data *)bfqg->bfqd; + + if (bfqg != bfqd->root_group) + bfqg->active_entities++; +} + +static void bfq_dec_active_entities(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data); + struct bfq_data *bfqd = (struct bfq_data *)bfqg->bfqd; + + if (bfqg != bfqd->root_group) + bfqg->active_entities--; +} + #else /* CONFIG_BFQ_GROUP_IOSCHED */ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) @@ -230,6 +250,14 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) return true; } +static void bfq_inc_active_entities(struct bfq_entity *entity) +{ +} + +static void bfq_dec_active_entities(struct bfq_entity *entity) +{ +} + #endif /* CONFIG_BFQ_GROUP_IOSCHED */ /* @@ -456,11 +484,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node = &entity->rb_node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif bfq_insert(&st->active, entity); @@ -471,17 +494,10 @@ static void bfq_active_insert(struct bfq_service_tree *st, bfq_update_active_tree(node); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif if (bfqq) list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (bfqg != bfqd->root_group) - bfqg->active_entities++; -#endif + + bfq_inc_active_entities(entity); } /** @@ -558,29 +574,16 @@ static void bfq_active_extract(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif node = bfq_find_deepest(&entity->rb_node); bfq_extract(&st->active, entity); if (node) bfq_update_active_tree(node); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif if (bfqq) list_del(&bfqq->bfqq_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (bfqg != bfqd->root_group) - bfqg->active_entities--; -#endif + + bfq_dec_active_entities(entity); } /** -- cgit v1.2.3 From f6fd119b1ae2c4f794dffc87421cf4ce2414401e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 2 Nov 2022 10:25:40 +0800 Subject: block, bfq: cleanup bfq_activate_requeue_entity() Just make the code a litter cleaner by removing the unnecessary variable 'sd'. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Acked-by: Paolo Valente Link: https://lore.kernel.org/r/20221102022542.3621219-4-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-wf2q.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index a908aa3eb0ab..4d4b84e7bf3e 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1052,12 +1052,12 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) } static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - struct bfq_sched_data *sd, bool non_blocking_wait_rq) { struct bfq_service_tree *st = bfq_entity_service_tree(entity); - if (sd->in_service_entity == entity || entity->tree == &st->active) + if (entity->sched_data->in_service_entity == entity || + entity->tree == &st->active) /* * in service or already queued on the active tree, * requeue or reposition @@ -1089,14 +1089,10 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, bool non_blocking_wait_rq, bool requeue, bool expiration) { - struct bfq_sched_data *sd; - for_each_entity(entity) { - sd = entity->sched_data; - __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); - - if (!bfq_update_next_in_service(sd, entity, expiration) && - !requeue) + __bfq_activate_requeue_entity(entity, non_blocking_wait_rq); + if (!bfq_update_next_in_service(entity->sched_data, entity, + expiration) && !requeue) break; } } -- cgit v1.2.3 From 918fdea3884ca8de93bd0e8ad02545eb8e3695d6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 2 Nov 2022 10:25:41 +0800 Subject: block, bfq: remove dead code for updating 'rq_in_driver' Such code are not even compiled since they are inside marco "#if 0". Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Acked-by: Paolo Valente Link: https://lore.kernel.org/r/20221102022542.3621219-5-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 42aa5fc7f17b..2381cf220ba2 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2321,22 +2321,6 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq) return 0; } -#if 0 /* Still not clear if we can do without next two functions */ -static void bfq_activate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver++; -} - -static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver--; -} -#endif - static void bfq_remove_request(struct request_queue *q, struct request *rq) { -- cgit v1.2.3 From aa625117d6f67e33fab280358855fdd332bb20ab Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 2 Nov 2022 10:25:42 +0800 Subject: block, bfq: don't declare 'bfqd' as type 'void *' in bfq_group Prevent unnecessary format conversion for bfqg->bfqd in multiple places. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Acked-by: Paolo Valente Link: https://lore.kernel.org/r/20221102022542.3621219-6-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 2 +- block/bfq-iosched.h | 2 +- block/bfq-wf2q.c | 6 ++---- 3 files changed, 4 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 4c37398e0b99..d57872cc05ed 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -224,7 +224,7 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, { blkg_rwstat_add(&bfqg->stats.queued, opf, 1); bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) + if (!(bfqq == bfqg->bfqd->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 76363841d8ff..9fa89577322d 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -931,7 +931,7 @@ struct bfq_group { struct bfq_entity entity; struct bfq_sched_data sched_data; - void *bfqd; + struct bfq_data *bfqd; struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS]; struct bfq_queue *async_idle_bfqq; diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 4d4b84e7bf3e..b02b53658ed4 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -222,9 +222,8 @@ static void bfq_inc_active_entities(struct bfq_entity *entity) { struct bfq_sched_data *sd = entity->sched_data; struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data); - struct bfq_data *bfqd = (struct bfq_data *)bfqg->bfqd; - if (bfqg != bfqd->root_group) + if (bfqg != bfqg->bfqd->root_group) bfqg->active_entities++; } @@ -232,9 +231,8 @@ static void bfq_dec_active_entities(struct bfq_entity *entity) { struct bfq_sched_data *sd = entity->sched_data; struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data); - struct bfq_data *bfqd = (struct bfq_data *)bfqg->bfqd; - if (bfqg != bfqd->root_group) + if (bfqg != bfqg->bfqd->root_group) bfqg->active_entities--; } -- cgit v1.2.3 From 71b26083d59cd4ab22489829ffe7d4ead93f5546 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:37 +0100 Subject: block: set the disk capacity to 0 in blk_mark_disk_dead nvme and xen-blkfront are already doing this to stop buffered writes from creating dirty pages that can't be written out later. Move it to the common code. This also removes the comment about the ordering from nvme, as bd_mutex not only is gone entirely, but also hasn't been used for locking updates to the disk size long before that, and thus the ordering requirement documented there doesn't apply any more. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Ming Lei Reviewed-by: Chao Leng Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-2-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 5 +++++ drivers/block/xen-blkfront.c | 1 - drivers/nvme/host/core.c | 7 +------ 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 493b93faee9c..e7bd036024fa 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -555,6 +555,11 @@ void blk_mark_disk_dead(struct gendisk *disk) { set_bit(GD_DEAD, &disk->state); blk_queue_start_drain(disk->queue); + + /* + * Stop buffered writers from dirtying pages that can't be written out. + */ + set_capacity_and_notify(disk, 0); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 35b9bcad9db9..b28489290323 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2129,7 +2129,6 @@ static void blkfront_closing(struct blkfront_info *info) if (info->rq && info->gd) { blk_mq_stop_hw_queues(info->rq); blk_mark_disk_dead(info->gd); - set_capacity(info->gd, 0); } for_each_rinfo(info, rinfo, i) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0090dc0b3ae6..aea0f89acf40 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5116,10 +5116,7 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns) /* * Prepare a queue for teardown. * - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set - * the capacity to 0 after that to avoid blocking dispatchers that may be - * holding bd_butex. This will end buffered writers dirtying pages that can't - * be synced. + * This must forcibly unquiesce queues to avoid blocking dispatch. */ static void nvme_set_queue_dying(struct nvme_ns *ns) { @@ -5128,8 +5125,6 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) blk_mark_disk_dead(ns->disk); nvme_start_ns_queue(ns); - - set_capacity_and_notify(ns->disk, 0); } /** -- cgit v1.2.3 From 8537380bb9882c201db60a1eb201aac6e74083e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:46 +0100 Subject: blk-mq: skip non-mq queues in blk_mq_quiesce_queue For submit_bio based queues there is no (S)RCU critical section during I/O submission and thus nothing to wait for in blk_mq_wait_quiesce_done, so skip doing any synchronization. No non-mq driver should be calling this, but for now we have core callers that unconditionally call into it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-11-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a78538586a40..a03abadfe4c6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -280,7 +280,9 @@ EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); void blk_mq_quiesce_queue(struct request_queue *q) { blk_mq_quiesce_queue_nowait(q); - blk_mq_wait_quiesce_done(q); + /* nothing to wait for non-mq queues */ + if (queue_is_mq(q)) + blk_mq_wait_quiesce_done(q); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); -- cgit v1.2.3 From 80bd4a7aab4c9ce59bf5e35fdf52aa23d8a3c9f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:47 +0100 Subject: blk-mq: move the srcu_struct used for quiescing to the tagset All I/O submissions have fairly similar latencies, and a tagset-wide quiesce is a fairly common operation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Ming Lei Reviewed-by: Chao Leng Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-12-hch@lst.de [axboe: fix whitespace] Signed-off-by: Jens Axboe --- block/blk-core.c | 27 +++++---------------------- block/blk-mq.c | 33 +++++++++++++++++++++++++-------- block/blk-mq.h | 14 +++++++------- block/blk-sysfs.c | 9 ++------- block/blk.h | 9 +-------- block/genhd.c | 2 +- include/linux/blk-mq.h | 4 ++++ include/linux/blkdev.h | 9 --------- 8 files changed, 45 insertions(+), 62 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 5d50dd16e2a5..e9e2bf15cd90 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -65,7 +65,6 @@ DEFINE_IDA(blk_queue_ida); * For queue allocation */ struct kmem_cache *blk_requestq_cachep; -struct kmem_cache *blk_requestq_srcu_cachep; /* * Controlling structure to kblockd @@ -373,26 +372,20 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) +struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; - q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu), - GFP_KERNEL | __GFP_ZERO, node_id); + q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, + node_id); if (!q) return NULL; - if (alloc_srcu) { - blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q); - if (init_srcu_struct(q->srcu) != 0) - goto fail_q; - } - q->last_merge = NULL; q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); if (q->id < 0) - goto fail_srcu; + goto fail_q; q->stats = blk_alloc_queue_stats(); if (!q->stats) @@ -435,11 +428,8 @@ fail_stats: blk_free_queue_stats(q->stats); fail_id: ida_free(&blk_queue_ida, q->id); -fail_srcu: - if (alloc_srcu) - cleanup_srcu_struct(q->srcu); fail_q: - kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q); + kmem_cache_free(blk_requestq_cachep, q); return NULL; } @@ -1172,9 +1162,6 @@ int __init blk_dev_init(void) sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct bio, bi_opf)); - BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu), - __alignof__(struct request_queue)) != - sizeof(struct request_queue)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", @@ -1185,10 +1172,6 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); - blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu", - sizeof(struct request_queue) + - sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL); - blk_debugfs_root = debugfs_create_dir("block", NULL); return 0; diff --git a/block/blk-mq.c b/block/blk-mq.c index a03abadfe4c6..bee728dac9cd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -261,8 +261,8 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); */ void blk_mq_wait_quiesce_done(struct request_queue *q) { - if (blk_queue_has_srcu(q)) - synchronize_srcu(q->srcu); + if (q->tag_set->flags & BLK_MQ_F_BLOCKING) + synchronize_srcu(q->tag_set->srcu); else synchronize_rcu(); } @@ -4003,7 +4003,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING); + q = blk_alloc_queue(set->numa_node); if (!q) return ERR_PTR(-ENOMEM); q->queuedata = queuedata; @@ -4168,9 +4168,6 @@ static void blk_mq_update_poll_flag(struct request_queue *q) int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { - WARN_ON_ONCE(blk_queue_has_srcu(q) != - !!(set->flags & BLK_MQ_F_BLOCKING)); - /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -4429,8 +4426,18 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) - return -ENOMEM; + if (set->flags & BLK_MQ_F_BLOCKING) { + set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); + if (!set->srcu) + return -ENOMEM; + ret = init_srcu_struct(set->srcu); + if (ret) + goto out_free_srcu; + } + + ret = blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues); + if (ret) + goto out_cleanup_srcu; ret = -ENOMEM; for (i = 0; i < set->nr_maps; i++) { @@ -4460,6 +4467,12 @@ out_free_mq_map: } kfree(set->tags); set->tags = NULL; +out_cleanup_srcu: + if (set->flags & BLK_MQ_F_BLOCKING) + cleanup_srcu_struct(set->srcu); +out_free_srcu: + if (set->flags & BLK_MQ_F_BLOCKING) + kfree(set->srcu); return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); @@ -4499,6 +4512,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) kfree(set->tags); set->tags = NULL; + if (set->flags & BLK_MQ_F_BLOCKING) { + cleanup_srcu_struct(set->srcu); + kfree(set->srcu); + } } EXPORT_SYMBOL(blk_mq_free_tag_set); diff --git a/block/blk-mq.h b/block/blk-mq.h index 0b2870839cdd..ef59fee62780 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -377,17 +377,17 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, /* run the code block in @dispatch_ops with rcu/srcu read lock held */ #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ do { \ - if (!blk_queue_has_srcu(q)) { \ - rcu_read_lock(); \ - (dispatch_ops); \ - rcu_read_unlock(); \ - } else { \ + if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ int srcu_idx; \ \ might_sleep_if(check_sleep); \ - srcu_idx = srcu_read_lock((q)->srcu); \ + srcu_idx = srcu_read_lock((q)->tag_set->srcu); \ (dispatch_ops); \ - srcu_read_unlock((q)->srcu, srcu_idx); \ + srcu_read_unlock((q)->tag_set->srcu, srcu_idx); \ + } else { \ + rcu_read_lock(); \ + (dispatch_ops); \ + rcu_read_unlock(); \ } \ } while (0) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7b98c7074771..02e94c4beff1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -742,10 +742,8 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, static void blk_free_queue_rcu(struct rcu_head *rcu_head) { - struct request_queue *q = container_of(rcu_head, struct request_queue, - rcu_head); - - kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q); + kmem_cache_free(blk_requestq_cachep, + container_of(rcu_head, struct request_queue, rcu_head)); } /** @@ -782,9 +780,6 @@ static void blk_release_queue(struct kobject *kobj) if (queue_is_mq(q)) blk_mq_release(q); - if (blk_queue_has_srcu(q)) - cleanup_srcu_struct(q->srcu); - ida_free(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); } diff --git a/block/blk.h b/block/blk.h index f1398fb96cec..e85703ae81dd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -27,7 +27,6 @@ struct blk_flush_queue { }; extern struct kmem_cache *blk_requestq_cachep; -extern struct kmem_cache *blk_requestq_srcu_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; @@ -429,13 +428,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); -static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu) -{ - if (srcu) - return blk_requestq_srcu_cachep; - return blk_requestq_cachep; -} -struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu); +struct request_queue *blk_alloc_queue(int node_id); int disk_scan_partitions(struct gendisk *disk, fmode_t mode); diff --git a/block/genhd.c b/block/genhd.c index e7bd036024fa..09cde914e054 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1414,7 +1414,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) struct request_queue *q; struct gendisk *disk; - q = blk_alloc_queue(node, false); + q = blk_alloc_queue(node); if (!q) return NULL; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 569053ed959d..f059edebb11d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -7,6 +7,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -500,6 +501,8 @@ enum hctx_type { * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. + * @srcu: Use as lock when type of the request queue is blocking + * (BLK_MQ_F_BLOCKING). */ struct blk_mq_tag_set { struct blk_mq_queue_map map[HCTX_MAX_TYPES]; @@ -520,6 +523,7 @@ struct blk_mq_tag_set { struct mutex tag_list_lock; struct list_head tag_list; + struct srcu_struct *srcu; }; /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 32137d85c9ad..6a6fa167fc82 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -543,18 +542,11 @@ struct request_queue { struct mutex debugfs_mutex; bool mq_sysfs_init_done; - - /** - * @srcu: Sleepable RCU. Use as lock when type of the request queue - * is blocking (BLK_MQ_F_BLOCKING). Must be the last member - */ - struct srcu_struct srcu[]; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ -#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ @@ -590,7 +582,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) -#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ -- cgit v1.2.3 From 483239c75ba768e0e2c0e0c503e5fc13c3d5773a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Nov 2022 16:00:48 +0100 Subject: blk-mq: pass a tagset to blk_mq_wait_quiesce_done Nothing in blk_mq_wait_quiesce_done needs the request_queue now, so just pass the tagset, and move the non-mq check into the only caller that needs it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Chao Leng Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-13-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 +++++++++------- drivers/nvme/host/core.c | 4 ++-- drivers/scsi/scsi_lib.c | 2 +- include/linux/blk-mq.h | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index bee728dac9cd..b7abfda1ea69 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -254,15 +254,17 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); /** * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done - * @q: request queue. + * @set: tag_set to wait on * * Note: it is driver's responsibility for making sure that quiesce has - * been started. + * been started on or more of the request_queues of the tag_set. This + * function only waits for the quiesce on those request_queues that had + * the quiesce flag set using blk_mq_quiesce_queue_nowait. */ -void blk_mq_wait_quiesce_done(struct request_queue *q) +void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) { - if (q->tag_set->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(q->tag_set->srcu); + if (set->flags & BLK_MQ_F_BLOCKING) + synchronize_srcu(set->srcu); else synchronize_rcu(); } @@ -282,7 +284,7 @@ void blk_mq_quiesce_queue(struct request_queue *q) blk_mq_quiesce_queue_nowait(q); /* nothing to wait for non-mq queues */ if (queue_is_mq(q)) - blk_mq_wait_quiesce_done(q); + blk_mq_wait_quiesce_done(q->tag_set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); @@ -1623,7 +1625,7 @@ static void blk_mq_timeout_work(struct work_struct *work) * uses srcu or rcu, wait for a synchronization point to * ensure all running submits have finished */ - blk_mq_wait_quiesce_done(q); + blk_mq_wait_quiesce_done(q->tag_set); expired.next = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ed06fcb87f93..66b0b6e11002 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -5107,7 +5107,7 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns) if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags)) blk_mq_quiesce_queue(ns->queue); else - blk_mq_wait_quiesce_done(ns->queue); + blk_mq_wait_quiesce_done(ns->queue->tag_set); } /* let I/O to all namespaces fail in preparation for surprise removal */ @@ -5197,7 +5197,7 @@ void nvme_stop_admin_queue(struct nvme_ctrl *ctrl) if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags)) blk_mq_quiesce_queue(ctrl->admin_q); else - blk_mq_wait_quiesce_done(ctrl->admin_q); + blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set); } EXPORT_SYMBOL_GPL(nvme_stop_admin_queue); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 8b89fab7c420..249757ddd8fe 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2735,7 +2735,7 @@ static void scsi_stop_queue(struct scsi_device *sdev, bool nowait) blk_mq_quiesce_queue(sdev->request_queue); } else { if (!nowait) - blk_mq_wait_quiesce_done(sdev->request_queue); + blk_mq_wait_quiesce_done(sdev->request_queue->tag_set); } } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f059edebb11d..061ea6e7af01 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -880,7 +880,7 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); -void blk_mq_wait_quiesce_done(struct request_queue *q); +void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); -- cgit v1.2.3 From 414dd48e882c5a39e7bd01b096ee6497eb3314b0 Mon Sep 17 00:00:00 2001 From: Chao Leng Date: Tue, 1 Nov 2022 16:00:49 +0100 Subject: blk-mq: add tagset quiesce interface Drivers that have shared tagsets may need to quiesce potentially a lot of request queues that all share a single tagset (e.g. nvme). Add an interface to quiesce all the queues on a given tagset. This interface is useful because it can speedup the quiesce by doing it in parallel. Because some queues should not need to be quiesced (e.g. the nvme connect_q) when quiescing the tagset, introduce a QUEUE_FLAG_SKIP_TAGSET_QUIESCE flag to allow this new interface to ski quiescing a particular queue. Signed-off-by: Chao Leng [hch: simplify for the per-tag_set srcu_struct] Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Ming Lei Reviewed-by: Chao Leng Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221101150050.3510-14-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 27 +++++++++++++++++++++++++++ include/linux/blk-mq.h | 2 ++ include/linux/blkdev.h | 3 +++ 3 files changed, 32 insertions(+) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index b7abfda1ea69..bae6f81c39b3 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -315,6 +315,33 @@ void blk_mq_unquiesce_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) +{ + struct request_queue *q; + + mutex_lock(&set->tag_list_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (!blk_queue_skip_tagset_quiesce(q)) + blk_mq_quiesce_queue_nowait(q); + } + blk_mq_wait_quiesce_done(set); + mutex_unlock(&set->tag_list_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); + +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) +{ + struct request_queue *q; + + mutex_lock(&set->tag_list_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (!blk_queue_skip_tagset_quiesce(q)) + blk_mq_unquiesce_queue(q); + } + mutex_unlock(&set->tag_list_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); + void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 061ea6e7af01..109a0e30c470 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -881,6 +881,8 @@ void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a6fa167fc82..9188aa3f6259 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -571,6 +571,7 @@ struct request_queue { #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */ +#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/ #define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \ (1UL << QUEUE_FLAG_SAME_COMP) | \ @@ -610,6 +611,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags) +#define blk_queue_skip_tagset_quiesce(q) \ + test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); -- cgit v1.2.3 From 7edfd68165b8dab8cde231728ff092a625469eb7 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Wed, 2 Nov 2022 10:52:29 +0800 Subject: blk-mq: improve error handling in blk_mq_alloc_rq_map() Use goto-style error handling like we do elsewhere in the kernel. Signed-off-by: Jinlong Chen Link: https://lore.kernel.org/r/bbbc2d9b17b137798c7fb92042141ca4cbbc58cc.1667356813.git.nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index bae6f81c39b3..d4824b53f6b2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3305,21 +3305,22 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); - if (!tags->rqs) { - blk_mq_free_tags(tags); - return NULL; - } + if (!tags->rqs) + goto err_free_tags; tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); - if (!tags->static_rqs) { - kfree(tags->rqs); - blk_mq_free_tags(tags); - return NULL; - } + if (!tags->static_rqs) + goto err_free_rqs; return tags; + +err_free_rqs: + kfree(tags->rqs); +err_free_tags: + blk_mq_free_tags(tags); + return NULL; } static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, -- cgit v1.2.3 From 4046728253751adb41b05e85ebd686210efde1ad Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Wed, 2 Nov 2022 10:52:30 +0800 Subject: blk-mq: use if-else instead of goto in blk_mq_alloc_cached_request() if-else is more readable than goto here. Signed-off-by: Jinlong Chen Link: https://lore.kernel.org/r/d3306fa4e92dc9cc614edc8f1802686096bafef2.1667356813.git.nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/blk-mq.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index d4824b53f6b2..fc9c400adf92 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -575,25 +575,26 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, if (!plug) return NULL; + if (rq_list_empty(plug->cached_rq)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); - if (rq) - goto got_it; - return NULL; - } - rq = rq_list_peek(&plug->cached_rq); - if (!rq || rq->q != q) - return NULL; + if (!rq) + return NULL; + } else { + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; - if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) - return NULL; - if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) - return NULL; + if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) + return NULL; + + plug->cached_rq = rq_list_next(rq); + } - plug->cached_rq = rq_list_next(rq); -got_it: rq->cmd_flags = opf; INIT_LIST_HEAD(&rq->queuelist); return rq; -- cgit v1.2.3 From 5b2560c4c20e6d6933625b4b56f6843d6c7faf0f Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 7 Nov 2022 14:22:55 +0800 Subject: block: Fix some kernel-doc comments Remove the description of @required_features in elevator_match() to clear the below warning: block/elevator.c:103: warning: Excess function parameter 'required_features' description in 'elevator_match' Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2734 Fixes: ffb86425ee2c ("block: don't check for required features in elevator_match") Reported-by: Abaci Robot Signed-off-by: Yang Li Link: https://lore.kernel.org/r/20221107062255.2685-1-yang.lee@linux.alibaba.com Signed-off-by: Jens Axboe --- block/elevator.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 800e0038be0d..a5bdc3b1e7e5 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -94,7 +94,6 @@ static inline bool elv_support_features(struct request_queue *q, * elevator_match - Test an elevator name and features * @e: Scheduler to test * @name: Elevator name to test - * @required_features: Features that the elevator must provide * * Return true if the elevator @e name matches @name and if @e provides all * the features specified by @required_features. -- cgit v1.2.3 From 49580e690755d0e51ed7aa2c33225dd884fa738a Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:11 -0600 Subject: block: add check when merging zone device pages Consecutive zone device pages should not be merged into the same sgl or bvec segment with other types of pages or if they belong to different pgmaps. Otherwise getting the pgmap of a given segment is not possible without scanning the entire segment. This helper returns true either if both pages are not zone device pages or both pages are zone device pages with the same pgmap. Add a helper to determine if zone device pages are mergeable and use this helper in page_is_mergeable(). Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221021174116.7200-5-logang@deltatee.com Signed-off-by: Jens Axboe --- block/bio.c | 2 ++ include/linux/mmzone.h | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 57c2f327225b..c7a124294828 100644 --- a/block/bio.c +++ b/block/bio.c @@ -863,6 +863,8 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; + if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return false; *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); if (*same_page) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5f74891556f3..9c49ec5d0e25 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -986,6 +986,25 @@ static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } + +/* + * Consecutive zone device pages should not be merged into the same sgl + * or bvec segment with other types of pages or if they belong to different + * pgmaps. Otherwise getting the pgmap of a given segment is not possible + * without scanning the entire segment. This helper returns true either if + * both pages are not zone device pages or both pages are zone device pages + * with the same pgmap. + */ +static inline bool zone_device_pages_have_same_pgmap(const struct page *a, + const struct page *b) +{ + if (is_zone_device_page(a) != is_zone_device_page(b)) + return false; + if (!is_zone_device_page(a)) + return true; + return a->pgmap == b->pgmap; +} + extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else @@ -993,6 +1012,11 @@ static inline bool is_zone_device_page(const struct page *page) { return false; } +static inline bool zone_device_pages_have_same_pgmap(const struct page *a, + const struct page *b) +{ + return true; +} #endif static inline bool folio_is_zone_device(const struct folio *folio) -- cgit v1.2.3 From 5e3e3f2e15df46abcab1959f93f214f778b6ec49 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:13 -0600 Subject: block: set FOLL_PCI_P2PDMA in __bio_iov_iter_get_pages() When a bio's queue supports PCI P2PDMA, set FOLL_PCI_P2PDMA for iov_iter_get_pages_flags(). This allows PCI P2PDMA pages to be passed from userspace and enables the O_DIRECT path in iomap based filesystems and direct to block devices. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221021174116.7200-7-logang@deltatee.com Signed-off-by: Jens Axboe --- block/bio.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index c7a124294828..aa1de6a367f9 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1197,6 +1197,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; + unsigned int gup_flags = 0; ssize_t size, left; unsigned len, i = 0; size_t offset, trim; @@ -1210,6 +1211,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); + if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) + gup_flags |= FOLL_PCI_P2PDMA; + /* * Each segment in the iov is required to be a block size multiple. * However, we may not be able to get the entire segment if it spans @@ -1217,8 +1221,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * result to ensure the bio's total size is correct. The remainder of * the iov data will be picked up in the next bio iteration. */ - size = iov_iter_get_pages2(iter, pages, UINT_MAX - bio->bi_iter.bi_size, - nr_pages, &offset); + size = iov_iter_get_pages(iter, pages, + UINT_MAX - bio->bi_iter.bi_size, + nr_pages, &offset, gup_flags); if (unlikely(size <= 0)) return size ? size : -EFAULT; -- cgit v1.2.3 From 7ee4ccf57484d260c37b29f9a48b65c4101403e8 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 21 Oct 2022 11:41:14 -0600 Subject: block: set FOLL_PCI_P2PDMA in bio_map_user_iov() When a bio's queue supports PCI P2PDMA, set FOLL_PCI_P2PDMA for iov_iter_get_pages_flags(). This allows PCI P2PDMA pages to be passed from userspace and enables the NVMe passthru requests to use P2PDMA pages. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: John Hubbard Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221021174116.7200-8-logang@deltatee.com Signed-off-by: Jens Axboe --- block/blk-map.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-map.c b/block/blk-map.c index 46688e70b141..19940c978c73 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -267,6 +267,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, { unsigned int max_sectors = queue_max_hw_sectors(rq->q); unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); + unsigned int gup_flags = 0; struct bio *bio; int ret; int j; @@ -278,6 +279,9 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (bio == NULL) return -ENOMEM; + if (blk_queue_pci_p2pdma(rq->q)) + gup_flags |= FOLL_PCI_P2PDMA; + while (iov_iter_count(iter)) { struct page **pages, *stack_pages[UIO_FASTIOV]; ssize_t bytes; @@ -286,11 +290,11 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (nr_vecs <= ARRAY_SIZE(stack_pages)) { pages = stack_pages; - bytes = iov_iter_get_pages2(iter, pages, LONG_MAX, - nr_vecs, &offs); + bytes = iov_iter_get_pages(iter, pages, LONG_MAX, + nr_vecs, &offs, gup_flags); } else { - bytes = iov_iter_get_pages_alloc2(iter, &pages, - LONG_MAX, &offs); + bytes = iov_iter_get_pages_alloc(iter, &pages, + LONG_MAX, &offs, gup_flags); } if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; -- cgit v1.2.3 From a1795c2ccb1e4c49220d2a0d381540024d71647c Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Tue, 8 Nov 2022 10:10:29 -0800 Subject: bfq: fix waker_bfqq inconsistency crash This fixes crashes in bfq_add_bfqq_busy due to waker_bfqq being NULL, but woken_list_node still being hashed. This would happen when bfq_init_rq() expects a brand new allocated queue to be returned from bfq_get_bfqq_handle_split() and unconditionally updates waker_bfqq without resetting woken_list_node. Since we can always return oom_bfqq when attempting to allocate, we cannot assume waker_bfqq starts as NULL. Avoid setting woken_bfqq for oom_bfqq entirely, as it's not useful. Crashes would have a stacktrace like: [160595.656560] bfq_add_bfqq_busy+0x110/0x1ec [160595.661142] bfq_add_request+0x6bc/0x980 [160595.666602] bfq_insert_request+0x8ec/0x1240 [160595.671762] bfq_insert_requests+0x58/0x9c [160595.676420] blk_mq_sched_insert_request+0x11c/0x198 [160595.682107] blk_mq_submit_bio+0x270/0x62c [160595.686759] __submit_bio_noacct_mq+0xec/0x178 [160595.691926] submit_bio+0x120/0x184 [160595.695990] ext4_mpage_readpages+0x77c/0x7c8 [160595.701026] ext4_readpage+0x60/0xb0 [160595.705158] filemap_read_page+0x54/0x114 [160595.711961] filemap_fault+0x228/0x5f4 [160595.716272] do_read_fault+0xe0/0x1f0 [160595.720487] do_fault+0x40/0x1c8 Tested by injecting random failures into bfq_get_queue, crashes go away completely. Fixes: 8ef3fc3a043c ("block, bfq: make shared queues inherit wakers") Signed-off-by: Khazhismel Kumykov Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221108181030.1611703-1-khazhy@google.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2381cf220ba2..7b610e91127e 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6717,6 +6717,12 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); + if (unlikely(bfqq == &bfqd->oom_bfqq)) + bfqq_already_existing = true; + } else + bfqq_already_existing = true; + + if (!bfqq_already_existing) { bfqq->waker_bfqq = old_bfqq->waker_bfqq; bfqq->tentative_waker_bfqq = NULL; @@ -6730,8 +6736,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) if (bfqq->waker_bfqq) hlist_add_head(&bfqq->woken_list_node, &bfqq->waker_bfqq->woken_list); - } else - bfqq_already_existing = true; + } } } -- cgit v1.2.3 From 99771d73ff4539f2337b84917f4792abf0d8931b Mon Sep 17 00:00:00 2001 From: Khazhismel Kumykov Date: Tue, 8 Nov 2022 10:10:30 -0800 Subject: bfq: ignore oom_bfqq in bfq_check_waker oom_bfqq is just a fallback bfqq, so shouldn't be used with waker detection. Suggested-by: Jan Kara Signed-off-by: Khazhismel Kumykov Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221108181030.1611703-2-khazhy@google.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7b610e91127e..a72304c728fc 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2083,7 +2083,9 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || - now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC) + now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || + bfqd->last_completed_rq_bfqq == &bfqd->oom_bfqq || + bfqq == &bfqd->oom_bfqq) return; /* -- cgit v1.2.3 From 5ee20298ff25e883d0668507b3216992a2e9e6cd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Nov 2022 11:08:10 +0100 Subject: blk-mq: remove blk_mq_alloc_tag_set_tags There is no point in trying to share any code with the realloc case when all that is needed by the initial tagset allocation is a simple kcalloc_node. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221109100811.2413423-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index fc9c400adf92..0134a1dd854f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4398,12 +4398,6 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, return 0; } -static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set, - int new_nr_hw_queues) -{ - return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues); -} - /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -4466,11 +4460,13 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) goto out_free_srcu; } - ret = blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues); - if (ret) + ret = -ENOMEM; + set->tags = kcalloc_node(set->nr_hw_queues, + sizeof(struct blk_mq_tags *), GFP_KERNEL, + set->numa_node); + if (!set->tags) goto out_cleanup_srcu; - ret = -ENOMEM; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, sizeof(set->map[i].mq_map[0]), -- cgit v1.2.3 From ee9d55210c2fe40ab6600b8009de2243b2ad1a4a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 9 Nov 2022 11:08:11 +0100 Subject: blk-mq: simplify blk_mq_realloc_tag_set_tags Use set->nr_hw_queues for the current number of tags, and remove the duplicate set->nr_hw_queues update in the caller. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221109100811.2413423-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 0134a1dd854f..ee16b4c34c6a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4376,11 +4376,11 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) } static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, - int cur_nr_hw_queues, int new_nr_hw_queues) + int new_nr_hw_queues) { struct blk_mq_tags **new_tags; - if (cur_nr_hw_queues >= new_nr_hw_queues) + if (set->nr_hw_queues >= new_nr_hw_queues) return 0; new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), @@ -4389,7 +4389,7 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, return -ENOMEM; if (set->tags) - memcpy(new_tags, set->tags, cur_nr_hw_queues * + memcpy(new_tags, set->tags, set->nr_hw_queues * sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; @@ -4705,11 +4705,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, } prev_nr_hw_queues = set->nr_hw_queues; - if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < - 0) + if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto reregister; - set->nr_hw_queues = nr_hw_queues; fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { -- cgit v1.2.3 From 759aa12f19155fe4e4fb4740450b4aa4233b7d9f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Nov 2022 15:18:20 +0000 Subject: bio: don't rob starving biosets of bios Biosets keep a mempool, so as long as requests complete we can always can allocate and have forward progress. Percpu bio caches break that assumptions as we may complete into the cache of one CPU and after try and fail to allocate with another CPU. We also can't grab from another CPU's cache without tricky sync. If we're allocating with a bio while the mempool is undersaturated, remove REQ_ALLOC_CACHE flag, so on put it will go straight to mempool. It might try to free into mempool more requests than required, but assuming than there is no memory starvation in the system it'll stabilise and never hit that path. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/aa150caf9c263fa92269e86d7826cc8fa65f38de.1667384020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- block/bio.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index aa1de6a367f9..91f4a6926a1b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -526,6 +526,8 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, } if (unlikely(!p)) return NULL; + if (!mempool_is_saturated(&bs->bio_pool)) + opf &= ~REQ_ALLOC_CACHE; bio = p + bs->front_pad; if (nr_vecs > BIO_INLINE_VECS) { -- cgit v1.2.3 From f25cf75a452150c243f74ab1f1836822137d5d2c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Nov 2022 15:18:21 +0000 Subject: bio: split pcpu cache part of bio_put into a helper Extract a helper out of bio_put for recycling into percpu caches. It's a preparation patch without functional changes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e97ab2026a89098ee1bfdd09bcb9451fced95f87.1667384020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- block/bio.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 91f4a6926a1b..0d03b8d36447 100644 --- a/block/bio.c +++ b/block/bio.c @@ -727,6 +727,28 @@ static void bio_alloc_cache_destroy(struct bio_set *bs) bs->cache = NULL; } +static inline void bio_put_percpu_cache(struct bio *bio) +{ + struct bio_alloc_cache *cache; + + cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); + bio_uninit(bio); + + if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) { + bio->bi_next = cache->free_list; + cache->free_list = bio; + cache->nr++; + } else { + put_cpu(); + bio_free(bio); + return; + } + + if (cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK) + bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK); + put_cpu(); +} + /** * bio_put - release a reference to a bio * @bio: bio to release reference to @@ -742,20 +764,10 @@ void bio_put(struct bio *bio) if (!atomic_dec_and_test(&bio->__bi_cnt)) return; } - - if ((bio->bi_opf & REQ_ALLOC_CACHE) && !WARN_ON_ONCE(in_interrupt())) { - struct bio_alloc_cache *cache; - - bio_uninit(bio); - cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); - bio->bi_next = cache->free_list; - cache->free_list = bio; - if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK) - bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK); - put_cpu(); - } else { + if (bio->bi_opf & REQ_ALLOC_CACHE) + bio_put_percpu_cache(bio); + else bio_free(bio); - } } EXPORT_SYMBOL(bio_put); -- cgit v1.2.3 From b99182c501c3dedeba4c9e6c92a60df1a2fee119 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Nov 2022 15:18:22 +0000 Subject: bio: add pcpu caching for non-polling bio_put This patch extends REQ_ALLOC_CACHE to IRQ completions, whenever currently it's only limited to iopoll. Instead of guarding the list with irq toggling on alloc, which is expensive, it keeps an additional irq-safe list from which bios are spliced in batches to ammortise overhead. On the put side it toggles irqs, but in many cases they're already disabled and so cheap. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c2306de96b900ab9264f4428ec37768ddcf0da36.1667384020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- block/bio.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 16 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 0d03b8d36447..14a7105231d2 100644 --- a/block/bio.c +++ b/block/bio.c @@ -25,9 +25,15 @@ #include "blk-rq-qos.h" #include "blk-cgroup.h" +#define ALLOC_CACHE_THRESHOLD 16 +#define ALLOC_CACHE_SLACK 64 +#define ALLOC_CACHE_MAX 512 + struct bio_alloc_cache { struct bio *free_list; + struct bio *free_list_irq; unsigned int nr; + unsigned int nr_irq; }; static struct biovec_slab { @@ -408,6 +414,22 @@ static void punt_bios_to_rescuer(struct bio_set *bs) queue_work(bs->rescue_workqueue, &bs->rescue_work); } +static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) +{ + unsigned long flags; + + /* cache->free_list must be empty */ + if (WARN_ON_ONCE(cache->free_list)) + return; + + local_irq_save(flags); + cache->free_list = cache->free_list_irq; + cache->free_list_irq = NULL; + cache->nr += cache->nr_irq; + cache->nr_irq = 0; + local_irq_restore(flags); +} + static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, struct bio_set *bs) @@ -417,8 +439,12 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, cache = per_cpu_ptr(bs->cache, get_cpu()); if (!cache->free_list) { - put_cpu(); - return NULL; + if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD) + bio_alloc_irq_cache_splice(cache); + if (!cache->free_list) { + put_cpu(); + return NULL; + } } bio = cache->free_list; cache->free_list = bio->bi_next; @@ -462,9 +488,6 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * - * If REQ_ALLOC_CACHE is set, the final put of the bio MUST be done from process - * context, not hard/soft IRQ. - * * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, @@ -678,11 +701,8 @@ void guard_bio_eod(struct bio *bio) bio_truncate(bio, maxsector << 9); } -#define ALLOC_CACHE_MAX 512 -#define ALLOC_CACHE_SLACK 64 - -static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, - unsigned int nr) +static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache, + unsigned int nr) { unsigned int i = 0; struct bio *bio; @@ -694,6 +714,17 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, if (++i == nr) break; } + return i; +} + +static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, + unsigned int nr) +{ + nr -= __bio_alloc_cache_prune(cache, nr); + if (!READ_ONCE(cache->free_list)) { + bio_alloc_irq_cache_splice(cache); + __bio_alloc_cache_prune(cache, nr); + } } static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node) @@ -732,6 +763,12 @@ static inline void bio_put_percpu_cache(struct bio *bio) struct bio_alloc_cache *cache; cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); + if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) { + put_cpu(); + bio_free(bio); + return; + } + bio_uninit(bio); if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) { @@ -739,13 +776,14 @@ static inline void bio_put_percpu_cache(struct bio *bio) cache->free_list = bio; cache->nr++; } else { - put_cpu(); - bio_free(bio); - return; - } + unsigned long flags; - if (cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK) - bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK); + local_irq_save(flags); + bio->bi_next = cache->free_list_irq; + cache->free_list_irq = bio; + cache->nr_irq++; + local_irq_restore(flags); + } put_cpu(); } -- cgit v1.2.3 From 42b2b2fb6ecf1cc11eb7e75782dd7a7a17e6d958 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 2 Nov 2022 15:18:23 +0000 Subject: bio: shrink max number of pcpu cached bios The downside of the bio pcpu cache is that bios of a cpu will be never freed unless there is new I/O issued from that cpu. We currently keep max 512 bios, which feels too much, half it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/bc198e8efb27d8c740d80c8ce477432729075096.1667384020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- block/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 14a7105231d2..ab59a491a883 100644 --- a/block/bio.c +++ b/block/bio.c @@ -27,7 +27,7 @@ #define ALLOC_CACHE_THRESHOLD 16 #define ALLOC_CACHE_SLACK 64 -#define ALLOC_CACHE_MAX 512 +#define ALLOC_CACHE_MAX 256 struct bio_alloc_cache { struct bio *free_list; -- cgit v1.2.3 From 470373e888f494a52f9916bf3eeea41fe819d031 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Nov 2022 14:20:35 +0100 Subject: block: remove blkdev_writepages While the block device code should switch to implementing ->writepages instead of ->writepage eventually, the current implementation is entirely pointless as it does the same looping over ->writepage as the generic code if no ->writepages is present. Remove blkdev_writepages so that we can eventually unexport generic_writepages. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221116132035.2192924-1-hch@lst.de Signed-off-by: Jens Axboe --- block/fops.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'block') diff --git a/block/fops.c b/block/fops.c index b90742595317..50d245e8c913 100644 --- a/block/fops.c +++ b/block/fops.c @@ -405,12 +405,6 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, return ret; } -static int blkdev_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return generic_writepages(mapping, wbc); -} - const struct address_space_operations def_blk_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, @@ -419,7 +413,6 @@ const struct address_space_operations def_blk_aops = { .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, - .writepages = blkdev_writepages, .direct_IO = blkdev_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, -- cgit v1.2.3 From d90db3b1c8676bc88b4309c5a571333de2263b8e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 22:10:45 +0800 Subject: block: clear ->slave_dir when dropping the main slave_dir reference Zero out the pointer to ->slave_dir so that the holder code doesn't incorrectly treat the object as alive when add_disk failed or after del_gendisk was called. Fixes: 89f871af1b26 ("dm: delay registering the gendisk") Reported-by: Yu Kuai Signed-off-by: Christoph Hellwig Signed-off-by: Yu Kuai Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20221115141054.1051801-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/genhd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 09cde914e054..6271ad06ed07 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -528,6 +528,7 @@ out_unregister_queue: blk_unregister_queue(disk); out_put_slave_dir: kobject_put(disk->slave_dir); + disk->slave_dir = NULL; out_put_holder_dir: kobject_put(disk->part0->bd_holder_dir); out_del_integrity: @@ -629,6 +630,7 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); + disk->slave_dir = NULL; part_stat_set_all(disk->part0, 0); disk->part0->bd_stamp = 0; -- cgit v1.2.3 From 7abc077788363ac7194aefd355306f8e974feff7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 22:10:51 +0800 Subject: block: remove delayed holder registration Now that dm has been fixed to track of holder registrations before add_disk, the somewhat buggy block layer code can be safely removed. Signed-off-by: Christoph Hellwig Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20221115141054.1051801-8-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/genhd.c | 4 --- block/holder.c | 72 +++++++++++++++----------------------------------- include/linux/blkdev.h | 5 ---- 3 files changed, 21 insertions(+), 60 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 6271ad06ed07..075d8da284f5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -478,10 +478,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, goto out_put_holder_dir; } - ret = bd_register_pending_holders(disk); - if (ret < 0) - goto out_put_slave_dir; - ret = blk_register_queue(disk); if (ret) goto out_put_slave_dir; diff --git a/block/holder.c b/block/holder.c index 5283bc804cc1..dd9327b43ce0 100644 --- a/block/holder.c +++ b/block/holder.c @@ -29,19 +29,6 @@ static void del_symlink(struct kobject *from, struct kobject *to) sysfs_remove_link(from, kobject_name(to)); } -static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - int ret; - - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); - if (ret) - return ret; - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - return ret; -} - /** * bd_link_disk_holder - create symlinks between holding disk and slave bdev * @bdev: the claimed slave bdev @@ -75,6 +62,9 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) struct bd_holder_disk *holder; int ret = 0; + if (WARN_ON_ONCE(!disk->slave_dir)) + return -EINVAL; + mutex_lock(&disk->open_mutex); WARN_ON_ONCE(!bdev->bd_holder); @@ -94,34 +84,32 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) INIT_LIST_HEAD(&holder->list); holder->bdev = bdev; holder->refcnt = 1; - if (disk->slave_dir) { - ret = __link_disk_holder(bdev, disk); - if (ret) { - kfree(holder); - goto out_unlock; - } - } - + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); + if (ret) + goto out_free_holder; + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + goto out_del_symlink; list_add(&holder->list, &disk->slave_bdevs); + /* * del_gendisk drops the initial reference to bd_holder_dir, so we need * to keep our own here to allow for cleanup past that point. */ kobject_get(bdev->bd_holder_dir); + mutex_unlock(&disk->open_mutex); + return 0; +out_del_symlink: + del_symlink(disk->slave_dir, bdev_kobj(bdev)); +out_free_holder: + kfree(holder); out_unlock: mutex_unlock(&disk->open_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); -static void __unlink_disk_holder(struct block_device *bdev, - struct gendisk *disk) -{ - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); -} - /** * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() * @bdev: the calimed slave bdev @@ -136,11 +124,14 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; + if (WARN_ON_ONCE(!disk->slave_dir)) + return; + mutex_lock(&disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - if (disk->slave_dir) - __unlink_disk_holder(bdev, disk); + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); kobject_put(bdev->bd_holder_dir); list_del_init(&holder->list); kfree(holder); @@ -148,24 +139,3 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) mutex_unlock(&disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); - -int bd_register_pending_holders(struct gendisk *disk) -{ - struct bd_holder_disk *holder; - int ret; - - mutex_lock(&disk->open_mutex); - list_for_each_entry(holder, &disk->slave_bdevs, list) { - ret = __link_disk_holder(holder->bdev, disk); - if (ret) - goto out_undo; - } - mutex_unlock(&disk->open_mutex); - return 0; - -out_undo: - list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list) - __unlink_disk_holder(holder->bdev, disk); - mutex_unlock(&disk->open_mutex); - return ret; -} diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9188aa3f6259..516e45246868 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -833,7 +833,6 @@ void set_capacity(struct gendisk *disk, sector_t size); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk); -int bd_register_pending_holders(struct gendisk *disk); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) @@ -844,10 +843,6 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { } -static inline int bd_register_pending_holders(struct gendisk *disk) -{ - return 0; -} #endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */ dev_t part_devt(struct gendisk *disk, u8 partno); -- cgit v1.2.3 From 62f535e1f061b4c2cc76061b6b59af9f9335ee34 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 15 Nov 2022 22:10:52 +0800 Subject: block: fix use after free for bd_holder_dir Currently, the caller of bd_link_disk_holer() get 'bdev' by blkdev_get_by_dev(), which will look up 'bdev' by inode number 'dev'. Howerver, it's possible that del_gendisk() can be called currently, and 'bd_holder_dir' can be freed before bd_link_disk_holer() access it, thus use after free is triggered. t1: t2: bdev = blkdev_get_by_dev del_gendisk kobject_put(bd_holder_dir) kobject_free() bd_link_disk_holder Fix the problem by checking disk is still live and grabbing a reference to 'bd_holder_dir' first in bd_link_disk_holder(). Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221115141054.1051801-9-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/holder.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/holder.c b/block/holder.c index dd9327b43ce0..c8e462053f49 100644 --- a/block/holder.c +++ b/block/holder.c @@ -65,12 +65,24 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) if (WARN_ON_ONCE(!disk->slave_dir)) return -EINVAL; - mutex_lock(&disk->open_mutex); + /* + * del_gendisk drops the initial reference to bd_holder_dir, so we + * need to keep our own here to allow for cleanup past that point. + */ + mutex_lock(&bdev->bd_disk->open_mutex); + if (!disk_live(bdev->bd_disk)) { + mutex_unlock(&bdev->bd_disk->open_mutex); + return -ENODEV; + } + kobject_get(bdev->bd_holder_dir); + mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_lock(&disk->open_mutex); WARN_ON_ONCE(!bdev->bd_holder); holder = bd_find_holder_disk(bdev, disk); if (holder) { + kobject_put(bdev->bd_holder_dir); holder->refcnt++; goto out_unlock; } @@ -92,11 +104,6 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) goto out_del_symlink; list_add(&holder->list, &disk->slave_bdevs); - /* - * del_gendisk drops the initial reference to bd_holder_dir, so we need - * to keep our own here to allow for cleanup past that point. - */ - kobject_get(bdev->bd_holder_dir); mutex_unlock(&disk->open_mutex); return 0; @@ -106,6 +113,8 @@ out_free_holder: kfree(holder); out_unlock: mutex_unlock(&disk->open_mutex); + if (ret) + kobject_put(bdev->bd_holder_dir); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); -- cgit v1.2.3 From 3b3449c1e6c3fe19f62607ff4f353f8bb82d5c4e Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 15 Nov 2022 22:10:53 +0800 Subject: block: store the holder kobject in bd_holder_disk We hold a reference to the holder kobject for each bd_holder_disk, so to make the code a bit more robust, use a reference to it instead of the block_device. As long as no one clears ->bd_holder_dir in before freeing the disk, this isn't strictly required, but it does make the code more clear and more robust. Orignally-From: Christoph Hellwig Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221115141054.1051801-10-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/holder.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/holder.c b/block/holder.c index c8e462053f49..3332142bb867 100644 --- a/block/holder.c +++ b/block/holder.c @@ -4,7 +4,7 @@ struct bd_holder_disk { struct list_head list; - struct block_device *bdev; + struct kobject *holder_dir; int refcnt; }; @@ -14,7 +14,7 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, struct bd_holder_disk *holder; list_for_each_entry(holder, &disk->slave_bdevs, list) - if (holder->bdev == bdev) + if (holder->holder_dir == bdev->bd_holder_dir) return holder; return NULL; } @@ -94,8 +94,9 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) } INIT_LIST_HEAD(&holder->list); - holder->bdev = bdev; holder->refcnt = 1; + holder->holder_dir = bdev->bd_holder_dir; + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); if (ret) goto out_free_holder; @@ -140,8 +141,8 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_holder_dir); + del_symlink(holder->holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(holder->holder_dir); list_del_init(&holder->list); kfree(holder); } -- cgit v1.2.3 From 077a4033541fc96fb0a955985aab7d1f353da831 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Tue, 15 Nov 2022 22:10:54 +0800 Subject: block: don't allow a disk link holder to itself After creating a dm device, then user can reload such dm with itself, and dead loop will be triggered because dm keep looking up to itself. Test procedures: 1) dmsetup create test --table "xxx sda", assume dm-0 is created 2) dmsetup suspend test 3) dmsetup reload test --table "xxx dm-0" 4) dmsetup resume test Test result: BUG: TASK stack guard page was hit at 00000000736a261f (stack is 000000008d12c88d..00000000c8dd82d5) stack guard page: 0000 [#1] PREEMPT SMP CPU: 29 PID: 946 Comm: systemd-udevd Not tainted 6.1.0-rc3-next-20221101-00006-g17640ca3b0ee #1295 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 RIP: 0010:dm_prepare_ioctl+0xf/0x1e0 Code: da 48 83 05 4a 7c 99 0b 01 41 89 c4 eb cd e8 b8 1f 40 00 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 57 48 83 05 a1 5a 99 0b 01 <41> 56 49 89 d6 41 55 4c 8d af 90 02 00 00 9 RSP: 0018:ffffc90002090000 EFLAGS: 00010206 RAX: ffff8881049d6800 RBX: ffff88817e589000 RCX: 0000000000000000 RDX: ffffc90002090010 RSI: ffffc9000209001c RDI: ffff88817e589000 RBP: 00000000484a101d R08: 0000000000000000 R09: 0000000000000007 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000005331 R13: 0000000000005331 R14: 0000000000000000 R15: 0000000000000000 FS: 00007fddf9609200(0000) GS:ffff889fbfd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffc9000208fff8 CR3: 0000000179043000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: dm_blk_ioctl+0x50/0x1c0 ? dm_prepare_ioctl+0xe0/0x1e0 dm_blk_ioctl+0x88/0x1c0 dm_blk_ioctl+0x88/0x1c0 ......(a lot of same lines) dm_blk_ioctl+0x88/0x1c0 dm_blk_ioctl+0x88/0x1c0 blkdev_ioctl+0x184/0x3e0 __x64_sys_ioctl+0xa3/0x110 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7fddf7306577 Code: b3 66 90 48 8b 05 11 89 2c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d e1 88 8 RSP: 002b:00007ffd0b2ec318 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00005634ef478320 RCX: 00007fddf7306577 RDX: 0000000000000000 RSI: 0000000000005331 RDI: 0000000000000007 RBP: 0000000000000007 R08: 00005634ef4843e0 R09: 0000000000000080 R10: 00007fddf75cfb38 R11: 0000000000000246 R12: 00000000030d4000 R13: 0000000000000000 R14: 0000000000000000 R15: 00005634ef48b800 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:dm_prepare_ioctl+0xf/0x1e0 Code: da 48 83 05 4a 7c 99 0b 01 41 89 c4 eb cd e8 b8 1f 40 00 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 57 48 83 05 a1 5a 99 0b 01 <41> 56 49 89 d6 41 55 4c 8d af 90 02 00 00 9 RSP: 0018:ffffc90002090000 EFLAGS: 00010206 RAX: ffff8881049d6800 RBX: ffff88817e589000 RCX: 0000000000000000 RDX: ffffc90002090010 RSI: ffffc9000209001c RDI: ffff88817e589000 RBP: 00000000484a101d R08: 0000000000000000 R09: 0000000000000007 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000005331 R13: 0000000000005331 R14: 0000000000000000 R15: 0000000000000000 FS: 00007fddf9609200(0000) GS:ffff889fbfd40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffc9000208fff8 CR3: 0000000179043000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: disabled ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- Fix the problem by forbidding a disk to create link to itself. Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221115141054.1051801-11-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/holder.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block') diff --git a/block/holder.c b/block/holder.c index 3332142bb867..37d18c13d958 100644 --- a/block/holder.c +++ b/block/holder.c @@ -65,6 +65,9 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) if (WARN_ON_ONCE(!disk->slave_dir)) return -EINVAL; + if (bdev->bd_disk == disk) + return -EINVAL; + /* * del_gendisk drops the initial reference to bd_holder_dir, so we * need to keep our own here to allow for cleanup past that point. -- cgit v1.2.3 From b5a9adcbd5dc95d34d1f5fc84eff9af6fc60d284 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 4 Nov 2022 20:59:00 -0400 Subject: blk-cgroup: Return -ENOMEM directly in blkcg_css_alloc() error path For blkcg_css_alloc(), the only error that will be returned is -ENOMEM. Simplify error handling code by returning this error directly instead of setting an intermediate "ret" variable. Signed-off-by: Waiman Long Reviewed-by: Ming Lei Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221105005902.407297-2-longman@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 6a5c849ee061..af8a4d2d1fd1 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1139,7 +1139,6 @@ static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; - struct cgroup_subsys_state *ret; int i; mutex_lock(&blkcg_pol_mutex); @@ -1148,10 +1147,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg = &blkcg_root; } else { blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) { - ret = ERR_PTR(-ENOMEM); + if (!blkcg) goto unlock; - } } for (i = 0; i < BLKCG_MAX_POLS ; i++) { @@ -1168,10 +1165,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) continue; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) { - ret = ERR_PTR(-ENOMEM); + if (!cpd) goto free_pd_blkcg; - } + blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; @@ -1200,7 +1196,7 @@ free_pd_blkcg: kfree(blkcg); unlock: mutex_unlock(&blkcg_pol_mutex); - return ret; + return ERR_PTR(-ENOMEM); } static int blkcg_css_online(struct cgroup_subsys_state *css) -- cgit v1.2.3 From 3b8cc6298724021da845f2f9fd7dd4b6829a6817 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 4 Nov 2022 20:59:01 -0400 Subject: blk-cgroup: Optimize blkcg_rstat_flush() For a system with many CPUs and block devices, the time to do blkcg_rstat_flush() from cgroup_rstat_flush() can be rather long. It can be especially problematic as interrupt is disabled during the flush. It was reported that it might take seconds to complete in some extreme cases leading to hard lockup messages. As it is likely that not all the percpu blkg_iostat_set's has been updated since the last flush, those stale blkg_iostat_set's don't need to be flushed in this case. This patch optimizes blkcg_rstat_flush() by keeping a lockless list of recently updated blkg_iostat_set's in a newly added percpu blkcg->lhead pointer. The blkg_iostat_set is added to a lockless list on the update side in blk_cgroup_bio_start(). It is removed from the lockless list when flushed in blkcg_rstat_flush(). Due to racing, it is possible that blk_iostat_set's in the lockless list may have no new IO stats to be flushed, but that is OK. To protect against destruction of blkg, a percpu reference is gotten when putting into the lockless list and put back when removed. When booting up an instrumented test kernel with this patch on a 2-socket 96-thread system with cgroup v2, out of the 2051 calls to cgroup_rstat_flush() after bootup, 1788 of the calls were exited immediately because of empty lockless list. After an all-cpu kernel build, the ratio became 6295424/6340513. That was more than 99%. Signed-off-by: Waiman Long Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221105005902.407297-3-longman@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++----- block/blk-cgroup.h | 10 +++++++ 2 files changed, 80 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index af8a4d2d1fd1..3e03c0d13253 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -59,6 +59,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq; #define BLKG_DESTROY_BATCH_SIZE 64 +/* + * Lockless lists for tracking IO stats update + * + * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). + * There are multiple blkg's (one for each block device) attached to each + * blkcg. The rstat code keeps track of which cpu has IO stats updated, + * but it doesn't know which blkg has the updated stats. If there are many + * block devices in a system, the cost of iterating all the blkg's to flush + * out the IO stats can be high. To reduce such overhead, a set of percpu + * lockless lists (lhead) per blkcg are used to track the set of recently + * updated iostat_cpu's since the last flush. An iostat_cpu will be put + * onto the lockless list on the update side [blk_cgroup_bio_start()] if + * not there yet and then removed when being flushed [blkcg_rstat_flush()]. + * References to blkg are gotten and then put back in the process to + * protect against blkg removal. + * + * Return: 0 if successful or -ENOMEM if allocation fails. + */ +static int init_blkcg_llists(struct blkcg *blkcg) +{ + int cpu; + + blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL); + if (!blkcg->lhead) + return -ENOMEM; + + for_each_possible_cpu(cpu) + init_llist_head(per_cpu_ptr(blkcg->lhead, cpu)); + return 0; +} + /** * blkcg_css - find the current css * @@ -236,8 +267,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, blkg->blkcg = blkcg; u64_stats_init(&blkg->iostat.sync); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); + per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg; + } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -827,7 +860,9 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; + struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); + struct llist_node *lnode; + struct blkg_iostat_set *bisc, *next_bisc; /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(css->cgroup)) @@ -835,12 +870,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) rcu_read_lock(); - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + lnode = llist_del_all(lhead); + if (!lnode) + goto out; + + /* + * Iterate only the iostat_cpu's queued in the lockless list. + */ + llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) { + struct blkcg_gq *blkg = bisc->blkg; struct blkcg_gq *parent = blkg->parent; - struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); struct blkg_iostat cur; unsigned int seq; + WRITE_ONCE(bisc->lqueued, false); + /* fetch the current per-cpu values */ do { seq = u64_stats_fetch_begin(&bisc->sync); @@ -853,8 +897,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (parent && parent->parent) blkcg_iostat_update(parent, &blkg->iostat.cur, &blkg->iostat.last); + percpu_ref_put(&blkg->refcnt); } +out: rcu_read_unlock(); } @@ -1132,6 +1178,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) mutex_unlock(&blkcg_pol_mutex); + free_percpu(blkcg->lhead); kfree(blkcg); } @@ -1151,6 +1198,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) goto unlock; } + if (init_blkcg_llists(blkcg)) + goto free_blkcg; + for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; @@ -1191,7 +1241,8 @@ free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); - + free_percpu(blkcg->lhead); +free_blkcg: if (blkcg != &blkcg_root) kfree(blkcg); unlock: @@ -1939,6 +1990,7 @@ static int blk_cgroup_io_type(struct bio *bio) void blk_cgroup_bio_start(struct bio *bio) { + struct blkcg *blkcg = bio->bi_blkg->blkcg; int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; unsigned long flags; @@ -1957,9 +2009,21 @@ void blk_cgroup_bio_start(struct bio *bio) } bis->cur.ios[rwd]++; + /* + * If the iostat_cpu isn't in a lockless list, put it into the + * list to indicate that a stat update is pending. + */ + if (!READ_ONCE(bis->lqueued)) { + struct llist_head *lhead = this_cpu_ptr(blkcg->lhead); + + llist_add(&bis->lnode, lhead); + WRITE_ONCE(bis->lqueued, true); + percpu_ref_get(&bis->blkg->refcnt); + } + u64_stats_update_end_irqrestore(&bis->sync, flags); if (cgroup_subsys_on_dfl(io_cgrp_subsys)) - cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); + cgroup_rstat_updated(blkcg->css.cgroup, cpu); put_cpu(); } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index aa2b286bc825..1e94e404eaa8 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -18,6 +18,7 @@ #include #include #include +#include struct blkcg_gq; struct blkg_policy_data; @@ -43,6 +44,9 @@ struct blkg_iostat { struct blkg_iostat_set { struct u64_stats_sync sync; + struct blkcg_gq *blkg; + struct llist_node lnode; + int lqueued; /* queued in llist */ struct blkg_iostat cur; struct blkg_iostat last; }; @@ -97,6 +101,12 @@ struct blkcg { struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; struct list_head all_blkcgs_node; + + /* + * List of updated percpu blkg_iostat_set's since the last flush. + */ + struct llist_head __percpu *lhead; + #ifdef CONFIG_BLK_CGROUP_FC_APPID char fc_app_id[FC_APPID_LEN]; #endif -- cgit v1.2.3 From dae590a6c96c799434e0ff8156ef29b88c257e60 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 4 Nov 2022 20:59:02 -0400 Subject: blk-cgroup: Flush stats at blkgs destruction path As noted by Michal, the blkg_iostat_set's in the lockless list hold reference to blkg's to protect against their removal. Those blkg's hold reference to blkcg. When a cgroup is being destroyed, cgroup_rstat_flush() is only called at css_release_work_fn() which is called when the blkcg reference count reaches 0. This circular dependency will prevent blkcg from being freed until some other events cause cgroup_rstat_flush() to be called to flush out the pending blkcg stats. To prevent this delayed blkcg removal, add a new cgroup_rstat_css_flush() function to flush stats for a given css and cpu and call it at the blkgs destruction path, blkcg_destroy_blkgs(), whenever there are still some pending stats to be flushed. This will ensure that blkcg reference count can reach 0 ASAP. Signed-off-by: Waiman Long Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221105005902.407297-4-longman@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 15 ++++++++++++++- include/linux/cgroup.h | 1 + kernel/cgroup/rstat.c | 20 ++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3e03c0d13253..57941d2a8ba3 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1084,10 +1084,12 @@ struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) */ static void blkcg_destroy_blkgs(struct blkcg *blkcg) { + int cpu; + might_sleep(); + css_get(&blkcg->css); spin_lock_irq(&blkcg->lock); - while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, blkcg_node); @@ -1110,6 +1112,17 @@ static void blkcg_destroy_blkgs(struct blkcg *blkcg) } spin_unlock_irq(&blkcg->lock); + + /* + * Flush all the non-empty percpu lockless lists. + */ + for_each_possible_cpu(cpu) { + struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); + + if (!llist_empty(lhead)) + cgroup_rstat_css_cpu_flush(&blkcg->css, cpu); + } + css_put(&blkcg->css); } /** diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 528bd44b59e2..6c4e66b3fa84 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -766,6 +766,7 @@ void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu); /* * Basic resource stats. diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 793ecff29038..910e633869b0 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -281,6 +281,26 @@ void cgroup_rstat_flush_release(void) spin_unlock_irq(&cgroup_rstat_lock); } +/** + * cgroup_rstat_css_cpu_flush - flush stats for the given css and cpu + * @css: target css to be flush + * @cpu: the cpu that holds the stats to be flush + * + * A lightweight rstat flush operation for a given css and cpu. + * Only the cpu_lock is being held for mutual exclusion, the cgroup_rstat_lock + * isn't used. + */ +void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); + + raw_spin_lock_irq(cpu_lock); + rcu_read_lock(); + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + raw_spin_unlock_irq(cpu_lock); +} + int cgroup_rstat_init(struct cgroup *cgrp) { int cpu; -- cgit v1.2.3 From fce3caea0f241f5d34855c82c399d5e0e2d91f07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:29:42 +0100 Subject: blk-crypto: don't use struct request_queue for public interfaces Switch all public blk-crypto interfaces to use struct block_device arguments to specify the device they operate on instead of th request_queue, which is a block layer implementation detail. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042944.1009870-2-hch@lst.de Signed-off-by: Jens Axboe --- Documentation/block/inline-encryption.rst | 12 ++++++------ block/blk-crypto.c | 24 ++++++++++++++---------- drivers/md/dm-table.c | 2 +- fs/crypto/inline_crypt.c | 8 +++----- include/linux/blk-crypto.h | 11 ++++------- 5 files changed, 28 insertions(+), 29 deletions(-) (limited to 'block') diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index 4d151fbe2058..f9bf18ea6509 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -142,7 +142,7 @@ Therefore, we also introduce *blk-crypto-fallback*, which is an implementation of inline encryption using the kernel crypto API. blk-crypto-fallback is built into the block layer, so it works on any block device without any special setup. Essentially, when a bio with an encryption context is submitted to a -request_queue that doesn't support that encryption context, the block layer will +block_device that doesn't support that encryption context, the block layer will handle en/decryption of the bio using blk-crypto-fallback. For encryption, the data cannot be encrypted in-place, as callers usually rely @@ -187,7 +187,7 @@ API presented to users of the block layer ``blk_crypto_config_supported()`` allows users to check ahead of time whether inline encryption with particular crypto settings will work on a particular -request_queue -- either via hardware or via blk-crypto-fallback. This function +block_device -- either via hardware or via blk-crypto-fallback. This function takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits the actual bytes of the key and instead just contains the algorithm, data unit size, etc. This function can be useful if blk-crypto-fallback is disabled. @@ -195,7 +195,7 @@ size, etc. This function can be useful if blk-crypto-fallback is disabled. ``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key. Users must call ``blk_crypto_start_using_key()`` before actually starting to use -a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()`` +a blk_crypto_key on a block_device (even if ``blk_crypto_config_supported()`` was called earlier). This is needed to initialize blk-crypto-fallback if it will be needed. This must not be called from the data path, as this may have to allocate resources, which may deadlock in that case. @@ -207,7 +207,7 @@ for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx later, as that happens automatically when the bio is freed or reset. Finally, when done using inline encryption with a blk_crypto_key on a -request_queue, users must call ``blk_crypto_evict_key()``. This ensures that +block_device, users must call ``blk_crypto_evict_key()``. This ensures that the key is evicted from all keyslots it may be programmed into and unlinked from any kernel data structures it may be linked into. @@ -221,9 +221,9 @@ as follows: 5. ``blk_crypto_evict_key()`` (after all I/O has completed) 6. Zeroize the blk_crypto_key (this has no dedicated function) -If a blk_crypto_key is being used on multiple request_queues, then +If a blk_crypto_key is being used on multiple block_devices, then ``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``, -and ``blk_crypto_evict_key()`` must be called on each request_queue. +and ``blk_crypto_evict_key()`` must be called on each block_device. API presented to device drivers =============================== diff --git a/block/blk-crypto.c b/block/blk-crypto.c index a496aaef85ba..0047436b6337 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -354,20 +354,21 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, /* * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the - * request queue it's submitted to supports inline crypto, or the + * block_device it's submitted to supports inline crypto, or the * blk-crypto-fallback is enabled and supports the cfg). */ -bool blk_crypto_config_supported(struct request_queue *q, +bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - __blk_crypto_cfg_supported(q->crypto_profile, cfg); + __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile, + cfg); } /** * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device + * @bdev: block device to operate on * @key: A key to use on the device - * @q: the request queue for the device * * Upper layers must call this function to ensure that either the hardware * supports the key's crypto settings, or the crypto API fallback has transforms @@ -379,10 +380,11 @@ bool blk_crypto_config_supported(struct request_queue *q, * blk-crypto-fallback is either disabled or the needed algorithm * is disabled in the crypto API; or another -errno code. */ -int blk_crypto_start_using_key(const struct blk_crypto_key *key, - struct request_queue *q) +int blk_crypto_start_using_key(struct block_device *bdev, + const struct blk_crypto_key *key) { - if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg)) + if (__blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile, + &key->crypto_cfg)) return 0; return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } @@ -390,7 +392,7 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key, /** * blk_crypto_evict_key() - Evict a key from any inline encryption hardware * it may have been programmed into - * @q: The request queue who's associated inline encryption hardware this key + * @bdev: The block_device who's associated inline encryption hardware this key * might have been programmed into * @key: The key to evict * @@ -400,14 +402,16 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key, * * Return: 0 on success or if the key wasn't in any keyslot; -errno on error. */ -int blk_crypto_evict_key(struct request_queue *q, +int blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key) { + struct request_queue *q = bdev_get_queue(bdev); + if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg)) return __blk_crypto_evict_key(q->crypto_profile, key); /* - * If the request_queue didn't support the key, then blk-crypto-fallback + * If the block_device didn't support the key, then blk-crypto-fallback * may have been used, so try to evict the key from blk-crypto-fallback. */ return blk_crypto_fallback_evict_key(key); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 078da18bb86d..8541d5688f3a 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1215,7 +1215,7 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev, struct dm_keyslot_evict_args *args = data; int err; - err = blk_crypto_evict_key(bdev_get_queue(dev->bdev), args->key); + err = blk_crypto_evict_key(dev->bdev, args->key); if (!args->err) args->err = err; /* Always try to evict the key from all devices. */ diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index cea8b14007e6..55c4d8c23d30 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -139,8 +139,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) return PTR_ERR(devs); for (i = 0; i < num_devs; i++) { - if (!blk_crypto_config_supported(bdev_get_queue(devs[i]), - &crypto_cfg)) + if (!blk_crypto_config_supported(devs[i], &crypto_cfg)) goto out_free_devs; } @@ -184,8 +183,7 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, goto fail; } for (i = 0; i < num_devs; i++) { - err = blk_crypto_start_using_key(blk_key, - bdev_get_queue(devs[i])); + err = blk_crypto_start_using_key(devs[i], blk_key); if (err) break; } @@ -224,7 +222,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb, devs = fscrypt_get_devices(sb, &num_devs); if (!IS_ERR(devs)) { for (i = 0; i < num_devs; i++) - blk_crypto_evict_key(bdev_get_queue(devs[i]), blk_key); + blk_crypto_evict_key(devs[i], blk_key); kfree(devs); } kfree_sensitive(blk_key); diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 69b24fe92cbf..561ca92e204d 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -71,9 +71,6 @@ struct bio_crypt_ctx { #include #include -struct request; -struct request_queue; - #ifdef CONFIG_BLK_INLINE_ENCRYPTION static inline bool bio_has_crypt_ctx(struct bio *bio) @@ -94,13 +91,13 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, unsigned int dun_bytes, unsigned int data_unit_size); -int blk_crypto_start_using_key(const struct blk_crypto_key *key, - struct request_queue *q); +int blk_crypto_start_using_key(struct block_device *bdev, + const struct blk_crypto_key *key); -int blk_crypto_evict_key(struct request_queue *q, +int blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key); -bool blk_crypto_config_supported(struct request_queue *q, +bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -- cgit v1.2.3 From 6715c98b6cf003f26b1b2f655393134e9d999a05 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:29:43 +0100 Subject: blk-crypto: add a blk_crypto_config_supported_natively helper Add a blk_crypto_config_supported_natively helper that wraps __blk_crypto_cfg_supported to retrieve the crypto_profile from the request queue. With this fscrypt can stop including blk-crypto-profile.h and rely on the public consumer interface in blk-crypto.h. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042944.1009870-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-crypto.c | 21 ++++++++++++--------- fs/crypto/inline_crypt.c | 6 ++---- include/linux/blk-crypto.h | 2 ++ 3 files changed, 16 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 0047436b6337..6a461f4d676a 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -267,7 +267,6 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; - struct blk_crypto_profile *profile; /* Error if bio has no data. */ if (WARN_ON_ONCE(!bio_has_data(bio))) { @@ -284,10 +283,9 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ - profile = bdev_get_queue(bio->bi_bdev)->crypto_profile; - if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg)) + if (blk_crypto_config_supported_natively(bio->bi_bdev, + &bc_key->crypto_cfg)) return true; - if (blk_crypto_fallback_bio_prep(bio_ptr)) return true; fail: @@ -352,6 +350,13 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, return 0; } +bool blk_crypto_config_supported_natively(struct block_device *bdev, + const struct blk_crypto_config *cfg) +{ + return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile, + cfg); +} + /* * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the * block_device it's submitted to supports inline crypto, or the @@ -361,8 +366,7 @@ bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile, - cfg); + blk_crypto_config_supported_natively(bdev, cfg); } /** @@ -383,8 +387,7 @@ bool blk_crypto_config_supported(struct block_device *bdev, int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key) { - if (__blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile, - &key->crypto_cfg)) + if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return 0; return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } @@ -407,7 +410,7 @@ int blk_crypto_evict_key(struct block_device *bdev, { struct request_queue *q = bdev_get_queue(bdev); - if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg)) + if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return __blk_crypto_evict_key(q->crypto_profile, key); /* diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 55c4d8c23d30..8bfb3ce86476 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -12,7 +12,7 @@ * provides the key and IV to use. */ -#include +#include #include #include #include @@ -77,10 +77,8 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode, unsigned int i; for (i = 0; i < num_devs; i++) { - struct request_queue *q = bdev_get_queue(devs[i]); - if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - __blk_crypto_cfg_supported(q->crypto_profile, cfg)) { + blk_crypto_config_supported_natively(devs[i], cfg)) { if (!xchg(&mode->logged_blk_crypto_native, 1)) pr_info("fscrypt: %s using blk-crypto (native)\n", mode->friendly_name); diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h index 561ca92e204d..a33d32f5c268 100644 --- a/include/linux/blk-crypto.h +++ b/include/linux/blk-crypto.h @@ -97,6 +97,8 @@ int blk_crypto_start_using_key(struct block_device *bdev, int blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key); +bool blk_crypto_config_supported_natively(struct block_device *bdev, + const struct blk_crypto_config *cfg); bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); -- cgit v1.2.3 From 3569788c08235c6f3e9e6ca724b2df44787ff487 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:29:44 +0100 Subject: blk-crypto: move internal only declarations to blk-crypto-internal.h blk_crypto_get_keyslot, blk_crypto_put_keyslot, __blk_crypto_evict_key and __blk_crypto_cfg_supported are only used internally by the blk-crypto code, so move the out of blk-crypto-profile.h, which is included by drivers that supply blk-crypto functionality. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042944.1009870-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-crypto-internal.h | 12 ++++++++++++ include/linux/blk-crypto-profile.h | 12 ------------ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index e6818ffaddbf..d31fa80454e4 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -65,6 +65,18 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq) return rq->crypt_ctx; } +blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + struct blk_crypto_keyslot **slot_ptr); + +void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); + +int __blk_crypto_evict_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key); + +bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, + const struct blk_crypto_config *cfg); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline int blk_crypto_sysfs_register(struct request_queue *q) diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h index bbab65bd5428..e6802b69cdd6 100644 --- a/include/linux/blk-crypto-profile.h +++ b/include/linux/blk-crypto-profile.h @@ -138,18 +138,6 @@ int devm_blk_crypto_profile_init(struct device *dev, unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot); -blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, - const struct blk_crypto_key *key, - struct blk_crypto_keyslot **slot_ptr); - -void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); - -bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, - const struct blk_crypto_config *cfg); - -int __blk_crypto_evict_key(struct blk_crypto_profile *profile, - const struct blk_crypto_key *key); - void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile); void blk_crypto_profile_destroy(struct blk_crypto_profile *profile); -- cgit v1.2.3 From d4b2e0d433769cb7c87e4c93dc048733388d1c46 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Tue, 22 Nov 2022 17:49:17 +0900 Subject: block: fix missing nr_hw_queues update in blk_mq_realloc_tag_set_tags The commit ee9d55210c2f ("blk-mq: simplify blk_mq_realloc_tag_set_tags") cleaned up the function blk_mq_realloc_tag_set_tags. After this change, the function does not update nr_hw_queues of struct blk_mq_tag_set when new nr_hw_queues value is smaller than original. This results in failure of queue number change of block devices. To avoid the failure, add the missing nr_hw_queues update. Fixes: ee9d55210c2f ("blk-mq: simplify blk_mq_realloc_tag_set_tags") Reported-by: Chaitanya Kulkarni Signed-off-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/linux-block/20221118140640.featvt3fxktfquwh@shindev/ Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221122084917.2034220-1-shinichiro.kawasaki@wdc.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ee16b4c34c6a..cdd8efcec191 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4381,7 +4381,7 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, struct blk_mq_tags **new_tags; if (set->nr_hw_queues >= new_nr_hw_queues) - return 0; + goto done; new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); @@ -4393,8 +4393,8 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; +done: set->nr_hw_queues = new_nr_hw_queues; - return 0; } -- cgit v1.2.3 From ac1171bd2c7a3a32dfbdd3c347919fee32b745a1 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 22 Nov 2022 22:21:23 +0800 Subject: elevator: update the document of elevator_switch We no longer support falling back to the old io scheduler if switching to the new one fails. Update the document to indicate that. Fixes: a1ce35fa4985 ("block: remove dead elevator code") Signed-off-by: Jinlong Chen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/94250961689ba7d2e67a7d9e7995a11166fedb31.1669126766.git.nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/elevator.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index a5bdc3b1e7e5..01aa9f38f22e 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -650,10 +650,10 @@ void elevator_init_mq(struct request_queue *q) } /* - * switch to new_e io scheduler. be careful not to introduce deadlocks - - * we don't free the old io scheduler, before we have allocated what we - * need for the new one. this way we have a chance of going back to the old - * one, if the new one fails init for some reason. + * Switch to new_e io scheduler. + * + * If switching fails, we are most likely running out of memory and not able + * to restore the old io scheduler, so leaving the io scheduler being none. */ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { -- cgit v1.2.3 From e0cca8bc9cd8d6176921cb3f5f466d3ccfbc6b99 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 22 Nov 2022 22:21:24 +0800 Subject: elevator: printk a warning if switching to a new io scheduler fails printk a warning to indicate that the io scheduler has been set to none if switching to a new io scheduler fails. Suggested-by: Christoph Hellwig Signed-off-by: Jinlong Chen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/d51ed0fb457db7a4f9cbb0dbce36d534e22be457.1669126766.git.nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/elevator.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 01aa9f38f22e..1fa45717b1d6 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -683,6 +683,12 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) out_unfreeze: blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); + + if (ret) { + pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n", + new_e->elevator_name); + } + return ret; } -- cgit v1.2.3 From f69b5e8f356e4e57e94b806ca1dcb9771933bb9c Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 22 Nov 2022 22:21:25 +0800 Subject: elevator: update the document of elevator_match elevator_match does not care about elevator_features any more. Remove related descriptions from its document. Fixes: ffb86425ee2c ("block: don't check for required features in elevator_match") Signed-off-by: Jinlong Chen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/a58424555202c07a9ccf7f60c3ad7e247da09e25.1669126766.git.nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/elevator.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 1fa45717b1d6..5287b39cd7a9 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -91,12 +91,11 @@ static inline bool elv_support_features(struct request_queue *q, } /** - * elevator_match - Test an elevator name and features + * elevator_match - Check whether @e's name or alias matches @name * @e: Scheduler to test * @name: Elevator name to test * - * Return true if the elevator @e name matches @name and if @e provides all - * the features specified by @required_features. + * Return true if the elevator @e's name or alias matches @name. */ static bool elevator_match(const struct elevator_type *e, const char *name) { -- cgit v1.2.3 From 4284354758d67cf77ab2a4494e28d4c05fb83074 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 22 Nov 2022 22:21:26 +0800 Subject: elevator: remove an outdated comment in elevator_change mq is no longer a special case. Signed-off-by: Jinlong Chen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/cbf47824fc726440371e74c867bf635ae1b671a3.1669126766.git.nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/elevator.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 5287b39cd7a9..599413620558 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -721,9 +721,6 @@ static int elevator_change(struct request_queue *q, const char *elevator_name) if (!blk_queue_registered(q)) return -ENOENT; - /* - * Special case for mq, turn off scheduling - */ if (!strncmp(elevator_name, "none", 4)) { if (q->elevator) elevator_disable(q); -- cgit v1.2.3 From 85168d416e5d3184b77dbec8fee75c9439894afa Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 23 Nov 2022 09:29:23 -0800 Subject: blk-crypto: Add a missing include directive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow the compiler to verify consistency of function declarations and function definitions. This patch fixes the following sparse errors: block/blk-crypto-profile.c:241:14: error: no previous prototype for ‘blk_crypto_get_keyslot’ [-Werror=missing-prototypes] 241 | blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, | ^~~~~~~~~~~~~~~~~~~~~~ block/blk-crypto-profile.c:318:6: error: no previous prototype for ‘blk_crypto_put_keyslot’ [-Werror=missing-prototypes] 318 | void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot) | ^~~~~~~~~~~~~~~~~~~~~~ block/blk-crypto-profile.c:344:6: error: no previous prototype for ‘__blk_crypto_cfg_supported’ [-Werror=missing-prototypes] 344 | bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, | ^~~~~~~~~~~~~~~~~~~~~~~~~~ block/blk-crypto-profile.c:373:5: error: no previous prototype for ‘__blk_crypto_evict_key’ [-Werror=missing-prototypes] 373 | int __blk_crypto_evict_key(struct blk_crypto_profile *profile, | ^~~~~~~~~~~~~~~~~~~~~~ Cc: Eric Biggers Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20221123172923.434339-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-crypto-profile.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 96c511967386..0307fb0d95d3 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -32,6 +32,7 @@ #include #include #include +#include "blk-crypto-internal.h" struct blk_crypto_keyslot { atomic_t slot_refs; -- cgit v1.2.3 From 2820e5d0820ac4daedff1272616a53d9c7682fd2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 24 Nov 2022 11:12:07 +0900 Subject: block: mq-deadline: Fix dd_finish_request() for zoned devices dd_finish_request() tests if the per prio fifo_list is not empty to determine if request dispatching must be restarted for handling blocked write requests to zoned devices with a call to blk_mq_sched_mark_restart_hctx(). While simple, this implementation has 2 problems: 1) Only the priority level of the completed request is considered. However, writes to a zone may be blocked due to other writes to the same zone using a different priority level. While this is unlikely to happen in practice, as writing a zone with different IO priorirites does not make sense, nothing in the code prevents this from happening. 2) The use of list_empty() is dangerous as dd_finish_request() does not take dd->lock and may run concurrently with the insert and dispatch code. Fix these 2 problems by testing the write fifo list of all priority levels using the new helper dd_has_write_work(), and by testing each fifo list using list_empty_careful(). Fixes: c807ab520fc3 ("block/mq-deadline: Add I/O priority support") Cc: Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20221124021208.242541-2-damien.lemoal@opensource.wdc.com Signed-off-by: Jens Axboe --- block/mq-deadline.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 5639921dfa92..36374481cb87 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -789,6 +789,18 @@ static void dd_prepare_request(struct request *rq) rq->elv.priv[0] = NULL; } +static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + enum dd_prio p; + + for (p = 0; p <= DD_PRIO_MAX; p++) + if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE])) + return true; + + return false; +} + /* * Callback from inside blk_mq_free_request(). * @@ -828,9 +840,10 @@ static void dd_finish_request(struct request *rq) spin_lock_irqsave(&dd->zone_lock, flags); blk_req_zone_write_unlock(rq); - if (!list_empty(&per_prio->fifo_list[DD_WRITE])) - blk_mq_sched_mark_restart_hctx(rq->mq_hctx); spin_unlock_irqrestore(&dd->zone_lock, flags); + + if (dd_has_write_work(rq->mq_hctx)) + blk_mq_sched_mark_restart_hctx(rq->mq_hctx); } } -- cgit v1.2.3 From 015d02f48537cf2d1a65eeac50717566f9db6eec Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 24 Nov 2022 11:12:08 +0900 Subject: block: mq-deadline: Do not break sequential write streams to zoned HDDs mq-deadline ensures an in order dispatching of write requests to zoned block devices using a per zone lock (a bit). This implies that for any purely sequential write workload, the drive is exercised most of the time at a maximum queue depth of one. However, when such sequential write workload crosses a zone boundary (when sequentially writing multiple contiguous zones), zone write locking may prevent the last write to one zone to be issued (as the previous write is still being executed) but allow the first write to the following zone to be issued (as that zone is not yet being writen and not locked). This result in an out of order delivery of the sequential write commands to the device every time a zone boundary is crossed. While such behavior does not break the sequential write constraint of zoned block devices (and does not generate any write error), some zoned hard-disks react badly to seeing these out of order writes, resulting in lower write throughput. This problem can be addressed by always dispatching the first request of a stream of sequential write requests, regardless of the zones targeted by these sequential writes. To do so, the function deadline_skip_seq_writes() is introduced and used in deadline_next_request() to select the next write command to issue if the target device is an HDD (blk_queue_nonrot() being false). deadline_fifo_request() is modified using the new deadline_earlier_request() and deadline_is_seq_write() helpers to ignore requests in the fifo list that have a preceding request in lba order that is sequential. With this fix, a sequential write workload executed with the following fio command: fio --name=seq-write --filename=/dev/sda --zonemode=zbd --direct=1 \ --size=68719476736 --ioengine=libaio --iodepth=32 --rw=write \ --bs=65536 results in an increase from 225 MB/s to 250 MB/s of the write throughput of an SMR HDD (11% increase). Cc: Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20221124021208.242541-3-damien.lemoal@opensource.wdc.com Signed-off-by: Jens Axboe --- block/mq-deadline.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 36374481cb87..6672f1bce379 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -130,6 +130,20 @@ static u8 dd_rq_ioclass(struct request *rq) return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } +/* + * get the request before `rq' in sector-sorted order + */ +static inline struct request * +deadline_earlier_request(struct request *rq) +{ + struct rb_node *node = rb_prev(&rq->rb_node); + + if (node) + return rb_entry_rq(node); + + return NULL; +} + /* * get the request after `rq' in sector-sorted order */ @@ -277,6 +291,39 @@ static inline int deadline_check_fifo(struct dd_per_prio *per_prio, return 0; } +/* + * Check if rq has a sequential request preceding it. + */ +static bool deadline_is_seq_writes(struct deadline_data *dd, struct request *rq) +{ + struct request *prev = deadline_earlier_request(rq); + + if (!prev) + return false; + + return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq); +} + +/* + * Skip all write requests that are sequential from @rq, even if we cross + * a zone boundary. + */ +static struct request *deadline_skip_seq_writes(struct deadline_data *dd, + struct request *rq) +{ + sector_t pos = blk_rq_pos(rq); + sector_t skipped_sectors = 0; + + while (rq) { + if (blk_rq_pos(rq) != pos + skipped_sectors) + break; + skipped_sectors += blk_rq_sectors(rq); + rq = deadline_latter_request(rq); + } + + return rq; +} + /* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. @@ -297,11 +344,16 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, /* * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. + * an unlocked target zone. For some HDDs, breaking a sequential + * write stream can lead to lower throughput, so make sure to preserve + * sequential write streams, even if that stream crosses into the next + * zones and these zones are unlocked. */ spin_lock_irqsave(&dd->zone_lock, flags); list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { - if (blk_req_can_dispatch_to_zone(rq)) + if (blk_req_can_dispatch_to_zone(rq) && + (blk_queue_nonrot(rq->q) || + !deadline_is_seq_writes(dd, rq))) goto out; } rq = NULL; @@ -331,13 +383,19 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, /* * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. + * an unlocked target zone. For some HDDs, breaking a sequential + * write stream can lead to lower throughput, so make sure to preserve + * sequential write streams, even if that stream crosses into the next + * zones and these zones are unlocked. */ spin_lock_irqsave(&dd->zone_lock, flags); while (rq) { if (blk_req_can_dispatch_to_zone(rq)) break; - rq = deadline_latter_request(rq); + if (blk_queue_nonrot(rq->q)) + rq = deadline_latter_request(rq); + else + rq = deadline_skip_seq_writes(dd, rq); } spin_unlock_irqrestore(&dd->zone_lock, flags); -- cgit v1.2.3 From 90b0296ece4bd8f70419f0addc1171be6feee195 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 7 Nov 2022 11:39:56 +0800 Subject: block: fix crash in 'blk_mq_elv_switch_none' Syzbot found the following issue: general protection fault, probably for non-canonical address 0xdffffc000000001d: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x00000000000000e8-0x00000000000000ef] CPU: 0 PID: 5234 Comm: syz-executor931 Not tainted 6.1.0-rc3-next-20221102-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/11/2022 RIP: 0010:__elevator_get block/elevator.h:94 [inline] RIP: 0010:blk_mq_elv_switch_none block/blk-mq.c:4593 [inline] RIP: 0010:__blk_mq_update_nr_hw_queues block/blk-mq.c:4658 [inline] RIP: 0010:blk_mq_update_nr_hw_queues+0x304/0xe40 block/blk-mq.c:4709 RSP: 0018:ffffc90003cdfc08 EFLAGS: 00010206 RAX: 0000000000000000 RBX: dffffc0000000000 RCX: 0000000000000000 RDX: 000000000000001d RSI: 0000000000000002 RDI: 00000000000000e8 RBP: ffff88801dbd0000 R08: ffff888027c89398 R09: ffffffff8de2e517 R10: fffffbfff1bc5ca2 R11: 0000000000000000 R12: ffffc90003cdfc70 R13: ffff88801dbd0008 R14: ffff88801dbd03f8 R15: ffff888027c89380 FS: 0000555557259300(0000) GS:ffff8880b9a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000005d84c8 CR3: 000000007a7cb000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: nbd_start_device+0x153/0xc30 drivers/block/nbd.c:1355 nbd_start_device_ioctl drivers/block/nbd.c:1405 [inline] __nbd_ioctl drivers/block/nbd.c:1481 [inline] nbd_ioctl+0x5a1/0xbd0 drivers/block/nbd.c:1521 blkdev_ioctl+0x36e/0x800 block/ioctl.c:614 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __x64_sys_ioctl+0x193/0x200 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd As after dd6f7f17bf58 commit move '__elevator_get(qe->type)' before set 'qe->type', so will lead to access wild pointer. To solve above issue get 'qe->type' after set 'qe->type'. Reported-by: syzbot+746a4eece09f86bc39d7@syzkaller.appspotmail.com Fixes:dd6f7f17bf58("block: add proper helpers for elevator_type module refcount management") Signed-off-by: Ye Bin Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221107033956.3276891-1-yebin@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index cdd8efcec191..4e6b3ccd4989 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4630,9 +4630,9 @@ static bool blk_mq_elv_switch_none(struct list_head *head, INIT_LIST_HEAD(&qe->node); qe->q = q; + qe->type = q->elevator->type; /* keep a reference to the elevator module as we'll switch back */ __elevator_get(qe->type); - qe->type = q->elevator->type; list_add(&qe->node, head); elevator_disable(q); mutex_unlock(&q->sysfs_lock); -- cgit v1.2.3 From 4b7a21c57b14fbcd0e1729150189e5933f5088e9 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 17 Nov 2022 10:29:40 +0800 Subject: blk-mq: fix possible memleak when register 'hctx' failed There's issue as follows when do fault injection test: unreferenced object 0xffff888132a9f400 (size 512): comm "insmod", pid 308021, jiffies 4324277909 (age 509.733s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 08 f4 a9 32 81 88 ff ff ...........2.... 08 f4 a9 32 81 88 ff ff 00 00 00 00 00 00 00 00 ...2............ backtrace: [<00000000e8952bb4>] kmalloc_node_trace+0x22/0xa0 [<00000000f9980e0f>] blk_mq_alloc_and_init_hctx+0x3f1/0x7e0 [<000000002e719efa>] blk_mq_realloc_hw_ctxs+0x1e6/0x230 [<000000004f1fda40>] blk_mq_init_allocated_queue+0x27e/0x910 [<00000000287123ec>] __blk_mq_alloc_disk+0x67/0xf0 [<00000000a2a34657>] 0xffffffffa2ad310f [<00000000b173f718>] 0xffffffffa2af824a [<0000000095a1dabb>] do_one_initcall+0x87/0x2a0 [<00000000f32fdf93>] do_init_module+0xdf/0x320 [<00000000cbe8541e>] load_module+0x3006/0x3390 [<0000000069ed1bdb>] __do_sys_finit_module+0x113/0x1b0 [<00000000a1a29ae8>] do_syscall_64+0x35/0x80 [<000000009cd878b0>] entry_SYSCALL_64_after_hwframe+0x46/0xb0 Fault injection context as follows: kobject_add blk_mq_register_hctx blk_mq_sysfs_register blk_register_queue device_add_disk null_add_dev.part.0 [null_blk] As 'blk_mq_register_hctx' may already add some objects when failed halfway, but there isn't do fallback, caller don't know which objects add failed. To solve above issue just do fallback when add objects failed halfway in 'blk_mq_register_hctx'. Signed-off-by: Ye Bin Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20221117022940.873959-1-yebin@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 93997d297d42..4515288fbe35 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -185,7 +185,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct blk_mq_ctx *ctx; - int i, ret; + int i, j, ret; if (!hctx->nr_ctx) return 0; @@ -197,9 +197,16 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) hctx_for_each_ctx(hctx, ctx, i) { ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); if (ret) - break; + goto out; } + return 0; +out: + hctx_for_each_ctx(hctx, ctx, j) { + if (j < i) + kobject_del(&ctx->kobj); + } + kobject_del(&hctx->kobj); return ret; } -- cgit v1.2.3 From 3692fec8bb476e8583e559ff5783a6adef306cf2 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sat, 26 Nov 2022 11:55:49 +0900 Subject: block: mq-deadline: Rename deadline_is_seq_writes() Rename deadline_is_seq_writes() to deadline_is_seq_write() (remove the "s" plural) to more correctly reflect the fact that this function tests a single request, not multiple requests. Fixes: 015d02f48537 ("block: mq-deadline: Do not break sequential write streams to zoned HDDs") Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20221126025550.967914-2-damien.lemoal@opensource.wdc.com Signed-off-by: Jens Axboe --- block/mq-deadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 6672f1bce379..f10c2a0d18d4 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -294,7 +294,7 @@ static inline int deadline_check_fifo(struct dd_per_prio *per_prio, /* * Check if rq has a sequential request preceding it. */ -static bool deadline_is_seq_writes(struct deadline_data *dd, struct request *rq) +static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq) { struct request *prev = deadline_earlier_request(rq); @@ -353,7 +353,7 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { if (blk_req_can_dispatch_to_zone(rq) && (blk_queue_nonrot(rq->q) || - !deadline_is_seq_writes(dd, rq))) + !deadline_is_seq_write(dd, rq))) goto out; } rq = NULL; -- cgit v1.2.3 From 7919d679ae09c0dc30dfecb7cbc02306cf95cdd7 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 29 Nov 2022 23:46:34 +0800 Subject: block: include 'none' for initial elv_iosched_show call This makes the printing order of the io schedulers consistent, and removes a redundant q->elevator check. Signed-off-by: Jinlong Chen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/bdd7083ed4f232e3285f39081e3c5f30b20b8da2.1669736350.git.nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/elevator.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 599413620558..308bee253564 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -767,10 +767,12 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) if (!elv_support_iosched(q)) return sprintf(name, "none\n"); - if (!q->elevator) + if (!q->elevator) { len += sprintf(name+len, "[none] "); - else + } else { + len += sprintf(name+len, "none "); cur = eq->type; + } spin_lock(&elv_list_lock); list_for_each_entry(e, &elv_list, list) { @@ -783,9 +785,6 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) } spin_unlock(&elv_list_lock); - if (q->elevator) - len += sprintf(name+len, "none"); - len += sprintf(len+name, "\n"); return len; } -- cgit v1.2.3 From 5998249e3238428156b09911f1606b41113443c5 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 29 Nov 2022 23:46:35 +0800 Subject: block: replace continue with else-if in elv_iosched_show else-if is more readable than continue here. Signed-off-by: Jinlong Chen Link: https://lore.kernel.org/r/77ac19ba556efd2c8639a6396eb4203c59bc13d6.1669736350.git.nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/elevator.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 308bee253564..ffa750976d25 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -776,11 +776,9 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) spin_lock(&elv_list_lock); list_for_each_entry(e, &elv_list, list) { - if (e == cur) { + if (e == cur) len += sprintf(name+len, "[%s] ", cur->elevator_name); - continue; - } - if (elv_support_features(q, e)) + else if (elv_support_features(q, e)) len += sprintf(name+len, "%s ", e->elevator_name); } spin_unlock(&elv_list_lock); -- cgit v1.2.3 From 7a3b3660fd30c028e7ae1cd82697933789962406 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 29 Nov 2022 23:46:36 +0800 Subject: block: always use 'e' when printing scheduler name Printing e->elevator_name in all cases improves the readability, and 'e' and 'cur' are identical in this branch. Suggested-by: Christoph Hellwig Signed-off-by: Jinlong Chen Link: https://lore.kernel.org/r/4bae180ffbac608ea0cf46ffa9739ce0973b60aa.1669736350.git.nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index ffa750976d25..b2f2252f29e7 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -777,7 +777,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) spin_lock(&elv_list_lock); list_for_each_entry(e, &elv_list, list) { if (e == cur) - len += sprintf(name+len, "[%s] ", cur->elevator_name); + len += sprintf(name+len, "[%s] ", e->elevator_name); else if (elv_support_features(q, e)) len += sprintf(name+len, "%s ", e->elevator_name); } -- cgit v1.2.3 From c6451ede406b9f57fcd61d48433a6b8b2be862e3 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 29 Nov 2022 23:46:37 +0800 Subject: block: replace "len+name" with "name+len" in elv_iosched_show The "pointer + offset" pattern is more resonable. Signed-off-by: Jinlong Chen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/d9beaee71b14f7b2a39ab0db6458dc0f7d961ceb.1669736350.git.nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/elevator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index b2f2252f29e7..8a5c171306f1 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -783,7 +783,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) } spin_unlock(&elv_list_lock); - len += sprintf(len+name, "\n"); + len += sprintf(name+len, "\n"); return len; } -- cgit v1.2.3 From 8d283ee62b077968e218531b24260e1cc51bd484 Mon Sep 17 00:00:00 2001 From: Jinlong Chen Date: Tue, 29 Nov 2022 23:46:38 +0800 Subject: block: use bool as the return type of elv_iosched_allow_bio_merge We have bool type now, update the old signature. Signed-off-by: Jinlong Chen Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/0db0a0298758d60d0f4df8b7126ac6a381e5a5bb.1669736350.git.nickyc975@zju.edu.cn Signed-off-by: Jens Axboe --- block/elevator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 8a5c171306f1..14e03632b5b5 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -57,7 +57,7 @@ static LIST_HEAD(elv_list); * Query io scheduler to see if the current process issuing bio may be * merged with rq. */ -static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) +static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; @@ -65,7 +65,7 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) if (e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); - return 1; + return true; } /* -- cgit v1.2.3 From c62256dda37133a48d56cecc15e4a4d527d4cc46 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Nov 2022 08:25:46 -0700 Subject: Revert "blk-cgroup: Flush stats at blkgs destruction path" This reverts commit dae590a6c96c799434e0ff8156ef29b88c257e60. We've had a few reports on this causing a crash at boot time, because of a reference issue. While this problem seemginly did exist before the patch and needs solving separately, this patch makes it a lot easier to trigger. Link: https://lore.kernel.org/linux-block/CA+QYu4oxiRKC6hJ7F27whXy-PRBx=Tvb+-7TQTONN8qTtV3aDA@mail.gmail.com/ Link: https://lore.kernel.org/linux-block/69af7ccb-6901-c84c-0e95-5682ccfb750c@acm.org/ Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 15 +-------------- include/linux/cgroup.h | 1 - kernel/cgroup/rstat.c | 20 -------------------- 3 files changed, 1 insertion(+), 35 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 57941d2a8ba3..3e03c0d13253 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1084,12 +1084,10 @@ struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) */ static void blkcg_destroy_blkgs(struct blkcg *blkcg) { - int cpu; - might_sleep(); - css_get(&blkcg->css); spin_lock_irq(&blkcg->lock); + while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, blkcg_node); @@ -1112,17 +1110,6 @@ static void blkcg_destroy_blkgs(struct blkcg *blkcg) } spin_unlock_irq(&blkcg->lock); - - /* - * Flush all the non-empty percpu lockless lists. - */ - for_each_possible_cpu(cpu) { - struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); - - if (!llist_empty(lhead)) - cgroup_rstat_css_cpu_flush(&blkcg->css, cpu); - } - css_put(&blkcg->css); } /** diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6c4e66b3fa84..528bd44b59e2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -766,7 +766,6 @@ void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); -void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu); /* * Basic resource stats. diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 910e633869b0..793ecff29038 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -281,26 +281,6 @@ void cgroup_rstat_flush_release(void) spin_unlock_irq(&cgroup_rstat_lock); } -/** - * cgroup_rstat_css_cpu_flush - flush stats for the given css and cpu - * @css: target css to be flush - * @cpu: the cpu that holds the stats to be flush - * - * A lightweight rstat flush operation for a given css and cpu. - * Only the cpu_lock is being held for mutual exclusion, the cgroup_rstat_lock - * isn't used. - */ -void cgroup_rstat_css_cpu_flush(struct cgroup_subsys_state *css, int cpu) -{ - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); - - raw_spin_lock_irq(cpu_lock); - rcu_read_lock(); - css->ss->css_rstat_flush(css, cpu); - rcu_read_unlock(); - raw_spin_unlock_irq(cpu_lock); -} - int cgroup_rstat_init(struct cgroup *cgrp) { int cpu; -- cgit v1.2.3 From 450deb93df7d457cdd93594a1987f9650c749b96 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:26:33 +0100 Subject: blk-crypto: pass a gendisk to blk_crypto_sysfs_{,un}register Prepare for changes to the block layer sysfs handling by passing the readily available gendisk to blk_crypto_sysfs_{,un}register. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20221114042637.1009333-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-crypto-internal.h | 10 ++++++---- block/blk-crypto-sysfs.c | 7 ++++--- block/blk-sysfs.c | 4 ++-- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index d31fa80454e4..a8cdaf26851e 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -21,9 +21,9 @@ extern const struct blk_crypto_mode blk_crypto_modes[]; #ifdef CONFIG_BLK_INLINE_ENCRYPTION -int blk_crypto_sysfs_register(struct request_queue *q); +int blk_crypto_sysfs_register(struct gendisk *disk); -void blk_crypto_sysfs_unregister(struct request_queue *q); +void blk_crypto_sysfs_unregister(struct gendisk *disk); void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc); @@ -79,12 +79,14 @@ bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, #else /* CONFIG_BLK_INLINE_ENCRYPTION */ -static inline int blk_crypto_sysfs_register(struct request_queue *q) +static inline int blk_crypto_sysfs_register(struct gendisk *disk) { return 0; } -static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { } +static inline void blk_crypto_sysfs_unregister(struct gendisk *disk) +{ +} static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c index fd93bd2f33b7..e05f145cd797 100644 --- a/block/blk-crypto-sysfs.c +++ b/block/blk-crypto-sysfs.c @@ -126,8 +126,9 @@ static struct kobj_type blk_crypto_ktype = { * If the request_queue has a blk_crypto_profile, create the "crypto" * subdirectory in sysfs (/sys/block/$disk/queue/crypto/). */ -int blk_crypto_sysfs_register(struct request_queue *q) +int blk_crypto_sysfs_register(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blk_crypto_kobj *obj; int err; @@ -149,9 +150,9 @@ int blk_crypto_sysfs_register(struct request_queue *q) return 0; } -void blk_crypto_sysfs_unregister(struct request_queue *q) +void blk_crypto_sysfs_unregister(struct gendisk *disk) { - kobject_put(q->crypto_kobject); + kobject_put(disk->queue->crypto_kobject); } static int __init blk_crypto_sysfs_init(void) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 02e94c4beff1..bd223a3bef47 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -836,7 +836,7 @@ int blk_register_queue(struct gendisk *disk) goto put_dev; } - ret = blk_crypto_sysfs_register(q); + ret = blk_crypto_sysfs_register(disk); if (ret) goto put_dev; @@ -913,7 +913,7 @@ void blk_unregister_queue(struct gendisk *disk) */ if (queue_is_mq(q)) blk_mq_sysfs_unregister(disk); - blk_crypto_sysfs_unregister(q); + blk_crypto_sysfs_unregister(disk); mutex_lock(&q->sysfs_lock); elv_unregister_queue(q); -- cgit v1.2.3 From 6fc75f309d291d328b4ea2f91bef0ff56e4bc7c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:26:34 +0100 Subject: block: factor out a blk_debugfs_remove helper Split the debugfs removal from blk_unregister_queue into a helper so that the it can be reused for blk_register_queue error handling. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221114042637.1009333-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index bd223a3bef47..197646d479b4 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -800,6 +800,19 @@ struct kobj_type blk_queue_ktype = { .release = blk_release_queue, }; +static void blk_debugfs_remove(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + + mutex_lock(&q->debugfs_mutex); + blk_trace_shutdown(q); + debugfs_remove_recursive(q->debugfs_dir); + q->debugfs_dir = NULL; + q->sched_debugfs_dir = NULL; + q->rqos_debugfs_dir = NULL; + mutex_unlock(&q->debugfs_mutex); +} + /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. @@ -925,11 +938,5 @@ void blk_unregister_queue(struct gendisk *disk) kobject_del(&q->kobj); mutex_unlock(&q->sysfs_dir_lock); - mutex_lock(&q->debugfs_mutex); - blk_trace_shutdown(q); - debugfs_remove_recursive(q->debugfs_dir); - q->debugfs_dir = NULL; - q->sched_debugfs_dir = NULL; - q->rqos_debugfs_dir = NULL; - mutex_unlock(&q->debugfs_mutex); + blk_debugfs_remove(disk); } -- cgit v1.2.3 From 40602997be26887bdfa3d58659c3acb4579099e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:26:35 +0100 Subject: block: fix error unwinding in blk_register_queue blk_register_queue fails to handle errors from blk_mq_sysfs_register, leaks various resources on errors and accidentally sets queue refs percpu refcount to percpu mode on kobject_add failure. Fix all that by properly unwinding on errors. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221114042637.1009333-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 197646d479b4..abd1784ff05e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -823,13 +823,15 @@ int blk_register_queue(struct gendisk *disk) int ret; mutex_lock(&q->sysfs_dir_lock); - ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) - goto unlock; + goto out_unlock_dir; - if (queue_is_mq(q)) - blk_mq_sysfs_register(disk); + if (queue_is_mq(q)) { + ret = blk_mq_sysfs_register(disk); + if (ret) + goto out_del_queue_kobj; + } mutex_lock(&q->sysfs_lock); mutex_lock(&q->debugfs_mutex); @@ -841,17 +843,17 @@ int blk_register_queue(struct gendisk *disk) ret = disk_register_independent_access_ranges(disk); if (ret) - goto put_dev; + goto out_debugfs_remove; if (q->elevator) { ret = elv_register_queue(q, false); if (ret) - goto put_dev; + goto out_unregister_ia_ranges; } ret = blk_crypto_sysfs_register(disk); if (ret) - goto put_dev; + goto out_elv_unregister; blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(q); @@ -862,8 +864,6 @@ int blk_register_queue(struct gendisk *disk) if (q->elevator) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); - -unlock: mutex_unlock(&q->sysfs_dir_lock); /* @@ -882,13 +882,17 @@ unlock: return ret; -put_dev: +out_elv_unregister: elv_unregister_queue(q); +out_unregister_ia_ranges: disk_unregister_independent_access_ranges(disk); +out_debugfs_remove: + blk_debugfs_remove(disk); mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); +out_del_queue_kobj: kobject_del(&q->kobj); - +out_unlock_dir: + mutex_unlock(&q->sysfs_dir_lock); return ret; } -- cgit v1.2.3 From 2bd85221a625b316114bafaab527770b607095d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:26:36 +0100 Subject: block: untangle request_queue refcounting from sysfs The kobject embedded into the request_queue is used for the queue directory in sysfs, but that is a child of the gendisks directory and is intimately tied to it. Move this kobject to the gendisk and use a refcount_t in the request_queue for the actual request_queue refcounting that is completely unrelated to the device model. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221114042637.1009333-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 42 ++++++++++++++++++----- block/blk-crypto-sysfs.c | 4 +-- block/blk-ia-ranges.c | 3 +- block/blk-sysfs.c | 86 +++++++++++++----------------------------------- block/blk.h | 4 --- block/bsg.c | 11 ++++--- block/elevator.c | 2 +- include/linux/blkdev.h | 6 ++-- 8 files changed, 71 insertions(+), 87 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index e9e2bf15cd90..d14317bfdf65 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -59,12 +59,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert); -DEFINE_IDA(blk_queue_ida); +static DEFINE_IDA(blk_queue_ida); /* * For queue allocation */ -struct kmem_cache *blk_requestq_cachep; +static struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd @@ -252,19 +252,46 @@ void blk_clear_pm_only(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_clear_pm_only); +static void blk_free_queue_rcu(struct rcu_head *rcu_head) +{ + kmem_cache_free(blk_requestq_cachep, + container_of(rcu_head, struct request_queue, rcu_head)); +} + +static void blk_free_queue(struct request_queue *q) +{ + might_sleep(); + + percpu_ref_exit(&q->q_usage_counter); + + if (q->poll_stat) + blk_stat_remove_callback(q, q->poll_cb); + blk_stat_free_callback(q->poll_cb); + + blk_free_queue_stats(q->stats); + kfree(q->poll_stat); + + if (queue_is_mq(q)) + blk_mq_release(q); + + ida_free(&blk_queue_ida, q->id); + call_rcu(&q->rcu_head, blk_free_queue_rcu); +} + /** * blk_put_queue - decrement the request_queue refcount * @q: the request_queue structure to decrement the refcount for * - * Decrements the refcount of the request_queue kobject. When this reaches 0 - * we'll have blk_release_queue() called. + * Decrements the refcount of the request_queue and free it when the refcount + * reaches 0. * * Context: Any context, but the last reference must not be dropped from * atomic context. */ void blk_put_queue(struct request_queue *q) { - kobject_put(&q->kobj); + if (refcount_dec_and_test(&q->refs)) + blk_free_queue(q); } EXPORT_SYMBOL(blk_put_queue); @@ -399,8 +426,7 @@ struct request_queue *blk_alloc_queue(int node_id) INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); - kobject_init(&q->kobj, &blk_queue_ktype); - + refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); @@ -445,7 +471,7 @@ bool blk_get_queue(struct request_queue *q) { if (unlikely(blk_queue_dying(q))) return false; - kobject_get(&q->kobj); + refcount_inc(&q->refs); return true; } EXPORT_SYMBOL(blk_get_queue); diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c index e05f145cd797..55268edc0625 100644 --- a/block/blk-crypto-sysfs.c +++ b/block/blk-crypto-sysfs.c @@ -140,8 +140,8 @@ int blk_crypto_sysfs_register(struct gendisk *disk) return -ENOMEM; obj->profile = q->crypto_profile; - err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj, - "crypto"); + err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, + &disk->queue_kobj, "crypto"); if (err) { kobject_put(&obj->kobj); return err; diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index 2bd1d311033b..2141931ddd37 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -123,7 +123,8 @@ int disk_register_independent_access_ranges(struct gendisk *disk) */ WARN_ON(iars->sysfs_registered); ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype, - &q->kobj, "%s", "independent_access_ranges"); + &disk->queue_kobj, "%s", + "independent_access_ranges"); if (ret) { disk->ia_ranges = NULL; kobject_put(&iars->kobj); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index abd1784ff05e..93d9e9c9a6ea 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -683,8 +683,8 @@ static struct attribute *queue_attrs[] = { static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; if (attr == &queue_io_timeout_entry.attr && (!q->mq_ops || !q->mq_ops->timeout)) @@ -710,8 +710,8 @@ static ssize_t queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; ssize_t res; if (!entry->show) @@ -727,63 +727,19 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct queue_sysfs_entry *entry = to_queue(attr); - struct request_queue *q; + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; ssize_t res; if (!entry->store) return -EIO; - q = container_of(kobj, struct request_queue, kobj); mutex_lock(&q->sysfs_lock); res = entry->store(q, page, length); mutex_unlock(&q->sysfs_lock); return res; } -static void blk_free_queue_rcu(struct rcu_head *rcu_head) -{ - kmem_cache_free(blk_requestq_cachep, - container_of(rcu_head, struct request_queue, rcu_head)); -} - -/** - * blk_release_queue - releases all allocated resources of the request_queue - * @kobj: pointer to a kobject, whose container is a request_queue - * - * This function releases all allocated resources of the request queue. - * - * The struct request_queue refcount is incremented with blk_get_queue() and - * decremented with blk_put_queue(). Once the refcount reaches 0 this function - * is called. - * - * Drivers exist which depend on the release of the request_queue to be - * synchronous, it should not be deferred. - * - * Context: can sleep - */ -static void blk_release_queue(struct kobject *kobj) -{ - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); - - might_sleep(); - - percpu_ref_exit(&q->q_usage_counter); - - if (q->poll_stat) - blk_stat_remove_callback(q, q->poll_cb); - blk_stat_free_callback(q->poll_cb); - - blk_free_queue_stats(q->stats); - kfree(q->poll_stat); - - if (queue_is_mq(q)) - blk_mq_release(q); - - ida_free(&blk_queue_ida, q->id); - call_rcu(&q->rcu_head, blk_free_queue_rcu); -} - static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, @@ -794,10 +750,15 @@ static const struct attribute_group *blk_queue_attr_groups[] = { NULL }; -struct kobj_type blk_queue_ktype = { +static void blk_queue_release(struct kobject *kobj) +{ + /* nothing to do here, all data is associated with the parent gendisk */ +} + +static struct kobj_type blk_queue_ktype = { .default_groups = blk_queue_attr_groups, .sysfs_ops = &queue_sysfs_ops, - .release = blk_release_queue, + .release = blk_queue_release, }; static void blk_debugfs_remove(struct gendisk *disk) @@ -823,20 +784,20 @@ int blk_register_queue(struct gendisk *disk) int ret; mutex_lock(&q->sysfs_dir_lock); - ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue"); + kobject_init(&disk->queue_kobj, &blk_queue_ktype); + ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); if (ret < 0) - goto out_unlock_dir; + goto out_put_queue_kobj; if (queue_is_mq(q)) { ret = blk_mq_sysfs_register(disk); if (ret) - goto out_del_queue_kobj; + goto out_put_queue_kobj; } mutex_lock(&q->sysfs_lock); mutex_lock(&q->debugfs_mutex); - q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), - blk_debugfs_root); + q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root); if (queue_is_mq(q)) blk_mq_debugfs_register(q); mutex_unlock(&q->debugfs_mutex); @@ -860,7 +821,7 @@ int blk_register_queue(struct gendisk *disk) blk_throtl_register(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ - kobject_uevent(&q->kobj, KOBJ_ADD); + kobject_uevent(&disk->queue_kobj, KOBJ_ADD); if (q->elevator) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); @@ -889,9 +850,8 @@ out_unregister_ia_ranges: out_debugfs_remove: blk_debugfs_remove(disk); mutex_unlock(&q->sysfs_lock); -out_del_queue_kobj: - kobject_del(&q->kobj); -out_unlock_dir: +out_put_queue_kobj: + kobject_put(&disk->queue_kobj); mutex_unlock(&q->sysfs_dir_lock); return ret; } @@ -938,8 +898,8 @@ void blk_unregister_queue(struct gendisk *disk) mutex_unlock(&q->sysfs_lock); /* Now that we've deleted all child objects, we can delete the queue. */ - kobject_uevent(&q->kobj, KOBJ_REMOVE); - kobject_del(&q->kobj); + kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); + kobject_del(&disk->queue_kobj); mutex_unlock(&q->sysfs_dir_lock); blk_debugfs_remove(disk); diff --git a/block/blk.h b/block/blk.h index e85703ae81dd..a8ac9803fcb3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -26,10 +26,6 @@ struct blk_flush_queue { spinlock_t mq_flush_lock; }; -extern struct kmem_cache *blk_requestq_cachep; -extern struct kobj_type blk_queue_ktype; -extern struct ida blk_queue_ida; - bool is_flush_rq(struct request *req); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, diff --git a/block/bsg.c b/block/bsg.c index 2ab1351eb082..8eba57b9bb46 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -175,8 +175,10 @@ static void bsg_device_release(struct device *dev) void bsg_unregister_queue(struct bsg_device *bd) { - if (bd->queue->kobj.sd) - sysfs_remove_link(&bd->queue->kobj, "bsg"); + struct gendisk *disk = bd->queue->disk; + + if (disk && disk->queue_kobj.sd) + sysfs_remove_link(&disk->queue_kobj, "bsg"); cdev_device_del(&bd->cdev, &bd->device); put_device(&bd->device); } @@ -216,8 +218,9 @@ struct bsg_device *bsg_register_queue(struct request_queue *q, if (ret) goto out_put_device; - if (q->kobj.sd) { - ret = sysfs_create_link(&q->kobj, &bd->device.kobj, "bsg"); + if (q->disk && q->disk->queue_kobj.sd) { + ret = sysfs_create_link(&q->disk->queue_kobj, &bd->device.kobj, + "bsg"); if (ret) goto out_device_del; } diff --git a/block/elevator.c b/block/elevator.c index 14e03632b5b5..adee58e48e2d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -467,7 +467,7 @@ int elv_register_queue(struct request_queue *q, bool uevent) lockdep_assert_held(&q->sysfs_lock); - error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); + error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; if (attr) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 516e45246868..469299ea0660 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -155,6 +155,7 @@ struct gendisk { unsigned open_partitions; /* number of open partitions */ struct backing_dev_info *bdi; + struct kobject queue_kobj; /* the queue/ directory */ struct kobject *slave_dir; #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED struct list_head slave_bdevs; @@ -430,10 +431,7 @@ struct request_queue { struct gendisk *disk; - /* - * queue kobject - */ - struct kobject kobj; + refcount_t refs; /* * mq queue kobject -- cgit v1.2.3 From 63f93fd6fa5717769a78d6d7bea6f7f9a1ccca8e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Nov 2022 05:26:37 +0100 Subject: block: mark blk_put_queue as potentially blocking We can't just say that the last reference release may block, as any reference dropped could be the last one. So move the might_sleep() from blk_free_queue to blk_put_queue and update the documentation. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221114042637.1009333-6-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d14317bfdf65..8ab21dd01cd1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -260,8 +260,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) static void blk_free_queue(struct request_queue *q) { - might_sleep(); - percpu_ref_exit(&q->q_usage_counter); if (q->poll_stat) @@ -285,11 +283,11 @@ static void blk_free_queue(struct request_queue *q) * Decrements the refcount of the request_queue and free it when the refcount * reaches 0. * - * Context: Any context, but the last reference must not be dropped from - * atomic context. + * Context: Can sleep. */ void blk_put_queue(struct request_queue *q) { + might_sleep(); if (refcount_dec_and_test(&q->refs)) blk_free_queue(q); } -- cgit v1.2.3 From 36369f46e91785688a5f39d7a5590e3f07981316 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 30 Nov 2022 18:56:53 +0100 Subject: block: Do not reread partition table on exclusively open device Since commit 10c70d95c0f2 ("block: remove the bd_openers checks in blk_drop_partitions") we allow rereading of partition table although there are users of the block device. This has an undesirable consequence that e.g. if sda and sdb are assembled to a RAID1 device md0 with partitions, BLKRRPART ioctl on sda will rescan partition table and create sda1 device. This partition device under a raid device confuses some programs (such as libstorage-ng used for initial partitioning for distribution installation) leading to failures. Fix the problem refusing to rescan partitions if there is another user that has the block device exclusively open. Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20221130135344.2ul4cyfstfs3znxg@quack3 Fixes: 10c70d95c0f2 ("block: remove the bd_openers checks in blk_drop_partitions") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20221130175653.24299-1-jack@suse.cz [axboe: fold in followup fix] Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/genhd.c | 7 +++++-- block/ioctl.c | 12 +++++++----- 3 files changed, 13 insertions(+), 8 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index a8ac9803fcb3..8900001946c7 100644 --- a/block/blk.h +++ b/block/blk.h @@ -426,7 +426,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct request_queue *blk_alloc_queue(int node_id); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode); +int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); diff --git a/block/genhd.c b/block/genhd.c index 075d8da284f5..52d71a94a809 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -356,7 +356,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action) } EXPORT_SYMBOL_GPL(disk_uevent); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode) +int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner) { struct block_device *bdev; @@ -366,6 +366,9 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) return -EINVAL; if (disk->open_partitions) return -EBUSY; + /* Someone else has bdev exclusively open? */ + if (disk->part0->bd_holder && disk->part0->bd_holder != owner) + return -EBUSY; set_bit(GD_NEED_PART_SCAN, &disk->state); bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL); @@ -495,7 +498,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, bdev_add(disk->part0, ddev->devt); if (get_capacity(disk)) - disk_scan_partitions(disk, FMODE_READ); + disk_scan_partitions(disk, FMODE_READ, NULL); /* * Announce the disk and partitions after all partitions are diff --git a/block/ioctl.c b/block/ioctl.c index 60121e89052b..96617512982e 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -467,9 +467,10 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, * user space. Note the separate arg/argp parameters that are needed * to deal with the compat_ptr() conversion. */ -static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg, void __user *argp) +static int blkdev_common_ioctl(struct file *file, fmode_t mode, unsigned cmd, + unsigned long arg, void __user *argp) { + struct block_device *bdev = I_BDEV(file->f_mapping->host); unsigned int max_sectors; switch (cmd) { @@ -527,7 +528,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; - return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL); + return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL, + file); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: @@ -605,7 +607,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; } - ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); + ret = blkdev_common_ioctl(file, mode, cmd, arg, argp); if (ret != -ENOIOCTLCMD) return ret; @@ -674,7 +676,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; } - ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); + ret = blkdev_common_ioctl(file, mode, cmd, arg, argp); if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl) ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg); -- cgit v1.2.3 From ecaaaabeead5329aa4c697c484bfc21cc129ab10 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Oct 2022 20:19:28 +0800 Subject: blk-iocost: Fix typo in comment soley -> solely Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221018121932.10792-2-shikemeng@huawei.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index f01359906c83..2b8abc499b18 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -111,7 +111,7 @@ * busy signal. * * As devices can have deep queues and be unfair in how the queued commands - * are executed, soley depending on rq wait may not result in satisfactory + * are executed, solely depending on rq wait may not result in satisfactory * control quality. For a better control quality, completion latency QoS * parameters can be configured so that the device is considered saturated * if N'th percentile completion latency rises above the set point. -- cgit v1.2.3 From c6d2efdd38b4e45741e103b6338f08e5ff2aae03 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Oct 2022 20:19:29 +0800 Subject: blk-iocost: Reset vtime_base_rate in ioc_refresh_params Since commit ac33e91e2daca("blk-iocost: implement vtime loss compensation") split vtime_rate into vtime_rate and vtime_base_rate, we need reset both vtime_base_rate and vtime_rate when device parameters are refreshed. If vtime_base_rate is no reset here, vtime_rate will be overwritten with old vtime_base_rate soon in ioc_refresh_vrate. Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221018121932.10792-3-shikemeng@huawei.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 2b8abc499b18..c3945d470779 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -906,8 +906,10 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force) if (idx == ioc->autop_idx && !force) return false; - if (idx != ioc->autop_idx) + if (idx != ioc->autop_idx) { atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); + ioc->vtime_base_rate = VTIME_PER_USEC; + } ioc->autop_idx = idx; ioc->autop_too_fast_at = 0; -- cgit v1.2.3 From 63c9eac4b6d75859703f5820414986edefb01210 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Oct 2022 20:19:30 +0800 Subject: blk-iocost: Trace vtime_base_rate instead of vtime_rate Since commit ac33e91e2daca ("blk-iocost: implement vtime loss compensation") rename original vtime_rate to vtime_base_rate and current vtime_rate is original vtime_rate with compensation. The current rate showed in tracepoint is mixed with vtime_rate and vtime_base_rate: 1) In function ioc_adjust_base_vrate, the first trace_iocost_ioc_vrate_adj shows vtime_rate, the second trace_iocost_ioc_vrate_adj shows vtime_base_rate. 2) In function iocg_activate shows vtime_rate by calling TRACE_IOCG_PATH(iocg_activate... 3) In function ioc_check_iocgs shows vtime_rate by calling TRACE_IOCG_PATH(iocg_idle... Trace vtime_base_rate instead of vtime_rate as: 1) Before commit ac33e91e2daca ("blk-iocost: implement vtime loss compensation"), the traced rate is without compensation, so still show rate without compensation. 2) The vtime_base_rate is more stable while vtime_rate heavily depends on excess budeget on current period which may change abruptly in next period. Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221018121932.10792-4-shikemeng@huawei.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 +- include/trace/events/iocost.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index c3945d470779..f57a84ff0244 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -977,7 +977,7 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { if (ioc->busy_level != prev_busy_level || nr_lagging) - trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, nr_lagging, nr_shortages); diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h index 6d1626e7a4ce..af8bfed528fc 100644 --- a/include/trace/events/iocost.h +++ b/include/trace/events/iocost.h @@ -38,7 +38,7 @@ DECLARE_EVENT_CLASS(iocost_iocg_state, __assign_str(cgroup, path); __entry->now = now->now; __entry->vnow = now->vnow; - __entry->vrate = now->vrate; + __entry->vrate = iocg->ioc->vtime_base_rate; __entry->last_period = last_period; __entry->cur_period = cur_period; __entry->vtime = vtime; @@ -160,7 +160,7 @@ TRACE_EVENT(iocost_ioc_vrate_adj, TP_fast_assign( __assign_str(devname, ioc_name(ioc)); - __entry->old_vrate = atomic64_read(&ioc->vtime_rate); + __entry->old_vrate = ioc->vtime_base_rate; __entry->new_vrate = new_vrate; __entry->busy_level = ioc->busy_level; __entry->read_missed_ppm = missed_ppm[READ]; -- cgit v1.2.3 From 6c31be320c52bad07e0317d2565ae8a4e5dab03b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Oct 2022 20:19:31 +0800 Subject: blk-iocost: Remove vrate member in struct ioc_now If we trace vtime_base_rate instead of vtime_rate, there is nowhere which accesses now->vrate except function ioc_now using now->vrate locally. Just remove it. Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221018121932.10792-5-shikemeng@huawei.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index f57a84ff0244..1b1a2d0f5564 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -556,7 +556,6 @@ struct ioc_now { u64 now_ns; u64 now; u64 vnow; - u64 vrate; }; struct iocg_wait { @@ -1020,10 +1019,11 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, static void ioc_now(struct ioc *ioc, struct ioc_now *now) { unsigned seq; + u64 vrate; now->now_ns = ktime_get(); now->now = ktime_to_us(now->now_ns); - now->vrate = atomic64_read(&ioc->vtime_rate); + vrate = atomic64_read(&ioc->vtime_rate); /* * The current vtime is @@ -1036,7 +1036,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now) do { seq = read_seqcount_begin(&ioc->period_seqcount); now->vnow = ioc->period_at_vtime + - (now->now - ioc->period_at) * now->vrate; + (now->now - ioc->period_at) * vrate; } while (read_seqcount_retry(&ioc->period_seqcount, seq)); } -- cgit v1.2.3 From 7a88b1a8263a8a0797a5e528b9cd3607e2462a3b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Oct 2022 20:19:32 +0800 Subject: blk-iocost: Correct comment in blk_iocost_init There is no iocg_pd_init function. The pd_alloc_fn function pointer of iocost policy is set with ioc_pd_init. Just correct it. Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20221018121932.10792-6-shikemeng@huawei.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 1b1a2d0f5564..d1bdc12deaa7 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2882,7 +2882,7 @@ static int blk_iocost_init(struct gendisk *disk) spin_unlock_irq(&ioc->lock); /* - * rqos must be added before activation to allow iocg_pd_init() to + * rqos must be added before activation to allow ioc_pd_init() to * lookup the ioc from q. This means that the rqos methods may get * called before policy activation completion, can't assume that the * target bio has an iocg associated and need to test for NULL iocg. -- cgit v1.2.3 From 2e833c8c8c42a3b6e22d6b3a9d2d18e425551261 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 30 Nov 2022 23:03:31 -0800 Subject: block: bdev & blktrace: use consistent function doc. notation Use only one hyphen in kernel-doc notation between the function name and its short description. The is the documented kerenl-doc format. It also fixes the HTML presentation to be consistent with other functions. Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: linux-block@vger.kernel.org Link: https://lore.kernel.org/r/20221201070331.25685-1-rdunlap@infradead.org Signed-off-by: Jens Axboe --- block/bdev.c | 4 ++-- kernel/trace/blktrace.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bdev.c b/block/bdev.c index d699ecdb3260..edc110d90df4 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -224,7 +224,7 @@ int fsync_bdev(struct block_device *bdev) EXPORT_SYMBOL(fsync_bdev); /** - * freeze_bdev -- lock a filesystem and force it into a consistent state + * freeze_bdev - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore @@ -268,7 +268,7 @@ done: EXPORT_SYMBOL(freeze_bdev); /** - * thaw_bdev -- unlock filesystem + * thaw_bdev - unlock filesystem * @bdev: blockdevice to unlock * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a995ea1ef849..94fde027bfc0 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -721,7 +721,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop); */ /** - * blk_trace_ioctl: - handle the ioctls associated with tracing + * blk_trace_ioctl - handle the ioctls associated with tracing * @bdev: the block device * @cmd: the ioctl cmd * @arg: the argument data, if any @@ -769,7 +769,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) } /** - * blk_trace_shutdown: - stop and cleanup trace structures + * blk_trace_shutdown - stop and cleanup trace structures * @q: the request queue associated with the device * **/ -- cgit v1.2.3 From 1d6df9d352bb2a3c2ddb32851dfcafb417c47762 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 2 Dec 2022 09:17:13 +0800 Subject: blk-cgroup: Fix some kernel-doc comments Make the description of @gendisk to @disk in blkcg_schedule_throttle() to clear the below warnings: block/blk-cgroup.c:1850: warning: Function parameter or member 'disk' not described in 'blkcg_schedule_throttle' block/blk-cgroup.c:1850: warning: Excess function parameter 'gendisk' description in 'blkcg_schedule_throttle' Fixes: de185b56e8a6 ("blk-cgroup: pass a gendisk to blkcg_schedule_throttle") Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3338 Reported-by: Abaci Robot Signed-off-by: Yang Li Link: https://lore.kernel.org/r/20221202011713.14834-1-yang.lee@linux.alibaba.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3e03c0d13253..0e225ca7bd9a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1831,7 +1831,7 @@ out: /** * blkcg_schedule_throttle - this task needs to check for throttling - * @gendisk: disk to throttle + * @disk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated -- cgit v1.2.3 From 85d6ce58e493ac8b7122e2fbe3f41b94d6ebdc11 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 3 Dec 2022 15:07:47 +0100 Subject: block: remove devnode callback from struct block_device_operations With the removal of the pktcdvd driver, there are no in-kernel users of the devnode callback in struct block_device_operations, so it can be safely removed. If it is needed for new block drivers in the future, it can be brought back. Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20221203140747.1942969-1-gregkh@linuxfoundation.org Signed-off-by: Jens Axboe --- block/genhd.c | 11 ----------- include/linux/blkdev.h | 1 - 2 files changed, 12 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 52d71a94a809..03a96d6473e1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1196,21 +1196,10 @@ struct class block_class = { .dev_uevent = block_uevent, }; -static char *block_devnode(struct device *dev, umode_t *mode, - kuid_t *uid, kgid_t *gid) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (disk->fops->devnode) - return disk->fops->devnode(disk, mode); - return NULL; -} - const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, - .devnode = block_devnode, }; #ifdef CONFIG_PROC_FS diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 469299ea0660..2db2ad72af0f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1395,7 +1395,6 @@ struct block_device_operations { void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); - char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], enum blk_unique_id id_type); -- cgit v1.2.3 From f56019aef353576f43f945fdd065858145090582 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 5 Dec 2022 19:57:01 +0800 Subject: blk-throttle: correct stale comment in throtl_pd_init On the default hierarchy (cgroup2), the throttle interface files don't exist in the root cgroup, so the ablity to limit the whole system by configuring root group is not existing anymore. In general, cgroup doesn't wanna be in the business of restricting resources at the system level, so correct the stale comment that we can limit whole system to we can only limit subtree. Acked-by: Tejun Heo Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20221205115709.251489-2-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 847721dc2b2b..8e2349b17936 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -395,8 +395,9 @@ static void throtl_pd_init(struct blkg_policy_data *pd) * If on the default hierarchy, we switch to properly hierarchical * behavior where limits on a given throtl_grp are applied to the * whole subtree rather than just the group itself. e.g. If 16M - * read_bps limit is set on the root group, the whole system can't - * exceed 16M for the device. + * read_bps limit is set on a parent group, summary bps of + * parent group and its subtree groups can't exceed 16M for the + * device. * * If not on the default hierarchy, the broken flat hierarchy * behavior is retained where all throtl_grps are treated as if -- cgit v1.2.3 From 84aca0a7e039c8735abc0f89f3f48e9006c0dfc7 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 5 Dec 2022 19:57:02 +0800 Subject: blk-throttle: Fix that bps of child could exceed bps limited in parent Consider situation as following (on the default hierarchy): HDD | root (bps limit: 4k) | child (bps limit :8k) | fio bs=8k Rate of fio is supposed to be 4k, but result is 8k. Reason is as following: Size of single IO from fio is larger than bytes allowed in one throtl_slice in child, so IOs are always queued in child group first. When queued IOs in child are dispatched to parent group, BIO_BPS_THROTTLED is set and these IOs will not be limited by tg_within_bps_limit anymore. Fix this by only set BIO_BPS_THROTTLED when the bio traversed the entire tree. There patch has no influence on situation which is not on the default hierarchy as each group is a single root group without parent. Acked-by: Tejun Heo Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20221205115709.251489-3-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8e2349b17936..2444ebf5f11d 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1067,7 +1067,6 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) sq->nr_queued[rw]--; throtl_charge_bio(tg, bio); - bio_set_flag(bio, BIO_BPS_THROTTLED); /* * If our parent is another tg, we just need to transfer @bio to @@ -1080,6 +1079,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); start_parent_slice_with_credit(tg, parent_tg, rw); } else { + bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], &parent_sq->queued[rw]); BUG_ON(tg->td->nr_queued[rw] <= 0); -- cgit v1.2.3 From eb184791821409c37fef4f67638bb56bdaa82900 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 5 Dec 2022 19:57:03 +0800 Subject: blk-throttle: ignore cgroup without io queued in blk_throtl_cancel_bios Ignore cgroup without io queued in blk_throtl_cancel_bios for two reasons: 1. Save cpu cycle for trying to dispatch cgroup which is no io queued. 2. Avoid non-consistent state that cgroup is inserted to service queue without THROTL_TG_PENDING set as tg_update_disptime will unconditional re-insert cgroup to service queue. If we are on the default hierarchy, IO dispatched from child in tg_dispatch_one_bio will trigger inserting cgroup to service queue without erase first and ruin the tree. Acked-by: Tejun Heo Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20221205115709.251489-4-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 2444ebf5f11d..75010110d481 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1738,7 +1738,18 @@ void blk_throtl_cancel_bios(struct gendisk *disk) * Set the flag to make sure throtl_pending_timer_fn() won't * stop until all throttled bios are dispatched. */ - blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; + tg->flags |= THROTL_TG_CANCELING; + + /* + * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup + * will be inserted to service queue without THROTL_TG_PENDING + * set in tg_update_disptime below. Then IO dispatched from + * child in tg_dispatch_one_bio will trigger double insertion + * and corrupt the tree. + */ + if (!(tg->flags & THROTL_TG_PENDING)) + continue; + /* * Update disptime after setting the above flag to make sure * throtl_select_dispatch() won't exit without dispatching. -- cgit v1.2.3 From 183daeb11de871b073515d14ec1e3bc0da79e038 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 5 Dec 2022 19:57:04 +0800 Subject: blk-throttle: correct calculation of wait time in tg_may_dispatch In C language, When executing "if (expression1 && expression2)" and expression1 return false, the expression2 may not be executed. For "tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) && tg_within_iops_limit(tg, bio, iops_limit, &iops_wait))", if bps is limited, tg_within_bps_limit will return false and tg_within_iops_limit will not be called. So even bps and iops are both limited, iops_wait will not be calculated and is always zero. So wait time of iops is always ignored. Fix this by always calling tg_within_bps_limit and tg_within_iops_limit to get wait time for both bps and iops. Observed that: 1. Wait time in tg_within_iops_limit/tg_within_bps_limit need always be stored as wait argument is always passed. 2. wait time is stored to zero if iops/bps is limited otherwise non-zero is stored. Simpfy tg_within_iops_limit/tg_within_bps_limit by removing wait argument and return wait time directly. Caller tg_may_dispatch checks if wait time is zero to find if iops/bps is limited. Acked-by: Tejun Heo Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20221205115709.251489-5-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 75010110d481..d5b7a2354ad7 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -822,17 +822,15 @@ static void tg_update_carryover(struct throtl_grp *tg) tg->carryover_ios[READ], tg->carryover_ios[WRITE]); } -static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, - u32 iops_limit, unsigned long *wait) +static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, + u32 iops_limit) { bool rw = bio_data_dir(bio); unsigned int io_allowed; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; if (iops_limit == UINT_MAX) { - if (wait) - *wait = 0; - return true; + return 0; } jiffy_elapsed = jiffies - tg->slice_start[rw]; @@ -842,21 +840,16 @@ static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + tg->carryover_ios[rw]; if (tg->io_disp[rw] + 1 <= io_allowed) { - if (wait) - *wait = 0; - return true; + return 0; } /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; - - if (wait) - *wait = jiffy_wait; - return false; + return jiffy_wait; } -static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, - u64 bps_limit, unsigned long *wait) +static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, + u64 bps_limit) { bool rw = bio_data_dir(bio); u64 bytes_allowed, extra_bytes; @@ -865,9 +858,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, /* no need to throttle if this bio's bytes have been accounted */ if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { - if (wait) - *wait = 0; - return true; + return 0; } jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; @@ -880,9 +871,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + tg->carryover_bytes[rw]; if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { - if (wait) - *wait = 0; - return true; + return 0; } /* Calc approx time to dispatch */ @@ -897,9 +886,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, * up we did. Add that time also. */ jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); - if (wait) - *wait = jiffy_wait; - return false; + return jiffy_wait; } /* @@ -947,8 +934,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } - if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) && - tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) { + bps_wait = tg_within_bps_limit(tg, bio, bps_limit); + iops_wait = tg_within_iops_limit(tg, bio, iops_limit); + if (bps_wait + iops_wait == 0) { if (wait) *wait = 0; return true; -- cgit v1.2.3 From a4d508e333829a8394e59efa06ce56e51f3e2b29 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 5 Dec 2022 19:57:05 +0800 Subject: blk-throttle: simpfy low limit reached check in throtl_tg_can_upgrade Commit c79892c557616 ("blk-throttle: add upgrade logic for LIMIT_LOW state") added upgrade logic for low limit and methioned that 1. "To determine if a cgroup exceeds its limitation, we check if the cgroup has pending request. Since cgroup is throttled according to the limit, pending request means the cgroup reaches the limit." 2. "If a cgroup has limit set for both read and write, we consider the combination of them for upgrade. The reason is read IO and write IO can interfere with each other. If we do the upgrade based in one direction IO, the other direction IO could be severly harmed." Besides, we also determine that cgroup reaches low limit if low limit is 0, see comment in throtl_tg_can_upgrade. Collect the information above, the desgin of upgrade check is as following: 1.The low limit is reached if limit is zero or io is already queued. 2.Cgroup will pass upgrade check if low limits of READ and WRITE are both reached. Simpfy the check code described above to removce repeat check and improve readability. There is no functional change. Detail equivalence proof is as following: All replaced conditions to return true are as following: condition 1 (!read_limit && !write_limit) condition 2 read_limit && sq->nr_queued[READ] && (!write_limit || sq->nr_queued[WRITE]) condition 3 write_limit && sq->nr_queued[WRITE] && (!read_limit || sq->nr_queued[READ]) Transferring condition 2 as following: (read_limit && sq->nr_queued[READ]) && (!write_limit || sq->nr_queued[WRITE]) is equivalent to (read_limit && sq->nr_queued[READ]) && (!write_limit || (write_limit && sq->nr_queued[WRITE])) is equivalent to condition 2.1 (read_limit && sq->nr_queued[READ] && !write_limit) || condition 2.2 (read_limit && sq->nr_queued[READ] && (write_limit && sq->nr_queued[WRITE])) Transferring condition 3 as following: write_limit && sq->nr_queued[WRITE] && (!read_limit || sq->nr_queued[READ]) is equivalent to (write_limit && sq->nr_queued[WRITE]) && (!read_limit || (read_limit && sq->nr_queued[READ])) is equivalent to condition 3.1 ((write_limit && sq->nr_queued[WRITE]) && !read_limit) || condition 3.2 ((write_limit && sq->nr_queued[WRITE]) && (read_limit && sq->nr_queued[READ])) Condition 3.2 is the same as condition 2.2, so all conditions we get to return are as following: (!read_limit && !write_limit) (1) (!read_limit && (write_limit && sq->nr_queued[WRITE])) (3.1) ((read_limit && sq->nr_queued[READ]) && !write_limit) (2.1) ((write_limit && sq->nr_queued[WRITE]) && (read_limit && sq->nr_queued[READ])) (2.2) As we can extract conditions "(a1 || a2) && (b1 || b2)" to: a1 && b1 a1 && b2 a2 && b1 ab && b2 Considering that: a1 = !read_limit a2 = read_limit && sq->nr_queued[READ] b1 = !write_limit b2 = write_limit && sq->nr_queued[WRITE] We can pack replaced conditions to (!read_limit || (read_limit && sq->nr_queued[READ])) && (!write_limit || (write_limit && sq->nr_queued[WRITE])) which is equivalent to (!read_limit || sq->nr_queued[READ]) && (!write_limit || sq->nr_queued[WRITE]) Reported-by: kernel test robot Acked-by: Tejun Heo Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20221205115709.251489-6-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d5b7a2354ad7..1623507ed56e 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1816,24 +1816,29 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) return ret; } -static bool throtl_tg_can_upgrade(struct throtl_grp *tg) +static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw) { struct throtl_service_queue *sq = &tg->service_queue; - bool read_limit, write_limit; + bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW]; /* - * if cgroup reaches low limit (if low limit is 0, the cgroup always - * reaches), it's ok to upgrade to next limit + * if low limit is zero, low limit is always reached. + * if low limit is non-zero, we can check if there is any request + * is queued to determine if low limit is reached as we throttle + * request according to limit. */ - read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]; - write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]; - if (!read_limit && !write_limit) - return true; - if (read_limit && sq->nr_queued[READ] && - (!write_limit || sq->nr_queued[WRITE])) - return true; - if (write_limit && sq->nr_queued[WRITE] && - (!read_limit || sq->nr_queued[READ])) + return !limit || sq->nr_queued[rw]; +} + +static bool throtl_tg_can_upgrade(struct throtl_grp *tg) +{ + /* + * cgroup reaches low limit when low limit of READ and WRITE are + * both reached, it's ok to upgrade to next limit if cgroup reaches + * low limit + */ + if (throtl_low_limit_reached(tg, READ) && + throtl_low_limit_reached(tg, WRITE)) return true; if (time_after_eq(jiffies, -- cgit v1.2.3 From 009df341714c6c20a44dd9268681a8bff10bb050 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 5 Dec 2022 19:57:06 +0800 Subject: blk-throttle: fix typo in comment of throtl_adjusted_limit lapsed time -> elapsed time Acked-by: Tejun Heo Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20221205115709.251489-7-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 1623507ed56e..7db8592dae38 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -129,7 +129,7 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) /* * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to * make the IO dispatch more smooth. - * Scale up: linearly scale up according to lapsed time since upgrade. For + * Scale up: linearly scale up according to elapsed time since upgrade. For * every throtl_slice, the limit scales up 1/2 .low limit till the * limit hits .max limit * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit -- cgit v1.2.3 From e3031d4c7d2c5bff6b5944d61d4e31319739d216 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 5 Dec 2022 19:57:07 +0800 Subject: blk-throttle: remove incorrect comment for tg_last_low_overflow_time Function tg_last_low_overflow_time is called with intermediate node as following: throtl_hierarchy_can_downgrade throtl_tg_can_downgrade tg_last_low_overflow_time throtl_hierarchy_can_upgrade throtl_tg_can_upgrade tg_last_low_overflow_time throtl_hierarchy_can_downgrade/throtl_hierarchy_can_upgrade will traverse from leaf node to sub-root node and pass traversed intermediate node to tg_last_low_overflow_time. No such limit could be found from context and implementation of tg_last_low_overflow_time, so remove this limit in comment. Acked-by: Tejun Heo Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20221205115709.251489-8-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 7db8592dae38..e6a087de414d 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1762,7 +1762,6 @@ static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) return min(rtime, wtime); } -/* tg should not be an intermediate node */ static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) { struct throtl_service_queue *parent_sq; -- cgit v1.2.3 From 9c9f209d9d81ea67cd84f53f470a592c252d845d Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 5 Dec 2022 19:57:08 +0800 Subject: blk-throttle: remove repeat check of elapsed time There is no need to check elapsed time from last upgrade for each node in hierarchy. Move this check before traversing as throtl_can_upgrade do to remove repeat check. Reported-by: kernel test robot Acked-by: Tejun Heo Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20221205115709.251489-9-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index e6a087de414d..413e668249cf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1955,8 +1955,7 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg) * If cgroup is below low limit, consider downgrade and throttle other * cgroups */ - if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) && - time_after_eq(now, tg_last_low_overflow_time(tg) + + if (time_after_eq(now, tg_last_low_overflow_time(tg) + td->throtl_slice) && (!throtl_tg_is_idle(tg) || !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) @@ -1966,6 +1965,11 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg) static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) { + struct throtl_data *td = tg->td; + + if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice)) + return false; + while (true) { if (!throtl_tg_can_downgrade(tg)) return false; -- cgit v1.2.3 From eea3e8b74aa1648fc96b739458d067a6e498c302 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 5 Dec 2022 19:57:09 +0800 Subject: blk-throttle: Use more suitable time_after check for update of slice_start There is no need to update tg->slice_start[rw] to start when they are equal already. So remove "eq" part of check before update slice_start. Acked-by: Tejun Heo Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20221205115709.251489-10-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 413e668249cf..6fb5a2f9e1ee 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -645,7 +645,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, * that bandwidth. Do try to make use of that bandwidth while giving * credit. */ - if (time_after_eq(start, tg->slice_start[rw])) + if (time_after(start, tg->slice_start[rw])) tg->slice_start[rw] = start; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; -- cgit v1.2.3 From db1c7d77976775483a8ef240b4c705f113e13ea1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Dec 2022 15:44:07 +0100 Subject: block: bio_copy_data_iter With the pktcdvdv removal, bio_copy_data_iter is unused now. Fold the logic into bio_copy_data and remove the separate lower level function. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20221206144407.722049-1-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 37 +++++++++++++++---------------------- include/linux/bio.h | 2 -- 2 files changed, 15 insertions(+), 24 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index ab59a491a883..5f96fcae3f75 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1401,27 +1401,6 @@ void __bio_advance(struct bio *bio, unsigned bytes) } EXPORT_SYMBOL(__bio_advance); -void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, - struct bio *src, struct bvec_iter *src_iter) -{ - while (src_iter->bi_size && dst_iter->bi_size) { - struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); - struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); - unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); - void *src_buf = bvec_kmap_local(&src_bv); - void *dst_buf = bvec_kmap_local(&dst_bv); - - memcpy(dst_buf, src_buf, bytes); - - kunmap_local(dst_buf); - kunmap_local(src_buf); - - bio_advance_iter_single(src, src_iter, bytes); - bio_advance_iter_single(dst, dst_iter, bytes); - } -} -EXPORT_SYMBOL(bio_copy_data_iter); - /** * bio_copy_data - copy contents of data buffers from one bio to another * @src: source bio @@ -1435,7 +1414,21 @@ void bio_copy_data(struct bio *dst, struct bio *src) struct bvec_iter src_iter = src->bi_iter; struct bvec_iter dst_iter = dst->bi_iter; - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + while (src_iter.bi_size && dst_iter.bi_size) { + struct bio_vec src_bv = bio_iter_iovec(src, src_iter); + struct bio_vec dst_bv = bio_iter_iovec(dst, dst_iter); + unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); + void *src_buf = bvec_kmap_local(&src_bv); + void *dst_buf = bvec_kmap_local(&dst_bv); + + memcpy(dst_buf, src_buf, bytes); + + kunmap_local(dst_buf); + kunmap_local(src_buf); + + bio_advance_iter_single(src, &src_iter, bytes); + bio_advance_iter_single(dst, &dst_iter, bytes); + } } EXPORT_SYMBOL(bio_copy_data); diff --git a/include/linux/bio.h b/include/linux/bio.h index 2c5806997bbf..b231a665682a 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -475,8 +475,6 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); -extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, - struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); void guard_bio_eod(struct bio *bio); -- cgit v1.2.3 From 37754595e94779db869e6ef803f038fa956d08ff Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 6 Dec 2022 17:33:07 +0800 Subject: blk-cgroup: Fix typo in comment Replace assocating with associating. Replace intiailized with initialized. Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Reviewed-by: Mukesh Ojha Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20221206093307.378249-1-shikemeng@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0e225ca7bd9a..ce09e56c35a8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -610,7 +610,7 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs); * @pd: policy private data of interest * @v: value to print * - * Print @v to @sf for the device assocaited with @pd. + * Print @v to @sf for the device associated with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { @@ -798,7 +798,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_finish - finish up per-blkg config update - * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() + * @ctx: blkg_conf_ctx initialized by blkg_conf_prep() * * Finish up after per-blkg config update. This function must be paired * with blkg_conf_prep(). -- cgit v1.2.3 From c1f480b2d092960ecf8bb0bd1f27982c33ada42a Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Tue, 6 Dec 2022 09:29:13 +0000 Subject: sed-opal: allow using IOC_OPAL_SAVE for locking too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Usually when closing a crypto device (eg: dm-crypt with LUKS) the volume key is not required, as it requires root privileges anyway, and root can deny access to a disk in many ways regardless. Requiring the volume key to lock the device is a peculiarity of the OPAL specification. Given we might already have saved the key if the user requested it via the 'IOC_OPAL_SAVE' ioctl, we can use that key to lock the device if no key was provided here and the locking range matches, and the user sets the appropriate flag with 'IOC_OPAL_SAVE'. This allows integrating OPAL with tools and libraries that are used to the common behaviour and do not ask for the volume key when closing a device. Callers can always pass a non-zero key and it will be used regardless, as before. Suggested-by: Štěpán Horáček Signed-off-by: Luca Boccassi Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20221206092913.4625-1-luca.boccassi@gmail.com Signed-off-by: Jens Axboe --- block/sed-opal.c | 39 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/sed-opal.h | 8 +++++++- 2 files changed, 46 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/sed-opal.c b/block/sed-opal.c index 2c5327a0543a..1f926c0973f9 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -2437,6 +2437,44 @@ static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key) return execute_steps(dev, mbrdone_step, ARRAY_SIZE(mbrdone_step)); } +static void opal_lock_check_for_saved_key(struct opal_dev *dev, + struct opal_lock_unlock *lk_unlk) +{ + struct opal_suspend_data *iter; + + if (lk_unlk->l_state != OPAL_LK || + lk_unlk->session.opal_key.key_len > 0) + return; + + /* + * Usually when closing a crypto device (eg: dm-crypt with LUKS) the + * volume key is not required, as it requires root privileges anyway, + * and root can deny access to a disk in many ways regardless. + * Requiring the volume key to lock the device is a peculiarity of the + * OPAL specification. Given we might already have saved the key if + * the user requested it via the 'IOC_OPAL_SAVE' ioctl, we can use + * that key to lock the device if no key was provided here, the + * locking range matches and the appropriate flag was passed with + * 'IOC_OPAL_SAVE'. + * This allows integrating OPAL with tools and libraries that are used + * to the common behaviour and do not ask for the volume key when + * closing a device. + */ + setup_opal_dev(dev); + list_for_each_entry(iter, &dev->unlk_lst, node) { + if ((iter->unlk.flags & OPAL_SAVE_FOR_LOCK) && + iter->lr == lk_unlk->session.opal_key.lr && + iter->unlk.session.opal_key.key_len > 0) { + lk_unlk->session.opal_key.key_len = + iter->unlk.session.opal_key.key_len; + memcpy(lk_unlk->session.opal_key.key, + iter->unlk.session.opal_key.key, + iter->unlk.session.opal_key.key_len); + break; + } + } +} + static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) { @@ -2446,6 +2484,7 @@ static int opal_lock_unlock(struct opal_dev *dev, return -EINVAL; mutex_lock(&dev->dev_lock); + opal_lock_check_for_saved_key(dev, lk_unlk); ret = __opal_lock_unlock(dev, lk_unlk); mutex_unlock(&dev->dev_lock); diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 2573772e2fb3..1fed3c9294fc 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -44,6 +44,11 @@ enum opal_lock_state { OPAL_LK = 0x04, /* 0100 */ }; +enum opal_lock_flags { + /* IOC_OPAL_SAVE will also store the provided key for locking */ + OPAL_SAVE_FOR_LOCK = 0x01, +}; + struct opal_key { __u8 lr; __u8 key_len; @@ -76,7 +81,8 @@ struct opal_user_lr_setup { struct opal_lock_unlock { struct opal_session_info session; __u32 l_state; - __u8 __align[4]; + __u16 flags; + __u8 __align[2]; }; struct opal_new_pw { -- cgit v1.2.3