From 64dc8c732f5c2b406cc752e6aaa1bd5471159cab Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 14 Dec 2022 11:04:30 +0800 Subject: block, bfq: fix possible uaf for 'bfqq->bic' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Our test report a uaf for 'bfqq->bic' in 5.10: ================================================================== BUG: KASAN: use-after-free in bfq_select_queue+0x378/0xa30 CPU: 6 PID: 2318352 Comm: fsstress Kdump: loaded Not tainted 5.10.0-60.18.0.50.h602.kasan.eulerosv2r11.x86_64 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58-20220320_160524-szxrtosci10000 04/01/2014 Call Trace: bfq_select_queue+0x378/0xa30 bfq_dispatch_request+0xe8/0x130 blk_mq_do_dispatch_sched+0x62/0xb0 __blk_mq_sched_dispatch_requests+0x215/0x2a0 blk_mq_sched_dispatch_requests+0x8f/0xd0 __blk_mq_run_hw_queue+0x98/0x180 __blk_mq_delay_run_hw_queue+0x22b/0x240 blk_mq_run_hw_queue+0xe3/0x190 blk_mq_sched_insert_requests+0x107/0x200 blk_mq_flush_plug_list+0x26e/0x3c0 blk_finish_plug+0x63/0x90 __iomap_dio_rw+0x7b5/0x910 iomap_dio_rw+0x36/0x80 ext4_dio_read_iter+0x146/0x190 [ext4] ext4_file_read_iter+0x1e2/0x230 [ext4] new_sync_read+0x29f/0x400 vfs_read+0x24e/0x2d0 ksys_read+0xd5/0x1b0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x61/0xc6 Commit 3bc5e683c67d ("bfq: Split shared queues on move between cgroups") changes that move process to a new cgroup will allocate a new bfqq to use, however, the old bfqq and new bfqq can point to the same bic: 1) Initial state, two process with io in the same cgroup. Process 1 Process 2 (BIC1) (BIC2) | Λ | Λ | | | | V | V | bfqq1 bfqq2 2) bfqq1 is merged to bfqq2. Process 1 Process 2 (BIC1) (BIC2) | | \-------------\| V bfqq1 bfqq2(coop) 3) Process 1 exit, then issue new io(denoce IOA) from Process 2. (BIC2) | Λ | | V | bfqq2(coop) 4) Before IOA is completed, move Process 2 to another cgroup and issue io. Process 2 (BIC2) Λ |\--------------\ | V bfqq2 bfqq3 Now that BIC2 points to bfqq3, while bfqq2 and bfqq3 both point to BIC2. If all the requests are completed, and Process 2 exit, BIC2 will be freed while there is no guarantee that bfqq2 will be freed before BIC2. Fix the problem by clearing bfqq->bic while bfqq is detached from bic. Fixes: 3bc5e683c67d ("bfq: Split shared queues on move between cgroups") Suggested-by: Jan Kara Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221214030430.3304151-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a72304c728fc..b111a7b8dca6 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -386,6 +386,12 @@ static void bfq_put_stable_ref(struct bfq_queue *bfqq); void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) { + struct bfq_queue *old_bfqq = bic->bfqq[is_sync]; + + /* Clear bic pointer if bfqq is detached from this bic */ + if (old_bfqq && old_bfqq->bic == bic) + old_bfqq->bic = NULL; + /* * If bfqq != NULL, then a non-stable queue merge between * bic->bfqq and bfqq is happening here. This causes troubles @@ -5311,7 +5317,6 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); - bfqq->bic = NULL; bfq_exit_bfqq(bfqd, bfqq); bic_set_bfqq(bic, NULL, is_sync); spin_unlock_irqrestore(&bfqd->lock, flags); -- cgit v1.2.3 From 452af7dc59033a76372d51a24682503377872b11 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 14 Dec 2022 11:31:54 +0800 Subject: block, bfq: don't return bfqg from __bfq_bic_change_cgroup() The return value is not used, hence remove it. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221214033155.3455754-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 627476bc6495..23dc355a106d 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -724,9 +724,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, * sure that the reference to cgroup is valid across the call (see * comments in bfq_bic_update_cgroup on this issue) */ -static void *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct bfq_group *bfqg) +static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_group *bfqg) { struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); @@ -776,8 +776,6 @@ static void *__bfq_bic_change_cgroup(struct bfq_data *bfqd, } } } - - return bfqg; } void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -- cgit v1.2.3 From 337366e02b370d2800110fbc99940f6ddddcbdfa Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 14 Dec 2022 11:31:55 +0800 Subject: block, bfq: replace 0/1 with false/true in bic apis Just to make the code a litter cleaner, there are no functional changes. Signed-off-by: Yu Kuai Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20221214033155.3455754-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-cgroup.c | 8 ++++---- block/bfq-iosched.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 23dc355a106d..1b2829e99dad 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -728,15 +728,15 @@ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_group *bfqg) { - struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); - struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); + struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false); + struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true); struct bfq_entity *entity; if (async_bfqq) { entity = &async_bfqq->entity; if (entity->sched_data != &bfqg->sched_data) { - bic_set_bfqq(bic, NULL, 0); + bic_set_bfqq(bic, NULL, false); bfq_release_process_ref(bfqd, async_bfqq); } } @@ -772,7 +772,7 @@ static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, */ bfq_put_cooperator(sync_bfqq); bfq_release_process_ref(bfqd, sync_bfqq); - bic_set_bfqq(bic, NULL, 1); + bic_set_bfqq(bic, NULL, true); } } } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index b111a7b8dca6..dc576b90ddfe 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -3114,7 +3114,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, /* * Merge queues (that is, let bic redirect its requests to new_bfqq) */ - bic_set_bfqq(bic, new_bfqq, 1); + bic_set_bfqq(bic, new_bfqq, true); bfq_mark_bfqq_coop(new_bfqq); /* * new_bfqq now belongs to at least two bics (it is a shared queue): @@ -6562,7 +6562,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) return bfqq; } - bic_set_bfqq(bic, NULL, 1); + bic_set_bfqq(bic, NULL, true); bfq_put_cooperator(bfqq); -- cgit v1.2.3 From ff1cc97b1f4c10db224f276d9615b22835b8c424 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Tue, 13 Dec 2022 13:08:26 +0100 Subject: block/blk-iocost (gcc13): keep large values in a new enum Since gcc13, each member of an enum has the same type as the enum [1]. And that is inherited from its members. Provided: VTIME_PER_SEC_SHIFT = 37, VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT, ... AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, the named type is unsigned long. This generates warnings with gcc-13: block/blk-iocost.c: In function 'ioc_weight_prfill': block/blk-iocost.c:3037:37: error: format '%u' expects argument of type 'unsigned int', but argument 4 has type 'long unsigned int' block/blk-iocost.c: In function 'ioc_weight_show': block/blk-iocost.c:3047:34: error: format '%u' expects argument of type 'unsigned int', but argument 3 has type 'long unsigned int' So split the anonymous enum with large values to a separate enum, so that they don't affect other members. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36113 Cc: Martin Liska Cc: Tejun Heo Cc: Josef Bacik Cc: Jens Axboe Cc: cgroups@vger.kernel.org Cc: linux-block@vger.kernel.org Signed-off-by: Jiri Slaby (SUSE) Link: https://lore.kernel.org/r/20221213120826.17446-1-jirislaby@kernel.org Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index d1bdc12deaa7..549ddc9e0c6f 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -232,7 +232,9 @@ enum { /* 1/64k is granular enough and can easily be handled w/ u32 */ WEIGHT_ONE = 1 << 16, +}; +enum { /* * As vtime is used to calculate the cost of each IO, it needs to * be fairly high precision. For example, it should be able to -- cgit v1.2.3 From 813e693023ba10da9e75067780f8378465bf27cc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 10 Dec 2022 08:33:10 -1000 Subject: blk-iolatency: Fix memory leak on add_disk() failures When a gendisk is successfully initialized but add_disk() fails such as when a loop device has invalid number of minor device numbers specified, blkcg_init_disk() is called during init and then blkcg_exit_disk() during error handling. Unfortunately, iolatency gets initialized in the former but doesn't get cleaned up in the latter. This is because, in non-error cases, the cleanup is performed by del_gendisk() calling rq_qos_exit(), the assumption being that rq_qos policies, iolatency being one of them, can only be activated once the disk is fully registered and visible. That assumption is true for wbt and iocost, but not so for iolatency as it gets initialized before add_disk() is called. It is desirable to lazy-init rq_qos policies because they are optional features and add to hot path overhead once initialized - each IO has to walk all the registered rq_qos policies. So, we want to switch iolatency to lazy init too. However, that's a bigger change. As a fix for the immediate problem, let's just add an extra call to rq_qos_exit() in blkcg_exit_disk(). This is safe because duplicate calls to rq_qos_exit() become noop's. Signed-off-by: Tejun Heo Reported-by: darklight2357@icloud.com Cc: Josef Bacik Cc: Linus Torvalds Fixes: d70675121546 ("block: introduce blk-iolatency io controller") Cc: stable@vger.kernel.org # v4.19+ Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/Y5TQ5gm3O4HXrXR3@slm.duckdns.org Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 50ac0dce95b8..ce6a2b7d3dfb 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -33,6 +33,7 @@ #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-throttle.h" +#include "blk-rq-qos.h" /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. @@ -1322,6 +1323,7 @@ err_unlock: void blkcg_exit_disk(struct gendisk *disk) { blkg_destroy_all(disk); + rq_qos_exit(disk->queue); blk_throtl_exit(disk); } -- cgit v1.2.3 From 1eb206208b0f3f707c67134ef6ba394410effb67 Mon Sep 17 00:00:00 2001 From: Yuwei Guan Date: Thu, 10 Nov 2022 19:26:22 +0800 Subject: block, bfq: only do counting of pending-request for BFQ_GROUP_IOSCHED The 'bfqd->num_groups_with_pending_reqs' is used when CONFIG_BFQ_GROUP_IOSCHED is enabled, so let the variables and processes take effect when CONFIG_BFQ_GROUP_IOSCHED is enabled. Cc: Yu Kuai Signed-off-by: Yuwei Guan Reviewed-by: Jan Kara Reviewed-by: Yu Kuai Link: https://lore.kernel.org/r/20221110112622.389332-1-Yuwei.Guan@zeekrlife.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 ++ block/bfq-iosched.h | 4 ++++ block/bfq-wf2q.c | 8 ++++---- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index dc576b90ddfe..16f43bbc575a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7063,7 +7063,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->idle_slice_timer.function = bfq_idle_slice_timer; bfqd->queue_weights_tree = RB_ROOT_CACHED; +#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqd->num_groups_with_pending_reqs = 0; +#endif INIT_LIST_HEAD(&bfqd->active_list); INIT_LIST_HEAD(&bfqd->idle_list); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 9fa89577322d..41aa151ccc22 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -197,8 +197,10 @@ struct bfq_entity { /* flag, set to request a weight, ioprio or ioprio_class change */ int prio_changed; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* flag, set if the entity is counted in groups_with_pending_reqs */ bool in_groups_with_pending_reqs; +#endif /* last child queue of entity created (for non-leaf entities) */ struct bfq_queue *last_bfqq_created; @@ -491,6 +493,7 @@ struct bfq_data { */ struct rb_root_cached queue_weights_tree; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* * Number of groups with at least one process that * has at least one request waiting for completion. Note that @@ -538,6 +541,7 @@ struct bfq_data { * with no request waiting for completion. */ unsigned int num_groups_with_pending_reqs; +#endif /* * Per-class (RT, BE, IDLE) number of bfq_queues containing diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index b02b53658ed4..ea4c3d757fdd 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1612,28 +1612,28 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) { +#ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_entity *entity = &bfqq->entity; if (!entity->in_groups_with_pending_reqs) { entity->in_groups_with_pending_reqs = true; -#ifdef CONFIG_BFQ_GROUP_IOSCHED if (!(bfqq_group(bfqq)->num_queues_with_pending_reqs++)) bfqq->bfqd->num_groups_with_pending_reqs++; -#endif } +#endif } void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) { +#ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_entity *entity = &bfqq->entity; if (entity->in_groups_with_pending_reqs) { entity->in_groups_with_pending_reqs = false; -#ifdef CONFIG_BFQ_GROUP_IOSCHED if (!(--bfqq_group(bfqq)->num_queues_with_pending_reqs)) bfqq->bfqd->num_groups_with_pending_reqs--; -#endif } +#endif } /* -- cgit v1.2.3 From d36a9ea5e7766961e753ee38d4c331bbe6ef659b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 15 Dec 2022 10:16:29 +0800 Subject: block: fix use-after-free of q->q_usage_counter For blk-mq, queue release handler is usually called after blk_mq_freeze_queue_wait() returns. However, the q_usage_counter->release() handler may not be run yet at that time, so this can cause a use-after-free. Fix the issue by moving percpu_ref_exit() into blk_free_queue_rcu(). Since ->release() is called with rcu read lock held, it is agreed that the race should be covered in caller per discussion from the two links. Reported-by: Zhang Wensheng Reported-by: Zhong Jinghua Link: https://lore.kernel.org/linux-block/Y5prfOjyyjQKUrtH@T590/T/#u Link: https://lore.kernel.org/lkml/Y4%2FmzMd4evRg9yDi@fedora/ Cc: Hillf Danton Cc: Yu Kuai Cc: Dennis Zhou Fixes: 2b0d3d3e4fcf ("percpu_ref: reduce memory footprint of percpu_ref in fast path") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20221215021629.74870-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 3866b6c4cd88..9321767470dc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -254,14 +254,15 @@ EXPORT_SYMBOL_GPL(blk_clear_pm_only); static void blk_free_queue_rcu(struct rcu_head *rcu_head) { - kmem_cache_free(blk_requestq_cachep, - container_of(rcu_head, struct request_queue, rcu_head)); + struct request_queue *q = container_of(rcu_head, + struct request_queue, rcu_head); + + percpu_ref_exit(&q->q_usage_counter); + kmem_cache_free(blk_requestq_cachep, q); } static void blk_free_queue(struct request_queue *q) { - percpu_ref_exit(&q->q_usage_counter); - if (q->poll_stat) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); -- cgit v1.2.3