diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-03-21 16:48:55 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-03-21 16:48:55 -0700 |
commit | 616355cc818c6ddadc393fdfd4491f94458cb715 (patch) | |
tree | a96907b179ccbeb84cd1df5515cd10c90eb5bd59 /block/blk-throttle.c | |
parent | b080cee72ef355669cbc52ff55dc513d37433600 (diff) | |
parent | 8f9e7b65f833cb9a4b2e2f54a049d74df394d906 (diff) | |
download | linux-616355cc818c6ddadc393fdfd4491f94458cb715.tar.bz2 |
Merge tag 'for-5.18/block-2022-03-18' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
- BFQ cleanups and fixes (Yu, Zhang, Yahu, Paolo)
- blk-rq-qos completion fix (Tejun)
- blk-cgroup merge fix (Tejun)
- Add offline error return value to distinguish it from an IO error on
the device (Song)
- IO stats fixes (Zhang, Christoph)
- blkcg refcount fixes (Ming, Yu)
- Fix for indefinite dispatch loop softlockup (Shin'ichiro)
- blk-mq hardware queue management improvements (Ming)
- sbitmap dead code removal (Ming, John)
- Plugging merge improvements (me)
- Show blk-crypto capabilities in sysfs (Eric)
- Multiple delayed queue run improvement (David)
- Block throttling fixes (Ming)
- Start deprecating auto module loading based on dev_t (Christoph)
- bio allocation improvements (Christoph, Chaitanya)
- Get rid of bio_devname (Christoph)
- bio clone improvements (Christoph)
- Block plugging improvements (Christoph)
- Get rid of genhd.h header (Christoph)
- Ensure drivers use appropriate flush helpers (Christoph)
- Refcounting improvements (Christoph)
- Queue initialization and teardown improvements (Ming, Christoph)
- Misc fixes/improvements (Barry, Chaitanya, Colin, Dan, Jiapeng,
Lukas, Nian, Yang, Eric, Chengming)
* tag 'for-5.18/block-2022-03-18' of git://git.kernel.dk/linux-block: (127 commits)
block: cancel all throttled bios in del_gendisk()
block: let blkcg_gq grab request queue's refcnt
block: avoid use-after-free on throttle data
block: limit request dispatch loop duration
block/bfq-iosched: Fix spelling mistake "tenative" -> "tentative"
sr: simplify the local variable initialization in sr_block_open()
block: don't merge across cgroup boundaries if blkcg is enabled
block: fix rq-qos breakage from skipping rq_qos_done_bio()
block: flush plug based on hardware and software queue order
block: ensure plug merging checks the correct queue at least once
block: move rq_qos_exit() into disk_release()
block: do more work in elevator_exit
block: move blk_exit_queue into disk_release
block: move q_usage_counter release into blk_queue_release
block: don't remove hctx debugfs dir from blk_mq_exit_queue
block: move blkcg initialization/destroy into disk allocation/release handler
sr: implement ->free_disk to simplify refcounting
sd: implement ->free_disk to simplify refcounting
sd: delay calling free_opal_dev
sd: call sd_zbc_release_disk before releasing the scsi_device reference
...
Diffstat (limited to 'block/blk-throttle.c')
-rw-r--r-- | block/blk-throttle.c | 110 |
1 files changed, 68 insertions, 42 deletions
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 7c462c006b26..469c483719be 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -10,7 +10,6 @@ #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> -#include <linux/blk-cgroup.h> #include "blk.h" #include "blk-cgroup-rwstat.h" #include "blk-stat.h" @@ -42,11 +41,6 @@ /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; -enum tg_state_flags { - THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ - THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ -}; - #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) /* We measure latency for request size from <= 4k to >= 1M */ @@ -426,12 +420,24 @@ static void tg_update_has_rules(struct throtl_grp *tg) struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); struct throtl_data *td = tg->td; int rw; + int has_iops_limit = 0; + + for (rw = READ; rw <= WRITE; rw++) { + unsigned int iops_limit = tg_iops_limit(tg, rw); - for (rw = READ; rw <= WRITE; rw++) tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || (td->limit_valid[td->limit_index] && (tg_bps_limit(tg, rw) != U64_MAX || - tg_iops_limit(tg, rw) != UINT_MAX)); + iops_limit != UINT_MAX)); + + if (iops_limit != UINT_MAX) + has_iops_limit = 1; + } + + if (has_iops_limit) + tg->flags |= THROTL_TG_HAS_IOPS_LIMIT; + else + tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT; } static void throtl_pd_online(struct blkg_policy_data *pd) @@ -634,8 +640,6 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; - atomic_set(&tg->io_split_cnt[rw], 0); - /* * Previous slice has expired. We must have trimmed it after last * bio dispatch. That means since start of last slice, we never used @@ -659,8 +663,6 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; - atomic_set(&tg->io_split_cnt[rw], 0); - throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -808,7 +810,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); - if (bps_limit == U64_MAX) { + /* no need to throttle if this bio's bytes have been accounted */ + if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) { if (wait) *wait = 0; return true; @@ -871,7 +874,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ - if (bps_limit == U64_MAX && iops_limit == UINT_MAX) { + if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || + tg->flags & THROTL_TG_CANCELING) { if (wait) *wait = 0; return true; @@ -893,9 +897,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } - if (iops_limit != UINT_MAX) - tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0); - if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { if (wait) @@ -920,9 +921,12 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ - tg->bytes_disp[rw] += bio_size; + if (!bio_flagged(bio, BIO_THROTTLED)) { + tg->bytes_disp[rw] += bio_size; + tg->last_bytes_disp[rw] += bio_size; + } + tg->io_disp[rw]++; - tg->last_bytes_disp[rw] += bio_size; tg->last_io_disp[rw]++; /* @@ -1134,12 +1138,22 @@ static void throtl_pending_timer_fn(struct timer_list *t) struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); - struct request_queue *q = td->queue; struct throtl_service_queue *parent_sq; + struct request_queue *q; bool dispatched; int ret; + /* throtl_data may be gone, so figure out request queue by blkg */ + if (tg) + q = tg->pd.blkg->q; + else + q = td->queue; + spin_lock_irq(&q->queue_lock); + + if (!q->root_blkg) + goto out_unlock; + if (throtl_can_upgrade(td, NULL)) throtl_upgrade_state(td); @@ -1219,7 +1233,7 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); while ((bio = bio_list_pop(&bio_list_on_stack))) - submit_bio_noacct(bio); + submit_bio_noacct_nocheck(bio); blk_finish_plug(&plug); } } @@ -1763,6 +1777,39 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) return false; } +void blk_throtl_cancel_bios(struct request_queue *q) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + spin_lock_irq(&q->queue_lock); + /* + * queue_lock is held, rcu lock is not needed here technically. + * However, rcu lock is still held to emphasize that following + * path need RCU protection and to prevent warning from lockdep. + */ + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + /* + * Set the flag to make sure throtl_pending_timer_fn() won't + * stop until all throttled bios are dispatched. + */ + blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING; + /* + * Update disptime after setting the above flag to make sure + * throtl_select_dispatch() won't exit without dispatching. + */ + tg_update_disptime(tg); + + throtl_schedule_pending_timer(sq, jiffies + 1); + } + rcu_read_unlock(); + spin_unlock_irq(&q->queue_lock); +} + static bool throtl_can_upgrade(struct throtl_data *td, struct throtl_grp *this_tg) { @@ -1917,14 +1964,12 @@ static void throtl_downgrade_check(struct throtl_grp *tg) } if (tg->iops[READ][LIMIT_LOW]) { - tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0); iops = tg->last_io_disp[READ] * HZ / elapsed_time; if (iops >= tg->iops[READ][LIMIT_LOW]) tg->last_low_overflow_time[READ] = now; } if (tg->iops[WRITE][LIMIT_LOW]) { - tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0); iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; if (iops >= tg->iops[WRITE][LIMIT_LOW]) tg->last_low_overflow_time[WRITE] = now; @@ -2043,25 +2088,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif -void blk_throtl_charge_bio_split(struct bio *bio) -{ - struct blkcg_gq *blkg = bio->bi_blkg; - struct throtl_grp *parent = blkg_to_tg(blkg); - struct throtl_service_queue *parent_sq; - bool rw = bio_data_dir(bio); - - do { - if (!parent->has_rules[rw]) - break; - - atomic_inc(&parent->io_split_cnt[rw]); - atomic_inc(&parent->last_io_split_cnt[rw]); - - parent_sq = parent->service_queue.parent_sq; - parent = sq_to_tg(parent_sq); - } while (parent); -} - bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); |