From cfc97abcbe0b1ffac441202fe330e697370ecbcc Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 24 Mar 2022 14:36:47 -0400 Subject: dm: conditionally enable BIOSET_PERCPU_CACHE for dm_io bioset A bioset's per-cpu alloc cache may have broader utility in the future but for now constrain it to being tightly coupled to QUEUE_FLAG_POLL. Also change dm_io_complete() to use bio_clear_polled() so that it properly clears all associated bio state on requeue. This commit improves DM's hipri bio polling (REQ_POLLED) perf by 7 - 20% depending on the system. Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 11 ++++++++--- drivers/md/dm.c | 8 ++++---- drivers/md/dm.h | 4 ++-- 3 files changed, 14 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e7d42f6335a2..73ed15d8cfc6 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1002,6 +1002,8 @@ bool dm_table_request_based(struct dm_table *t) return __table_type_request_based(dm_table_get_type(t)); } +static int dm_table_supports_poll(struct dm_table *t); + static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { enum dm_queue_mode type = dm_table_get_type(t); @@ -1009,21 +1011,24 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * unsigned min_pool_size = 0; struct dm_target *ti; unsigned i; + bool poll_supported = false; if (unlikely(type == DM_TYPE_NONE)) { DMWARN("no table type is set, can't allocate mempools"); return -EINVAL; } - if (__table_type_bio_based(type)) + if (__table_type_bio_based(type)) { for (i = 0; i < t->num_targets; i++) { ti = t->targets + i; per_io_data_size = max(per_io_data_size, ti->per_io_data_size); min_pool_size = max(min_pool_size, ti->num_flush_bios); } + poll_supported = !!dm_table_supports_poll(t); + } - t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, - per_io_data_size, min_pool_size); + t->mempools = dm_alloc_md_mempools(md, type, per_io_data_size, min_pool_size, + t->integrity_supported, poll_supported); if (!t->mempools) return -ENOMEM; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 39081338ca61..cdf35c6f8b0b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -899,7 +899,7 @@ static void dm_io_complete(struct dm_io *io) * may only reflect a subset of the pre-split original) * so clear REQ_POLLED in case of requeue. */ - bio->bi_opf &= ~REQ_POLLED; + bio_clear_polled(bio); if (io_error == BLK_STS_AGAIN) { /* io_uring doesn't handle BLK_STS_AGAIN (yet) */ queue_io(md, bio); @@ -2901,8 +2901,8 @@ int dm_noflush_suspending(struct dm_target *ti) EXPORT_SYMBOL_GPL(dm_noflush_suspending); struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned integrity, unsigned per_io_data_size, - unsigned min_pool_size) + unsigned per_io_data_size, unsigned min_pool_size, + bool integrity, bool poll) { struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); unsigned int pool_size = 0; @@ -2918,7 +2918,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_qu pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size); front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET; io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET; - ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0); + ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, poll ? BIOSET_PERCPU_CACHE : 0); if (ret) goto out; if (integrity && bioset_integrity_create(&pools->io_bs, pool_size)) diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 9013dc1a7b00..3f89664fea01 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -221,8 +221,8 @@ void dm_kcopyd_exit(void); * Mempool operations */ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type, - unsigned integrity, unsigned per_bio_data_size, - unsigned min_pool_size); + unsigned per_io_data_size, unsigned min_pool_size, + bool integrity, bool poll); void dm_free_md_mempools(struct dm_md_mempools *pools); /* -- cgit v1.2.3 From 84b98f4ce4d1d1f811d3e0658ec76c0349d8023a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 17 Mar 2022 13:52:06 -0400 Subject: dm: factor out dm_io_set_error and __dm_io_dec_pending Also eliminate need to use errno_to_blk_status(). Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 64 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 28 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index cdf35c6f8b0b..82c1636f8591 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -578,7 +578,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) io = container_of(tio, struct dm_io, tio); io->magic = DM_IO_MAGIC; - io->status = 0; + io->status = BLK_STS_OK; atomic_set(&io->io_count, 1); this_cpu_inc(*md->pending_io); io->orig_bio = NULL; @@ -933,20 +933,31 @@ static inline bool dm_tio_is_normal(struct dm_target_io *tio) * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. */ -void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +static inline void __dm_io_dec_pending(struct dm_io *io) +{ + if (atomic_dec_and_test(&io->io_count)) + dm_io_complete(io); +} + +static void dm_io_set_error(struct dm_io *io, blk_status_t error) { + unsigned long flags; + /* Push-back supersedes any I/O errors */ - if (unlikely(error)) { - unsigned long flags; - spin_lock_irqsave(&io->lock, flags); - if (!(io->status == BLK_STS_DM_REQUEUE && - __noflush_suspending(io->md))) - io->status = error; - spin_unlock_irqrestore(&io->lock, flags); + spin_lock_irqsave(&io->lock, flags); + if (!(io->status == BLK_STS_DM_REQUEUE && + __noflush_suspending(io->md))) { + io->status = error; } + spin_unlock_irqrestore(&io->lock, flags); +} - if (atomic_dec_and_test(&io->io_count)) - dm_io_complete(io); +void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +{ + if (unlikely(error)) + dm_io_set_error(io, error); + + __dm_io_dec_pending(io); } void disable_discard(struct mapped_device *md) @@ -1428,7 +1439,7 @@ static bool is_abnormal_io(struct bio *bio) } static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, - int *result) + blk_status_t *status) { unsigned num_bios = 0; @@ -1452,11 +1463,11 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, * reconfiguration might also have changed that since the * check was performed. */ - if (!num_bios) - *result = -EOPNOTSUPP; + if (unlikely(!num_bios)) + *status = BLK_STS_NOTSUPP; else { __send_changing_extent_only(ci, ti, num_bios); - *result = 0; + *status = BLK_STS_OK; } return true; } @@ -1505,19 +1516,16 @@ static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) /* * Select the correct strategy for processing a non-flush bio. */ -static int __split_and_process_bio(struct clone_info *ci) +static blk_status_t __split_and_process_bio(struct clone_info *ci) { struct bio *clone; struct dm_target *ti; unsigned len; - int r; + blk_status_t error = BLK_STS_IOERR; ti = dm_table_find_target(ci->map, ci->sector); - if (!ti) - return -EIO; - - if (__process_abnormal_io(ci, ti, &r)) - return r; + if (unlikely(!ti || __process_abnormal_io(ci, ti, &error))) + return error; /* * Only support bio polling for normal IO, and the target io is @@ -1532,7 +1540,7 @@ static int __split_and_process_bio(struct clone_info *ci) ci->sector += len; ci->sector_count -= len; - return 0; + return BLK_STS_OK; } static void init_clone_info(struct clone_info *ci, struct mapped_device *md, @@ -1558,7 +1566,7 @@ static void dm_split_and_process_bio(struct mapped_device *md, { struct clone_info ci; struct bio *orig_bio = NULL; - int error = 0; + blk_status_t error = BLK_STS_OK; init_clone_info(&ci, md, map, bio); @@ -1600,7 +1608,7 @@ out: * bio->bi_private, so that dm_poll_bio can poll them all. */ if (error || !ci.submit_as_polled) - dm_io_dec_pending(ci.io, errno_to_blk_status(error)); + dm_io_dec_pending(ci.io, error); else dm_queue_poll_io(bio, ci.io); } @@ -1681,10 +1689,10 @@ static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, if (dm_poll_dm_io(io, iob, flags)) { hlist_del_init(&io->node); /* - * clone_endio() has already occurred, so passing - * error as 0 here doesn't override io->status + * clone_endio() has already occurred, so no + * error handling is needed here. */ - dm_io_dec_pending(io, 0); + __dm_io_dec_pending(io); } } -- cgit v1.2.3 From 4857abf66434d96dec1a9209bbfbf55e2ee359fa Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 25 Mar 2022 13:53:23 -0400 Subject: dm: simplify dm_io access in dm_split_and_process_bio Use local variable instead of redudant access using ci.io Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 82c1636f8591..71357592646a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1565,10 +1565,12 @@ static void dm_split_and_process_bio(struct mapped_device *md, struct dm_table *map, struct bio *bio) { struct clone_info ci; + struct dm_io *io; struct bio *orig_bio = NULL; blk_status_t error = BLK_STS_OK; init_clone_info(&ci, md, map, bio); + io = ci.io; if (bio->bi_opf & REQ_PREFLUSH) { __send_empty_flush(&ci); @@ -1577,14 +1579,14 @@ static void dm_split_and_process_bio(struct mapped_device *md, } error = __split_and_process_bio(&ci); - ci.io->map_task = NULL; + io->map_task = NULL; if (error || !ci.sector_count) goto out; /* * Remainder must be passed to submit_bio_noacct() so it gets handled * *after* bios already submitted have been completely processed. - * We take a clone of the original to store in ci.io->orig_bio to be + * We take a clone of the original to store in io->orig_bio to be * used by dm_end_io_acct() and for dm_io_complete() to use for * completion handling. */ @@ -1596,9 +1598,9 @@ static void dm_split_and_process_bio(struct mapped_device *md, out: if (!orig_bio) orig_bio = bio; - smp_store_release(&ci.io->orig_bio, orig_bio); - if (dm_io_flagged(ci.io, DM_IO_START_ACCT)) - dm_start_io_acct(ci.io, NULL); + smp_store_release(&io->orig_bio, orig_bio); + if (dm_io_flagged(io, DM_IO_START_ACCT)) + dm_start_io_acct(io, NULL); /* * Drop the extra reference count for non-POLLED bio, and hold one @@ -1610,7 +1612,7 @@ out: if (error || !ci.submit_as_polled) dm_io_dec_pending(ci.io, error); else - dm_queue_poll_io(bio, ci.io); + dm_queue_poll_io(bio, io); } static void dm_submit_bio(struct bio *bio) -- cgit v1.2.3 From 3b03f7c1242c754f0c474b37eec7d79107b9f375 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 25 Mar 2022 14:12:47 -0400 Subject: dm: simplify dm_start_io_acct Pull common DM_IO_ACCOUNTED check out to beginning of dm_start_io_acct. Also, use dm_tio_is_normal (and move it to dm-core.h). Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 6 ++++++ drivers/md/dm.c | 18 +++++------------- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 4277853c7535..db069fa9cee5 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -237,6 +237,12 @@ static inline void dm_tio_set_flag(struct dm_target_io *tio, unsigned int bit) tio->flags |= (1U << bit); } +static inline bool dm_tio_is_normal(struct dm_target_io *tio) +{ + return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && + !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); +} + /* * One of these is allocated per original bio. * It contains the first clone used for that original. diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 71357592646a..e6b6fe03dbcf 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -538,17 +538,15 @@ static void dm_start_io_acct(struct dm_io *io, struct bio *clone) /* * Ensure IO accounting is only ever started once. - * Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ - if (!clone || - likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) { - if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED))) - return; + if (dm_io_flagged(io, DM_IO_ACCOUNTED)) + return; + + /* Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO. */ + if (!clone || likely(dm_tio_is_normal(clone_to_tio(clone)))) { dm_io_set_flag(io, DM_IO_ACCOUNTED); } else { unsigned long flags; - if (dm_io_flagged(io, DM_IO_ACCOUNTED)) - return; /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */ spin_lock_irqsave(&io->lock, flags); dm_io_set_flag(io, DM_IO_ACCOUNTED); @@ -923,12 +921,6 @@ static void dm_io_complete(struct dm_io *io) } } -static inline bool dm_tio_is_normal(struct dm_target_io *tio) -{ - return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) && - !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); -} - /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. -- cgit v1.2.3 From fe221db4192cf6c5f6b0be59e09025b05418e226 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 25 Mar 2022 17:20:45 -0400 Subject: dm: mark various branches unlikely Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e6b6fe03dbcf..34d457cfcb36 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -991,7 +991,7 @@ static void clone_endio(struct bio *bio) disable_write_zeroes(md); } - if (blk_queue_is_zoned(q)) + if (unlikely(blk_queue_is_zoned(q))) dm_zone_endio(io, bio); if (endio) { @@ -1288,7 +1288,7 @@ static void __map_bio(struct bio *clone) * on zoned target. In this case, dm_zone_map_bio() calls the target * map operation. */ - if (dm_emulate_zone_append(io->md)) + if (unlikely(dm_emulate_zone_append(io->md))) r = dm_zone_map_bio(tio); else r = ti->type->map(ti, clone); @@ -1631,7 +1631,7 @@ static void dm_submit_bio(struct bio *bio) * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) * otherwise associated queue_limits won't be imposed. */ - if (is_abnormal_io(bio)) + if (unlikely(is_abnormal_io(bio))) blk_queue_split(&bio); dm_split_and_process_bio(md, map, bio); -- cgit v1.2.3 From 6cbce280fc741c2057d574366318eafbeabbcfda Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Sat, 26 Mar 2022 13:46:06 -0400 Subject: dm: add local variables to clone_endio and __map_bio Avoid redundant dereferences in both functions. Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 34d457cfcb36..152545ef00fa 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -976,11 +976,12 @@ static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) static void clone_endio(struct bio *bio) { blk_status_t error = bio->bi_status; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct dm_target_io *tio = clone_to_tio(bio); + struct dm_target *ti = tio->ti; + dm_endio_fn endio = ti->type->end_io; struct dm_io *io = tio->io; - struct mapped_device *md = tio->io->md; - dm_endio_fn endio = tio->ti->type->end_io; - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct mapped_device *md = io->md; if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_DISCARD && @@ -995,7 +996,7 @@ static void clone_endio(struct bio *bio) dm_zone_endio(io, bio); if (endio) { - int r = endio(tio->ti, bio, &error); + int r = endio(ti, bio, &error); switch (r) { case DM_ENDIO_REQUEUE: /* @@ -1019,10 +1020,8 @@ static void clone_endio(struct bio *bio) } } - if (unlikely(swap_bios_limit(tio->ti, bio))) { - struct mapped_device *md = io->md; + if (unlikely(swap_bios_limit(ti, bio))) up(&md->swap_bios_semaphore); - } free_tio(bio); dm_io_dec_pending(io, error); @@ -1263,9 +1262,10 @@ static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) static void __map_bio(struct bio *clone) { struct dm_target_io *tio = clone_to_tio(clone); - int r; - struct dm_io *io = tio->io; struct dm_target *ti = tio->ti; + struct dm_io *io = tio->io; + struct mapped_device *md = io->md; + int r; clone->bi_end_io = clone_endio; @@ -1276,7 +1276,6 @@ static void __map_bio(struct bio *clone) tio->old_sector = clone->bi_iter.bi_sector; if (unlikely(swap_bios_limit(ti, clone))) { - struct mapped_device *md = io->md; int latch = get_swap_bios(); if (unlikely(latch != md->swap_bios)) __set_swap_bios_limit(md, latch); @@ -1288,7 +1287,7 @@ static void __map_bio(struct bio *clone) * on zoned target. In this case, dm_zone_map_bio() calls the target * map operation. */ - if (unlikely(dm_emulate_zone_append(io->md))) + if (unlikely(dm_emulate_zone_append(md))) r = dm_zone_map_bio(tio); else r = ti->type->map(ti, clone); @@ -1304,14 +1303,14 @@ static void __map_bio(struct bio *clone) * the bio has been remapped so dispatch it, but defer * dm_start_io_acct() until after possible bio_split(). */ - __dm_submit_bio_remap(clone, disk_devt(io->md->disk), + __dm_submit_bio_remap(clone, disk_devt(md->disk), tio->old_sector); dm_io_set_flag(io, DM_IO_START_ACCT); break; case DM_MAPIO_KILL: case DM_MAPIO_REQUEUE: if (unlikely(swap_bios_limit(ti, clone))) - up(&io->md->swap_bios_semaphore); + up(&md->swap_bios_semaphore); free_tio(clone); if (r == DM_MAPIO_KILL) dm_io_dec_pending(io, BLK_STS_IOERR); -- cgit v1.2.3 From 982b48ae25d9aabd69c1f012a94f63766181f0b2 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Sat, 26 Mar 2022 14:38:07 -0400 Subject: dm: move hot dm_io members to same cacheline as dm_target_io Just saves some cacheline bouncing for members accessed during cloned bio submission and completion. Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index db069fa9cee5..41d6511dc7cf 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -250,17 +250,19 @@ static inline bool dm_tio_is_normal(struct dm_target_io *tio) #define DM_IO_MAGIC 19577 struct dm_io { unsigned short magic; - blk_short_t flags; - atomic_t io_count; - struct mapped_device *md; - struct bio *orig_bio; - blk_status_t status; + spinlock_t lock; unsigned long start_time; void *data; struct hlist_node node; struct task_struct *map_task; struct dm_stats_aux stats_aux; + + blk_short_t flags; + blk_status_t status; + atomic_t io_count; + struct mapped_device *md; + struct bio *orig_bio; /* last member of dm_target_io is 'struct bio' */ struct dm_target_io tio; }; -- cgit v1.2.3 From 563a225c9fd207326c2a2af9d59b4097cb31ce70 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Sat, 26 Mar 2022 21:08:36 -0400 Subject: dm: introduce dm_{get,put}_live_table_bio called from dm_submit_bio If a bio is marked REQ_NOWAIT optimize dm_submit_bio()'s dm_table RCU usage to dm_{get,put}_live_table_fast. DM core offers protection against blocking (via suspend) if REQ_NOWAIT. Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 152545ef00fa..d7b7154edffd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -661,14 +661,16 @@ static void queue_io(struct mapped_device *md, struct bio *bio) * function to access the md->map field, and make sure they call * dm_put_live_table() when finished. */ -struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) +struct dm_table *dm_get_live_table(struct mapped_device *md, + int *srcu_idx) __acquires(md->io_barrier) { *srcu_idx = srcu_read_lock(&md->io_barrier); return srcu_dereference(md->map, &md->io_barrier); } -void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) +void dm_put_live_table(struct mapped_device *md, + int srcu_idx) __releases(md->io_barrier) { srcu_read_unlock(&md->io_barrier, srcu_idx); } @@ -694,6 +696,24 @@ static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) rcu_read_unlock(); } +static inline struct dm_table *dm_get_live_table_bio(struct mapped_device *md, + int *srcu_idx, struct bio *bio) +{ + if (bio->bi_opf & REQ_NOWAIT) + return dm_get_live_table_fast(md); + else + return dm_get_live_table(md, srcu_idx); +} + +static inline void dm_put_live_table_bio(struct mapped_device *md, int srcu_idx, + struct bio *bio) +{ + if (bio->bi_opf & REQ_NOWAIT) + dm_put_live_table_fast(md); + else + dm_put_live_table(md, srcu_idx); +} + static char *_dm_claim_ptr = "I belong to device-mapper"; /* @@ -1612,7 +1632,7 @@ static void dm_submit_bio(struct bio *bio) int srcu_idx; struct dm_table *map; - map = dm_get_live_table(md, &srcu_idx); + map = dm_get_live_table_bio(md, &srcu_idx, bio); /* If suspended, or map not yet available, queue this IO for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) || @@ -1635,7 +1655,7 @@ static void dm_submit_bio(struct bio *bio) dm_split_and_process_bio(md, map, bio); out: - dm_put_live_table(md, srcu_idx); + dm_put_live_table_bio(md, srcu_idx, bio); } static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, -- cgit v1.2.3 From 442761fd2b297d65d1cb5786249e1e07a19e9122 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Sat, 26 Mar 2022 14:14:00 -0400 Subject: dm: conditionally enable branching for less used features Use jump_labels to further reduce cost of unlikely branches for zoned block devices, dm-stats and swap_bios throttling. Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 5 ++++ drivers/md/dm-stats.c | 3 +++ drivers/md/dm-table.c | 5 ++++ drivers/md/dm.c | 63 ++++++++++++++++++++++++++++++++------------------- 4 files changed, 53 insertions(+), 23 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 41d6511dc7cf..8ba99eaa0872 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -154,6 +155,10 @@ static inline struct dm_stats *dm_get_stats(struct mapped_device *md) return &md->stats; } +DECLARE_STATIC_KEY_FALSE(stats_enabled); +DECLARE_STATIC_KEY_FALSE(swap_bios_enabled); +DECLARE_STATIC_KEY_FALSE(zoned_enabled); + static inline bool dm_emulate_zone_append(struct mapped_device *md) { if (blk_queue_is_zoned(md->queue)) diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 0e039a8c0bf2..86e0697330e8 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -396,6 +396,9 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, dm_stats_recalc_precise_timestamps(stats); + if (!static_key_enabled(&stats_enabled.key)) + static_branch_enable(&stats_enabled); + mutex_unlock(&stats->mutex); resume_callback(md); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 73ed15d8cfc6..a37c7b763643 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -719,6 +719,9 @@ int dm_table_add_target(struct dm_table *t, const char *type, DMWARN("%s: %s: ignoring discards_supported because num_discard_bios is zero.", dm_device_name(t->md), type); + if (tgt->limit_swap_bios && !static_key_enabled(&swap_bios_enabled.key)) + static_branch_enable(&swap_bios_enabled); + return 0; bad: @@ -2040,6 +2043,8 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, r = dm_set_zones_restrictions(t, q); if (r) return r; + if (!static_key_enabled(&zoned_enabled.key)) + static_branch_enable(&zoned_enabled); } dm_update_crypto_profile(q, t); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d7b7154edffd..6304322a48f0 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -71,6 +71,10 @@ void dm_issue_global_event(void) wake_up(&dm_global_eventq); } +DEFINE_STATIC_KEY_FALSE(stats_enabled); +DEFINE_STATIC_KEY_FALSE(swap_bios_enabled); +DEFINE_STATIC_KEY_FALSE(zoned_enabled); + /* * One of these is allocated (on-stack) per original bio. */ @@ -516,7 +520,8 @@ static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, else bio_end_io_acct(bio, start_time); - if (unlikely(dm_stats_used(&md->stats))) + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), bio->bi_iter.bi_sector, bio_sectors(bio), end, start_time, stats_aux); @@ -586,7 +591,8 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) io->start_time = jiffies; io->flags = 0; - dm_stats_record_start(&md->stats, &io->stats_aux); + if (static_branch_unlikely(&stats_enabled)) + dm_stats_record_start(&md->stats, &io->stats_aux); return io; } @@ -1012,21 +1018,25 @@ static void clone_endio(struct bio *bio) disable_write_zeroes(md); } - if (unlikely(blk_queue_is_zoned(q))) + if (static_branch_unlikely(&zoned_enabled) && + unlikely(blk_queue_is_zoned(q))) dm_zone_endio(io, bio); if (endio) { int r = endio(ti, bio, &error); switch (r) { case DM_ENDIO_REQUEUE: - /* - * Requeuing writes to a sequential zone of a zoned - * target will break the sequential write pattern: - * fail such IO. - */ - if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) - error = BLK_STS_IOERR; - else + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Requeuing writes to a sequential zone of a zoned + * target will break the sequential write pattern: + * fail such IO. + */ + if (WARN_ON_ONCE(dm_is_zone_write(md, bio))) + error = BLK_STS_IOERR; + else + error = BLK_STS_DM_REQUEUE; + } else error = BLK_STS_DM_REQUEUE; fallthrough; case DM_ENDIO_DONE: @@ -1040,7 +1050,8 @@ static void clone_endio(struct bio *bio) } } - if (unlikely(swap_bios_limit(ti, bio))) + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, bio))) up(&md->swap_bios_semaphore); free_tio(bio); @@ -1295,21 +1306,25 @@ static void __map_bio(struct bio *clone) dm_io_inc_pending(io); tio->old_sector = clone->bi_iter.bi_sector; - if (unlikely(swap_bios_limit(ti, clone))) { + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) { int latch = get_swap_bios(); if (unlikely(latch != md->swap_bios)) __set_swap_bios_limit(md, latch); down(&md->swap_bios_semaphore); } - /* - * Check if the IO needs a special mapping due to zone append emulation - * on zoned target. In this case, dm_zone_map_bio() calls the target - * map operation. - */ - if (unlikely(dm_emulate_zone_append(md))) - r = dm_zone_map_bio(tio); - else + if (static_branch_unlikely(&zoned_enabled)) { + /* + * Check if the IO needs a special mapping due to zone append + * emulation on zoned target. In this case, dm_zone_map_bio() + * calls the target map operation. + */ + if (unlikely(dm_emulate_zone_append(md))) + r = dm_zone_map_bio(tio); + else + r = ti->type->map(ti, clone); + } else r = ti->type->map(ti, clone); switch (r) { @@ -1329,7 +1344,8 @@ static void __map_bio(struct bio *clone) break; case DM_MAPIO_KILL: case DM_MAPIO_REQUEUE: - if (unlikely(swap_bios_limit(ti, clone))) + if (static_branch_unlikely(&swap_bios_enabled) && + unlikely(swap_bios_limit(ti, clone))) up(&md->swap_bios_semaphore); free_tio(clone); if (r == DM_MAPIO_KILL) @@ -1565,7 +1581,8 @@ static void init_clone_info(struct clone_info *ci, struct mapped_device *md, ci->sector_count = bio_sectors(bio); /* Shouldn't happen but sector_count was being set to 0 so... */ - if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) + if (static_branch_unlikely(&zoned_enabled) && + WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count)) ci->sector_count = 0; } -- cgit v1.2.3 From e86f2b005a51437d2887eec5ee659d0287d370ad Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 30 Mar 2022 13:52:10 -0400 Subject: dm: simplify basic targets Remove needless factoring and remap bi_sector regardless of bio_sectors() being non-zero. Signed-off-by: Mike Snitzer --- drivers/md/dm-delay.c | 3 +-- drivers/md/dm-flakey.c | 4 +--- drivers/md/dm-linear.c | 11 ++--------- 3 files changed, 4 insertions(+), 14 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 9a51bf51a859..869afef5654a 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -296,8 +296,7 @@ static int delay_map(struct dm_target *ti, struct bio *bio) } delayed->class = c; bio_set_dev(bio, c->dev->bdev); - if (bio_sectors(bio)) - bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); + bio->bi_iter.bi_sector = c->start + dm_target_offset(ti, bio->bi_iter.bi_sector); return delay_bio(dc, c, bio); } diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 345229d7e59c..f2305eb758a2 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -280,9 +280,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio) struct flakey_c *fc = ti->private; bio_set_dev(bio, fc->dev->bdev); - if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) - bio->bi_iter.bi_sector = - flakey_map_sector(ti, bio->bi_iter.bi_sector); + bio->bi_iter.bi_sector = flakey_map_sector(ti, bio->bi_iter.bi_sector); } static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 76b486e4d2be..0a6abbbe3745 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -84,19 +84,12 @@ static sector_t linear_map_sector(struct dm_target *ti, sector_t bi_sector) return lc->start + dm_target_offset(ti, bi_sector); } -static void linear_map_bio(struct dm_target *ti, struct bio *bio) +static int linear_map(struct dm_target *ti, struct bio *bio) { struct linear_c *lc = ti->private; bio_set_dev(bio, lc->dev->bdev); - if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) - bio->bi_iter.bi_sector = - linear_map_sector(ti, bio->bi_iter.bi_sector); -} - -static int linear_map(struct dm_target *ti, struct bio *bio) -{ - linear_map_bio(ti, bio); + bio->bi_iter.bi_sector = linear_map_sector(ti, bio->bi_iter.bi_sector); return DM_MAPIO_REMAPPED; } -- cgit v1.2.3 From bdb34759a0dbc89c134c60d282b5f100bed1365f Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Apr 2022 03:04:59 -0400 Subject: dm: use bio_sectors in dm_aceept_partial_bio Rename 'bi_size' to 'bio_sectors' given bi_size is being stored in sectors. Also, use bio_sectors() rather than open-coding it. Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6304322a48f0..69535e021c92 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1194,7 +1194,7 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, * +--------------------+---------------+-------+ * * <-------------- *tio->len_ptr ---------------> - * <------- bi_size -------> + * <----- bio_sectors -----> * <-- n_sectors --> * * Region 1 was already iterated over with bio_advance or similar function. @@ -1211,15 +1211,15 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) { struct dm_target_io *tio = clone_to_tio(bio); - unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + unsigned bio_sectors = bio_sectors(bio); BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO)); BUG_ON(op_is_zone_mgmt(bio_op(bio))); BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND); - BUG_ON(bi_size > *tio->len_ptr); - BUG_ON(n_sectors > bi_size); + BUG_ON(bio_sectors > *tio->len_ptr); + BUG_ON(n_sectors > bio_sectors); - *tio->len_ptr -= bi_size - n_sectors; + *tio->len_ptr -= bio_sectors - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); -- cgit v1.2.3 From b992b40dfcc1d904e5040c34c5edde3bbc83f60b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 12 Apr 2022 16:56:10 +0800 Subject: dm: don't pass bio to __dm_start_io_acct and dm_end_io_acct dm->orig_bio is always passed to __dm_start_io_acct and dm_end_io_acct, so it isn't necessary to take one bio parameter for the two helpers. Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 69535e021c92..19d0d6192ea6 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -531,16 +531,13 @@ static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, bio->bi_iter.bi_size = bi_size; } -static void __dm_start_io_acct(struct dm_io *io, struct bio *bio) +static void __dm_start_io_acct(struct dm_io *io) { - dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux); + dm_io_acct(false, io->md, io->orig_bio, io->start_time, &io->stats_aux); } static void dm_start_io_acct(struct dm_io *io, struct bio *clone) { - /* Must account IO to DM device in terms of orig_bio */ - struct bio *bio = io->orig_bio; - /* * Ensure IO accounting is only ever started once. */ @@ -558,12 +555,12 @@ static void dm_start_io_acct(struct dm_io *io, struct bio *clone) spin_unlock_irqrestore(&io->lock, flags); } - __dm_start_io_acct(io, bio); + __dm_start_io_acct(io); } -static void dm_end_io_acct(struct dm_io *io, struct bio *bio) +static void dm_end_io_acct(struct dm_io *io) { - dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux); + dm_io_acct(true, io->md, io->orig_bio, io->start_time, &io->stats_aux); } static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) @@ -899,14 +896,14 @@ static void dm_io_complete(struct dm_io *io) io_error = io->status; if (dm_io_flagged(io, DM_IO_ACCOUNTED)) - dm_end_io_acct(io, bio); + dm_end_io_acct(io); else if (!io_error) { /* * Must handle target that DM_MAPIO_SUBMITTED only to * then bio_endio() rather than dm_submit_bio_remap() */ - __dm_start_io_acct(io, bio); - dm_end_io_acct(io, bio); + __dm_start_io_acct(io); + dm_end_io_acct(io); } free_io(io); smp_wmb(); -- cgit v1.2.3 From e6926ad0c988d4cf8d4f4ec77373572831149104 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 12 Apr 2022 16:56:11 +0800 Subject: dm: pass dm_io instance to dm_io_acct directly All the other 4 parameters are retrieved from the 'dm_io' instance, so it's not necessary to pass all four to dm_io_acct(). Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 19d0d6192ea6..2b99ee2a6131 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -502,9 +502,12 @@ static bool bio_is_flush_with_data(struct bio *bio) return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); } -static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, - unsigned long start_time, struct dm_stats_aux *stats_aux) +static void dm_io_acct(struct dm_io *io, bool end) { + struct dm_stats_aux *stats_aux = &io->stats_aux; + unsigned long start_time = io->start_time; + struct mapped_device *md = io->md; + struct bio *bio = io->orig_bio; bool is_flush_with_data; unsigned int bi_size; @@ -533,7 +536,7 @@ static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, static void __dm_start_io_acct(struct dm_io *io) { - dm_io_acct(false, io->md, io->orig_bio, io->start_time, &io->stats_aux); + dm_io_acct(io, false); } static void dm_start_io_acct(struct dm_io *io, struct bio *clone) @@ -560,7 +563,7 @@ static void dm_start_io_acct(struct dm_io *io, struct bio *clone) static void dm_end_io_acct(struct dm_io *io) { - dm_io_acct(true, io->md, io->orig_bio, io->start_time, &io->stats_aux); + dm_io_acct(io, true); } static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) -- cgit v1.2.3 From d3de6d12694de45dfb4d7821d09224ed43dde3d7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 12 Apr 2022 16:56:12 +0800 Subject: dm: switch to bdev based IO accounting interfaces DM splits flush with data into empty flush followed by bio with data payload, switch dm_io_acct() to use bdev_{start,end}_io_acct() to do this accoiunting more naturally (rather than temporarily changing the bio's bi_size). This will allow DM to more easily account bios that are split (in following commit). Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2b99ee2a6131..30578ee0cbe3 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -508,30 +508,28 @@ static void dm_io_acct(struct dm_io *io, bool end) unsigned long start_time = io->start_time; struct mapped_device *md = io->md; struct bio *bio = io->orig_bio; - bool is_flush_with_data; - unsigned int bi_size; + unsigned int sectors; - /* If REQ_PREFLUSH set save any payload but do not account it */ - is_flush_with_data = bio_is_flush_with_data(bio); - if (is_flush_with_data) { - bi_size = bio->bi_iter.bi_size; - bio->bi_iter.bi_size = 0; - } + /* + * If REQ_PREFLUSH set, don't account payload, it will be + * submitted (and accounted) after this flush completes. + */ + if (bio_is_flush_with_data(bio)) + sectors = 0; + else + sectors = bio_sectors(bio); if (!end) - bio_start_io_acct_time(bio, start_time); + bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio), + start_time); else - bio_end_io_acct(bio, start_time); + bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); if (static_branch_unlikely(&stats_enabled) && unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), - bio->bi_iter.bi_sector, bio_sectors(bio), + bio->bi_iter.bi_sector, sectors, end, start_time, stats_aux); - - /* Restore bio's payload so it does get accounted upon requeue */ - if (is_flush_with_data) - bio->bi_iter.bi_size = bi_size; } static void __dm_start_io_acct(struct dm_io *io) -- cgit v1.2.3 From 7dd76d1feec70a23e1de2b55c0c2a9c592fa8423 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 12 Apr 2022 16:56:13 +0800 Subject: dm: improve bio splitting and associated IO accounting The current DM code (ab)uses late assignment of dm_io->orig_bio (after __map_bio() returns and any bio splitting is complete) to indicate the FS bio has been processed and can be accounted. This results in awkward waiting until ->orig_bio is set in dm_submit_bio_remap(). Also the bio splitting was implemented using bio_split()+bio_chain() -- a well-worn pattern but it requires bio cloning purely for the benefit of more natural IO accounting. The bio_split() result was stored in ->orig_bio to represent the mapped part of the original FS bio. DM has switched to the bdev based IO accounting interface. DM's IO accounting can be implemented in terms of the original FS bio (now stored early in ->orig_bio) via access to its sectors/bio_op. And if/when splitting is needed, set a new DM_IO_WAS_SPLIT flag and use new dm_io fields of .sector_offset & .sectors to allow IO accounting for split bios _without_ needing to clone a new bio to store in ->orig_bio. Signed-off-by: Ming Lei Co-developed-by: Mike Snitzer Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 8 +++++- drivers/md/dm.c | 75 ++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 59 insertions(+), 24 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 8ba99eaa0872..37ddedf61249 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -267,7 +267,12 @@ struct dm_io { blk_status_t status; atomic_t io_count; struct mapped_device *md; + + /* The three fields represent mapped part of original bio */ struct bio *orig_bio; + unsigned int sector_offset; /* offset to end of orig_bio */ + unsigned int sectors; + /* last member of dm_target_io is 'struct bio' */ struct dm_target_io tio; }; @@ -277,7 +282,8 @@ struct dm_io { */ enum { DM_IO_START_ACCT, - DM_IO_ACCOUNTED + DM_IO_ACCOUNTED, + DM_IO_WAS_SPLIT }; static inline bool dm_io_flagged(struct dm_io *io, unsigned int bit) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 30578ee0cbe3..7a1a83b58677 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -516,8 +516,10 @@ static void dm_io_acct(struct dm_io *io, bool end) */ if (bio_is_flush_with_data(bio)) sectors = 0; - else + else if (likely(!(dm_io_flagged(io, DM_IO_WAS_SPLIT)))) sectors = bio_sectors(bio); + else + sectors = io->sectors; if (!end) bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio), @@ -526,10 +528,18 @@ static void dm_io_acct(struct dm_io *io, bool end) bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); if (static_branch_unlikely(&stats_enabled) && - unlikely(dm_stats_used(&md->stats))) + unlikely(dm_stats_used(&md->stats))) { + sector_t sector; + + if (likely(!dm_io_flagged(io, DM_IO_WAS_SPLIT))) + sector = bio->bi_iter.bi_sector; + else + sector = bio_end_sector(bio) - io->sector_offset; + dm_stats_account_io(&md->stats, bio_data_dir(bio), - bio->bi_iter.bi_sector, sectors, + sector, sectors, end, start_time, stats_aux); + } } static void __dm_start_io_acct(struct dm_io *io) @@ -582,7 +592,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) io->status = BLK_STS_OK; atomic_set(&io->io_count, 1); this_cpu_inc(*md->pending_io); - io->orig_bio = NULL; + io->orig_bio = bio; io->md = md; io->map_task = current; spin_lock_init(&io->lock); @@ -1219,6 +1229,13 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) *tio->len_ptr -= bio_sectors - n_sectors; bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; + + /* + * __split_and_process_bio() may have already saved mapped part + * for accounting but it is being reduced so update accordingly. + */ + dm_io_set_flag(tio->io, DM_IO_WAS_SPLIT); + tio->io->sectors = n_sectors; } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); @@ -1257,13 +1274,6 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) /* Still in target's map function */ dm_io_set_flag(io, DM_IO_START_ACCT); } else { - /* - * Called by another thread, managed by DM target, - * wait for dm_split_and_process_bio() to store - * io->orig_bio - */ - while (unlikely(!smp_load_acquire(&io->orig_bio))) - msleep(1); dm_start_io_acct(io, clone); } @@ -1357,6 +1367,31 @@ static void __map_bio(struct bio *clone) } } +static void setup_split_accounting(struct clone_info *ci, unsigned len) +{ + struct dm_io *io = ci->io; + + if (ci->sector_count > len) { + /* + * Split needed, save the mapped part for accounting. + * NOTE: dm_accept_partial_bio() will update accordingly. + */ + dm_io_set_flag(io, DM_IO_WAS_SPLIT); + io->sectors = len; + } + + if (static_branch_unlikely(&stats_enabled) && + unlikely(dm_stats_used(&io->md->stats))) { + /* + * Save bi_sector in terms of its offset from end of + * original bio, only needed for DM-stats' benefit. + * - saved regardless of whether split needed so that + * dm_accept_partial_bio() doesn't need to. + */ + io->sector_offset = bio_end_sector(ci->bio) - ci->sector; + } +} + static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, struct dm_target *ti, unsigned num_bios) { @@ -1396,6 +1431,8 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, case 0: break; case 1: + if (len) + setup_split_accounting(ci, *len); clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); __map_bio(clone); break; @@ -1559,6 +1596,7 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci) ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED; len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count); + setup_split_accounting(ci, len); clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO); __map_bio(clone); @@ -1592,7 +1630,6 @@ static void dm_split_and_process_bio(struct mapped_device *md, { struct clone_info ci; struct dm_io *io; - struct bio *orig_bio = NULL; blk_status_t error = BLK_STS_OK; init_clone_info(&ci, md, map, bio); @@ -1608,23 +1645,15 @@ static void dm_split_and_process_bio(struct mapped_device *md, io->map_task = NULL; if (error || !ci.sector_count) goto out; - /* * Remainder must be passed to submit_bio_noacct() so it gets handled * *after* bios already submitted have been completely processed. - * We take a clone of the original to store in io->orig_bio to be - * used by dm_end_io_acct() and for dm_io_complete() to use for - * completion handling. */ - orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count, - GFP_NOIO, &md->queue->bio_split); - bio_chain(orig_bio, bio); - trace_block_split(orig_bio, bio->bi_iter.bi_sector); + bio_trim(bio, io->sectors, ci.sector_count); + trace_block_split(bio, bio->bi_iter.bi_sector); + bio_inc_remaining(bio); submit_bio_noacct(bio); out: - if (!orig_bio) - orig_bio = bio; - smp_store_release(&io->orig_bio, orig_bio); if (dm_io_flagged(io, DM_IO_START_ACCT)) dm_start_io_acct(io, NULL); -- cgit v1.2.3 From 2e803cd99ba8b7a84be155c1d5ee28d363fdbe44 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 12 Apr 2022 16:56:14 +0800 Subject: dm: don't grab target io reference in dm_zone_map_bio dm_zone_map_bio() is only called from __map_bio in which the io's reference is grabbed already, and the reference won't be released until the bio is submitted, so not necessary to do it dm_zone_map_bio any more. Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 7 ------- drivers/md/dm-zone.c | 10 ---------- drivers/md/dm.c | 7 ++++++- 3 files changed, 6 insertions(+), 18 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 37ddedf61249..d2d188c9b632 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -296,13 +296,6 @@ static inline void dm_io_set_flag(struct dm_io *io, unsigned int bit) io->flags |= (1U << bit); } -static inline void dm_io_inc_pending(struct dm_io *io) -{ - atomic_inc(&io->io_count); -} - -void dm_io_dec_pending(struct dm_io *io, blk_status_t error); - static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) { return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index 57daa86c19cf..3e7b1fe1580b 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -550,13 +550,6 @@ int dm_zone_map_bio(struct dm_target_io *tio) return DM_MAPIO_KILL; } - /* - * The target map function may issue and complete the IO quickly. - * Take an extra reference on the IO to make sure it does disappear - * until we run dm_zone_map_bio_end(). - */ - dm_io_inc_pending(io); - /* Let the target do its work */ r = ti->type->map(ti, clone); switch (r) { @@ -587,9 +580,6 @@ int dm_zone_map_bio(struct dm_target_io *tio) break; } - /* Drop the extra reference on the IO */ - dm_io_dec_pending(io, sts); - if (sts != BLK_STS_OK) return DM_MAPIO_KILL; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7a1a83b58677..7cae8235fbe1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -955,6 +955,11 @@ static void dm_io_complete(struct dm_io *io) } } +static void dm_io_inc_pending(struct dm_io *io) +{ + atomic_inc(&io->io_count); +} + /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. @@ -978,7 +983,7 @@ static void dm_io_set_error(struct dm_io *io, blk_status_t error) spin_unlock_irqrestore(&io->lock, flags); } -void dm_io_dec_pending(struct dm_io *io, blk_status_t error) +static void dm_io_dec_pending(struct dm_io *io, blk_status_t error) { if (unlikely(error)) dm_io_set_error(io, error); -- cgit v1.2.3 From 0f14d60a023cc4fe68758e0fcdc0ce82a82dde7d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 12 Apr 2022 16:56:15 +0800 Subject: dm: improve dm_io reference counting Currently each dm_io's reference counter is grabbed before calling __map_bio(), this way isn't efficient since we can move this grabbing to initialization time inside alloc_io(). Meantime it becomes typical async io reference counter model: one is for submission side, the other is for completion side, and the io won't be completed until both sides are done. Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 53 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 14 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7cae8235fbe1..6bc2dc8071fc 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -590,7 +590,9 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) io = container_of(tio, struct dm_io, tio); io->magic = DM_IO_MAGIC; io->status = BLK_STS_OK; - atomic_set(&io->io_count, 1); + + /* one ref is for submission, the other is for completion */ + atomic_set(&io->io_count, 2); this_cpu_inc(*md->pending_io); io->orig_bio = bio; io->md = md; @@ -955,11 +957,6 @@ static void dm_io_complete(struct dm_io *io) } } -static void dm_io_inc_pending(struct dm_io *io) -{ - atomic_inc(&io->io_count); -} - /* * Decrements the number of outstanding ios that a bio has been * cloned into, completing the original io if necc. @@ -1316,7 +1313,6 @@ static void __map_bio(struct bio *clone) /* * Map the clone. */ - dm_io_inc_pending(io); tio->old_sector = clone->bi_iter.bi_sector; if (static_branch_unlikely(&swap_bios_enabled) && @@ -1426,11 +1422,12 @@ static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci, } } -static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, +static int __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, unsigned num_bios, unsigned *len) { struct bio_list blist = BIO_EMPTY_LIST; struct bio *clone; + int ret = 0; switch (num_bios) { case 0: @@ -1440,6 +1437,7 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, setup_split_accounting(ci, *len); clone = alloc_tio(ci, ti, 0, len, GFP_NOIO); __map_bio(clone); + ret = 1; break; default: /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */ @@ -1447,9 +1445,12 @@ static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, while ((clone = bio_list_pop(&blist))) { dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO); __map_bio(clone); + ret += 1; } break; } + + return ret; } static void __send_empty_flush(struct clone_info *ci) @@ -1470,8 +1471,19 @@ static void __send_empty_flush(struct clone_info *ci) ci->sector_count = 0; ci->io->tio.clone.bi_iter.bi_size = 0; - while ((ti = dm_table_get_target(ci->map, target_nr++))) - __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + while ((ti = dm_table_get_target(ci->map, target_nr++))) { + int bios; + + atomic_add(ti->num_flush_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); + atomic_sub(ti->num_flush_bios - bios, &ci->io->io_count); + } + + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following subtraction + */ + atomic_sub(1, &ci->io->io_count); bio_uninit(ci->bio); } @@ -1480,11 +1492,18 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target unsigned num_bios) { unsigned len; + int bios; len = min_t(sector_t, ci->sector_count, max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector))); - __send_duplicate_bios(ci, ti, num_bios, &len); + atomic_add(num_bios, &ci->io->io_count); + bios = __send_duplicate_bios(ci, ti, num_bios, &len); + /* + * alloc_io() takes one extra reference for submission, so the + * reference won't reach 0 without the following (+1) subtraction + */ + atomic_sub(num_bios - bios + 1, &ci->io->io_count); ci->sector += len; ci->sector_count -= len; @@ -1669,9 +1688,15 @@ out: * Add every dm_io instance into the hlist_head which is stored in * bio->bi_private, so that dm_poll_bio can poll them all. */ - if (error || !ci.submit_as_polled) - dm_io_dec_pending(ci.io, error); - else + if (error || !ci.submit_as_polled) { + /* + * In case of submission failure, the extra reference for + * submitting io isn't consumed yet + */ + if (error) + atomic_dec(&io->io_count); + dm_io_dec_pending(io, error); + } else dm_queue_poll_io(bio, io); } -- cgit v1.2.3 From ec211631ae24b1e700354f48c05fab5b3c617d83 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 12 Apr 2022 16:56:16 +0800 Subject: dm: put all polled dm_io instances into a single list Now that bio_split() isn't used by DM's bio splitting, it is a bit overkill to link dm_io into an hlist given there is only single dm_io in the list. Convert to using a single list for holding all dm_io instances associated with this bio. Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 2 +- drivers/md/dm.c | 52 +++++++++++++++++++++++++++------------------------- 2 files changed, 28 insertions(+), 26 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index d2d188c9b632..f3cfc7affd12 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -259,7 +259,7 @@ struct dm_io { spinlock_t lock; unsigned long start_time; void *data; - struct hlist_node node; + struct dm_io *next; struct task_struct *map_task; struct dm_stats_aux stats_aux; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6bc2dc8071fc..cd084a74c65b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1559,7 +1559,7 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, } /* - * Reuse ->bi_private as hlist head for storing all dm_io instances + * Reuse ->bi_private as dm_io list head for storing all dm_io instances * associated with this bio, and this bio's bi_private needs to be * stored in dm_io->data before the reuse. * @@ -1567,36 +1567,37 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, * touch it after splitting. Meantime it won't be changed by anyone after * bio is submitted. So this reuse is safe. */ -static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio) +static inline struct dm_io **dm_poll_list_head(struct bio *bio) { - return (struct hlist_head *)&bio->bi_private; + return (struct dm_io **)&bio->bi_private; } static void dm_queue_poll_io(struct bio *bio, struct dm_io *io) { - struct hlist_head *head = dm_get_bio_hlist_head(bio); + struct dm_io **head = dm_poll_list_head(bio); if (!(bio->bi_opf & REQ_DM_POLL_LIST)) { bio->bi_opf |= REQ_DM_POLL_LIST; /* * Save .bi_private into dm_io, so that we can reuse - * .bi_private as hlist head for storing dm_io list + * .bi_private as dm_io list head for storing dm_io list */ io->data = bio->bi_private; - INIT_HLIST_HEAD(head); - /* tell block layer to poll for completion */ bio->bi_cookie = ~BLK_QC_T_NONE; + + io->next = NULL; } else { /* * bio recursed due to split, reuse original poll list, * and save bio->bi_private too. */ - io->data = hlist_entry(head->first, struct dm_io, node)->data; + io->data = (*head)->data; + io->next = *head; } - hlist_add_head(&io->node, head); + *head = io; } /* @@ -1685,8 +1686,8 @@ out: * Drop the extra reference count for non-POLLED bio, and hold one * reference for POLLED bio, which will be released in dm_poll_bio * - * Add every dm_io instance into the hlist_head which is stored in - * bio->bi_private, so that dm_poll_bio can poll them all. + * Add every dm_io instance into the dm_io list head which is stored + * in bio->bi_private, so that dm_poll_bio can poll them all. */ if (error || !ci.submit_as_polled) { /* @@ -1748,18 +1749,16 @@ static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob, static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { - struct hlist_head *head = dm_get_bio_hlist_head(bio); - struct hlist_head tmp = HLIST_HEAD_INIT; - struct hlist_node *next; - struct dm_io *io; + struct dm_io **head = dm_poll_list_head(bio); + struct dm_io *list = *head; + struct dm_io *tmp = NULL; + struct dm_io *curr, *next; /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */ if (!(bio->bi_opf & REQ_DM_POLL_LIST)) return 0; - WARN_ON_ONCE(hlist_empty(head)); - - hlist_move_list(head, &tmp); + WARN_ON_ONCE(!list); /* * Restore .bi_private before possibly completing dm_io. @@ -1770,24 +1769,27 @@ static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob, * clearing REQ_DM_POLL_LIST here. */ bio->bi_opf &= ~REQ_DM_POLL_LIST; - bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data; + bio->bi_private = list->data; - hlist_for_each_entry_safe(io, next, &tmp, node) { - if (dm_poll_dm_io(io, iob, flags)) { - hlist_del_init(&io->node); + for (curr = list, next = curr->next; curr; curr = next, next = + curr ? curr->next : NULL) { + if (dm_poll_dm_io(curr, iob, flags)) { /* * clone_endio() has already occurred, so no * error handling is needed here. */ - __dm_io_dec_pending(io); + __dm_io_dec_pending(curr); + } else { + curr->next = tmp; + tmp = curr; } } /* Not done? */ - if (!hlist_empty(&tmp)) { + if (tmp) { bio->bi_opf |= REQ_DM_POLL_LIST; /* Reset bio->bi_private to dm_io list head */ - hlist_move_list(&tmp, head); + *head = tmp; return 0; } return 1; -- cgit v1.2.3 From 9d20653fe84ebd772c3af71808e6a727603e0b71 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Apr 2022 20:08:23 -0400 Subject: dm: simplify bio-based IO accounting further Now that io splitting is recorded prior to, or during, ->map IO accounting can happen immediately rather than defer until after bio splitting in dm_split_and_process_bio(). Remove the DM_IO_START_ACCT flag and also remove dm_io's map_task member because there is no longer any need to wait for splitting to occur before accounting. Also move dm_io struct's 'flags' member to consolidate struct holes. Signed-off-by: Mike Snitzer --- drivers/md/dm-core.h | 6 +----- drivers/md/dm.c | 34 +++++----------------------------- 2 files changed, 6 insertions(+), 34 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index f3cfc7affd12..d21648a923ea 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -255,15 +255,12 @@ static inline bool dm_tio_is_normal(struct dm_target_io *tio) #define DM_IO_MAGIC 19577 struct dm_io { unsigned short magic; - + blk_short_t flags; spinlock_t lock; unsigned long start_time; void *data; struct dm_io *next; - struct task_struct *map_task; struct dm_stats_aux stats_aux; - - blk_short_t flags; blk_status_t status; atomic_t io_count; struct mapped_device *md; @@ -281,7 +278,6 @@ struct dm_io { * dm_io flags */ enum { - DM_IO_START_ACCT, DM_IO_ACCOUNTED, DM_IO_WAS_SPLIT }; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index cd084a74c65b..50e081f68792 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -596,7 +596,6 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) this_cpu_inc(*md->pending_io); io->orig_bio = bio; io->md = md; - io->map_task = current; spin_lock_init(&io->lock); io->start_time = jiffies; io->flags = 0; @@ -1241,13 +1240,6 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) } EXPORT_SYMBOL_GPL(dm_accept_partial_bio); -static inline void __dm_submit_bio_remap(struct bio *clone, - dev_t dev, sector_t old_sector) -{ - trace_block_bio_remap(clone, dev, old_sector); - submit_bio_noacct(clone); -} - /* * @clone: clone bio that DM core passed to target's .map function * @tgt_clone: clone of @clone bio that target needs submitted @@ -1262,8 +1254,6 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) struct dm_target_io *tio = clone_to_tio(clone); struct dm_io *io = tio->io; - WARN_ON_ONCE(!tio->ti->accounts_remapped_io); - /* establish bio that will get submitted */ if (!tgt_clone) tgt_clone = clone; @@ -1272,15 +1262,11 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone) * Account io->origin_bio to DM dev on behalf of target * that took ownership of IO with DM_MAPIO_SUBMITTED. */ - if (io->map_task == current) { - /* Still in target's map function */ - dm_io_set_flag(io, DM_IO_START_ACCT); - } else { - dm_start_io_acct(io, clone); - } + dm_start_io_acct(io, clone); - __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk), + trace_block_bio_remap(tgt_clone, disk_devt(io->md->disk), tio->old_sector); + submit_bio_noacct(tgt_clone); } EXPORT_SYMBOL_GPL(dm_submit_bio_remap); @@ -1340,16 +1326,10 @@ static void __map_bio(struct bio *clone) case DM_MAPIO_SUBMITTED: /* target has assumed ownership of this io */ if (!ti->accounts_remapped_io) - dm_io_set_flag(io, DM_IO_START_ACCT); + dm_start_io_acct(io, clone); break; case DM_MAPIO_REMAPPED: - /* - * the bio has been remapped so dispatch it, but defer - * dm_start_io_acct() until after possible bio_split(). - */ - __dm_submit_bio_remap(clone, disk_devt(md->disk), - tio->old_sector); - dm_io_set_flag(io, DM_IO_START_ACCT); + dm_submit_bio_remap(clone, NULL); break; case DM_MAPIO_KILL: case DM_MAPIO_REQUEUE: @@ -1667,7 +1647,6 @@ static void dm_split_and_process_bio(struct mapped_device *md, } error = __split_and_process_bio(&ci); - io->map_task = NULL; if (error || !ci.sector_count) goto out; /* @@ -1679,9 +1658,6 @@ static void dm_split_and_process_bio(struct mapped_device *md, bio_inc_remaining(bio); submit_bio_noacct(bio); out: - if (dm_io_flagged(io, DM_IO_START_ACCT)) - dm_start_io_acct(io, NULL); - /* * Drop the extra reference count for non-POLLED bio, and hold one * reference for POLLED bio, which will be released in dm_poll_bio -- cgit v1.2.3 From 4edadf6dcb54d2a86eeb424f27122dc0076d9267 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Sun, 17 Apr 2022 13:00:15 -0400 Subject: dm: improve abnormal bio processing Read/write/flush are the most common operations, optimize switch in is_abnormal_io() for those cases. Follows same pattern established in block perf-wip commit ("block: optimise blk_may_split for normal rw") Also, push is_abnormal_io() check and blk_queue_split() down from dm_submit_bio() to dm_split_and_process_bio() and set new 'is_abnormal_io' flag in clone_info. Optimize __split_and_process_bio and __process_abnormal_io by leveraging ci.is_abnormal_io flag. Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 67 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 31 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 50e081f68792..9650ba2075b8 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -84,7 +84,8 @@ struct clone_info { struct dm_io *io; sector_t sector; unsigned sector_count; - bool submit_as_polled; + bool is_abnormal_io:1; + bool submit_as_polled:1; }; #define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone)) @@ -1491,21 +1492,24 @@ static void __send_changing_extent_only(struct clone_info *ci, struct dm_target static bool is_abnormal_io(struct bio *bio) { - bool r = false; + unsigned int op = bio_op(bio); - switch (bio_op(bio)) { - case REQ_OP_DISCARD: - case REQ_OP_SECURE_ERASE: - case REQ_OP_WRITE_ZEROES: - r = true; - break; + if (op != REQ_OP_READ && op != REQ_OP_WRITE && op != REQ_OP_FLUSH) { + switch (op) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + return true; + default: + break; + } } - return r; + return false; } -static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, - blk_status_t *status) +static blk_status_t __process_abnormal_io(struct clone_info *ci, + struct dm_target *ti) { unsigned num_bios = 0; @@ -1519,8 +1523,6 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, case REQ_OP_WRITE_ZEROES: num_bios = ti->num_write_zeroes_bios; break; - default: - return false; } /* @@ -1530,12 +1532,10 @@ static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti, * check was performed. */ if (unlikely(!num_bios)) - *status = BLK_STS_NOTSUPP; - else { - __send_changing_extent_only(ci, ti, num_bios); - *status = BLK_STS_OK; - } - return true; + return BLK_STS_NOTSUPP; + + __send_changing_extent_only(ci, ti, num_bios); + return BLK_STS_OK; } /* @@ -1588,11 +1588,12 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci) struct bio *clone; struct dm_target *ti; unsigned len; - blk_status_t error = BLK_STS_IOERR; ti = dm_table_find_target(ci->map, ci->sector); - if (unlikely(!ti || __process_abnormal_io(ci, ti, &error))) - return error; + if (unlikely(!ti)) + return BLK_STS_IOERR; + else if (unlikely(ci->is_abnormal_io)) + return __process_abnormal_io(ci, ti); /* * Only support bio polling for normal IO, and the target io is @@ -1612,11 +1613,12 @@ static blk_status_t __split_and_process_bio(struct clone_info *ci) } static void init_clone_info(struct clone_info *ci, struct mapped_device *md, - struct dm_table *map, struct bio *bio) + struct dm_table *map, struct bio *bio, bool is_abnormal) { ci->map = map; ci->io = alloc_io(md, bio); ci->bio = bio; + ci->is_abnormal_io = is_abnormal; ci->submit_as_polled = false; ci->sector = bio->bi_iter.bi_sector; ci->sector_count = bio_sectors(bio); @@ -1636,8 +1638,18 @@ static void dm_split_and_process_bio(struct mapped_device *md, struct clone_info ci; struct dm_io *io; blk_status_t error = BLK_STS_OK; + bool is_abnormal; - init_clone_info(&ci, md, map, bio); + is_abnormal = is_abnormal_io(bio); + if (unlikely(is_abnormal)) { + /* + * Use blk_queue_split() for abnormal IO (e.g. discard, etc) + * otherwise associated queue_limits won't be imposed. + */ + blk_queue_split(&bio); + } + + init_clone_info(&ci, md, map, bio, is_abnormal); io = ci.io; if (bio->bi_opf & REQ_PREFLUSH) { @@ -1697,13 +1709,6 @@ static void dm_submit_bio(struct bio *bio) goto out; } - /* - * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc) - * otherwise associated queue_limits won't be imposed. - */ - if (unlikely(is_abnormal_io(bio))) - blk_queue_split(&bio); - dm_split_and_process_bio(md, map, bio); out: dm_put_live_table_bio(md, srcu_idx, bio); -- cgit v1.2.3 From bfe2b0146c4d0230b68f5c71a64380ff8d361f8b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sun, 24 Apr 2022 16:43:00 -0400 Subject: dm stats: add cond_resched when looping over entries dm-stats can be used with a very large number of entries (it is only limited by 1/4 of total system memory), so add rescheduling points to the loops that iterate over the entries. Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-stats.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 86e0697330e8..8326f9fe0e91 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -225,6 +225,7 @@ void dm_stats_cleanup(struct dm_stats *stats) atomic_read(&shared->in_flight[READ]), atomic_read(&shared->in_flight[WRITE])); } + cond_resched(); } dm_stat_free(&s->rcu_head); } @@ -330,6 +331,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { atomic_set(&s->stat_shared[ni].in_flight[READ], 0); atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); + cond_resched(); } if (s->n_histogram_entries) { @@ -342,6 +344,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { s->stat_shared[ni].tmp.histogram = hi; hi += s->n_histogram_entries + 1; + cond_resched(); } } @@ -362,6 +365,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { p[ni].histogram = hi; hi += s->n_histogram_entries + 1; + cond_resched(); } } } @@ -500,6 +504,7 @@ static int dm_stats_list(struct dm_stats *stats, const char *program, } DMEMIT("\n"); } + cond_resched(); } mutex_unlock(&stats->mutex); @@ -777,6 +782,7 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, local_irq_enable(); } } + cond_resched(); } } @@ -892,6 +898,8 @@ static int dm_stats_print(struct dm_stats *stats, int id, if (unlikely(sz + 1 >= maxlen)) goto buffer_overflow; + + cond_resched(); } if (clear) -- cgit v1.2.3 From d3f2a14b8906df913cb04a706367b012db94a6e8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 25 Apr 2022 14:56:48 +0300 Subject: dm integrity: fix error code in dm_integrity_ctr() The "r" variable shadows an earlier "r" that has function scope. It means that we accidentally return success instead of an error code. Smatch has a warning for this: drivers/md/dm-integrity.c:4503 dm_integrity_ctr() warn: missing error code 'r' Fixes: 7eada909bfd7 ("dm: add integrity target") Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 36ae30b73a6e..3d5a0ce123c9 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4494,8 +4494,6 @@ try_smaller_buffer: } if (should_write_sb) { - int r; - init_journal(ic, 0, ic->journal_sections, 0); r = dm_integrity_failed(ic); if (unlikely(r)) { -- cgit v1.2.3 From 567dd8f34560fa221a6343729474536aa7ede4fd Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 25 Apr 2022 08:53:29 -0400 Subject: dm crypt: make printing of the key constant-time The device mapper dm-crypt target is using scnprintf("%02x", cc->key[i]) to report the current key to userspace. However, this is not a constant-time operation and it may leak information about the key via timing, via cache access patterns or via the branch predictor. Change dm-crypt's key printing to use "%c" instead of "%02x". Also introduce hex2asc() that carefully avoids any branching or memory accesses when converting a number in the range 0 ... 15 to an ascii character. Cc: stable@vger.kernel.org Signed-off-by: Mikulas Patocka Tested-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index fb80539865d7..159c6806c19b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3439,6 +3439,11 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +static char hex2asc(unsigned char c) +{ + return c + '0' + ((unsigned)(9 - c) >> 4 & 0x27); +} + static void crypt_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { @@ -3457,9 +3462,12 @@ static void crypt_status(struct dm_target *ti, status_type_t type, if (cc->key_size > 0) { if (cc->key_string) DMEMIT(":%u:%s", cc->key_size, cc->key_string); - else - for (i = 0; i < cc->key_size; i++) - DMEMIT("%02x", cc->key[i]); + else { + for (i = 0; i < cc->key_size; i++) { + DMEMIT("%c%c", hex2asc(cc->key[i] >> 4), + hex2asc(cc->key[i] & 0xf)); + } + } } else DMEMIT("-"); -- cgit v1.2.3 From c06dfd124d46df9c482fbd1319b5fe19bcb1a110 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 27 Apr 2022 12:57:10 -0400 Subject: dm mpath: provide high-resolution timer to HST for bio-based The precision loss of reading IO start_time with jiffies_to_nsecs instead of using a high resolution timer degrades HST path prediction for BIO-based mpath on high load workloads. Below, I show the utilization percentage of a 10 disk multipath with asymmetrical disk access cost, while being exercised by a randwrite FIO benchmark with high submission queue depth (depth=64). It is possible to see that the HST path selection degrades heavily for high-iops in BIO-mpath, underutilizing the slower paths way beyond expected. This seems to be caused by the start_time truncation, which makes some IO to seem much slower than it actually is. In this scenario ST outperforms HST for bio-mpath, but not for mq-mpath, which already uses ktime_get_ns(). The third column shows utilization with this patch applied. It is easy to see that now HST prediction is much closer to the ideal distribution (calculated considering the real cost of each path). | | ST | HST (orig) | HST(ktime) | Best | | sdd | 0.17 | 0.20 | 0.17 | 0.18 | | sde | 0.17 | 0.20 | 0.17 | 0.18 | | sdf | 0.17 | 0.20 | 0.17 | 0.18 | | sdg | 0.06 | 0.00 | 0.06 | 0.04 | | sdh | 0.03 | 0.00 | 0.03 | 0.02 | | sdi | 0.03 | 0.00 | 0.03 | 0.02 | | sdj | 0.02 | 0.00 | 0.01 | 0.01 | | sdk | 0.02 | 0.00 | 0.01 | 0.01 | | sdl | 0.17 | 0.20 | 0.17 | 0.18 | | sdm | 0.17 | 0.20 | 0.17 | 0.18 | This issue was originally discussed [1] when we first merged HST, and this patch was left as a low hanging fruit to be solved later. Regarding the implementation, as suggested by Mike in that mail thread, in order to avoid the overhead of ktime_get_ns for other selectors, this patch adds a flag for the selector code to request the high-resolution timer. I tested this using the same benchmark used in the original HST submission. Full test and benchmark scripts are available here: https://people.collabora.com/~krisman/HST-BIO-MPATH/ [1] https://lore.kernel.org/lkml/85tv0am9de.fsf@collabora.com/T/ Signed-off-by: Gabriel Krisman Bertazi [snitzer: cleaned up various implementation details] Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 8 +++++++- drivers/md/dm-path-selector.h | 15 +++++++++++++++ drivers/md/dm-ps-historical-service-time.c | 1 + 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 6ed9d2731254..0e325469a252 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -105,6 +105,7 @@ struct multipath { struct dm_mpath_io { struct pgpath *pgpath; size_t nr_bytes; + u64 start_time_ns; }; typedef int (*action_fn) (struct pgpath *pgpath); @@ -295,6 +296,7 @@ static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mp mpio->nr_bytes = bio->bi_iter.bi_size; mpio->pgpath = NULL; + mpio->start_time_ns = 0; *mpio_p = mpio; dm_bio_record(bio_details, bio); @@ -647,6 +649,9 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, mpio->pgpath = pgpath; + if (dm_ps_use_hr_timer(pgpath->pg->ps.type)) + mpio->start_time_ns = ktime_get_ns(); + bio->bi_status = 0; bio_set_dev(bio, pgpath->path.dev->bdev); bio->bi_opf |= REQ_FAILFAST_TRANSPORT; @@ -1713,7 +1718,8 @@ done: if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes, - dm_start_time_ns_from_clone(clone)); + (mpio->start_time_ns ?: + dm_start_time_ns_from_clone(clone))); } return r; diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index c47bc0e20275..83cac2b04b66 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -26,11 +26,26 @@ struct path_selector { void *context; }; +/* + * If a path selector uses this flag, a high resolution timer is used + * (via ktime_get_ns) to account for IO start time in BIO-based mpath. + * This improves performance of some path selectors (i.e. HST), in + * exchange for slightly higher overhead when submitting the BIO. + * The extra cost is usually offset by improved path selection for + * some benchmarks. + * + * This has no effect for request-based mpath, since it already uses a + * higher precision timer by default. + */ +#define DM_PS_USE_HR_TIMER 0x00000001 +#define dm_ps_use_hr_timer(type) ((type)->features & DM_PS_USE_HR_TIMER) + /* Information about a path selector type */ struct path_selector_type { char *name; struct module *module; + unsigned int features; unsigned int table_args; unsigned int info_args; diff --git a/drivers/md/dm-ps-historical-service-time.c b/drivers/md/dm-ps-historical-service-time.c index 82f2a06153dc..1d82c95d323d 100644 --- a/drivers/md/dm-ps-historical-service-time.c +++ b/drivers/md/dm-ps-historical-service-time.c @@ -523,6 +523,7 @@ static int hst_end_io(struct path_selector *ps, struct dm_path *path, static struct path_selector_type hst_ps = { .name = "historical-service-time", .module = THIS_MODULE, + .features = DM_PS_USE_HR_TIMER, .table_args = 1, .info_args = 3, .create = hst_create, -- cgit v1.2.3 From d254c3699fddb671bc555f042c96ec033582ae72 Mon Sep 17 00:00:00 2001 From: Guo Zhengkui Date: Thu, 5 May 2022 00:28:26 +0800 Subject: dm cache metadata: remove unnecessary variable in __dump_mapping Fix the following coccicheck warning: drivers/md/dm-cache-metadata.c:1512:5-6: Unneeded variable: "r". Return "0" on line 1520. Signed-off-by: Guo Zhengkui Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-metadata.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 2874f222c313..ab13b7380265 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1509,7 +1509,6 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd, static int __dump_mapping(void *context, uint64_t cblock, void *leaf) { - int r = 0; __le64 value; dm_oblock_t oblock; unsigned flags; @@ -1517,7 +1516,7 @@ static int __dump_mapping(void *context, uint64_t cblock, void *leaf) memcpy(&value, leaf, sizeof(value)); unpack_value(value, &oblock, &flags); - return r; + return 0; } static int __dump_mappings(struct dm_cache_metadata *cmd) -- cgit v1.2.3 From ca522482e3eafd005b8d4e8b1331c911505a58d5 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 11 May 2022 09:38:38 -0400 Subject: dm: pass NULL bdev to bio_alloc_clone Most DM targets will remap the clone bio passed to their ->map function using bio_set_bdev(). So this change to pass NULL bdev to bio_alloc_clone avoids clone-time work that sets up resources for a bdev association that will not be used in practice (e.g. clone issued to underlying device will not use DM device's blk-cgroups resources). But clone->bi_bdev is still initialized following bio_alloc_clone to preserve DM target expectations that clone->bi_bdev will be set. Follow-up work is needed to audit DM targets to remove accesses to a clone->bi_bdev that the target didn't initialize with bio_set_dev(). Depends-on: 7ecc56c62b27 ("block: allow passing a NULL bdev to bio_alloc_clone/bio_init_clone") Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 9650ba2075b8..d62f1354ecbf 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -581,7 +581,9 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs); + clone = bio_alloc_clone(NULL, bio, GFP_NOIO, &md->io_bs); + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; tio = clone_to_tio(clone); tio->flags = 0; @@ -613,7 +615,7 @@ static void free_io(struct dm_io *io) } static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, - unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) + unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask) { struct dm_target_io *tio; struct bio *clone; @@ -624,10 +626,13 @@ static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti, /* alloc_io() already initialized embedded clone */ clone = &tio->clone; } else { - clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio, - gfp_mask, &ci->io->md->bs); + struct mapped_device *md = ci->io->md; + + clone = bio_alloc_clone(NULL, ci->bio, gfp_mask, &md->bs); if (!clone) return NULL; + /* Set default bdev, but target must bio_set_dev() before issuing IO */ + clone->bi_bdev = md->disk->part0; /* REQ_DM_POLL_LIST shouldn't be inherited */ clone->bi_opf &= ~REQ_DM_POLL_LIST; @@ -1012,25 +1017,28 @@ static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) static void clone_endio(struct bio *bio) { blk_status_t error = bio->bi_status; - struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct dm_target_io *tio = clone_to_tio(bio); struct dm_target *ti = tio->ti; dm_endio_fn endio = ti->type->end_io; struct dm_io *io = tio->io; struct mapped_device *md = io->md; - if (unlikely(error == BLK_STS_TARGET)) { - if (bio_op(bio) == REQ_OP_DISCARD && - !bdev_max_discard_sectors(bio->bi_bdev)) - disable_discard(md); - else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !q->limits.max_write_zeroes_sectors) - disable_write_zeroes(md); - } + if (likely(bio->bi_bdev != md->disk->part0)) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); - if (static_branch_unlikely(&zoned_enabled) && - unlikely(blk_queue_is_zoned(q))) - dm_zone_endio(io, bio); + if (unlikely(error == BLK_STS_TARGET)) { + if (bio_op(bio) == REQ_OP_DISCARD && + !bdev_max_discard_sectors(bio->bi_bdev)) + disable_discard(md); + else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && + !q->limits.max_write_zeroes_sectors) + disable_write_zeroes(md); + } + + if (static_branch_unlikely(&zoned_enabled) && + unlikely(blk_queue_is_zoned(q))) + dm_zone_endio(io, bio); + } if (endio) { int r = endio(ti, bio, &error); -- cgit v1.2.3