From 6ba01df72b4b63a26b4977790f58d8f775d2992c Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 5 Nov 2019 10:43:44 -0500 Subject: dm table: do not allow request-based DM to stack on partitions Partitioned request-based devices cannot be used as underlying devices for request-based DM because no partition offsets are added to each incoming request. As such, until now, stacking on partitioned devices would _always_ result in data corruption (e.g. wiping the partition table, writing to other partitions, etc). Fix this by disallowing request-based stacking on partitions. While at it, since all .request_fn support has been removed from block core, remove legacy dm-table code that differentiated between blk-mq and .request_fn request-based. Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 52e049554f5c..2ae0c1913766 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -918,21 +918,15 @@ bool dm_table_supports_dax(struct dm_table *t, static bool dm_table_does_not_support_partial_completion(struct dm_table *t); -struct verify_rq_based_data { - unsigned sq_count; - unsigned mq_count; -}; - -static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_is_rq_stackable(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - struct verify_rq_based_data *v = data; + struct block_device *bdev = dev->bdev; + struct request_queue *q = bdev_get_queue(bdev); - if (queue_is_mq(q)) - v->mq_count++; - else - v->sq_count++; + /* request-based cannot stack on partitions! */ + if (bdev != bdev->bd_contains) + return false; return queue_is_mq(q); } @@ -941,7 +935,6 @@ static int dm_table_determine_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; - struct verify_rq_based_data v = {.sq_count = 0, .mq_count = 0}; struct dm_target *tgt; struct list_head *devices = dm_table_get_devices(t); enum dm_queue_mode live_md_type = dm_get_md_type(t->md); @@ -1045,14 +1038,10 @@ verify_rq_based: /* Non-request-stackable devices can't be used for request-based dm */ if (!tgt->type->iterate_devices || - !tgt->type->iterate_devices(tgt, device_is_rq_based, &v)) { + !tgt->type->iterate_devices(tgt, device_is_rq_stackable, NULL)) { DMERR("table load rejected: including non-request-stackable devices"); return -EINVAL; } - if (v.sq_count > 0) { - DMERR("table load rejected: not all devices are blk-mq request-stackable"); - return -EINVAL; - } return 0; } -- cgit v1.2.3 From 22c992e1a868478b9fe83701cdf6329103c2ac06 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Tue, 1 Oct 2019 17:47:52 +0200 Subject: dm raid: change rs_set_dev_and_array_sectors API and callers Add a size argument to rs_set_dev_and_array_sectors as prerequisite to fixing grown device resynchronization not occuring when new MD bitmap pages have to be allocated as a result of the extension in a follwup patch. Also avoid code duplication by using rs_set_rdev_sectors in the aforementioned function. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index b0aa595e4375..89f805e851cf 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1615,13 +1615,12 @@ static int _check_data_dev_sectors(struct raid_set *rs) } /* Calculate the sectors per device and per array used for @rs */ -static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) +static int rs_set_dev_and_array_sectors(struct raid_set *rs, sector_t sectors, bool use_mddev) { int delta_disks; unsigned int data_stripes; + sector_t array_sectors = sectors, dev_sectors = sectors; struct mddev *mddev = &rs->md; - struct md_rdev *rdev; - sector_t array_sectors = rs->ti->len, dev_sectors = rs->ti->len; if (use_mddev) { delta_disks = mddev->delta_disks; @@ -1656,12 +1655,9 @@ static int rs_set_dev_and_array_sectors(struct raid_set *rs, bool use_mddev) /* Striped layouts */ array_sectors = (data_stripes + delta_disks) * dev_sectors; - rdev_for_each(rdev, mddev) - if (!test_bit(Journal, &rdev->flags)) - rdev->sectors = dev_sectors; - mddev->array_sectors = array_sectors; mddev->dev_sectors = dev_sectors; + rs_set_rdev_sectors(rs); return _check_data_dev_sectors(rs); bad: @@ -2911,7 +2907,7 @@ static int rs_setup_reshape(struct raid_set *rs) /* Remove disk(s) */ } else if (rs->delta_disks < 0) { - r = rs_set_dev_and_array_sectors(rs, true); + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, true); mddev->reshape_backwards = 1; /* removing disk(s) -> backward reshape */ /* Change layout and/or chunk size */ @@ -3067,7 +3063,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) * * Any existing superblock will overwrite the array and device sizes */ - r = rs_set_dev_and_array_sectors(rs, false); + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); if (r) goto bad; -- cgit v1.2.3 From 99273d9e6e19ce7bc393d294d60b8e0c0b971121 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Tue, 1 Oct 2019 17:47:53 +0200 Subject: dm raid: to ensure resynchronization, perform raid set grow in preresume This fixes a flaw causing raid set extensions not to be synchronized in case the MD bitmap resize required additional pages to be allocated. Also share resize code in the raid constructor between new size changes and those occuring during recovery. Bump the target version to define the change and document it in Documentation/admin-guide/device-mapper/dm-raid.rst. Reported-by: Steve D Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-raid.rst | 2 + drivers/md/dm-raid.c | 81 ++++++++++++++++------ 2 files changed, 62 insertions(+), 21 deletions(-) (limited to 'drivers') diff --git a/Documentation/admin-guide/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst index 2fe255b130fb..f6344675e395 100644 --- a/Documentation/admin-guide/device-mapper/dm-raid.rst +++ b/Documentation/admin-guide/device-mapper/dm-raid.rst @@ -417,3 +417,5 @@ Version History deadlock/potential data corruption. Update superblock when specific devices are requested via rebuild. Fix RAID leg rebuild errors. + 1.15.0 Fix size extensions not being synchronized in case of new MD bitmap + pages allocated; also fix those not occuring after previous reductions diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 89f805e851cf..5c84215d9b62 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -209,6 +209,7 @@ struct raid_dev { #define RT_FLAG_RS_SUSPENDED 5 #define RT_FLAG_RS_IN_SYNC 6 #define RT_FLAG_RS_RESYNCING 7 +#define RT_FLAG_RS_GROW 8 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -241,6 +242,9 @@ struct raid_set { struct raid_type *raid_type; struct dm_target_callbacks callbacks; + sector_t array_sectors; + sector_t dev_sectors; + /* Optional raid4/5/6 journal device */ struct journal_dev { struct dm_dev *dev; @@ -3004,7 +3008,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) bool resize = false; struct raid_type *rt; unsigned int num_raid_params, num_raid_devs; - sector_t calculated_dev_sectors, rdev_sectors, reshape_sectors; + sector_t sb_array_sectors, rdev_sectors, reshape_sectors; struct raid_set *rs = NULL; const char *arg; struct rs_layout rs_layout; @@ -3067,7 +3071,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; - calculated_dev_sectors = rs->md.dev_sectors; + /* Memorize just calculated, potentially larger sizes to grow the raid set in preresume */ + rs->array_sectors = rs->md.array_sectors; + rs->dev_sectors = rs->md.dev_sectors; /* * Backup any new raid set level, layout, ... @@ -3080,6 +3086,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; + /* All in-core metadata now as of current superblocks after calling analyse_superblocks() */ + sb_array_sectors = rs->md.array_sectors; rdev_sectors = __rdev_sectors(rs); if (!rdev_sectors) { ti->error = "Invalid rdev size"; @@ -3089,8 +3097,11 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) reshape_sectors = _get_reshape_sectors(rs); - if (calculated_dev_sectors != rdev_sectors) - resize = calculated_dev_sectors != (reshape_sectors ? rdev_sectors - reshape_sectors : rdev_sectors); + if (rs->dev_sectors != rdev_sectors) { + resize = (rs->dev_sectors != rdev_sectors - reshape_sectors); + if (rs->dev_sectors > rdev_sectors - reshape_sectors) + set_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); + } INIT_WORK(&rs->md.event_work, do_table_event); ti->private = rs; @@ -3117,13 +3128,8 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); rs_set_new(rs); } else if (rs_is_recovering(rs)) { - /* Rebuild particular devices */ - if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { - set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); - rs_setup_recovery(rs, MaxSector); - } /* A recovering raid set may be resized */ - ; /* skip setup rs */ + goto size_check; } else if (rs_is_reshaping(rs)) { /* Have to reject size change request during reshape */ if (resize) { @@ -3167,6 +3173,9 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) rs_setup_recovery(rs, MaxSector); rs_set_new(rs); } else if (rs_reshape_requested(rs)) { + /* Only on size extensions, not on reshapes. */ + clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); + /* * No need to check for 'ongoing' takeover here, because takeover * is an instant operation as oposed to an ongoing reshape. @@ -3197,13 +3206,30 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) } rs_set_cur(rs); } else { +size_check: /* May not set recovery when a device rebuild is requested */ if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { - rs_setup_recovery(rs, MaxSector); + clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); - } else - rs_setup_recovery(rs, test_bit(__CTR_FLAG_SYNC, &rs->ctr_flags) ? - 0 : (resize ? calculated_dev_sectors : MaxSector)); + } else if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) { + /* + * Set raid set to current size, i.e. non-grown size + * as of superblocks to grow to new size in preresume. + */ + r = rs_set_dev_and_array_sectors(rs, sb_array_sectors, false); + if (r) + goto bad; + + rs_setup_recovery(rs, rs->md.recovery_cp < rs->md.dev_sectors ? rs->md.recovery_cp : rs->md.dev_sectors); + } else { + /* This is no size change or it is shrinking, update size and record in superblocks */ + r = rs_set_dev_and_array_sectors(rs, rs->ti->len, false); + if (r) + goto bad; + + if (sb_array_sectors > rs->array_sectors) + set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + } rs_set_cur(rs); } @@ -3951,11 +3977,22 @@ static int raid_preresume(struct dm_target *ti) if (r) return r; - /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) */ - if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && - mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)) { - r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, - to_bytes(rs->requested_bitmap_chunk_sectors), 0); + /* We are extending the raid set size, adjust mddev/md_rdev sizes and set capacity. */ + if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) { + mddev->array_sectors = rs->array_sectors; + mddev->dev_sectors = rs->dev_sectors; + rs_set_rdev_sectors(rs); + rs_set_capacity(rs); + } + + /* Resize bitmap to adjust to changed region size (aka MD bitmap chunksize) or grown device size */ + if (test_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags) && mddev->bitmap && + (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags) || + (rs->requested_bitmap_chunk_sectors && + mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { + int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; + + r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0); if (r) DMERR("Failed to resize bitmap"); } @@ -3964,8 +4001,10 @@ static int raid_preresume(struct dm_target *ti) /* Be prepared for mddev_resume() in raid_resume() */ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { - set_bit(MD_RECOVERY_SYNC, &mddev->recovery); + set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); mddev->resync_min = mddev->recovery_cp; + if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) + mddev->resync_max_sectors = mddev->dev_sectors; } /* Check for any reshape request unless new raid set */ @@ -4013,7 +4052,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 14, 0}, + .version = {1, 15, 0}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, -- cgit v1.2.3 From f9f3ee9130eb588c75e4a145837d4e6214947c40 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Tue, 1 Oct 2019 17:47:54 +0200 Subject: dm raid: simplify rs_setup_recovery call chain rs_setup_recovery() sets the starting recovery offset. Drop superfluous rs_setup_recovery() and replace with __rs_setup_recovery(). Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 5c84215d9b62..3b99ef79dbe3 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -1670,7 +1670,7 @@ bad: } /* Setup recovery on @rs */ -static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) +static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) { /* raid0 does not recover */ if (rs_is_raid0(rs)) @@ -1691,22 +1691,6 @@ static void __rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) ? MaxSector : dev_sectors; } -/* Setup recovery on @rs based on raid type, device size and 'nosync' flag */ -static void rs_setup_recovery(struct raid_set *rs, sector_t dev_sectors) -{ - if (!dev_sectors) - /* New raid set or 'sync' flag provided */ - __rs_setup_recovery(rs, 0); - else if (dev_sectors == MaxSector) - /* Prevent recovery */ - __rs_setup_recovery(rs, MaxSector); - else if (__rdev_sectors(rs) < dev_sectors) - /* Grown raid set */ - __rs_setup_recovery(rs, __rdev_sectors(rs)); - else - __rs_setup_recovery(rs, MaxSector); -} - static void do_table_event(struct work_struct *ws) { struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); @@ -2474,7 +2458,7 @@ static int super_validate(struct raid_set *rs, struct md_rdev *rdev) return -EINVAL; } - /* Enable bitmap creation for RAID levels != 0 */ + /* Enable bitmap creation on @rs unless no metadevs or raid0 or journaled raid4/5/6 set. */ mddev->bitmap_info.offset = (rt_is_raid0(rs->raid_type) || rs->journal_dev.dev) ? 0 : to_sector(4096); mddev->bitmap_info.default_offset = mddev->bitmap_info.offset; @@ -3173,7 +3157,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) rs_setup_recovery(rs, MaxSector); rs_set_new(rs); } else if (rs_reshape_requested(rs)) { - /* Only on size extensions, not on reshapes. */ + /* Only request grow on raid set size extensions, not on reshapes. */ clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); /* @@ -3211,10 +3195,11 @@ size_check: if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) { clear_bit(RT_FLAG_RS_GROW, &rs->runtime_flags); set_bit(RT_FLAG_UPDATE_SBS, &rs->runtime_flags); + rs_setup_recovery(rs, MaxSector); } else if (test_bit(RT_FLAG_RS_GROW, &rs->runtime_flags)) { /* - * Set raid set to current size, i.e. non-grown size - * as of superblocks to grow to new size in preresume. + * Set raid set to current size, i.e. size as of + * superblocks to grow to larger size in preresume. */ r = rs_set_dev_and_array_sectors(rs, sb_array_sectors, false); if (r) -- cgit v1.2.3 From 53be73a5d75f477e52c9275ed7aa9307a8b73e5c Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Tue, 1 Oct 2019 17:47:55 +0200 Subject: dm raid: streamline rs_get_progress() and its raid_status() caller side Pass already deciphered state into rs_get_progress, simplify recovery offset definition and combine two st_resync, st_reshape conditionals into one as is already the case with st_check and st_repair. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 47 ++++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 3b99ef79dbe3..13fa90546a0f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3413,10 +3413,9 @@ static const char *__raid_dev_status(struct raid_set *rs, struct md_rdev *rdev) /* Helper to return resync/reshape progress for @rs and runtime flags for raid set in sync / resynching */ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, - sector_t resync_max_sectors) + enum sync_state state, sector_t resync_max_sectors) { sector_t r; - enum sync_state state; struct mddev *mddev = &rs->md; clear_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); @@ -3427,8 +3426,6 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); } else { - state = decipher_sync_action(mddev, recovery); - if (state == st_idle && !test_bit(MD_RECOVERY_INTR, &recovery)) r = mddev->recovery_cp; else @@ -3446,18 +3443,14 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, /* * In case we are recovering, the array is not in sync * and health chars should show the recovering legs. + * + * Already retrieved recovery offset from curr_resync_completed above. */ ; - else if (state == st_resync) - /* - * If "resync" is occurring, the raid set - * is or may be out of sync hence the health - * characters shall be 'a'. - */ - set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); - else if (state == st_reshape) + + else if (state == st_resync || state == st_reshape) /* - * If "reshape" is occurring, the raid set + * If "resync/reshape" is occurring, the raid set * is or may be out of sync hence the health * characters shall be 'a'. */ @@ -3471,22 +3464,22 @@ static sector_t rs_get_progress(struct raid_set *rs, unsigned long recovery, */ set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); - else { - struct md_rdev *rdev; - + else if (test_bit(MD_RECOVERY_NEEDED, &recovery)) /* * We are idle and recovery is needed, prevent 'A' chars race * caused by components still set to in-sync by constructor. */ - if (test_bit(MD_RECOVERY_NEEDED, &recovery)) - set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + set_bit(RT_FLAG_RS_RESYNCING, &rs->runtime_flags); + else { /* - * The raid set may be doing an initial sync, or it may - * be rebuilding individual components. If all the - * devices are In_sync, then it is the raid set that is - * being initialized. + * We are idle and the raid set may be doing an initial + * sync, or it may be rebuilding individual components. + * If all the devices are In_sync, then it is the raid set + * that is being initialized. */ + struct md_rdev *rdev; + set_bit(RT_FLAG_RS_IN_SYNC, &rs->runtime_flags); rdev_for_each(rdev, mddev) if (!test_bit(Journal, &rdev->flags) && @@ -3519,7 +3512,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, unsigned int rebuild_disks; unsigned int write_mostly_params = 0; sector_t progress, resync_max_sectors, resync_mismatches; - const char *sync_action; + enum sync_state state; struct raid_type *rt; switch (type) { @@ -3533,14 +3526,14 @@ static void raid_status(struct dm_target *ti, status_type_t type, /* Access most recent mddev properties for status output */ smp_rmb(); - recovery = rs->md.recovery; /* Get sensible max sectors even if raid set not yet started */ resync_max_sectors = test_bit(RT_FLAG_RS_PRERESUMED, &rs->runtime_flags) ? mddev->resync_max_sectors : mddev->dev_sectors; - progress = rs_get_progress(rs, recovery, resync_max_sectors); + recovery = rs->md.recovery; + state = decipher_sync_action(mddev, recovery); + progress = rs_get_progress(rs, recovery, state, resync_max_sectors); resync_mismatches = (mddev->last_sync_action && !strcasecmp(mddev->last_sync_action, "check")) ? atomic64_read(&mddev->resync_mismatches) : 0; - sync_action = sync_str(decipher_sync_action(&rs->md, recovery)); /* HM FIXME: do we want another state char for raid0? It shows 'D'/'A'/'-' now */ for (i = 0; i < rs->raid_disks; i++) @@ -3568,7 +3561,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, * See Documentation/admin-guide/device-mapper/dm-raid.rst for * information on each of these states. */ - DMEMIT(" %s", sync_action); + DMEMIT(" %s", sync_str(state)); /* * v1.5.0+: -- cgit v1.2.3 From 8adeac3be03d400f9c2391d52f85cd27bd188800 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 2 Oct 2019 14:03:41 -0500 Subject: dm stripe: use struct_size() in kmalloc() One of the more common cases of allocation size calculations is finding the size of a structure that has a zero-sized array at the end, along with memory for some number of elements for that array. For example: struct stripe_c { ... struct stripe stripe[0]; }; In this case alloc_context() and dm_array_too_big() are removed and replaced by the direct use of the struct_size() helper in kmalloc(). Notice that open-coded form is prone to type mistakes. This code was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva Signed-off-by: Mike Snitzer --- drivers/md/dm-stripe.c | 15 +-------------- include/linux/device-mapper.h | 3 --- 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 8547d7594338..63bbcc20f49a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -55,19 +55,6 @@ static void trigger_event(struct work_struct *work) dm_table_event(sc->ti->table); } -static inline struct stripe_c *alloc_context(unsigned int stripes) -{ - size_t len; - - if (dm_array_too_big(sizeof(struct stripe_c), sizeof(struct stripe), - stripes)) - return NULL; - - len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes); - - return kmalloc(len, GFP_KERNEL); -} - /* * Parse a single pair */ @@ -142,7 +129,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv) return -EINVAL; } - sc = alloc_context(stripes); + sc = kmalloc(struct_size(sc, stripe, stripes), GFP_KERNEL); if (!sc) { ti->error = "Memory allocation for striped context " "failed"; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 399ad8632356..2e13826898b2 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -594,9 +594,6 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size); */ #define dm_round_up(n, sz) (dm_div_up((n), (sz)) * (sz)) -#define dm_array_too_big(fixed, obj, num) \ - ((num) > (UINT_MAX - (fixed)) / (obj)) - /* * Sector offset taken relative to the start of the target instead of * relative to the start of the device. -- cgit v1.2.3 From 8dd85873a0bd8f162e95bc9402eea0050dbdac01 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 2 Oct 2019 07:07:57 -0400 Subject: dm writecache: fix uninitialized variable warning This fixes coverity warning CID 1454301. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index d06b8aa41e26..ea7b8be0b833 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1561,7 +1561,7 @@ static void writecache_writeback(struct work_struct *work) { struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); struct blk_plug plug; - struct wc_entry *f, *g, *e = NULL; + struct wc_entry *f, *uninitialized_var(g), *e = NULL; struct rb_node *node, *next_node; struct list_head skipped; struct writeback_list wbl; -- cgit v1.2.3 From c1005322ff02110a4df7f0033368ea015062b583 Mon Sep 17 00:00:00 2001 From: Maged Mokhtar Date: Wed, 23 Oct 2019 22:41:17 +0200 Subject: dm writecache: handle REQ_FUA Call writecache_flush() on REQ_FUA in writecache_map(). Cc: stable@vger.kernel.org # 4.18+ Signed-off-by: Maged Mokhtar Acked-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index ea7b8be0b833..7d727a72aa13 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1218,7 +1218,8 @@ bio_copy: } } while (bio->bi_iter.bi_size); - if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) + if (unlikely(bio->bi_opf & REQ_FUA || + wc->uncommitted_blocks >= wc->autocommit_blocks)) writecache_flush(wc); else writecache_schedule_autocommit(wc); -- cgit v1.2.3 From 6ca43ed8376a51afec790dd484a51804ade4352a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 4 Oct 2019 10:17:37 -0400 Subject: dm clone: replace spin_lock_irqsave with spin_lock_irq If we are in a place where it is known that interrupts are enabled, functions spin_lock_irq/spin_unlock_irq should be used instead of spin_lock_irqsave/spin_unlock_irqrestore. spin_lock_irq and spin_unlock_irq are faster because they don't need to push and pop the flags register. Signed-off-by: Mikulas Patocka Signed-off-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-clone-metadata.c | 29 ++++++++++++----------------- drivers/md/dm-clone-metadata.h | 4 +++- drivers/md/dm-clone-target.c | 28 ++++++++++++---------------- 3 files changed, 27 insertions(+), 34 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c index 6bc8c1d1c351..08c552e5e41b 100644 --- a/drivers/md/dm-clone-metadata.c +++ b/drivers/md/dm-clone-metadata.c @@ -712,7 +712,7 @@ static int __metadata_commit(struct dm_clone_metadata *cmd) static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) { int r; - unsigned long word, flags; + unsigned long word; word = 0; do { @@ -736,9 +736,9 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) return r; /* Update the changed flag */ - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); dmap->changed = 0; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); return 0; } @@ -746,7 +746,6 @@ static int __flush_dmap(struct dm_clone_metadata *cmd, struct dirty_map *dmap) int dm_clone_metadata_commit(struct dm_clone_metadata *cmd) { int r = -EPERM; - unsigned long flags; struct dirty_map *dmap, *next_dmap; down_write(&cmd->lock); @@ -770,9 +769,9 @@ int dm_clone_metadata_commit(struct dm_clone_metadata *cmd) } /* Swap dirty bitmaps */ - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); cmd->current_dmap = next_dmap; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); /* * No one is accessing the old dirty bitmap anymore, so we can flush @@ -817,9 +816,9 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, { int r = 0; struct dirty_map *dmap; - unsigned long word, region_nr, flags; + unsigned long word, region_nr; - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); if (cmd->read_only) { r = -EPERM; @@ -836,7 +835,7 @@ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, } } out: - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); return r; } @@ -903,13 +902,11 @@ out: void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd) { - unsigned long flags; - down_write(&cmd->lock); - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); cmd->read_only = 1; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); if (!cmd->fail_io) dm_bm_set_read_only(cmd->bm); @@ -919,13 +916,11 @@ void dm_clone_metadata_set_read_only(struct dm_clone_metadata *cmd) void dm_clone_metadata_set_read_write(struct dm_clone_metadata *cmd) { - unsigned long flags; - down_write(&cmd->lock); - spin_lock_irqsave(&cmd->bitmap_lock, flags); + spin_lock_irq(&cmd->bitmap_lock); cmd->read_only = 0; - spin_unlock_irqrestore(&cmd->bitmap_lock, flags); + spin_unlock_irq(&cmd->bitmap_lock); if (!cmd->fail_io) dm_bm_set_read_write(cmd->bm); diff --git a/drivers/md/dm-clone-metadata.h b/drivers/md/dm-clone-metadata.h index 434bff08508b..3fe50a781c11 100644 --- a/drivers/md/dm-clone-metadata.h +++ b/drivers/md/dm-clone-metadata.h @@ -44,7 +44,9 @@ int dm_clone_set_region_hydrated(struct dm_clone_metadata *cmd, unsigned long re * @start: Starting region number * @nr_regions: Number of regions in the range * - * This function doesn't block, so it's safe to call it from interrupt context. + * This function doesn't block, but since it uses spin_lock_irq()/spin_unlock_irq() + * it's NOT safe to call it from any context where interrupts are disabled, e.g., + * from interrupt context. */ int dm_clone_cond_set_range(struct dm_clone_metadata *cmd, unsigned long start, unsigned long nr_regions); diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 4ca8f1977222..f7d1070a4d40 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -332,8 +332,6 @@ static void submit_bios(struct bio_list *bios) */ static void issue_bio(struct clone *clone, struct bio *bio) { - unsigned long flags; - if (!bio_triggers_commit(clone, bio)) { generic_make_request(bio); return; @@ -352,9 +350,9 @@ static void issue_bio(struct clone *clone, struct bio *bio) * Batch together any bios that trigger commits and then issue a single * commit for them in process_deferred_flush_bios(). */ - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_add(&clone->deferred_flush_bios, bio); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); wake_worker(clone); } @@ -469,7 +467,7 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ static void process_discard_bio(struct clone *clone, struct bio *bio) { - unsigned long rs, re, flags; + unsigned long rs, re; bio_region_range(clone, bio, &rs, &re); BUG_ON(re > clone->nr_regions); @@ -501,9 +499,9 @@ static void process_discard_bio(struct clone *clone, struct bio *bio) /* * Defer discard processing. */ - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_add(&clone->deferred_discard_bios, bio); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); wake_worker(clone); } @@ -1140,13 +1138,13 @@ static void process_deferred_discards(struct clone *clone) int r = -EPERM; struct bio *bio; struct blk_plug plug; - unsigned long rs, re, flags; + unsigned long rs, re; struct bio_list discards = BIO_EMPTY_LIST; - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_merge(&discards, &clone->deferred_discard_bios); bio_list_init(&clone->deferred_discard_bios); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); if (bio_list_empty(&discards)) return; @@ -1176,13 +1174,12 @@ out: static void process_deferred_bios(struct clone *clone) { - unsigned long flags; struct bio_list bios = BIO_EMPTY_LIST; - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_merge(&bios, &clone->deferred_bios); bio_list_init(&clone->deferred_bios); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); if (bio_list_empty(&bios)) return; @@ -1193,7 +1190,6 @@ static void process_deferred_bios(struct clone *clone) static void process_deferred_flush_bios(struct clone *clone) { struct bio *bio; - unsigned long flags; struct bio_list bios = BIO_EMPTY_LIST; struct bio_list bio_completions = BIO_EMPTY_LIST; @@ -1201,13 +1197,13 @@ static void process_deferred_flush_bios(struct clone *clone) * If there are any deferred flush bios, we must commit the metadata * before issuing them or signaling their completion. */ - spin_lock_irqsave(&clone->lock, flags); + spin_lock_irq(&clone->lock); bio_list_merge(&bios, &clone->deferred_flush_bios); bio_list_init(&clone->deferred_flush_bios); bio_list_merge(&bio_completions, &clone->deferred_flush_completions); bio_list_init(&clone->deferred_flush_completions); - spin_unlock_irqrestore(&clone->lock, flags); + spin_unlock_irq(&clone->lock); if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone))) -- cgit v1.2.3 From 52c67d416b26be1f91fbcbd57cfcf6ffca76ff29 Mon Sep 17 00:00:00 2001 From: Nikos Tsironis Date: Mon, 7 Oct 2019 20:54:41 +0300 Subject: dm clone: add bucket_lock_irq/bucket_unlock_irq helpers Introduce bucket_lock_irq() and bucket_unlock_irq() helpers and use them in places where it is known that interrupts are enabled. Signed-off-by: Nikos Tsironis Signed-off-by: Mike Snitzer --- drivers/md/dm-clone-target.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index f7d1070a4d40..b3d89072d21c 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -552,6 +552,12 @@ struct hash_table_bucket { #define bucket_unlock_irqrestore(bucket, flags) \ spin_unlock_irqrestore(&(bucket)->lock, flags) +#define bucket_lock_irq(bucket) \ + spin_lock_irq(&(bucket)->lock) + +#define bucket_unlock_irq(bucket) \ + spin_unlock_irq(&(bucket)->lock) + static int hash_table_init(struct clone *clone) { unsigned int i, sz; @@ -849,7 +855,6 @@ static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio */ static void hydrate_bio_region(struct clone *clone, struct bio *bio) { - unsigned long flags; unsigned long region_nr; struct hash_table_bucket *bucket; struct dm_clone_region_hydration *hd, *hd2; @@ -857,19 +862,19 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) region_nr = bio_to_region(clone, bio); bucket = get_hash_table_bucket(clone, region_nr); - bucket_lock_irqsave(bucket, flags); + bucket_lock_irq(bucket); hd = __hash_find(bucket, region_nr); if (hd) { /* Someone else is hydrating the region */ bio_list_add(&hd->deferred_bios, bio); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); return; } if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { /* The region has been hydrated */ - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); issue_bio(clone, bio); return; } @@ -878,16 +883,16 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) * We must allocate a hydration descriptor and start the hydration of * the corresponding region. */ - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); hd = alloc_hydration(clone); hydration_init(hd, region_nr); - bucket_lock_irqsave(bucket, flags); + bucket_lock_irq(bucket); /* Check if the region has been hydrated in the meantime. */ if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) { - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); free_hydration(hd); issue_bio(clone, bio); return; @@ -897,7 +902,7 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) if (hd2 != hd) { /* Someone else started the region's hydration. */ bio_list_add(&hd2->deferred_bios, bio); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); free_hydration(hd); return; } @@ -909,7 +914,7 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) */ if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) { hlist_del(&hd->h); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); free_hydration(hd); bio_io_error(bio); return; @@ -923,11 +928,11 @@ static void hydrate_bio_region(struct clone *clone, struct bio *bio) * to the destination device. */ if (is_overwrite_bio(clone, bio)) { - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); hydration_overwrite(hd, bio); } else { bio_list_add(&hd->deferred_bios, bio); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); hydration_copy(hd, 1); } } @@ -994,7 +999,6 @@ static unsigned long __start_next_hydration(struct clone *clone, unsigned long offset, struct batch_info *batch) { - unsigned long flags; struct hash_table_bucket *bucket; struct dm_clone_region_hydration *hd; unsigned long nr_regions = clone->nr_regions; @@ -1008,13 +1012,13 @@ static unsigned long __start_next_hydration(struct clone *clone, break; bucket = get_hash_table_bucket(clone, offset); - bucket_lock_irqsave(bucket, flags); + bucket_lock_irq(bucket); if (!dm_clone_is_region_hydrated(clone->cmd, offset) && !__hash_find(bucket, offset)) { hydration_init(hd, offset); __insert_region_hydration(bucket, hd); - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); /* Batch hydration */ __batch_hydration(batch, hd); @@ -1022,7 +1026,7 @@ static unsigned long __start_next_hydration(struct clone *clone, return (offset + 1); } - bucket_unlock_irqrestore(bucket, flags); + bucket_unlock_irq(bucket); } while (++offset < nr_regions); -- cgit v1.2.3 From 8e0c9dacc39eccd27e70cf3243896b43f5398c17 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 15 Oct 2019 08:16:29 -0400 Subject: dm thin: replace spin_lock_irqsave with spin_lock_irq If we are in a place where it is known that interrupts are enabled, functions spin_lock_irq/spin_unlock_irq should be used instead of spin_lock_irqsave/spin_unlock_irqrestore. spin_lock_irq and spin_unlock_irq are faster because they don't need to push and pop the flags register. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 113 +++++++++++++++++++++------------------------------ 1 file changed, 46 insertions(+), 67 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fcd887703f95..85ada5ad2121 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -609,13 +609,12 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, blk_status_t error) { struct bio_list bios; - unsigned long flags; bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); __merge_bio_list(&bios, master); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); error_bio_list(&bios, error); } @@ -623,15 +622,14 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, static void requeue_deferred_cells(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct list_head cells; struct dm_bio_prison_cell *cell, *tmp; INIT_LIST_HEAD(&cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice_init(&tc->deferred_cells, &cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); list_for_each_entry_safe(cell, tmp, &cells, user_list) cell_requeue(pool, cell); @@ -640,14 +638,13 @@ static void requeue_deferred_cells(struct thin_c *tc) static void requeue_io(struct thin_c *tc) { struct bio_list bios; - unsigned long flags; bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); __merge_bio_list(&bios, &tc->deferred_bio_list); __merge_bio_list(&bios, &tc->retry_on_resume_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); error_bio_list(&bios, BLK_STS_DM_REQUEUE); requeue_deferred_cells(tc); @@ -756,7 +753,6 @@ static void inc_all_io_entry(struct pool *pool, struct bio *bio) static void issue(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; - unsigned long flags; if (!bio_triggers_commit(tc, bio)) { generic_make_request(bio); @@ -777,9 +773,9 @@ static void issue(struct thin_c *tc, struct bio *bio) * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios(). */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_add(&pool->deferred_flush_bios, bio); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) @@ -960,7 +956,6 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; - unsigned long flags; /* * If the bio has the REQ_FUA flag set we must commit the metadata @@ -985,9 +980,9 @@ static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio) * Batch together any bios that trigger commits and then issue a * single commit for them in process_deferred_bios(). */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_add(&pool->deferred_flush_completions, bio); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void process_prepared_mapping(struct dm_thin_new_mapping *m) @@ -1226,14 +1221,13 @@ static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m) static void process_prepared(struct pool *pool, struct list_head *head, process_mapping_fn *fn) { - unsigned long flags; struct list_head maps; struct dm_thin_new_mapping *m, *tmp; INIT_LIST_HEAD(&maps); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); list_splice_init(head, &maps); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); list_for_each_entry_safe(m, tmp, &maps, list) (*fn)(m); @@ -1510,14 +1504,12 @@ static int commit(struct pool *pool) static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) { - unsigned long flags; - if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) { DMWARN("%s: reached low water mark for data device: sending event.", dm_device_name(pool->pool_md)); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->low_water_triggered = true; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); dm_table_event(pool->ti->table); } } @@ -1593,11 +1585,10 @@ static void retry_on_resume(struct bio *bio) { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct thin_c *tc = h->tc; - unsigned long flags; - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->retry_on_resume_list, bio); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); } static blk_status_t should_error_unserviceable_bio(struct pool *pool) @@ -2170,7 +2161,6 @@ static void __sort_thin_deferred_bios(struct thin_c *tc) static void process_thin_deferred_bios(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct bio *bio; struct bio_list bios; struct blk_plug plug; @@ -2184,10 +2174,10 @@ static void process_thin_deferred_bios(struct thin_c *tc) bio_list_init(&bios); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); if (bio_list_empty(&tc->deferred_bio_list)) { - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); return; } @@ -2196,7 +2186,7 @@ static void process_thin_deferred_bios(struct thin_c *tc) bio_list_merge(&bios, &tc->deferred_bio_list); bio_list_init(&tc->deferred_bio_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); blk_start_plug(&plug); while ((bio = bio_list_pop(&bios))) { @@ -2206,10 +2196,10 @@ static void process_thin_deferred_bios(struct thin_c *tc) * prepared mappings to process. */ if (ensure_next_mapping(pool)) { - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->deferred_bio_list, bio); bio_list_merge(&tc->deferred_bio_list, &bios); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); break; } @@ -2264,16 +2254,15 @@ static unsigned sort_cells(struct pool *pool, struct list_head *cells) static void process_thin_deferred_cells(struct thin_c *tc) { struct pool *pool = tc->pool; - unsigned long flags; struct list_head cells; struct dm_bio_prison_cell *cell; unsigned i, j, count; INIT_LIST_HEAD(&cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice_init(&tc->deferred_cells, &cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); if (list_empty(&cells)) return; @@ -2294,9 +2283,9 @@ static void process_thin_deferred_cells(struct thin_c *tc) for (j = i; j < count; j++) list_add(&pool->cell_sort_array[j]->user_list, &cells); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_splice(&cells, &tc->deferred_cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); return; } @@ -2349,7 +2338,6 @@ static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc) static void process_deferred_bios(struct pool *pool) { - unsigned long flags; struct bio *bio; struct bio_list bios, bio_completions; struct thin_c *tc; @@ -2368,13 +2356,13 @@ static void process_deferred_bios(struct pool *pool) bio_list_init(&bios); bio_list_init(&bio_completions); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_list_merge(&bios, &pool->deferred_flush_bios); bio_list_init(&pool->deferred_flush_bios); bio_list_merge(&bio_completions, &pool->deferred_flush_completions); bio_list_init(&pool->deferred_flush_completions); - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool))) @@ -2657,12 +2645,11 @@ static void metadata_operation_failed(struct pool *pool, const char *op, int r) */ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) { - unsigned long flags; struct pool *pool = tc->pool; - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_add(&tc->deferred_bio_list, bio); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); wake_worker(pool); } @@ -2678,13 +2665,12 @@ static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio) static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) { - unsigned long flags; struct pool *pool = tc->pool; throttle_lock(&pool->throttle); - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); list_add_tail(&cell->user_list, &tc->deferred_cells); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); throttle_unlock(&pool->throttle); wake_worker(pool); @@ -2810,15 +2796,14 @@ static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) static void requeue_bios(struct pool *pool) { - unsigned long flags; struct thin_c *tc; rcu_read_lock(); list_for_each_entry_rcu(tc, &pool->active_thins, list) { - spin_lock_irqsave(&tc->lock, flags); + spin_lock_irq(&tc->lock); bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list); bio_list_init(&tc->retry_on_resume_list); - spin_unlock_irqrestore(&tc->lock, flags); + spin_unlock_irq(&tc->lock); } rcu_read_unlock(); } @@ -3412,15 +3397,14 @@ static int pool_map(struct dm_target *ti, struct bio *bio) int r; struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; /* * As this is a singleton target, ti->begin is always zero. */ - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); bio_set_dev(bio, pt->data_dev->bdev); r = DM_MAPIO_REMAPPED; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); return r; } @@ -3591,7 +3575,6 @@ static void pool_resume(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; /* * Must requeue active_thins' bios and then resume @@ -3600,10 +3583,10 @@ static void pool_resume(struct dm_target *ti) requeue_bios(pool); pool_resume_active_thins(pool); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->low_water_triggered = false; pool->suspended = false; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); do_waker(&pool->waker.work); } @@ -3612,11 +3595,10 @@ static void pool_presuspend(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->suspended = true; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); pool_suspend_active_thins(pool); } @@ -3625,13 +3607,12 @@ static void pool_presuspend_undo(struct dm_target *ti) { struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - unsigned long flags; pool_resume_active_thins(pool); - spin_lock_irqsave(&pool->lock, flags); + spin_lock_irq(&pool->lock); pool->suspended = false; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock_irq(&pool->lock); } static void pool_postsuspend(struct dm_target *ti) @@ -4110,11 +4091,10 @@ static void thin_put(struct thin_c *tc) static void thin_dtr(struct dm_target *ti) { struct thin_c *tc = ti->private; - unsigned long flags; - spin_lock_irqsave(&tc->pool->lock, flags); + spin_lock_irq(&tc->pool->lock); list_del_rcu(&tc->list); - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); synchronize_rcu(); thin_put(tc); @@ -4150,7 +4130,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) struct thin_c *tc; struct dm_dev *pool_dev, *origin_dev; struct mapped_device *pool_md; - unsigned long flags; mutex_lock(&dm_thin_pool_table.mutex); @@ -4244,9 +4223,9 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) mutex_unlock(&dm_thin_pool_table.mutex); - spin_lock_irqsave(&tc->pool->lock, flags); + spin_lock_irq(&tc->pool->lock); if (tc->pool->suspended) { - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */ ti->error = "Unable to activate thin device while pool is suspended"; r = -EINVAL; @@ -4255,7 +4234,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) refcount_set(&tc->refcount, 1); init_completion(&tc->can_destroy); list_add_tail_rcu(&tc->list, &tc->pool->active_thins); - spin_unlock_irqrestore(&tc->pool->lock, flags); + spin_unlock_irq(&tc->pool->lock); /* * This synchronize_rcu() call is needed here otherwise we risk a * wake_worker() call finding no bios to process (because the newly -- cgit v1.2.3 From 235bc8616060514ce57053e51d0a81f7fab96f01 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 15 Oct 2019 08:16:51 -0400 Subject: dm bio prison: replace spin_lock_irqsave with spin_lock_irq Replace spin_lock_irqsave/irqrestore with spin_lock_irq/spin_unlock_irq. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bio-prison-v1.c | 27 ++++++++++----------------- drivers/md/dm-bio-prison-v2.c | 26 ++++++++++---------------- 2 files changed, 20 insertions(+), 33 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c index b5389890bbc3..1f8f98efd97a 100644 --- a/drivers/md/dm-bio-prison-v1.c +++ b/drivers/md/dm-bio-prison-v1.c @@ -150,11 +150,10 @@ static int bio_detain(struct dm_bio_prison *prison, struct dm_bio_prison_cell **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -198,11 +197,9 @@ void dm_cell_release(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, struct bio_list *bios) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); __cell_release(prison, cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_release); @@ -250,12 +247,10 @@ void dm_cell_visit_release(struct dm_bio_prison *prison, void *context, struct dm_bio_prison_cell *cell) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); visit_fn(context, cell); rb_erase(&cell->node, &prison->cells); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_visit_release); @@ -275,11 +270,10 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __promote_or_release(prison, cell); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -379,10 +373,9 @@ EXPORT_SYMBOL_GPL(dm_deferred_entry_dec); int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work) { int r = 1; - unsigned long flags; unsigned next_entry; - spin_lock_irqsave(&ds->lock, flags); + spin_lock_irq(&ds->lock); if ((ds->sweeper == ds->current_entry) && !ds->entries[ds->current_entry].count) r = 0; @@ -392,7 +385,7 @@ int dm_deferred_set_add_work(struct dm_deferred_set *ds, struct list_head *work) if (!ds->entries[next_entry].count) ds->current_entry = next_entry; } - spin_unlock_irqrestore(&ds->lock, flags); + spin_unlock_irq(&ds->lock); return r; } diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c index b092cdc8e1ae..8ee019eda32d 100644 --- a/drivers/md/dm-bio-prison-v2.c +++ b/drivers/md/dm-bio-prison-v2.c @@ -177,11 +177,10 @@ bool dm_cell_get_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __get(prison, key, lock_level, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -261,11 +260,10 @@ int dm_cell_lock_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 **cell_result) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __lock(prison, key, lock_level, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -285,11 +283,9 @@ void dm_cell_quiesce_v2(struct dm_bio_prison_v2 *prison, struct dm_bio_prison_cell_v2 *cell, struct work_struct *continuation) { - unsigned long flags; - - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); __quiesce(prison, cell, continuation); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); } EXPORT_SYMBOL_GPL(dm_cell_quiesce_v2); @@ -309,11 +305,10 @@ int dm_cell_lock_promote_v2(struct dm_bio_prison_v2 *prison, unsigned new_lock_level) { int r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __promote(prison, cell, new_lock_level); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } @@ -342,11 +337,10 @@ bool dm_cell_unlock_v2(struct dm_bio_prison_v2 *prison, struct bio_list *bios) { bool r; - unsigned long flags; - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irq(&prison->lock); r = __unlock(prison, cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irq(&prison->lock); return r; } -- cgit v1.2.3 From 26b924b93c7bd68add372e24d25b86848c507921 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 23 Oct 2019 09:39:15 -0400 Subject: dm cache: replace spin_lock_irqsave with spin_lock_irq If we are in a place where it is known that interrupts are enabled, functions spin_lock_irq/spin_unlock_irq should be used instead of spin_lock_irqsave/spin_unlock_irqrestore. spin_lock_irq and spin_unlock_irq are faster because they don't need to push and pop the flags register. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 77 ++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 49 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 8346e6d1816c..2d32821b3a5b 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -74,22 +74,19 @@ static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs) static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs) { bool r; - unsigned long flags; - spin_lock_irqsave(&iot->lock, flags); + spin_lock_irq(&iot->lock); r = __iot_idle_for(iot, jifs); - spin_unlock_irqrestore(&iot->lock, flags); + spin_unlock_irq(&iot->lock); return r; } static void iot_io_begin(struct io_tracker *iot, sector_t len) { - unsigned long flags; - - spin_lock_irqsave(&iot->lock, flags); + spin_lock_irq(&iot->lock); iot->in_flight += len; - spin_unlock_irqrestore(&iot->lock, flags); + spin_unlock_irq(&iot->lock); } static void __iot_io_end(struct io_tracker *iot, sector_t len) @@ -172,7 +169,6 @@ static void __commit(struct work_struct *_ws) { struct batcher *b = container_of(_ws, struct batcher, commit_work); blk_status_t r; - unsigned long flags; struct list_head work_items; struct work_struct *ws, *tmp; struct continuation *k; @@ -186,12 +182,12 @@ static void __commit(struct work_struct *_ws) * We have to grab these before the commit_op to avoid a race * condition. */ - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); list_splice_init(&b->work_items, &work_items); bio_list_merge(&bios, &b->bios); bio_list_init(&b->bios); b->commit_scheduled = false; - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); r = b->commit_op(b->commit_context); @@ -238,13 +234,12 @@ static void async_commit(struct batcher *b) static void continue_after_commit(struct batcher *b, struct continuation *k) { - unsigned long flags; bool commit_scheduled; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); commit_scheduled = b->commit_scheduled; list_add_tail(&k->ws.entry, &b->work_items); - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (commit_scheduled) async_commit(b); @@ -255,13 +250,12 @@ static void continue_after_commit(struct batcher *b, struct continuation *k) */ static void issue_after_commit(struct batcher *b, struct bio *bio) { - unsigned long flags; bool commit_scheduled; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); commit_scheduled = b->commit_scheduled; bio_list_add(&b->bios, bio); - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (commit_scheduled) async_commit(b); @@ -273,12 +267,11 @@ static void issue_after_commit(struct batcher *b, struct bio *bio) static void schedule_commit(struct batcher *b) { bool immediate; - unsigned long flags; - spin_lock_irqsave(&b->lock, flags); + spin_lock_irq(&b->lock); immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); b->commit_scheduled = true; - spin_unlock_irqrestore(&b->lock, flags); + spin_unlock_irq(&b->lock); if (immediate) async_commit(b); @@ -630,23 +623,19 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio) static void defer_bio(struct cache *cache, struct bio *bio) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_add(&cache->deferred_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); wake_deferred_bio_worker(cache); } static void defer_bios(struct cache *cache, struct bio_list *bios) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_merge(&cache->deferred_bios, bios); bio_list_init(bios); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); wake_deferred_bio_worker(cache); } @@ -756,33 +745,27 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) static void set_discard(struct cache *cache, dm_dblock_t b) { - unsigned long flags; - BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks)); atomic_inc(&cache->stats.discard_count); - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); set_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static void clear_discard(struct cache *cache, dm_dblock_t b) { - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); clear_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static bool is_discarded(struct cache *cache, dm_dblock_t b) { int r; - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); r = test_bit(from_dblock(b), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); return r; } @@ -790,12 +773,10 @@ static bool is_discarded(struct cache *cache, dm_dblock_t b) static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) { int r; - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); r = test_bit(from_dblock(oblock_to_dblock(cache, b)), cache->discard_bitset); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); return r; } @@ -827,17 +808,16 @@ static void remap_to_cache(struct cache *cache, struct bio *bio, static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) { - unsigned long flags; struct per_bio_data *pb; - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) && bio_op(bio) != REQ_OP_DISCARD) { pb = get_per_bio_data(bio); pb->tick = true; cache->need_tick_bio = false; } - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); } static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, @@ -1889,17 +1869,16 @@ static void process_deferred_bios(struct work_struct *ws) { struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); - unsigned long flags; bool commit_needed = false; struct bio_list bios; struct bio *bio; bio_list_init(&bios); - spin_lock_irqsave(&cache->lock, flags); + spin_lock_irq(&cache->lock); bio_list_merge(&bios, &cache->deferred_bios); bio_list_init(&cache->deferred_bios); - spin_unlock_irqrestore(&cache->lock, flags); + spin_unlock_irq(&cache->lock); while ((bio = bio_list_pop(&bios))) { if (bio->bi_opf & REQ_PREFLUSH) -- cgit v1.2.3 From 6ec1be501500787264ce62b8786ed7723bcd93f5 Mon Sep 17 00:00:00 2001 From: Bryan Gurney Date: Fri, 4 Oct 2019 11:42:08 -0400 Subject: dm dust: change result vars to r Change the "result" variables to "r" in dust_status() and dust_message(). Signed-off-by: Bryan Gurney Signed-off-by: Mike Snitzer --- drivers/md/dm-dust.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 8288887b7f94..29626ff8b8e8 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -375,7 +375,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, struct dust_device *dd = ti->private; sector_t size = i_size_read(dd->dev->bdev->bd_inode) >> SECTOR_SHIFT; bool invalid_msg = false; - int result = -EINVAL; + int r = -EINVAL; unsigned long long tmp, block; unsigned long flags; char dummy; @@ -388,45 +388,45 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, } else if (!strcasecmp(argv[0], "disable")) { DMINFO("disabling read failures on bad sectors"); dd->fail_read_on_bb = false; - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "enable")) { DMINFO("enabling read failures on bad sectors"); dd->fail_read_on_bb = true; - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "countbadblocks")) { spin_lock_irqsave(&dd->dust_lock, flags); DMINFO("countbadblocks: %llu badblock(s) found", dd->badblock_count); spin_unlock_irqrestore(&dd->dust_lock, flags); - result = 0; + r = 0; } else if (!strcasecmp(argv[0], "clearbadblocks")) { - result = dust_clear_badblocks(dd); + r = dust_clear_badblocks(dd); } else if (!strcasecmp(argv[0], "quiet")) { if (!dd->quiet_mode) dd->quiet_mode = true; else dd->quiet_mode = false; - result = 0; + r = 0; } else { invalid_msg = true; } } else if (argc == 2) { if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) - return result; + return r; block = tmp; sector_div(size, dd->sect_per_block); if (block > size) { DMERR("selected block value out of range"); - return result; + return r; } if (!strcasecmp(argv[0], "addbadblock")) - result = dust_add_block(dd, block); + r = dust_add_block(dd, block); else if (!strcasecmp(argv[0], "removebadblock")) - result = dust_remove_block(dd, block); + r = dust_remove_block(dd, block); else if (!strcasecmp(argv[0], "queryblock")) - result = dust_query_block(dd, block); + r = dust_query_block(dd, block); else invalid_msg = true; @@ -436,7 +436,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, if (invalid_msg) DMERR("unrecognized message '%s' received", argv[0]); - return result; + return r; } static void dust_status(struct dm_target *ti, status_type_t type, @@ -499,12 +499,12 @@ static struct target_type dust_target = { static int __init dm_dust_init(void) { - int result = dm_register_target(&dust_target); + int r = dm_register_target(&dust_target); - if (result < 0) - DMERR("dm_register_target failed %d", result); + if (r < 0) + DMERR("dm_register_target failed %d", r); - return result; + return r; } static void __exit dm_dust_exit(void) -- cgit v1.2.3 From cc7a7fb3b689996d35404080dde09de03fc1d09b Mon Sep 17 00:00:00 2001 From: Bryan Gurney Date: Fri, 4 Oct 2019 11:42:09 -0400 Subject: dm dust: change ret to r in dust_map_read and dust_map In the dust_map_read() and dust_map() functions, change the return code variable "ret" to "r", to match the convention of the other device-mapper targets. Signed-off-by: Bryan Gurney Signed-off-by: Mike Snitzer --- drivers/md/dm-dust.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 29626ff8b8e8..9a926a799fb4 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -163,16 +163,16 @@ static int dust_map_read(struct dust_device *dd, sector_t thisblock, bool fail_read_on_bb) { unsigned long flags; - int ret = DM_MAPIO_REMAPPED; + int r = DM_MAPIO_REMAPPED; if (fail_read_on_bb) { thisblock >>= dd->sect_per_block_shift; spin_lock_irqsave(&dd->dust_lock, flags); - ret = __dust_map_read(dd, thisblock); + r = __dust_map_read(dd, thisblock); spin_unlock_irqrestore(&dd->dust_lock, flags); } - return ret; + return r; } static void __dust_map_write(struct dust_device *dd, sector_t thisblock) @@ -209,17 +209,17 @@ static int dust_map_write(struct dust_device *dd, sector_t thisblock, static int dust_map(struct dm_target *ti, struct bio *bio) { struct dust_device *dd = ti->private; - int ret; + int r; bio_set_dev(bio, dd->dev->bdev); bio->bi_iter.bi_sector = dd->start + dm_target_offset(ti, bio->bi_iter.bi_sector); if (bio_data_dir(bio) == READ) - ret = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + r = dust_map_read(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); else - ret = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); + r = dust_map_write(dd, bio->bi_iter.bi_sector, dd->fail_read_on_bb); - return ret; + return r; } static bool __dust_clear_badblocks(struct rb_root *tree, -- cgit v1.2.3 From 72d7df4c8079306e7fbd6243bf951461ed647a47 Mon Sep 17 00:00:00 2001 From: Bryan Gurney Date: Tue, 22 Oct 2019 16:46:01 -0400 Subject: dm dust: add limited write failure mode Add a limited write failure mode which allows a write to a block to fail a specified amount of times, prior to remapping. The "addbadblock" message is extended to allow specifying the limited number of times a write fails. Example: add bad block on block 60, with 5 write failures: dmsetup message 0 dust1 addbadblock 60 5 The write failure counter will be printed for newly added bad blocks. Signed-off-by: Bryan Gurney Signed-off-by: Mike Snitzer --- drivers/md/dm-dust.c | 53 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index 9a926a799fb4..eb37584427a4 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -17,6 +17,7 @@ struct badblock { struct rb_node node; sector_t bb; + unsigned char wr_fail_cnt; }; struct dust_device { @@ -101,7 +102,8 @@ static int dust_remove_block(struct dust_device *dd, unsigned long long block) return 0; } -static int dust_add_block(struct dust_device *dd, unsigned long long block) +static int dust_add_block(struct dust_device *dd, unsigned long long block, + unsigned char wr_fail_cnt) { struct badblock *bblock; unsigned long flags; @@ -115,6 +117,7 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block) spin_lock_irqsave(&dd->dust_lock, flags); bblock->bb = block; + bblock->wr_fail_cnt = wr_fail_cnt; if (!dust_rb_insert(&dd->badblocklist, bblock)) { if (!dd->quiet_mode) { DMERR("%s: block %llu already in badblocklist", @@ -126,8 +129,10 @@ static int dust_add_block(struct dust_device *dd, unsigned long long block) } dd->badblock_count++; - if (!dd->quiet_mode) - DMINFO("%s: badblock added at block %llu", __func__, block); + if (!dd->quiet_mode) { + DMINFO("%s: badblock added at block %llu with write fail count %hhu", + __func__, block, wr_fail_cnt); + } spin_unlock_irqrestore(&dd->dust_lock, flags); return 0; @@ -175,10 +180,15 @@ static int dust_map_read(struct dust_device *dd, sector_t thisblock, return r; } -static void __dust_map_write(struct dust_device *dd, sector_t thisblock) +static int __dust_map_write(struct dust_device *dd, sector_t thisblock) { struct badblock *bblk = dust_rb_search(&dd->badblocklist, thisblock); + if (bblk && bblk->wr_fail_cnt > 0) { + bblk->wr_fail_cnt--; + return DM_MAPIO_KILL; + } + if (bblk) { rb_erase(&bblk->node, &dd->badblocklist); dd->badblock_count--; @@ -189,21 +199,24 @@ static void __dust_map_write(struct dust_device *dd, sector_t thisblock) (unsigned long long)thisblock); } } + + return DM_MAPIO_REMAPPED; } static int dust_map_write(struct dust_device *dd, sector_t thisblock, bool fail_read_on_bb) { unsigned long flags; + int ret = DM_MAPIO_REMAPPED; if (fail_read_on_bb) { thisblock >>= dd->sect_per_block_shift; spin_lock_irqsave(&dd->dust_lock, flags); - __dust_map_write(dd, thisblock); + ret = __dust_map_write(dd, thisblock); spin_unlock_irqrestore(&dd->dust_lock, flags); } - return DM_MAPIO_REMAPPED; + return ret; } static int dust_map(struct dm_target *ti, struct bio *bio) @@ -377,6 +390,8 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, bool invalid_msg = false; int r = -EINVAL; unsigned long long tmp, block; + unsigned char wr_fail_cnt; + unsigned int tmp_ui; unsigned long flags; char dummy; @@ -422,7 +437,7 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, } if (!strcasecmp(argv[0], "addbadblock")) - r = dust_add_block(dd, block); + r = dust_add_block(dd, block, 0); else if (!strcasecmp(argv[0], "removebadblock")) r = dust_remove_block(dd, block); else if (!strcasecmp(argv[0], "queryblock")) @@ -430,6 +445,30 @@ static int dust_message(struct dm_target *ti, unsigned int argc, char **argv, else invalid_msg = true; + } else if (argc == 3) { + if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) + return r; + + if (sscanf(argv[2], "%u%c", &tmp_ui, &dummy) != 1) + return r; + + block = tmp; + if (tmp_ui > 255) { + DMERR("selected write fail count out of range"); + return r; + } + wr_fail_cnt = tmp_ui; + sector_div(size, dd->sect_per_block); + if (block > size) { + DMERR("selected block value out of range"); + return r; + } + + if (!strcasecmp(argv[0], "addbadblock")) + r = dust_add_block(dd, block, wr_fail_cnt); + else + invalid_msg = true; + } else DMERR("invalid number of arguments '%d'", argc); -- cgit v1.2.3 From e7fad909b68aa37470d9f2d2731b5bec355ee5d6 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Wed, 6 Nov 2019 14:34:35 -0800 Subject: dm zoned: reduce overhead of backing device checks Commit 75d66ffb48efb3 added backing device health checks and as a part of these checks, check_events() block ops template call is invoked in dm-zoned mapping path as well as in reclaim and flush path. Calling check_events() with ATA or SCSI backing devices introduces a blocking scsi_test_unit_ready() call being made in sd_check_events(). Even though the overhead of calling scsi_test_unit_ready() is small for ATA zoned devices, it is much larger for SCSI and it affects performance in a very negative way. Fix this performance regression by executing check_events() only in case of any I/O errors. The function dmz_bdev_is_dying() is modified to call only blk_queue_dying(), while calls to check_events() are made in a new helper function, dmz_check_bdev(). Reported-by: zhangxiaoxu Fixes: 75d66ffb48efb3 ("dm zoned: properly handle backing device failure") Cc: stable@vger.kernel.org Signed-off-by: Dmitry Fomichev Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 29 +++++++++++++++-------- drivers/md/dm-zoned-reclaim.c | 8 ++----- drivers/md/dm-zoned-target.c | 54 +++++++++++++++++++++++++++++------------- drivers/md/dm-zoned.h | 2 ++ 4 files changed, 61 insertions(+), 32 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 595a73110e17..ac1179ca80d9 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -554,6 +554,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { dmz_release_mblock(zmd, mblk); + dmz_check_bdev(zmd->dev); return ERR_PTR(-EIO); } @@ -625,6 +626,8 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, ret = submit_bio_wait(bio); bio_put(bio); + if (ret) + dmz_check_bdev(zmd->dev); return ret; } @@ -691,6 +694,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { clear_bit(DMZ_META_ERROR, &mblk->state); + dmz_check_bdev(zmd->dev); ret = -EIO; } nr_mblks_submitted--; @@ -768,7 +772,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); - goto out; + goto err; } /* @@ -778,7 +782,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ ret = dmz_log_dirty_mblocks(zmd, &write_list); if (ret) - goto out; + goto err; /* * The log is on disk. It is now safe to update in place @@ -786,11 +790,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); if (ret) - goto out; + goto err; ret = dmz_write_sb(zmd, zmd->mblk_primary); if (ret) - goto out; + goto err; while (!list_empty(&write_list)) { mblk = list_first_entry(&write_list, struct dmz_mblock, link); @@ -805,16 +809,20 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) zmd->sb_gen++; out: - if (ret && !list_empty(&write_list)) { - spin_lock(&zmd->mblk_lock); - list_splice(&write_list, &zmd->mblk_dirty_list); - spin_unlock(&zmd->mblk_lock); - } - dmz_unlock_flush(zmd); up_write(&zmd->mblk_sem); return ret; + +err: + if (!list_empty(&write_list)) { + spin_lock(&zmd->mblk_lock); + list_splice(&write_list, &zmd->mblk_dirty_list); + spin_unlock(&zmd->mblk_lock); + } + if (!dmz_check_bdev(zmd->dev)) + ret = -EIO; + goto out; } /* @@ -1244,6 +1252,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) if (ret) { dmz_dev_err(zmd->dev, "Get zone %u report failed", dmz_id(zmd, zone)); + dmz_check_bdev(zmd->dev); return ret; } diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index d240d7ca8a8a..e7ace908a9b7 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -82,6 +82,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", dmz_id(zmd, zone), (unsigned long long)wp_block, (unsigned long long)block, nr_blocks, ret); + dmz_check_bdev(zrc->dev); return ret; } @@ -489,12 +490,7 @@ static void dmz_reclaim_work(struct work_struct *work) ret = dmz_do_reclaim(zrc); if (ret) { dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret); - if (ret == -EIO) - /* - * LLD might be performing some error handling sequence - * at the underlying device. To not interfere, do not - * attempt to schedule the next reclaim run immediately. - */ + if (!dmz_check_bdev(zrc->dev)) return; } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index d3bcc4197f5d..4574e0dedbd6 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -80,6 +80,8 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) bio->bi_status = status; + if (bio->bi_status != BLK_STS_OK) + bioctx->target->dev->flags |= DMZ_CHECK_BDEV; if (refcount_dec_and_test(&bioctx->ref)) { struct dm_zone *zone = bioctx->zone; @@ -565,31 +567,51 @@ out: } /* - * Check the backing device availability. If it's on the way out, + * Check if the backing device is being removed. If it's on the way out, * start failing I/O. Reclaim and metadata components also call this * function to cleanly abort operation in the event of such failure. */ bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev) { - struct gendisk *disk; + if (dmz_dev->flags & DMZ_BDEV_DYING) + return true; - if (!(dmz_dev->flags & DMZ_BDEV_DYING)) { - disk = dmz_dev->bdev->bd_disk; - if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { - dmz_dev_warn(dmz_dev, "Backing device queue dying"); - dmz_dev->flags |= DMZ_BDEV_DYING; - } else if (disk->fops->check_events) { - if (disk->fops->check_events(disk, 0) & - DISK_EVENT_MEDIA_CHANGE) { - dmz_dev_warn(dmz_dev, "Backing device offline"); - dmz_dev->flags |= DMZ_BDEV_DYING; - } - } + if (dmz_dev->flags & DMZ_CHECK_BDEV) + return !dmz_check_bdev(dmz_dev); + + if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { + dmz_dev_warn(dmz_dev, "Backing device queue dying"); + dmz_dev->flags |= DMZ_BDEV_DYING; } return dmz_dev->flags & DMZ_BDEV_DYING; } +/* + * Check the backing device availability. This detects such events as + * backing device going offline due to errors, media removals, etc. + * This check is less efficient than dmz_bdev_is_dying() and should + * only be performed as a part of error handling. + */ +bool dmz_check_bdev(struct dmz_dev *dmz_dev) +{ + struct gendisk *disk; + + dmz_dev->flags &= ~DMZ_CHECK_BDEV; + + if (dmz_bdev_is_dying(dmz_dev)) + return false; + + disk = dmz_dev->bdev->bd_disk; + if (disk->fops->check_events && + disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) { + dmz_dev_warn(dmz_dev, "Backing device offline"); + dmz_dev->flags |= DMZ_BDEV_DYING; + } + + return !(dmz_dev->flags & DMZ_BDEV_DYING); +} + /* * Process a new BIO. */ @@ -902,8 +924,8 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dmz_target *dmz = ti->private; - if (dmz_bdev_is_dying(dmz->dev)) - return -ENODEV; + if (!dmz_check_bdev(dmz->dev)) + return -EIO; *bdev = dmz->dev->bdev; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index d8e70b0ade35..5b5e493d479c 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -72,6 +72,7 @@ struct dmz_dev { /* Device flags. */ #define DMZ_BDEV_DYING (1 << 0) +#define DMZ_CHECK_BDEV (2 << 0) /* * Zone descriptor. @@ -255,5 +256,6 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc); * Functions defined in dm-zoned-target.c */ bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev); +bool dmz_check_bdev(struct dmz_dev *dmz_dev); #endif /* DM_ZONED_H */ -- cgit v1.2.3 From 35ad035b8398c634d888324a299ec50a84cbc6cc Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 24 Oct 2019 13:28:03 -0700 Subject: dm raid: Remove unnecessary negation of a shift in raid10_format_to_md_layout When building with Clang + -Wtautological-constant-compare: drivers/md/dm-raid.c:619:8: warning: converting the result of '<<' to a boolean always evaluates to true [-Wtautological-constant-compare] r = !RAID10_OFFSET; ^ drivers/md/dm-raid.c:517:28: note: expanded from macro 'RAID10_OFFSET' #define RAID10_OFFSET (1 << 16) /* stripes with data copies area adjacent on devices */ ^ 1 warning generated. Negating a non-zero number will always make it zero, which is the default value of r in this function so this statement is unnecessary; remove it so that clang no longer warns. Link: https://github.com/ClangBuiltLinux/linux/issues/753 Signed-off-by: Nathan Chancellor Acked-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 13fa90546a0f..c412eaa975fc 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -620,7 +620,6 @@ static int raid10_format_to_md_layout(struct raid_set *rs, } else if (algorithm == ALGORITHM_RAID10_FAR) { f = copies; - r = !RAID10_OFFSET; if (!test_bit(__CTR_FLAG_RAID10_USE_NEAR_SETS, &rs->ctr_flags)) r |= RAID10_USE_FAR_SETS; -- cgit v1.2.3 From d537858ac8aaf4311b51240893add2fc62003b97 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 13 Nov 2019 06:48:16 -0500 Subject: dm integrity: fix excessive alignment of metadata runs Metadata runs are supposed to be aligned on 4k boundary (so that they work efficiently with disks with 4k sectors). However, there was a programming bug that makes them aligned on 128k boundary instead. The unused space is wasted. Fix this bug by providing a proper 4k alignment. In order to keep existing volumes working, we introduce a new flag SB_FLAG_FIXED_PADDING - when the flag is clear, we calculate the padding the old way. In order to make sure that the old version cannot mount the volume created by the new version, we increase superblock version to 4. Also in order to not break with old integritysetup, we fix alignment only if the parameter "fix_padding" is present when formatting the device. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-integrity.rst | 5 ++++ drivers/md/dm-integrity.c | 28 ++++++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst index a30aa91b5fbe..594095b54b29 100644 --- a/Documentation/admin-guide/device-mapper/dm-integrity.rst +++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst @@ -177,6 +177,11 @@ bitmap_flush_interval:number The bitmap flush interval in milliseconds. The metadata buffers are synchronized when this interval expires. +fix_padding + Use a smaller padding of the tag area that is more + space-efficient. If this option is not present, large padding is + used - that is for compatibility with older kernels. + The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can be changed when reloading the target (load an inactive table and swap the diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index dab4446fe7d8..b225b3e445fa 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -53,6 +53,7 @@ #define SB_VERSION_1 1 #define SB_VERSION_2 2 #define SB_VERSION_3 3 +#define SB_VERSION_4 4 #define SB_SECTORS 8 #define MAX_SECTORS_PER_BLOCK 8 @@ -73,6 +74,7 @@ struct superblock { #define SB_FLAG_HAVE_JOURNAL_MAC 0x1 #define SB_FLAG_RECALCULATING 0x2 #define SB_FLAG_DIRTY_BITMAP 0x4 +#define SB_FLAG_FIXED_PADDING 0x8 #define JOURNAL_ENTRY_ROUNDUP 8 @@ -250,6 +252,7 @@ struct dm_integrity_c { bool journal_uptodate; bool just_formatted; bool recalculate_flag; + bool fix_padding; struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; @@ -463,7 +466,9 @@ static void wraparound_section(struct dm_integrity_c *ic, unsigned *sec_ptr) static void sb_set_version(struct dm_integrity_c *ic) { - if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) + if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) + ic->sb->version = SB_VERSION_4; + else if (ic->mode == 'B' || ic->sb->flags & cpu_to_le32(SB_FLAG_DIRTY_BITMAP)) ic->sb->version = SB_VERSION_3; else if (ic->meta_dev || ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) ic->sb->version = SB_VERSION_2; @@ -2955,6 +2960,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; + arg_count += (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0; DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, ic->tag_size, ic->mode, arg_count); if (ic->meta_dev) @@ -2974,6 +2980,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" sectors_per_bit:%llu", (unsigned long long)ic->sectors_per_block << ic->log2_blocks_per_bitmap_bit); DMEMIT(" bitmap_flush_interval:%u", jiffies_to_msecs(ic->bitmap_flush_interval)); } + if ((ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING)) != 0) + DMEMIT(" fix_padding"); #define EMIT_ALG(a, n) \ do { \ @@ -3042,8 +3050,14 @@ static int calculate_device_limits(struct dm_integrity_c *ic) if (!ic->meta_dev) { sector_t last_sector, last_area, last_offset; - ic->metadata_run = roundup((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), - (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS)) >> SECTOR_SHIFT; + /* we have to maintain excessive padding for compatibility with existing volumes */ + __u64 metadata_run_padding = + ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_PADDING) ? + (__u64)(METADATA_PADDING_SECTORS << SECTOR_SHIFT) : + (__u64)(1 << SECTOR_SHIFT << METADATA_PADDING_SECTORS); + + ic->metadata_run = round_up((__u64)ic->tag_size << (ic->sb->log2_interleave_sectors - ic->sb->log2_sectors_per_block), + metadata_run_padding) >> SECTOR_SHIFT; if (!(ic->metadata_run & (ic->metadata_run - 1))) ic->log2_metadata_run = __ffs(ic->metadata_run); else @@ -3086,6 +3100,8 @@ static int initialize_superblock(struct dm_integrity_c *ic, unsigned journal_sec journal_sections = 1; if (!ic->meta_dev) { + if (ic->fix_padding) + ic->sb->flags |= cpu_to_le32(SB_FLAG_FIXED_PADDING); ic->sb->journal_sections = cpu_to_le32(journal_sections); if (!interleave_sectors) interleave_sectors = DEFAULT_INTERLEAVE_SECTORS; @@ -3725,6 +3741,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } else if (!strcmp(opt_string, "recalculate")) { ic->recalculate_flag = true; + } else if (!strcmp(opt_string, "fix_padding")) { + ic->fix_padding = true; } else { r = -EINVAL; ti->error = "Invalid argument"; @@ -3867,7 +3885,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) should_write_sb = true; } - if (!ic->sb->version || ic->sb->version > SB_VERSION_3) { + if (!ic->sb->version || ic->sb->version > SB_VERSION_4) { r = -EINVAL; ti->error = "Unknown version"; goto bad; @@ -4182,7 +4200,7 @@ static void dm_integrity_dtr(struct dm_target *ti) static struct target_type integrity_target = { .name = "integrity", - .version = {1, 3, 0}, + .version = {1, 4, 0}, .module = THIS_MODULE, .features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY, .ctr = dm_integrity_ctr, -- cgit v1.2.3 From d256d796279de0bdc227ff4daef565aa7e80c898 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Mon, 18 Nov 2019 09:50:38 +0800 Subject: dm thin: wakeup worker only when deferred bios exist Single thread fio test (read, bs=4k, ioengine=libaio, iodepth=128, numjobs=1) over dm-thin device has poor performance versus bare nvme device. Further investigation with perf indicates that queue_work_on() consumes over 20% CPU time when doing IO over dm-thin device. The call stack is as follows. - 40.57% thin_map + 22.07% queue_work_on + 9.95% dm_thin_find_block + 2.80% cell_defer_no_holder 1.91% inc_all_io_entry.isra.33.part.34 + 1.78% bio_detain.isra.35 In cell_defer_no_holder(), wakeup_worker() is always called, no matter whether the tc->deferred_bio_list list is empty or not. In single thread IO model, this list is most likely empty. So skip waking up worker thread if tc->deferred_bio_list list is empty. Single thread IO performance improves from 448 MiB/s to 646 MiB/s (+44%) once the needless wake_worker() calls are properly skipped. Signed-off-by: Jeffle Xu Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 85ada5ad2121..5a2c494cb552 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -882,12 +882,15 @@ static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *c { struct pool *pool = tc->pool; unsigned long flags; + int has_work; spin_lock_irqsave(&tc->lock, flags); cell_release_no_holder(pool, cell, &tc->deferred_bio_list); + has_work = !bio_list_empty(&tc->deferred_bio_list); spin_unlock_irqrestore(&tc->lock, flags); - wake_worker(pool); + if (has_work) + wake_worker(pool); } static void thin_defer_bio(struct thin_c *tc, struct bio *bio); -- cgit v1.2.3 From 443633225ef9e07515578aa0d01bd214fa93db53 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 20 Nov 2019 21:41:10 +0800 Subject: dm: Fix Kconfig indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ /\t/' -i */Kconfig Signed-off-by: Krzysztof Kozlowski Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) (limited to 'drivers') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index aa98953f4462..d6d5ab23c088 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -38,9 +38,9 @@ config MD_AUTODETECT default y ---help--- If you say Y here, then the kernel will try to autodetect raid - arrays as part of its boot process. + arrays as part of its boot process. - If you don't use raid and say Y, this autodetection can cause + If you don't use raid and say Y, this autodetection can cause a several-second delay in the boot time due to various synchronisation steps that are part of this step. @@ -290,7 +290,7 @@ config DM_SNAPSHOT depends on BLK_DEV_DM select DM_BUFIO ---help--- - Allow volume managers to take writable snapshots of a device. + Allow volume managers to take writable snapshots of a device. config DM_THIN_PROVISIONING tristate "Thin provisioning target" @@ -298,7 +298,7 @@ config DM_THIN_PROVISIONING select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - Provides thin provisioning and snapshots that share a data store. + Provides thin provisioning and snapshots that share a data store. config DM_CACHE tristate "Cache target (EXPERIMENTAL)" @@ -307,23 +307,23 @@ config DM_CACHE select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - dm-cache attempts to improve performance of a block device by - moving frequently used data to a smaller, higher performance - device. Different 'policy' plugins can be used to change the - algorithms used to select which blocks are promoted, demoted, - cleaned etc. It supports writeback and writethrough modes. + dm-cache attempts to improve performance of a block device by + moving frequently used data to a smaller, higher performance + device. Different 'policy' plugins can be used to change the + algorithms used to select which blocks are promoted, demoted, + cleaned etc. It supports writeback and writethrough modes. config DM_CACHE_SMQ tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)" depends on DM_CACHE default y ---help--- - A cache policy that uses a multiqueue ordered by recent hits - to select which blocks should be promoted and demoted. - This is meant to be a general purpose policy. It prioritises - reads over writes. This SMQ policy (vs MQ) offers the promise - of less memory utilization, improved performance and increased - adaptability in the face of changing workloads. + A cache policy that uses a multiqueue ordered by recent hits + to select which blocks should be promoted and demoted. + This is meant to be a general purpose policy. It prioritises + reads over writes. This SMQ policy (vs MQ) offers the promise + of less memory utilization, improved performance and increased + adaptability in the face of changing workloads. config DM_WRITECACHE tristate "Writecache target" @@ -343,9 +343,9 @@ config DM_ERA select DM_PERSISTENT_DATA select DM_BIO_PRISON ---help--- - dm-era tracks which parts of a block device are written to - over time. Useful for maintaining cache coherency when using - vendor snapshots. + dm-era tracks which parts of a block device are written to + over time. Useful for maintaining cache coherency when using + vendor snapshots. config DM_CLONE tristate "Clone target (EXPERIMENTAL)" @@ -353,20 +353,20 @@ config DM_CLONE default n select DM_PERSISTENT_DATA ---help--- - dm-clone produces a one-to-one copy of an existing, read-only source - device into a writable destination device. The cloned device is - visible/mountable immediately and the copy of the source device to the - destination device happens in the background, in parallel with user - I/O. + dm-clone produces a one-to-one copy of an existing, read-only source + device into a writable destination device. The cloned device is + visible/mountable immediately and the copy of the source device to the + destination device happens in the background, in parallel with user + I/O. - If unsure, say N. + If unsure, say N. config DM_MIRROR tristate "Mirror target" depends on BLK_DEV_DM ---help--- - Allow volume managers to mirror logical volumes, also - needed for live data migration tools such as 'pvmove'. + Allow volume managers to mirror logical volumes, also + needed for live data migration tools such as 'pvmove'. config DM_LOG_USERSPACE tristate "Mirror userspace logging" @@ -483,7 +483,7 @@ config DM_FLAKEY tristate "Flakey target" depends on BLK_DEV_DM ---help--- - A target that intermittently fails I/O for debugging purposes. + A target that intermittently fails I/O for debugging purposes. config DM_VERITY tristate "Verity target support" -- cgit v1.2.3 From f612b2132db529feac4f965f28a1b9258ea7c22b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 20 Nov 2019 17:27:39 -0500 Subject: Revert "dm crypt: use WQ_HIGHPRI for the IO and crypt workqueues" This reverts commit a1b89132dc4f61071bdeaab92ea958e0953380a1. Revert required hand-patching due to subsequent changes that were applied since commit a1b89132dc4f61071bdeaab92ea958e0953380a1. Requires: ed0302e83098d ("dm crypt: make workqueue names device-specific") Cc: stable@vger.kernel.org Bug: https://bugzilla.kernel.org/show_bug.cgi?id=199857 Reported-by: Vito Caputo Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index f87f6495652f..eb9782fc93fe 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2700,21 +2700,18 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) } ret = -ENOMEM; - cc->io_queue = alloc_workqueue("kcryptd_io/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, - 1, devname); + cc->io_queue = alloc_workqueue("kcryptd_io/%s", WQ_MEM_RECLAIM, 1, devname); if (!cc->io_queue) { ti->error = "Couldn't create kcryptd io queue"; goto bad; } if (test_bit(DM_CRYPT_SAME_CPU, &cc->flags)) - cc->crypt_queue = alloc_workqueue("kcryptd/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, + cc->crypt_queue = alloc_workqueue("kcryptd/%s", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM, 1, devname); else cc->crypt_queue = alloc_workqueue("kcryptd/%s", - WQ_HIGHPRI | WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, + WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus(), devname); if (!cc->crypt_queue) { ti->error = "Couldn't create kcryptd queue"; -- cgit v1.2.3