From 88e7cafdcae8bc6354804c685379784c55e276ca Mon Sep 17 00:00:00 2001 From: Bryan Gurney Date: Thu, 5 Dec 2019 17:08:37 -0500 Subject: dm dust: change ret to r in dust_map_write In the dust_map_write() function, change the return code variable "ret" to "r", to match the convention of the other device-mapper targets. Signed-off-by: Bryan Gurney Signed-off-by: Mike Snitzer --- drivers/md/dm-dust.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-dust.c b/drivers/md/dm-dust.c index eb37584427a4..ff03b90072c5 100644 --- a/drivers/md/dm-dust.c +++ b/drivers/md/dm-dust.c @@ -207,16 +207,16 @@ static int dust_map_write(struct dust_device *dd, sector_t thisblock, bool fail_read_on_bb) { unsigned long flags; - int ret = DM_MAPIO_REMAPPED; + int r = DM_MAPIO_REMAPPED; if (fail_read_on_bb) { thisblock >>= dd->sect_per_block_shift; spin_lock_irqsave(&dd->dust_lock, flags); - ret = __dust_map_write(dd, thisblock); + r = __dust_map_write(dd, thisblock); spin_unlock_irqrestore(&dd->dust_lock, flags); } - return ret; + return r; } static int dust_map(struct dm_target *ti, struct bio *bio) -- cgit v1.2.3 From 43f3952a51f8198d365acb7f51fe42d578fe5d0a Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Thu, 19 Dec 2019 17:58:46 +0100 Subject: dm raid: table line rebuild status fixes raid_status() wasn't emitting rebuild flags on the table line properly because the rdev number was not yet set properly; index raid component devices array directly to solve. Also fix wrong argument count on emitted table line caused by 1 too many rebuild/write_mostly argument and consider any journal_(dev|mode) pairs. Link: https://bugzilla.redhat.com/1782045 Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- .../admin-guide/device-mapper/dm-raid.rst | 2 + drivers/md/dm-raid.c | 43 +++++++++++----------- 2 files changed, 24 insertions(+), 21 deletions(-) (limited to 'drivers/md') diff --git a/Documentation/admin-guide/device-mapper/dm-raid.rst b/Documentation/admin-guide/device-mapper/dm-raid.rst index f6344675e395..695a2ea1d1ae 100644 --- a/Documentation/admin-guide/device-mapper/dm-raid.rst +++ b/Documentation/admin-guide/device-mapper/dm-raid.rst @@ -419,3 +419,5 @@ Version History rebuild errors. 1.15.0 Fix size extensions not being synchronized in case of new MD bitmap pages allocated; also fix those not occuring after previous reductions + 1.15.1 Fix argument count and arguments for rebuild/write_mostly/journal_(dev|mode) + on the status line. diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index c412eaa975fc..9a18bef0a5ff 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -129,7 +129,9 @@ struct raid_dev { CTR_FLAG_RAID10_COPIES | \ CTR_FLAG_RAID10_FORMAT | \ CTR_FLAG_DELTA_DISKS | \ - CTR_FLAG_DATA_OFFSET) + CTR_FLAG_DATA_OFFSET | \ + CTR_FLAG_JOURNAL_DEV | \ + CTR_FLAG_JOURNAL_MODE) /* Valid options definitions per raid level... */ @@ -3001,7 +3003,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) { 1, 254, "Cannot understand number of raid devices parameters" } }; - /* Must have */ arg = dm_shift_arg(&as); if (!arg) { ti->error = "No arguments"; @@ -3508,8 +3509,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, unsigned long recovery; unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ unsigned int sz = 0; - unsigned int rebuild_disks; - unsigned int write_mostly_params = 0; + unsigned int rebuild_writemostly_count = 0; sector_t progress, resync_max_sectors, resync_mismatches; enum sync_state state; struct raid_type *rt; @@ -3593,18 +3593,20 @@ static void raid_status(struct dm_target *ti, status_type_t type, case STATUSTYPE_TABLE: /* Report the table line string you would use to construct this raid set */ - /* Calculate raid parameter count */ - for (i = 0; i < rs->raid_disks; i++) - if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) - write_mostly_params += 2; - rebuild_disks = memweight(rs->rebuild_disks, DISKS_ARRAY_ELEMS * sizeof(*rs->rebuild_disks)); - raid_param_cnt += rebuild_disks * 2 + - write_mostly_params + + /* + * Count any rebuild or writemostly argument pairs and subtract the + * hweight count being added below of any rebuild and writemostly ctr flags. + */ + for (i = 0; i < rs->raid_disks; i++) { + rebuild_writemostly_count += (test_bit(i, (void *) rs->rebuild_disks) ? 2 : 0) + + (test_bit(WriteMostly, &rs->dev[i].rdev.flags) ? 2 : 0); + } + rebuild_writemostly_count -= (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags) ? 2 : 0) + + (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags) ? 2 : 0); + /* Calculate raid parameter count based on ^ rebuild/writemostly argument counts and ctr flags set. */ + raid_param_cnt += rebuild_writemostly_count + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_NO_ARGS) + - hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2 + - (test_bit(__CTR_FLAG_JOURNAL_DEV, &rs->ctr_flags) ? 2 : 0) + - (test_bit(__CTR_FLAG_JOURNAL_MODE, &rs->ctr_flags) ? 2 : 0); - + hweight32(rs->ctr_flags & CTR_FLAG_OPTIONS_ONE_ARG) * 2; /* Emit table line */ /* This has to be in the documented order for userspace! */ DMEMIT("%s %u %u", rs->raid_type->name, raid_param_cnt, mddev->new_chunk_sectors); @@ -3612,11 +3614,10 @@ static void raid_status(struct dm_target *ti, status_type_t type, DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_SYNC)); if (test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) DMEMIT(" %s", dm_raid_arg_name_by_flag(CTR_FLAG_NOSYNC)); - if (rebuild_disks) + if (test_bit(__CTR_FLAG_REBUILD, &rs->ctr_flags)) for (i = 0; i < rs->raid_disks; i++) - if (test_bit(rs->dev[i].rdev.raid_disk, (void *) rs->rebuild_disks)) - DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), - rs->dev[i].rdev.raid_disk); + if (test_bit(i, (void *) rs->rebuild_disks)) + DMEMIT(" %s %u", dm_raid_arg_name_by_flag(CTR_FLAG_REBUILD), i); if (test_bit(__CTR_FLAG_DAEMON_SLEEP, &rs->ctr_flags)) DMEMIT(" %s %lu", dm_raid_arg_name_by_flag(CTR_FLAG_DAEMON_SLEEP), mddev->bitmap_info.daemon_sleep); @@ -3626,7 +3627,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, if (test_bit(__CTR_FLAG_MAX_RECOVERY_RATE, &rs->ctr_flags)) DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_MAX_RECOVERY_RATE), mddev->sync_speed_max); - if (write_mostly_params) + if (test_bit(__CTR_FLAG_WRITE_MOSTLY, &rs->ctr_flags)) for (i = 0; i < rs->raid_disks; i++) if (test_bit(WriteMostly, &rs->dev[i].rdev.flags)) DMEMIT(" %s %d", dm_raid_arg_name_by_flag(CTR_FLAG_WRITE_MOSTLY), @@ -4029,7 +4030,7 @@ static void raid_resume(struct dm_target *ti) static struct target_type raid_target = { .name = "raid", - .version = {1, 15, 0}, + .version = {1, 15, 1}, .module = THIS_MODULE, .ctr = raid_ctr, .dtr = raid_dtr, -- cgit v1.2.3 From b39962950339912978484cdac50069258545d753 Mon Sep 17 00:00:00 2001 From: Dmitry Fomichev Date: Mon, 23 Dec 2019 17:05:46 -0800 Subject: dm zoned: support zone sizes smaller than 128MiB dm-zoned is observed to log failed kernel assertions and not work correctly when operating against a device with a zone size smaller than 128MiB (e.g. 32768 bits per 4K block). The reason is that the bitmap size per zone is calculated as zero with such a small zone size. Fix this problem and also make the code related to zone bitmap management be able to handle per zone bitmaps smaller than a single block. A dm-zoned-tools patch is required to properly format dm-zoned devices with zone sizes smaller than 128MiB. Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target") Cc: stable@vger.kernel.org Signed-off-by: Dmitry Fomichev Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-metadata.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 22b3cb0050a7..516c7b671d25 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -134,6 +134,7 @@ struct dmz_metadata { sector_t zone_bitmap_size; unsigned int zone_nr_bitmap_blocks; + unsigned int zone_bits_per_mblk; unsigned int nr_bitmap_blocks; unsigned int nr_map_blocks; @@ -1161,7 +1162,10 @@ static int dmz_init_zones(struct dmz_metadata *zmd) /* Init */ zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3; - zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT; + zmd->zone_nr_bitmap_blocks = + max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT); + zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks, + DMZ_BLOCK_SIZE_BITS); /* Allocate zone array */ zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); @@ -1956,7 +1960,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, dmz_release_mblock(zmd, to_mblk); dmz_release_mblock(zmd, from_mblk); - chunk_block += DMZ_BLOCK_SIZE_BITS; + chunk_block += zmd->zone_bits_per_mblk; } to_zone->weight = from_zone->weight; @@ -2017,7 +2021,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, /* Set bits */ bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits); if (count) { @@ -2096,7 +2100,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, /* Clear bits */ bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); count = dmz_clear_bits((unsigned long *)mblk->data, bit, nr_bits); @@ -2156,6 +2160,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, { struct dmz_mblock *mblk; unsigned int bit, set_bit, nr_bits; + unsigned int zone_bits = zmd->zone_bits_per_mblk; unsigned long *bitmap; int n = 0; @@ -2170,15 +2175,15 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, /* Get offset */ bitmap = (unsigned long *) mblk->data; bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zone_bits - bit); if (set) - set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); + set_bit = find_next_bit(bitmap, zone_bits, bit); else - set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); + set_bit = find_next_zero_bit(bitmap, zone_bits, bit); dmz_release_mblock(zmd, mblk); n += set_bit - bit; - if (set_bit < DMZ_BLOCK_SIZE_BITS) + if (set_bit < zone_bits) break; nr_blocks -= nr_bits; @@ -2281,7 +2286,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) /* Count bits in this block */ bitmap = mblk->data; bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); n += dmz_count_bits(bitmap, bit, nr_bits); dmz_release_mblock(zmd, mblk); -- cgit v1.2.3 From 4ecc5081909ae70e4d988b643d54687cc53ba6eb Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 24 Dec 2019 14:38:00 +0800 Subject: dm mpath: use true/false for bool variable Fixes coccicheck warning: drivers/md/dm-mpath.c:1447:2-13: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index e0c32793c248..97c095220c79 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1444,7 +1444,7 @@ static void pg_init_done(void *data, int errors) break; case SCSI_DH_RETRY: /* Wait before retrying. */ - delay_retry = 1; + delay_retry = true; /* fall through */ case SCSI_DH_IMM_RETRY: case SCSI_DH_RES_TEMP_UNAVAIL: -- cgit v1.2.3 From 67b92d979b70212d7b2272f6b86a4d9229ab1eac Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 24 Dec 2019 14:38:01 +0800 Subject: dm bio prison v2: use true/false for bool variable Fixes coccicheck warning: drivers/md/dm-bio-prison-v2.c:327:2-22: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Mike Snitzer --- drivers/md/dm-bio-prison-v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c index 8ee019eda32d..9dec3b61cf70 100644 --- a/drivers/md/dm-bio-prison-v2.c +++ b/drivers/md/dm-bio-prison-v2.c @@ -324,7 +324,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison, bio_list_init(&cell->bios); if (cell->shared_count) { - cell->exclusive_lock = 0; + cell->exclusive_lock = false; return false; } -- cgit v1.2.3 From 1d1dda8ca8ca0586df87d898e59cd69110d3d74b Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 24 Dec 2019 14:38:02 +0800 Subject: dm snapshot: use true/false for bool variable Fixes coccicheck warning: drivers/md/dm-snap.c:1064:3-18: WARNING: Assignment of 0/1 to bool variable drivers/md/dm-snap.c:1152:1-16: WARNING: Assignment of 0/1 to bool variable drivers/md/dm-snap.c:1317:1-16: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 4fb1a40e68a0..6b11a266299f 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1061,7 +1061,7 @@ static void snapshot_merge_next_chunks(struct dm_snapshot *s) DMERR("Read error in exception store: " "shutting down merge"); down_write(&s->lock); - s->merge_failed = 1; + s->merge_failed = true; up_write(&s->lock); } goto shut; @@ -1149,7 +1149,7 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) shut: down_write(&s->lock); - s->merge_failed = 1; + s->merge_failed = true; b = __release_queued_bios_after_merge(s); up_write(&s->lock); error_bios(b); @@ -1314,7 +1314,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) INIT_LIST_HEAD(&s->list); spin_lock_init(&s->pe_lock); s->state_bits = 0; - s->merge_failed = 0; + s->merge_failed = false; s->first_merging_chunk = 0; s->num_merging_chunks = 0; bio_list_init(&s->bios_queued_during_merge); -- cgit v1.2.3 From 63ee92d1c2531008c9e5d73db2f2a1349961b10a Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 24 Dec 2019 14:38:03 +0800 Subject: dm thin metadata: use true/false for bool variable Fixes coccicheck warning: drivers/md/dm-thin-metadata.c:814:3-14: WARNING: Assignment of 0/1 to bool variable drivers/md/dm-thin-metadata.c:1109:1-12: WARNING: Assignment of 0/1 to bool variable drivers/md/dm-thin-metadata.c:1621:1-12: WARNING: Assignment of 0/1 to bool variable drivers/md/dm-thin-metadata.c:1652:1-12: WARNING: Assignment of 0/1 to bool variable drivers/md/dm-thin-metadata.c:1706:1-12: WARNING: Assignment of 0/1 to bool variable Reported-by: Hulk Robot Signed-off-by: zhengbin Signed-off-by: Mike Snitzer --- drivers/md/dm-thin-metadata.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index b88d6d701f5b..ca03b38c9fa2 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -811,7 +811,7 @@ static int __write_changed_details(struct dm_pool_metadata *pmd) return r; if (td->open_count) - td->changed = 0; + td->changed = false; else { list_del(&td->list); kfree(td); @@ -1106,7 +1106,7 @@ static int __set_snapshot_details(struct dm_pool_metadata *pmd, if (r) return r; - td->changed = 1; + td->changed = true; td->snapshotted_time = time; snap->mapped_blocks = td->mapped_blocks; @@ -1618,7 +1618,7 @@ static int __insert(struct dm_thin_device *td, dm_block_t block, if (r) return r; - td->changed = 1; + td->changed = true; if (inserted) td->mapped_blocks++; @@ -1649,7 +1649,7 @@ static int __remove(struct dm_thin_device *td, dm_block_t block) return r; td->mapped_blocks--; - td->changed = 1; + td->changed = true; return 0; } @@ -1703,7 +1703,7 @@ static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_ } td->mapped_blocks -= total_count; - td->changed = 1; + td->changed = true; /* * Reinsert the mapping tree. -- cgit v1.2.3 From 4306904053902c7e7827815722f82015dc274ba3 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Mon, 30 Dec 2019 10:54:32 +0800 Subject: dm thin metadata: Fix trivial math error in on-disk format documentation Signed-off-by: Jeffle Xu Signed-off-by: Mike Snitzer --- drivers/md/dm-thin-metadata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index ca03b38c9fa2..4e44274adad3 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -28,7 +28,7 @@ * * - A hierarchical btree, with 2 levels which effectively maps (thin * dev id, virtual block) -> block_time. Block time is a 64-bit - * field holding the time in the low 24 bits, and block in the top 48 + * field holding the time in the low 24 bits, and block in the top 40 * bits. * * BTrees consist solely of btree_nodes, that fill a block. Some are -- cgit v1.2.3 From 9402e959014a18b4ebf7558733076875808dd66c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 2 Jan 2020 08:23:32 -0500 Subject: dm crypt: fix GFP flags passed to skcipher_request_alloc() GFP_KERNEL is not supposed to be or'd with GFP_NOFS (the result is equivalent to GFP_KERNEL). Also, we use GFP_NOIO instead of GFP_NOFS because we don't want any I/O being submitted in the direct reclaim path. Fixes: 39d13a1ac41d ("dm crypt: reuse eboiv skcipher for IV generation") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index eb9782fc93fe..9a183882ee4b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -717,7 +717,7 @@ static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv, struct crypto_wait wait; int err; - req = skcipher_request_alloc(any_tfm(cc), GFP_KERNEL | GFP_NOFS); + req = skcipher_request_alloc(any_tfm(cc), GFP_NOIO); if (!req) return -ENOMEM; -- cgit v1.2.3 From 0a531c5a39a71279e0a98097562bf14b5a43529e Mon Sep 17 00:00:00 2001 From: "xianrong.zhou" Date: Mon, 6 Jan 2020 20:34:24 -0800 Subject: dm verity: don't prefetch hash blocks for already-verified data Try to skip prefetching hash blocks that won't be needed due to the "check_at_most_once" option being enabled and the corresponding data blocks already having been verified. Since prefetching operates on a range of data blocks, do this by just trimming the two ends of the range. This doesn't skip every unneeded hash block, since data blocks in the middle of the range could also be unneeded, and hash blocks are still prefetched in large clusters as controlled by dm_verity_prefetch_cluster. But it can still help a lot. In a test on Android Q launching 91 apps every 15s repeated 21 times, prefetching was only done for 447177/4776629 = 9.36% of data blocks. Tested-by: ruxian.feng Co-developed-by: yuanjiong.gao Signed-off-by: yuanjiong.gao Signed-off-by: xianrong.zhou [EB: simplified the 'while' loops and improved the commit message] Signed-off-by: Eric Biggers Signed-off-by: Mike Snitzer --- drivers/md/dm-verity-target.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 4fb33e7562c5..0d61e9c67986 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -611,8 +611,22 @@ no_prefetch_cluster: static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) { + sector_t block = io->block; + unsigned int n_blocks = io->n_blocks; struct dm_verity_prefetch_work *pw; + if (v->validated_blocks) { + while (n_blocks && test_bit(block, v->validated_blocks)) { + block++; + n_blocks--; + } + while (n_blocks && test_bit(block + n_blocks - 1, + v->validated_blocks)) + n_blocks--; + if (!n_blocks) + return; + } + pw = kmalloc(sizeof(struct dm_verity_prefetch_work), GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); @@ -621,8 +635,8 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io) INIT_WORK(&pw->work, verity_prefetch_io); pw->v = v; - pw->block = io->block; - pw->n_blocks = io->n_blocks; + pw->block = block; + pw->n_blocks = n_blocks; queue_work(v->verify_wq, &pw->work); } -- cgit v1.2.3 From 4feaef830de7ffdd8352e1fe14ad3bf13c9688f8 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 7 Jan 2020 11:58:42 +0000 Subject: dm space map common: fix to ensure new block isn't already in use The space-maps track the reference counts for disk blocks allocated by both the thin-provisioning and cache targets. There are variants for tracking metadata blocks and data blocks. Transactionality is implemented by never touching blocks from the previous transaction, so we can rollback in the event of a crash. When allocating a new block we need to ensure the block is free (has reference count of 0) in both the current and previous transaction. Prior to this fix we were doing this by searching for a free block in the previous transaction, and relying on a 'begin' counter to track where the last allocation in the current transaction was. This 'begin' field was not being updated in all code paths (eg, increment of a data block reference count due to breaking sharing of a neighbour block in the same btree leaf). This fix keeps the 'begin' field, but now it's just a hint to speed up the search. Instead the current transaction is searched for a free block, and then the old transaction is double checked to ensure it's free. Much simpler. This fixes reports of sm_disk_new_block()'s BUG_ON() triggering when DM thin-provisioning's snapshots are heavily used. Reported-by: Eric Wheeler Cc: stable@vger.kernel.org Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-space-map-common.c | 27 ++++++++++++++++++++++ drivers/md/persistent-data/dm-space-map-common.h | 2 ++ drivers/md/persistent-data/dm-space-map-disk.c | 6 +++-- drivers/md/persistent-data/dm-space-map-metadata.c | 5 +++- 4 files changed, 37 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index bd68f6fef694..d8b4125e338c 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -380,6 +380,33 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, return -ENOSPC; } +int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, + dm_block_t begin, dm_block_t end, dm_block_t *b) +{ + int r; + uint32_t count; + + do { + r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b); + if (r) + break; + + /* double check this block wasn't used in the old transaction */ + if (*b >= old_ll->nr_blocks) + count = 0; + else { + r = sm_ll_lookup(old_ll, *b, &count); + if (r) + break; + + if (count) + begin = *b + 1; + } + } while (count); + + return r; +} + static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, int (*mutator)(void *context, uint32_t old, uint32_t *new), void *context, enum allocation_event *ev) diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h index b3078d5eda0c..8de63ce39bdd 100644 --- a/drivers/md/persistent-data/dm-space-map-common.h +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -109,6 +109,8 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result); int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result); int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, dm_block_t end, dm_block_t *result); +int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, + dm_block_t begin, dm_block_t end, dm_block_t *result); int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev); int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index 32adf6b4a9c7..bf4c5e2ccb6f 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -167,8 +167,10 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - /* FIXME: we should loop round a couple of times */ - r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b); + /* + * Any block we allocate has to be free in both the old and current ll. + */ + r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b); if (r) return r; diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 25328582cc48..9e3c64ec2026 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -448,7 +448,10 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) enum allocation_event ev; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b); + /* + * Any block we allocate has to be free in both the old and current ll. + */ + r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b); if (r) return r; -- cgit v1.2.3 From bbb1658461ac85ef7e0563bb11283f94ea5eb651 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Fri, 3 Jan 2020 09:20:22 +0100 Subject: dm crypt: Implement Elephant diffuser for Bitlocker compatibility Add experimental support for BitLocker encryption with CBC mode and additional Elephant diffuser. The mode was used in older Windows systems and it is provided mainly for compatibility reasons. The userspace support to activate these devices is being added to cryptsetup utility. Read-write activation of such a device is very simple, for example: echo | cryptsetup bitlkOpen bitlk_image.img test The Elephant diffuser uses two rotations in opposite direction for data (Diffuser A and B) and also XOR operation with Sector key over the sector data; Sector key is derived from additional key data. The original public documentation is available here: http://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf The dm-crypt implementation is embedded to "elephant" IV (similar to tcw IV construction). Because we cannot modify original bio data for write (before encryption), an additional internal flag to pre-process data is added. Signed-off-by: Milan Broz Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 323 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 319 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9a183882ee4b..2190f2f55151 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,8 +1,8 @@ /* * Copyright (C) 2003 Jana Saout * Copyright (C) 2004 Clemens Fruhwirth - * Copyright (C) 2006-2017 Red Hat, Inc. All rights reserved. - * Copyright (C) 2013-2017 Milan Broz + * Copyright (C) 2006-2020 Red Hat, Inc. All rights reserved. + * Copyright (C) 2013-2020 Milan Broz * * This file is released under the GPL. */ @@ -115,6 +115,11 @@ struct iv_tcw_private { u8 *whitening; }; +#define ELEPHANT_MAX_KEY_SIZE 32 +struct iv_elephant_private { + struct crypto_skcipher *tfm; +}; + /* * Crypt: maps a linear range of a block device * and encrypts / decrypts at the same time. @@ -125,6 +130,7 @@ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, enum cipher_flags { CRYPT_MODE_INTEGRITY_AEAD, /* Use authenticated mode for cihper */ CRYPT_IV_LARGE_SECTORS, /* Calculate IV from sector_size, not 512B sectors */ + CRYPT_ENCRYPT_PREPROCESS, /* Must preprocess data for encryption (elephant) */ }; /* @@ -152,6 +158,7 @@ struct crypt_config { struct iv_benbi_private benbi; struct iv_lmk_private lmk; struct iv_tcw_private tcw; + struct iv_elephant_private elephant; } iv_gen_private; u64 iv_offset; unsigned int iv_size; @@ -285,6 +292,11 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc) * eboiv: Encrypted byte-offset IV (used in Bitlocker in CBC mode) * The IV is encrypted little-endian byte-offset (with the same key * and cipher as the volume). + * + * elephant: The extended version of eboiv with additional Elephant diffuser + * used with Bitlocker CBC mode. + * This mode was used in older Windows systems + * http://download.microsoft.com/download/0/2/3/0238acaf-d3bf-4a6d-b3d6-0a0be4bbb36e/bitlockercipher200608.pdf */ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, @@ -734,6 +746,290 @@ static int crypt_iv_eboiv_gen(struct crypt_config *cc, u8 *iv, return err; } +static void crypt_iv_elephant_dtr(struct crypt_config *cc) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + + crypto_free_skcipher(elephant->tfm); + elephant->tfm = NULL; +} + +static int crypt_iv_elephant_ctr(struct crypt_config *cc, struct dm_target *ti, + const char *opts) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + int r; + + elephant->tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); + if (IS_ERR(elephant->tfm)) { + r = PTR_ERR(elephant->tfm); + elephant->tfm = NULL; + return r; + } + + r = crypt_iv_eboiv_ctr(cc, ti, NULL); + if (r) + crypt_iv_elephant_dtr(cc); + return r; +} + +static void diffuser_disk_to_cpu(u32 *d, size_t n) +{ +#ifndef __LITTLE_ENDIAN + int i; + + for (i = 0; i < n; i++) + d[i] = le32_to_cpu((__le32)d[i]); +#endif +} + +static void diffuser_cpu_to_disk(__le32 *d, size_t n) +{ +#ifndef __LITTLE_ENDIAN + int i; + + for (i = 0; i < n; i++) + d[i] = cpu_to_le32((u32)d[i]); +#endif +} + +static void diffuser_a_decrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 5; i++) { + i1 = 0; + i2 = n - 2; + i3 = n - 5; + + while (i1 < (n - 1)) { + d[i1] += d[i2] ^ (d[i3] << 9 | d[i3] >> 23); + i1++; i2++; i3++; + + if (i3 >= n) + i3 -= n; + + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + + if (i2 >= n) + i2 -= n; + + d[i1] += d[i2] ^ (d[i3] << 13 | d[i3] >> 19); + i1++; i2++; i3++; + + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + } + } +} + +static void diffuser_a_encrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 5; i++) { + i1 = n - 1; + i2 = n - 2 - 1; + i3 = n - 5 - 1; + + while (i1 > 0) { + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + + d[i1] -= d[i2] ^ (d[i3] << 13 | d[i3] >> 19); + i1--; i2--; i3--; + + if (i2 < 0) + i2 += n; + + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + + if (i3 < 0) + i3 += n; + + d[i1] -= d[i2] ^ (d[i3] << 9 | d[i3] >> 23); + i1--; i2--; i3--; + } + } +} + +static void diffuser_b_decrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 3; i++) { + i1 = 0; + i2 = 2; + i3 = 5; + + while (i1 < (n - 1)) { + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + + d[i1] += d[i2] ^ (d[i3] << 10 | d[i3] >> 22); + i1++; i2++; i3++; + + if (i2 >= n) + i2 -= n; + + d[i1] += d[i2] ^ d[i3]; + i1++; i2++; i3++; + + if (i3 >= n) + i3 -= n; + + d[i1] += d[i2] ^ (d[i3] << 25 | d[i3] >> 7); + i1++; i2++; i3++; + } + } +} + +static void diffuser_b_encrypt(u32 *d, size_t n) +{ + int i, i1, i2, i3; + + for (i = 0; i < 3; i++) { + i1 = n - 1; + i2 = 2 - 1; + i3 = 5 - 1; + + while (i1 > 0) { + d[i1] -= d[i2] ^ (d[i3] << 25 | d[i3] >> 7); + i1--; i2--; i3--; + + if (i3 < 0) + i3 += n; + + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + + if (i2 < 0) + i2 += n; + + d[i1] -= d[i2] ^ (d[i3] << 10 | d[i3] >> 22); + i1--; i2--; i3--; + + d[i1] -= d[i2] ^ d[i3]; + i1--; i2--; i3--; + } + } +} + +static int crypt_iv_elephant(struct crypt_config *cc, struct dm_crypt_request *dmreq) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + u8 *es, *ks, *data, *data2, *data_offset; + struct skcipher_request *req; + struct scatterlist *sg, *sg2, src, dst; + struct crypto_wait wait; + int i, r; + + req = skcipher_request_alloc(elephant->tfm, GFP_NOIO); + es = kzalloc(16, GFP_NOIO); /* Key for AES */ + ks = kzalloc(32, GFP_NOIO); /* Elephant sector key */ + + if (!req || !es || !ks) { + r = -ENOMEM; + goto out; + } + + *(__le64 *)es = cpu_to_le64(dmreq->iv_sector * cc->sector_size); + + /* E(Ks, e(s)) */ + sg_init_one(&src, es, 16); + sg_init_one(&dst, ks, 16); + skcipher_request_set_crypt(req, &src, &dst, 16, NULL); + skcipher_request_set_callback(req, 0, crypto_req_done, &wait); + r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + if (r) + goto out; + + /* E(Ks, e'(s)) */ + es[15] = 0x80; + sg_init_one(&dst, &ks[16], 16); + r = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); + if (r) + goto out; + + sg = crypt_get_sg_data(cc, dmreq->sg_out); + data = kmap_atomic(sg_page(sg)); + data_offset = data + sg->offset; + + /* Cannot modify original bio, copy to sg_out and apply Elephant to it */ + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + sg2 = crypt_get_sg_data(cc, dmreq->sg_in); + data2 = kmap_atomic(sg_page(sg2)); + memcpy(data_offset, data2 + sg2->offset, cc->sector_size); + kunmap_atomic(data2); + } + + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { + diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_b_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_a_decrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32)); + } + + for (i = 0; i < (cc->sector_size / 32); i++) + crypto_xor(data_offset + i * 32, ks, 32); + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + diffuser_disk_to_cpu((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_a_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_b_encrypt((u32*)data_offset, cc->sector_size / sizeof(u32)); + diffuser_cpu_to_disk((__le32*)data_offset, cc->sector_size / sizeof(u32)); + } + + kunmap_atomic(data); +out: + kzfree(ks); + kzfree(es); + skcipher_request_free(req); + return r; +} + +static int crypt_iv_elephant_gen(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + int r; + + if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { + r = crypt_iv_elephant(cc, dmreq); + if (r) + return r; + } + + return crypt_iv_eboiv_gen(cc, iv, dmreq); +} + +static int crypt_iv_elephant_post(struct crypt_config *cc, u8 *iv, + struct dm_crypt_request *dmreq) +{ + if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) + return crypt_iv_elephant(cc, dmreq); + + return 0; +} + +static int crypt_iv_elephant_init(struct crypt_config *cc) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + int key_offset = cc->key_size - cc->key_extra_size; + + return crypto_skcipher_setkey(elephant->tfm, &cc->key[key_offset], cc->key_extra_size); +} + +static int crypt_iv_elephant_wipe(struct crypt_config *cc) +{ + struct iv_elephant_private *elephant = &cc->iv_gen_private.elephant; + u8 key[ELEPHANT_MAX_KEY_SIZE]; + + memset(key, 0, cc->key_extra_size); + return crypto_skcipher_setkey(elephant->tfm, key, cc->key_extra_size); +} + static const struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; @@ -787,6 +1083,15 @@ static struct crypt_iv_operations crypt_iv_eboiv_ops = { .generator = crypt_iv_eboiv_gen }; +static struct crypt_iv_operations crypt_iv_elephant_ops = { + .ctr = crypt_iv_elephant_ctr, + .dtr = crypt_iv_elephant_dtr, + .init = crypt_iv_elephant_init, + .wipe = crypt_iv_elephant_wipe, + .generator = crypt_iv_elephant_gen, + .post = crypt_iv_elephant_post +}; + /* * Integrity extensions */ @@ -1103,6 +1408,9 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, r = cc->iv_gen_ops->generator(cc, org_iv, dmreq); if (r < 0) return r; + /* Data can be already preprocessed in generator */ + if (test_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags)) + sg_in = sg_out; /* Store generated IV in integrity metadata */ if (cc->integrity_iv_size) memcpy(tag_iv, org_iv, cc->integrity_iv_size); @@ -2191,7 +2499,14 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode) cc->iv_gen_ops = &crypt_iv_null_ops; else if (strcmp(ivmode, "eboiv") == 0) cc->iv_gen_ops = &crypt_iv_eboiv_ops; - else if (strcmp(ivmode, "lmk") == 0) { + else if (strcmp(ivmode, "elephant") == 0) { + cc->iv_gen_ops = &crypt_iv_elephant_ops; + cc->key_parts = 2; + cc->key_extra_size = cc->key_size / 2; + if (cc->key_extra_size > ELEPHANT_MAX_KEY_SIZE) + return -EINVAL; + set_bit(CRYPT_ENCRYPT_PREPROCESS, &cc->cipher_flags); + } else if (strcmp(ivmode, "lmk") == 0) { cc->iv_gen_ops = &crypt_iv_lmk_ops; /* * Version 2 and 3 is recognised according @@ -2959,7 +3274,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type crypt_target = { .name = "crypt", - .version = {1, 19, 0}, + .version = {1, 20, 0}, .module = THIS_MODULE, .ctr = crypt_ctr, .dtr = crypt_dtr, -- cgit v1.2.3 From 4ea9471fbd1addb25a4d269991dc724e200ca5b5 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Mon, 6 Jan 2020 10:11:47 +0100 Subject: dm crypt: fix benbi IV constructor crash if used in authenticated mode If benbi IV is used in AEAD construction, for example: cryptsetup luksFormat --cipher twofish-xts-benbi --key-size 512 --integrity=hmac-sha256 the constructor uses wrong skcipher function and crashes: BUG: kernel NULL pointer dereference, address: 00000014 ... EIP: crypt_iv_benbi_ctr+0x15/0x70 [dm_crypt] Call Trace: ? crypt_subkey_size+0x20/0x20 [dm_crypt] crypt_ctr+0x567/0xfc0 [dm_crypt] dm_table_add_target+0x15f/0x340 [dm_mod] Fix this by properly using crypt_aead_blocksize() in this case. Fixes: ef43aa38063a6 ("dm crypt: add cryptographic data integrity protection (authenticated encryption)") Cc: stable@vger.kernel.org # v4.12+ Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=941051 Reported-by: Jerad Simpson Signed-off-by: Milan Broz Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 2190f2f55151..c6a529873d0f 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -343,8 +343,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned bs = crypto_skcipher_blocksize(any_tfm(cc)); - int log = ilog2(bs); + unsigned bs; + int log; + + if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) + bs = crypto_aead_blocksize(any_tfm_aead(cc)); + else + bs = crypto_skcipher_blocksize(any_tfm(cc)); + log = ilog2(bs); /* we need to calculate how far we must shift the sector count * to get the cipher block count, we use this shift in _gen */ -- cgit v1.2.3 From aa9509209c5ac2f0b35d01a922bf9ae072d0c2fc Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 8 Jan 2020 10:46:05 -0500 Subject: dm writecache: fix incorrect flush sequence when doing SSD mode commit When committing state, the function writecache_flush does the following: 1. write metadata (writecache_commit_flushed) 2. flush disk cache (writecache_commit_flushed) 3. wait for data writes to complete (writecache_wait_for_ios) 4. increase superblock seq_count 5. write the superblock 6. flush disk cache It may happen that at step 3, when we wait for some write to finish, the disk may report the write as finished, but the write only hit the disk cache and it is not yet stored in persistent storage. At step 5 we write the superblock - it may happen that the superblock is written before the write that we waited for in step 3. If the machine crashes, it may result in incorrect data being returned after reboot. In order to fix the bug, we must swap steps 2 and 3 in the above sequence, so that we first wait for writes to complete and then flush the disk cache. Fixes: 48debafe4f2f ("dm: add writecache target") Cc: stable@vger.kernel.org # 4.18+ Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 7d727a72aa13..9b0a3bf6a4a1 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -442,7 +442,13 @@ static void writecache_notify_io(unsigned long error, void *context) complete(&endio->c); } -static void ssd_commit_flushed(struct dm_writecache *wc) +static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) +{ + wait_event(wc->bio_in_progress_wait[direction], + !atomic_read(&wc->bio_in_progress[direction])); +} + +static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { struct dm_io_region region; struct dm_io_request req; @@ -488,17 +494,20 @@ static void ssd_commit_flushed(struct dm_writecache *wc) writecache_notify_io(0, &endio); wait_for_completion_io(&endio.c); + if (wait_for_ios) + writecache_wait_for_ios(wc, WRITE); + writecache_disk_flush(wc, wc->ssd_dev); memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); } -static void writecache_commit_flushed(struct dm_writecache *wc) +static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { if (WC_MODE_PMEM(wc)) wmb(); else - ssd_commit_flushed(wc); + ssd_commit_flushed(wc, wait_for_ios); } static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) @@ -522,12 +531,6 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) writecache_error(wc, r, "error flushing metadata: %d", r); } -static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) -{ - wait_event(wc->bio_in_progress_wait[direction], - !atomic_read(&wc->bio_in_progress[direction])); -} - #define WFE_RETURN_FOLLOWING 1 #define WFE_LOWEST_SEQ 2 @@ -724,15 +727,12 @@ static void writecache_flush(struct dm_writecache *wc) e = e2; cond_resched(); } - writecache_commit_flushed(wc); - - if (!WC_MODE_PMEM(wc)) - writecache_wait_for_ios(wc, WRITE); + writecache_commit_flushed(wc, true); wc->seq_count++; pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); wc->overwrote_committed = false; @@ -756,7 +756,7 @@ static void writecache_flush(struct dm_writecache *wc) } if (need_flush_after_free) - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); } static void writecache_flush_work(struct work_struct *work) @@ -809,7 +809,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_ } if (discarded_something) - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); } static bool writecache_wait_for_writeback(struct dm_writecache *wc) @@ -958,7 +958,7 @@ erase_this: if (need_flush) { writecache_flush_all_metadata(wc); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); } wc_unlock(wc); @@ -1342,7 +1342,7 @@ static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head * wc->writeback_size--; n_walked++; if (unlikely(n_walked >= ENDIO_LATENCY)) { - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); wc_unlock(wc); wc_lock(wc); n_walked = 0; @@ -1423,7 +1423,7 @@ pop_from_list: writecache_wait_for_ios(wc, READ); } - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); wc_unlock(wc); } @@ -1766,10 +1766,10 @@ static int init_memory(struct dm_writecache *wc) write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); writecache_flush_all_metadata(wc); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); return 0; } -- cgit v1.2.3 From 44d8ebf436399a40fcd10dd31b29d37823d62fcc Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 13 Jan 2020 11:18:51 -0500 Subject: dm thin metadata: use pool locking at end of dm_pool_metadata_close Ensure that the pool is locked during calls to __commit_transaction and __destroy_persistent_data_objects. Just being consistent with locking, but reality is dm_pool_metadata_close is called once pool is being destroyed so access to pool shouldn't be contended. Also, use pmd_write_lock_in_core rather than __pmd_write_lock in dm_pool_commit_metadata and rename __pmd_write_lock to pmd_write_lock_in_core -- there was no need for the alias. In addition, verify that the pool is locked in __commit_transaction(). Fixes: 873f258becca ("dm thin metadata: do not write metadata if no changes occurred") Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-thin-metadata.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 4e44274adad3..fc9947d6210c 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -387,16 +387,15 @@ static int subtree_equal(void *context, const void *value1_le, const void *value * Variant that is used for in-core only changes or code that * shouldn't put the pool in service on its own (e.g. commit). */ -static inline void __pmd_write_lock(struct dm_pool_metadata *pmd) +static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd) __acquires(pmd->root_lock) { down_write(&pmd->root_lock); } -#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd)) static inline void pmd_write_lock(struct dm_pool_metadata *pmd) { - __pmd_write_lock(pmd); + pmd_write_lock_in_core(pmd); if (unlikely(!pmd->in_service)) pmd->in_service = true; } @@ -831,6 +830,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) * We need to know if the thin_disk_superblock exceeds a 512-byte sector. */ BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); + BUG_ON(!rwsem_is_locked(&pmd->root_lock)); if (unlikely(!pmd->in_service)) return 0; @@ -953,6 +953,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) return -EBUSY; } + pmd_write_lock_in_core(pmd); if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) { r = __commit_transaction(pmd); if (r < 0) @@ -961,6 +962,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) } if (!pmd->fail_io) __destroy_persistent_data_objects(pmd); + pmd_write_unlock(pmd); kfree(pmd); return 0; @@ -1841,7 +1843,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) * Care is taken to not have commit be what * triggers putting the thin-pool in-service. */ - __pmd_write_lock(pmd); + pmd_write_lock_in_core(pmd); if (pmd->fail_io) goto out; -- cgit v1.2.3 From a4a8d286586d4b28c8517a51db8d86954aadc74b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 13 Jan 2020 12:29:04 -0500 Subject: dm thin: fix use-after-free in metadata_pre_commit_callback dm-thin uses struct pool to hold the state of the pool. There may be multiple pool_c's pointing to a given pool, each pool_c represents a loaded target. pool_c's may be created and destroyed arbitrarily and the pool contains a reference count of pool_c's pointing to it. Since commit 694cfe7f31db3 ("dm thin: Flush data device before committing metadata") a pointer to pool_c is passed to dm_pool_register_pre_commit_callback and this function stores it in pmd->pre_commit_context. If this pool_c is freed, but pool is not (because there is another pool_c referencing it), we end up in a situation where pmd->pre_commit_context structure points to freed pool_c. It causes a crash in metadata_pre_commit_callback. Fix this by moving the dm_pool_register_pre_commit_callback() from pool_ctr() to pool_preresume(). This way the in-core thin-pool metadata is only ever armed with callback data whose lifetime matches the active thin-pool target. In should be noted that this fix preserves the ability to load a thin-pool table that uses a different data block device (that contains the same data) -- though it is unclear if that capability is still useful and/or needed. Fixes: 694cfe7f31db3 ("dm thin: Flush data device before committing metadata") Cc: stable@vger.kernel.org Reported-by: Zdenek Kabelac Reported-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 57626c27a54b..a2bb2622cdbd 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3408,10 +3408,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) if (r) goto out_flags_changed; - dm_pool_register_pre_commit_callback(pt->pool->pmd, - metadata_pre_commit_callback, - pt); - pt->callbacks.congested_fn = pool_is_congested; dm_table_add_target_callbacks(ti->table, &pt->callbacks); @@ -3574,6 +3570,9 @@ static int pool_preresume(struct dm_target *ti) if (r) return r; + dm_pool_register_pre_commit_callback(pool->pmd, + metadata_pre_commit_callback, pt); + r = maybe_resize_data_dev(ti, &need_commit1); if (r) return r; -- cgit v1.2.3 From 873937e75f9a8ea231a502c3d29d9cb6ad91b3ef Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 13 Jan 2020 15:04:37 -0500 Subject: dm thin: don't allow changing data device during thin-pool reload The existing code allows changing the data device when the thin-pool target is reloaded. This capability is not required and only complicates device lifetime guarantees. This can cause crashes like the one reported here: https://bugzilla.redhat.com/show_bug.cgi?id=1788596 where the kernel tries to issue a flush bio located in a structure that was already freed. Take the first step to simplifying the thin-pool's data device lifetime by disallowing changing it. Like the thin-pool's metadata device, the data device is now set in pool_create() and it cannot be changed for a given thin-pool. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index a2bb2622cdbd..4fb6e89c8786 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -231,6 +231,7 @@ struct pool { struct dm_target *ti; /* Only set if a pool target is bound */ struct mapped_device *pool_md; + struct block_device *data_dev; struct block_device *md_dev; struct dm_pool_metadata *pmd; @@ -2933,6 +2934,7 @@ static struct kmem_cache *_new_mapping_cache; static struct pool *pool_create(struct mapped_device *pool_md, struct block_device *metadata_dev, + struct block_device *data_dev, unsigned long block_size, int read_only, char **error) { @@ -3040,6 +3042,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->last_commit_jiffies = jiffies; pool->pool_md = pool_md; pool->md_dev = metadata_dev; + pool->data_dev = data_dev; __pool_table_insert(pool); return pool; @@ -3081,6 +3084,7 @@ static void __pool_dec(struct pool *pool) static struct pool *__pool_find(struct mapped_device *pool_md, struct block_device *metadata_dev, + struct block_device *data_dev, unsigned long block_size, int read_only, char **error, int *created) { @@ -3091,19 +3095,23 @@ static struct pool *__pool_find(struct mapped_device *pool_md, *error = "metadata device already in use by a pool"; return ERR_PTR(-EBUSY); } + if (pool->data_dev != data_dev) { + *error = "data device already in use by a pool"; + return ERR_PTR(-EBUSY); + } __pool_inc(pool); } else { pool = __pool_table_lookup(pool_md); if (pool) { - if (pool->md_dev != metadata_dev) { + if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) { *error = "different pool cannot replace a pool"; return ERR_PTR(-EINVAL); } __pool_inc(pool); } else { - pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); + pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error); *created = 1; } } @@ -3356,7 +3364,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out; } - pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, + pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev, block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); if (IS_ERR(pool)) { r = PTR_ERR(pool); @@ -4098,7 +4106,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 21, 0}, + .version = {1, 22, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4475,7 +4483,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 21, 0}, + .version = {1, 22, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, -- cgit v1.2.3 From f06c03d1ded22cf1c059650c4ba02496e93a0c06 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 13 Jan 2020 15:41:53 -0500 Subject: dm thin: change data device's flush_bio to be member of struct pool With commit fe64369163c5 ("dm thin: don't allow changing data device during thin-pool load") it is now possible to re-parent the data device's flush_bio from the pool_c to pool structure. Doing so offers improved lifetime guarantees for the flush_bio so that the call to dm_pool_register_pre_commit_callback can now be done safely from pool_ctr(). Depends-on: fe64369163c5 ("dm thin: don't allow changing data device during thin-pool load") Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 4fb6e89c8786..fa8d5464c1fb 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -282,6 +282,8 @@ struct pool { struct dm_bio_prison_cell **cell_sort_array; mempool_t mapping_pool; + + struct bio flush_bio; }; static void metadata_operation_failed(struct pool *pool, const char *op, int r); @@ -329,7 +331,6 @@ struct pool_c { dm_block_t low_water_blocks; struct pool_features requested_pf; /* Features requested during table load */ struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */ - struct bio flush_bio; }; /* @@ -2925,6 +2926,7 @@ static void __pool_destroy(struct pool *pool) if (pool->next_mapping) mempool_free(pool->next_mapping, &pool->mapping_pool); mempool_exit(&pool->mapping_pool); + bio_uninit(&pool->flush_bio); dm_deferred_set_destroy(pool->shared_read_ds); dm_deferred_set_destroy(pool->all_io_ds); kfree(pool); @@ -3005,6 +3007,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->low_water_triggered = false; pool->suspended = true; pool->out_of_data_space = false; + bio_init(&pool->flush_bio, NULL, 0); pool->shared_read_ds = dm_deferred_set_create(); if (!pool->shared_read_ds) { @@ -3132,7 +3135,6 @@ static void pool_dtr(struct dm_target *ti) __pool_dec(pt->pool); dm_put_device(ti, pt->metadata_dev); dm_put_device(ti, pt->data_dev); - bio_uninit(&pt->flush_bio); kfree(pt); mutex_unlock(&dm_thin_pool_table.mutex); @@ -3211,11 +3213,11 @@ static void metadata_low_callback(void *context) */ static int metadata_pre_commit_callback(void *context) { - struct pool_c *pt = context; - struct bio *flush_bio = &pt->flush_bio; + struct pool *pool = context; + struct bio *flush_bio = &pool->flush_bio; bio_reset(flush_bio); - bio_set_dev(flush_bio, pt->data_dev->bdev); + bio_set_dev(flush_bio, pool->data_dev); flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; return submit_bio_wait(flush_bio); @@ -3389,7 +3391,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) pt->data_dev = data_dev; pt->low_water_blocks = low_water_blocks; pt->adjusted_pf = pt->requested_pf = pf; - bio_init(&pt->flush_bio, NULL, 0); ti->num_flush_bios = 1; /* @@ -3416,6 +3417,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) if (r) goto out_flags_changed; + dm_pool_register_pre_commit_callback(pool->pmd, + metadata_pre_commit_callback, pool); + pt->callbacks.congested_fn = pool_is_congested; dm_table_add_target_callbacks(ti->table, &pt->callbacks); @@ -3578,9 +3582,6 @@ static int pool_preresume(struct dm_target *ti) if (r) return r; - dm_pool_register_pre_commit_callback(pool->pmd, - metadata_pre_commit_callback, pt); - r = maybe_resize_data_dev(ti, &need_commit1); if (r) return r; -- cgit v1.2.3 From be240ff5e402df49c4ddbcb0595ef96009239f6a Mon Sep 17 00:00:00 2001 From: Anatol Pomazau Date: Mon, 13 Jan 2020 17:41:27 -0500 Subject: dm mpath: Add timeout mechanism for queue_if_no_path Add a configurable timeout mechanism to disable queue_if_no_path without assistance from userspace multipathd. This reimplements multipathd's no_path_retry mechanism in kernel space. This is motivated by the desire to prevent processes from hanging indefinitely waiting for IO in cases where multipathd might be unable to respond (after a failure or for whatever reason). Despite replicating userspace multipathd's policy configuration in kernel space, it is important to prevent IOs from hanging forever, waiting for userspace that may be incapable of behaving correctly. Use of the provided "queue_if_no_path_timeout_secs" dm-multipath module parameter is optional. This timeout mechanism is disabled by default (by being set to 0). Signed-off-by: Anatol Pomazau Co-developed-by: Gabriel Krisman Bertazi Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 97c095220c79..2bc18c9c3abc 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,9 @@ #define DM_MSG_PREFIX "multipath" #define DM_PG_INIT_DELAY_MSECS 2000 #define DM_PG_INIT_DELAY_DEFAULT ((unsigned) -1) +#define QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT 0 + +static unsigned long queue_if_no_path_timeout_secs = QUEUE_IF_NO_PATH_TIMEOUT_DEFAULT; /* Path properties */ struct pgpath { @@ -91,6 +95,8 @@ struct multipath { struct work_struct process_queued_bios; struct bio_list queued_bios; + + struct timer_list nopath_timer; /* Timeout for queue_if_no_path */ }; /* @@ -108,6 +114,7 @@ static void trigger_event(struct work_struct *work); static void activate_or_offline_path(struct pgpath *pgpath); static void activate_path_work(struct work_struct *work); static void process_queued_bios(struct work_struct *work); +static void queue_if_no_path_timeout_work(struct timer_list *t); /*----------------------------------------------- * Multipath state flags. @@ -195,6 +202,8 @@ static struct multipath *alloc_multipath(struct dm_target *ti) m->ti = ti; ti->private = m; + + timer_setup(&m->nopath_timer, queue_if_no_path_timeout_work, 0); } return m; @@ -717,6 +726,43 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, return 0; } +/* + * If the queue_if_no_path timeout fires, turn off queue_if_no_path and + * process any queued I/O. + */ +static void queue_if_no_path_timeout_work(struct timer_list *t) +{ + struct multipath *m = from_timer(m, t, nopath_timer); + struct mapped_device *md = dm_table_get_md(m->ti->table); + + DMWARN("queue_if_no_path timeout on %s, failing queued IO", dm_device_name(md)); + queue_if_no_path(m, false, false); +} + +/* + * Enable the queue_if_no_path timeout if necessary. + * Called with m->lock held. + */ +static void enable_nopath_timeout(struct multipath *m) +{ + unsigned long queue_if_no_path_timeout = + READ_ONCE(queue_if_no_path_timeout_secs) * HZ; + + lockdep_assert_held(&m->lock); + + if (queue_if_no_path_timeout > 0 && + atomic_read(&m->nr_valid_paths) == 0 && + test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + mod_timer(&m->nopath_timer, + jiffies + queue_if_no_path_timeout); + } +} + +static void disable_nopath_timeout(struct multipath *m) +{ + del_timer_sync(&m->nopath_timer); +} + /* * An event is triggered whenever a path is taken out of use. * Includes path failure and PG bypass. @@ -1090,6 +1136,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) struct dm_arg_set as; unsigned pg_count = 0; unsigned next_pg_num; + unsigned long flags; as.argc = argc; as.argv = argv; @@ -1154,6 +1201,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + spin_lock_irqsave(&m->lock, flags); + enable_nopath_timeout(m); + spin_unlock_irqrestore(&m->lock, flags); + ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_same_bios = 1; @@ -1208,6 +1259,7 @@ static void multipath_dtr(struct dm_target *ti) { struct multipath *m = ti->private; + disable_nopath_timeout(m); flush_multipath_work(m); free_multipath(m); } @@ -1241,6 +1293,8 @@ static int fail_path(struct pgpath *pgpath) schedule_work(&m->trigger_event); + enable_nopath_timeout(m); + out: spin_unlock_irqrestore(&m->lock, flags); @@ -1291,6 +1345,9 @@ out: process_queued_io_list(m); } + if (pgpath->is_active) + disable_nopath_timeout(m); + return r; } @@ -1789,6 +1846,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, struct dm_dev *dev; struct multipath *m = ti->private; action_fn action; + unsigned long flags; mutex_lock(&m->work_mutex); @@ -1800,9 +1858,13 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, if (argc == 1) { if (!strcasecmp(argv[0], "queue_if_no_path")) { r = queue_if_no_path(m, true, false); + spin_lock_irqsave(&m->lock, flags); + enable_nopath_timeout(m); + spin_unlock_irqrestore(&m->lock, flags); goto out; } else if (!strcasecmp(argv[0], "fail_if_no_path")) { r = queue_if_no_path(m, false, false); + disable_nopath_timeout(m); goto out; } } @@ -2065,6 +2127,10 @@ static void __exit dm_multipath_exit(void) module_init(dm_multipath_init); module_exit(dm_multipath_exit); +module_param_named(queue_if_no_path_timeout_secs, + queue_if_no_path_timeout_secs, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(queue_if_no_path_timeout_secs, "No available paths queue IO timeout in seconds"); + MODULE_DESCRIPTION(DM_NAME " multipath target"); MODULE_AUTHOR("Sistina Software "); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From dcd195071f22d4770911ca46694ca398b6d5101d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 15 Jan 2020 04:35:22 -0500 Subject: dm writecache: improve performance of large linear writes on SSDs When dm-writecache is used with SSD as a cache device, it would submit a separate bio for each written block. The I/Os would be merged by the disk scheduler, but this merging degrades performance. Improve dm-writecache performance by submitting larger bios - this is possible as long as there is consecutive free space on the cache device. Benchmark (arm64 with 64k page size, using /dev/ram0 as a cache device): fio --bs=512k --iodepth=32 --size=400M --direct=1 \ --filename=/dev/mapper/cache --rw=randwrite --numjobs=1 --name=test block old new size MiB/s MiB/s --------------------- 512 181 700 1k 347 1256 2k 644 2020 4k 1183 2759 8k 1852 3333 16k 2469 3509 32k 2974 3670 64k 3404 3810 Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 9b0a3bf6a4a1..b9e27e37a943 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -625,7 +625,7 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry wc->freelist_size++; } -static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) +static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector) { struct wc_entry *e; @@ -634,6 +634,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) if (unlikely(!wc->current_free)) return NULL; e = wc->current_free; + if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) + return NULL; next = rb_next(&e->rb_node); rb_erase(&e->rb_node, &wc->freetree); if (unlikely(!next)) @@ -643,6 +645,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) if (unlikely(list_empty(&wc->freelist))) return NULL; e = container_of(wc->freelist.next, struct wc_entry, lru); + if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector)) + return NULL; list_del(&e->lru); } wc->freelist_size--; @@ -1193,7 +1197,7 @@ read_next_block: goto bio_copy; } } - e = writecache_pop_from_freelist(wc); + e = writecache_pop_from_freelist(wc, (sector_t)-1); if (unlikely(!e)) { writecache_wait_on_freelist(wc); continue; @@ -1205,9 +1209,26 @@ bio_copy: if (WC_MODE_PMEM(wc)) { bio_copy_block(wc, bio, memory_data(wc, e)); } else { - dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); + unsigned bio_size = wc->block_size; + sector_t start_cache_sec = cache_sector(wc, e); + sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT); + + while (bio_size < bio->bi_iter.bi_size) { + struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec); + if (!f) + break; + write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + + (bio_size >> SECTOR_SHIFT), wc->seq_count); + writecache_insert_entry(wc, f); + wc->uncommitted_blocks++; + bio_size += wc->block_size; + current_cache_sec += wc->block_size >> SECTOR_SHIFT; + } + bio_set_dev(bio, wc->ssd_dev->bdev); - bio->bi_iter.bi_sector = cache_sector(wc, e); + bio->bi_iter.bi_sector = start_cache_sec; + dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT); + if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { wc->uncommitted_blocks = 0; queue_work(wc->writeback_wq, &wc->flush_work); -- cgit v1.2.3 From 47ace7e012b9f7ad71d43ac9063d335ea3d6820b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 27 Jan 2020 14:07:23 -0500 Subject: dm: fix potential for q->make_request_fn NULL pointer Move blk_queue_make_request() to dm.c:alloc_dev() so that q->make_request_fn is never NULL during the lifetime of a DM device (even one that is created without a DM table). Otherwise generic_make_request() will crash simply by doing: dmsetup create -n test mount /dev/dm-N /mnt While at it, move ->congested_data initialization out of dm.c:alloc_dev() and into the bio-based specific init method. Reported-by: Stefan Bader BugLink: https://bugs.launchpad.net/bugs/1860231 Fixes: ff36ab34583a ("dm: remove request-based logic from make_request_fn wrapper") Depends-on: c12c9a3c3860c ("dm: various cleanups to md->queue initialization code") Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e8f9661a10a1..b89f07ee2eff 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1859,6 +1859,7 @@ static void dm_init_normal_md_queue(struct mapped_device *md) /* * Initialize aspects of queue that aren't relevant for blk-mq */ + md->queue->backing_dev_info->congested_data = md; md->queue->backing_dev_info->congested_fn = dm_any_congested; } @@ -1949,7 +1950,12 @@ static struct mapped_device *alloc_dev(int minor) if (!md->queue) goto bad; md->queue->queuedata = md; - md->queue->backing_dev_info->congested_data = md; + /* + * default to bio-based required ->make_request_fn until DM + * table is loaded and md->type established. If request-based + * table is loaded: blk-mq will override accordingly. + */ + blk_queue_make_request(md->queue, dm_make_request); md->disk = alloc_disk_node(1, md->numa_node_id); if (!md->disk) @@ -2264,7 +2270,6 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) case DM_TYPE_DAX_BIO_BASED: case DM_TYPE_NVME_BIO_BASED: dm_init_normal_md_queue(md); - blk_queue_make_request(md->queue, dm_make_request); break; case DM_TYPE_NONE: WARN_ON_ONCE(true); -- cgit v1.2.3