From e2782f560c298efc2e23c7e54b3acf54e8a6ba72 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 9 Dec 2020 13:53:31 -0800
Subject: Revert "dm raid: remove unnecessary discard limits for raid10"

This reverts commit f0e90b6c663a7e3b4736cb318c6c7c589f152c28.

Matthew Ruffell reported data corruption in raid10 due to the changes
in discard handling [1]. Revert these changes before we find a proper fix.

[1] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1907262/
Cc: Matthew Ruffell <matthew.ruffell@canonical.com>
Cc: Xiao Ni <xni@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/dm-raid.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 9c1f7c4de65b..dc8568ab96f2 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3728,6 +3728,17 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 	blk_limits_io_min(limits, chunk_size_bytes);
 	blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
+
+	/*
+	 * RAID10 personality requires bio splitting,
+	 * RAID0/1/4/5/6 don't and process large discard bios properly.
+	 */
+	if (rs_is_raid10(rs)) {
+		limits->discard_granularity = max(chunk_size_bytes,
+						  limits->discard_granularity);
+		limits->max_discard_sectors = min_not_zero(rs->md.chunk_sectors,
+							   limits->max_discard_sectors);
+	}
 }
 
 static void raid_postsuspend(struct dm_target *ti)
-- 
cgit v1.2.3


From 82fe9af77cd11ea7bdc133ceed1f7f5fc08f7d25 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 9 Dec 2020 11:38:11 -0800
Subject: Revert "md/raid10: improve discard request for far layout"

This reverts commit d3ee2d8415a6256c1c41e1be36e80e640c3e6359.

Matthew Ruffell reported data corruption in raid10 due to the changes
in discard handling [1]. Revert these changes before we find a proper fix.

[1] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1907262/
Cc: Matthew Ruffell <matthew.ruffell@canonical.com>
Cc: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid10.c | 86 ++++++++++++++---------------------------------------
 drivers/md/raid10.h |  1 -
 2 files changed, 23 insertions(+), 64 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b7bca6703df8..05773ee1b5ba 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1534,28 +1534,6 @@ static struct bio *raid10_split_bio(struct r10conf *conf,
 	return bio;
 }
 
-static void raid_end_discard_bio(struct r10bio *r10bio)
-{
-	struct r10conf *conf = r10bio->mddev->private;
-	struct r10bio *first_r10bio;
-
-	while (atomic_dec_and_test(&r10bio->remaining)) {
-
-		allow_barrier(conf);
-
-		if (!test_bit(R10BIO_Discard, &r10bio->state)) {
-			first_r10bio = (struct r10bio *)r10bio->master_bio;
-			free_r10bio(r10bio);
-			r10bio = first_r10bio;
-		} else {
-			md_write_end(r10bio->mddev);
-			bio_endio(r10bio->master_bio);
-			free_r10bio(r10bio);
-			break;
-		}
-	}
-}
-
 static void raid10_end_discard_request(struct bio *bio)
 {
 	struct r10bio *r10_bio = bio->bi_private;
@@ -1582,7 +1560,11 @@ static void raid10_end_discard_request(struct bio *bio)
 		rdev = conf->mirrors[dev].rdev;
 	}
 
-	raid_end_discard_bio(r10_bio);
+	if (atomic_dec_and_test(&r10_bio->remaining)) {
+		md_write_end(r10_bio->mddev);
+		raid_end_bio_io(r10_bio);
+	}
+
 	rdev_dec_pending(rdev, conf->mddev);
 }
 
@@ -1595,9 +1577,7 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 {
 	struct r10conf *conf = mddev->private;
 	struct geom *geo = &conf->geo;
-	struct r10bio *r10_bio, *first_r10bio;
-	int far_copies = geo->far_copies;
-	bool first_copy = true;
+	struct r10bio *r10_bio;
 
 	int disk;
 	sector_t chunk;
@@ -1636,20 +1616,30 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 	if (bio_sectors(bio) < stripe_size*2)
 		goto out;
 
-	/* For far and far offset layout, if bio is not aligned with stripe size,
-	 * it splits the part that is not aligned with strip size.
+	/* For far offset layout, if bio is not aligned with stripe size, it splits
+	 * the part that is not aligned with strip size.
 	 */
 	div_u64_rem(bio_start, stripe_size, &remainder);
-	if ((far_copies > 1) && remainder) {
+	if (geo->far_offset && remainder) {
 		split_size = stripe_size - remainder;
 		bio = raid10_split_bio(conf, bio, split_size, false);
 	}
 	div_u64_rem(bio_end, stripe_size, &remainder);
-	if ((far_copies > 1) && remainder) {
+	if (geo->far_offset && remainder) {
 		split_size = bio_sectors(bio) - remainder;
 		bio = raid10_split_bio(conf, bio, split_size, true);
 	}
 
+	r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
+	r10_bio->mddev = mddev;
+	r10_bio->state = 0;
+	r10_bio->sectors = 0;
+	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
+
+	wait_blocked_dev(mddev, r10_bio);
+
+	r10_bio->master_bio = bio;
+
 	bio_start = bio->bi_iter.bi_sector;
 	bio_end = bio_end_sector(bio);
 
@@ -1675,28 +1665,6 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
 	end_disk_offset = (bio_end & geo->chunk_mask) +
 				(last_stripe_index << geo->chunk_shift);
 
-retry_discard:
-	r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
-	r10_bio->mddev = mddev;
-	r10_bio->state = 0;
-	r10_bio->sectors = 0;
-	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
-	wait_blocked_dev(mddev, r10_bio);
-
-	/* For far layout it needs more than one r10bio to cover all regions.
-	 * Inspired by raid10_sync_request, we can use the first r10bio->master_bio
-	 * to record the discard bio. Other r10bio->master_bio record the first
-	 * r10bio. The first r10bio only release after all other r10bios finish.
-	 * The discard bio returns only first r10bio finishes
-	 */
-	if (first_copy) {
-		r10_bio->master_bio = bio;
-		set_bit(R10BIO_Discard, &r10_bio->state);
-		first_copy = false;
-		first_r10bio = r10_bio;
-	} else
-		r10_bio->master_bio = (struct bio *)first_r10bio;
-
 	rcu_read_lock();
 	for (disk = 0; disk < geo->raid_disks; disk++) {
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
@@ -1787,19 +1755,11 @@ retry_discard:
 		}
 	}
 
-	if (!geo->far_offset && --far_copies) {
-		first_stripe_index += geo->stride >> geo->chunk_shift;
-		start_disk_offset += geo->stride;
-		last_stripe_index += geo->stride >> geo->chunk_shift;
-		end_disk_offset += geo->stride;
-		atomic_inc(&first_r10bio->remaining);
-		raid_end_discard_bio(r10_bio);
-		wait_barrier(conf);
-		goto retry_discard;
+	if (atomic_dec_and_test(&r10_bio->remaining)) {
+		md_write_end(r10_bio->mddev);
+		raid_end_bio_io(r10_bio);
 	}
 
-	raid_end_discard_bio(r10_bio);
-
 	return 0;
 out:
 	allow_barrier(conf);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 1461fd55311b..79cd2b7d3128 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -179,6 +179,5 @@ enum r10bio_state {
 	R10BIO_Previous,
 /* failfast devices did receive failfast requests. */
 	R10BIO_FailFast,
-	R10BIO_Discard,
 };
 #endif
-- 
cgit v1.2.3


From d7cb6be0d0cdced2a7a96bd80a8f835e77ec5204 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 9 Dec 2020 11:43:08 -0800
Subject: Revert "md/raid10: improve raid10 discard request"

This reverts commit bcc90d280465ebd51ab8688be86e1f00c62dccf9.

Matthew Ruffell reported data corruption in raid10 due to the changes
in discard handling [1]. Revert these changes before we find a proper fix.

[1] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1907262/
Cc: Matthew Ruffell <matthew.ruffell@canonical.com>
Cc: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid10.c | 256 +---------------------------------------------------
 1 file changed, 1 insertion(+), 255 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 05773ee1b5ba..4ad8447fa868 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1516,256 +1516,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 		raid10_write_request(mddev, bio, r10_bio);
 }
 
-static struct bio *raid10_split_bio(struct r10conf *conf,
-			struct bio *bio, sector_t sectors, bool want_first)
-{
-	struct bio *split;
-
-	split = bio_split(bio, sectors,	GFP_NOIO, &conf->bio_split);
-	bio_chain(split, bio);
-	allow_barrier(conf);
-	if (want_first) {
-		submit_bio_noacct(bio);
-		bio = split;
-	} else
-		submit_bio_noacct(split);
-	wait_barrier(conf);
-
-	return bio;
-}
-
-static void raid10_end_discard_request(struct bio *bio)
-{
-	struct r10bio *r10_bio = bio->bi_private;
-	struct r10conf *conf = r10_bio->mddev->private;
-	struct md_rdev *rdev = NULL;
-	int dev;
-	int slot, repl;
-
-	/*
-	 * We don't care the return value of discard bio
-	 */
-	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
-		set_bit(R10BIO_Uptodate, &r10_bio->state);
-
-	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
-	if (repl)
-		rdev = conf->mirrors[dev].replacement;
-	if (!rdev) {
-		/* raid10_remove_disk uses smp_mb to make sure rdev is set to
-		 * replacement before setting replacement to NULL. It can read
-		 * rdev first without barrier protect even replacment is NULL
-		 */
-		smp_rmb();
-		rdev = conf->mirrors[dev].rdev;
-	}
-
-	if (atomic_dec_and_test(&r10_bio->remaining)) {
-		md_write_end(r10_bio->mddev);
-		raid_end_bio_io(r10_bio);
-	}
-
-	rdev_dec_pending(rdev, conf->mddev);
-}
-
-/* There are some limitations to handle discard bio
- * 1st, the discard size is bigger than stripe_size*2.
- * 2st, if the discard bio spans reshape progress, we use the old way to
- * handle discard bio
- */
-static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
-{
-	struct r10conf *conf = mddev->private;
-	struct geom *geo = &conf->geo;
-	struct r10bio *r10_bio;
-
-	int disk;
-	sector_t chunk;
-	unsigned int stripe_size;
-	sector_t split_size;
-
-	sector_t bio_start, bio_end;
-	sector_t first_stripe_index, last_stripe_index;
-	sector_t start_disk_offset;
-	unsigned int start_disk_index;
-	sector_t end_disk_offset;
-	unsigned int end_disk_index;
-	unsigned int remainder;
-
-	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-		return -EAGAIN;
-
-	wait_barrier(conf);
-
-	/* Check reshape again to avoid reshape happens after checking
-	 * MD_RECOVERY_RESHAPE and before wait_barrier
-	 */
-	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
-		goto out;
-
-	stripe_size = geo->raid_disks << geo->chunk_shift;
-	bio_start = bio->bi_iter.bi_sector;
-	bio_end = bio_end_sector(bio);
-
-	/* Maybe one discard bio is smaller than strip size or across one stripe
-	 * and discard region is larger than one stripe size. For far offset layout,
-	 * if the discard region is not aligned with stripe size, there is hole
-	 * when we submit discard bio to member disk. For simplicity, we only
-	 * handle discard bio which discard region is bigger than stripe_size*2
-	 */
-	if (bio_sectors(bio) < stripe_size*2)
-		goto out;
-
-	/* For far offset layout, if bio is not aligned with stripe size, it splits
-	 * the part that is not aligned with strip size.
-	 */
-	div_u64_rem(bio_start, stripe_size, &remainder);
-	if (geo->far_offset && remainder) {
-		split_size = stripe_size - remainder;
-		bio = raid10_split_bio(conf, bio, split_size, false);
-	}
-	div_u64_rem(bio_end, stripe_size, &remainder);
-	if (geo->far_offset && remainder) {
-		split_size = bio_sectors(bio) - remainder;
-		bio = raid10_split_bio(conf, bio, split_size, true);
-	}
-
-	r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
-	r10_bio->mddev = mddev;
-	r10_bio->state = 0;
-	r10_bio->sectors = 0;
-	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
-
-	wait_blocked_dev(mddev, r10_bio);
-
-	r10_bio->master_bio = bio;
-
-	bio_start = bio->bi_iter.bi_sector;
-	bio_end = bio_end_sector(bio);
-
-	/* raid10 uses chunk as the unit to store data. It's similar like raid0.
-	 * One stripe contains the chunks from all member disk (one chunk from
-	 * one disk at the same HBA address). For layout detail, see 'man md 4'
-	 */
-	chunk = bio_start >> geo->chunk_shift;
-	chunk *= geo->near_copies;
-	first_stripe_index = chunk;
-	start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
-	if (geo->far_offset)
-		first_stripe_index *= geo->far_copies;
-	start_disk_offset = (bio_start & geo->chunk_mask) +
-				(first_stripe_index << geo->chunk_shift);
-
-	chunk = bio_end >> geo->chunk_shift;
-	chunk *= geo->near_copies;
-	last_stripe_index = chunk;
-	end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
-	if (geo->far_offset)
-		last_stripe_index *= geo->far_copies;
-	end_disk_offset = (bio_end & geo->chunk_mask) +
-				(last_stripe_index << geo->chunk_shift);
-
-	rcu_read_lock();
-	for (disk = 0; disk < geo->raid_disks; disk++) {
-		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
-		struct md_rdev *rrdev = rcu_dereference(
-			conf->mirrors[disk].replacement);
-
-		r10_bio->devs[disk].bio = NULL;
-		r10_bio->devs[disk].repl_bio = NULL;
-
-		if (rdev && (test_bit(Faulty, &rdev->flags)))
-			rdev = NULL;
-		if (rrdev && (test_bit(Faulty, &rrdev->flags)))
-			rrdev = NULL;
-		if (!rdev && !rrdev)
-			continue;
-
-		if (rdev) {
-			r10_bio->devs[disk].bio = bio;
-			atomic_inc(&rdev->nr_pending);
-		}
-		if (rrdev) {
-			r10_bio->devs[disk].repl_bio = bio;
-			atomic_inc(&rrdev->nr_pending);
-		}
-	}
-	rcu_read_unlock();
-
-	atomic_set(&r10_bio->remaining, 1);
-	for (disk = 0; disk < geo->raid_disks; disk++) {
-		sector_t dev_start, dev_end;
-		struct bio *mbio, *rbio = NULL;
-		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
-		struct md_rdev *rrdev = rcu_dereference(
-			conf->mirrors[disk].replacement);
-
-		/*
-		 * Now start to calculate the start and end address for each disk.
-		 * The space between dev_start and dev_end is the discard region.
-		 *
-		 * For dev_start, it needs to consider three conditions:
-		 * 1st, the disk is before start_disk, you can imagine the disk in
-		 * the next stripe. So the dev_start is the start address of next
-		 * stripe.
-		 * 2st, the disk is after start_disk, it means the disk is at the
-		 * same stripe of first disk
-		 * 3st, the first disk itself, we can use start_disk_offset directly
-		 */
-		if (disk < start_disk_index)
-			dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
-		else if (disk > start_disk_index)
-			dev_start = first_stripe_index * mddev->chunk_sectors;
-		else
-			dev_start = start_disk_offset;
-
-		if (disk < end_disk_index)
-			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
-		else if (disk > end_disk_index)
-			dev_end = last_stripe_index * mddev->chunk_sectors;
-		else
-			dev_end = end_disk_offset;
-
-		/* It only handles discard bio which size is >= stripe size, so
-		 * dev_end > dev_start all the time
-		 */
-		if (r10_bio->devs[disk].bio) {
-			mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
-			mbio->bi_end_io = raid10_end_discard_request;
-			mbio->bi_private = r10_bio;
-			r10_bio->devs[disk].bio = mbio;
-			r10_bio->devs[disk].devnum = disk;
-			atomic_inc(&r10_bio->remaining);
-			md_submit_discard_bio(mddev, rdev, mbio,
-					dev_start + choose_data_offset(r10_bio, rdev),
-					dev_end - dev_start);
-			bio_endio(mbio);
-		}
-		if (r10_bio->devs[disk].repl_bio) {
-			rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
-			rbio->bi_end_io = raid10_end_discard_request;
-			rbio->bi_private = r10_bio;
-			r10_bio->devs[disk].repl_bio = rbio;
-			r10_bio->devs[disk].devnum = disk;
-			atomic_inc(&r10_bio->remaining);
-			md_submit_discard_bio(mddev, rrdev, rbio,
-					dev_start + choose_data_offset(r10_bio, rrdev),
-					dev_end - dev_start);
-			bio_endio(rbio);
-		}
-	}
-
-	if (atomic_dec_and_test(&r10_bio->remaining)) {
-		md_write_end(r10_bio->mddev);
-		raid_end_bio_io(r10_bio);
-	}
-
-	return 0;
-out:
-	allow_barrier(conf);
-	return -EAGAIN;
-}
-
 static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r10conf *conf = mddev->private;
@@ -1780,10 +1530,6 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
 	if (!md_write_start(mddev, bio))
 		return false;
 
-	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
-		if (!raid10_handle_discard(mddev, bio))
-			return true;
-
 	/*
 	 * If this request crosses a chunk boundary, we need to split
 	 * it.
@@ -4023,7 +3769,7 @@ static int raid10_run(struct mddev *mddev)
 
 	if (mddev->queue) {
 		blk_queue_max_discard_sectors(mddev->queue,
-					      UINT_MAX);
+					      mddev->chunk_sectors);
 		blk_queue_max_write_same_sectors(mddev->queue, 0);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
 		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
-- 
cgit v1.2.3


From 4e2c6567efdd6f252e1874c41c8db4abfb0a9bf3 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 9 Dec 2020 11:43:43 -0800
Subject: Revert "md/raid10: pull codes that wait for blocked dev into one
 function"

This reverts commit f046f5d0d79cdb968f219ce249e497fd1accf484.

Matthew Ruffell reported data corruption in raid10 due to the changes
in discard handling [1]. Revert these changes before we find a proper fix.

[1] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1907262/
Cc: Matthew Ruffell <matthew.ruffell@canonical.com>
Cc: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid10.c | 118 +++++++++++++++++++++++-----------------------------
 1 file changed, 51 insertions(+), 67 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 4ad8447fa868..f2ec44fda1a0 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1275,75 +1275,12 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 	}
 }
 
-static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
-{
-	int i;
-	struct r10conf *conf = mddev->private;
-	struct md_rdev *blocked_rdev;
-
-retry_wait:
-	blocked_rdev = NULL;
-	rcu_read_lock();
-	for (i = 0; i < conf->copies; i++) {
-		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
-		struct md_rdev *rrdev = rcu_dereference(
-			conf->mirrors[i].replacement);
-		if (rdev == rrdev)
-			rrdev = NULL;
-		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
-			atomic_inc(&rdev->nr_pending);
-			blocked_rdev = rdev;
-			break;
-		}
-		if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
-			atomic_inc(&rrdev->nr_pending);
-			blocked_rdev = rrdev;
-			break;
-		}
-
-		if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
-			sector_t first_bad;
-			sector_t dev_sector = r10_bio->devs[i].addr;
-			int bad_sectors;
-			int is_bad;
-
-			/* Discard request doesn't care the write result
-			 * so it doesn't need to wait blocked disk here.
-			 */
-			if (!r10_bio->sectors)
-				continue;
-
-			is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
-					     &first_bad, &bad_sectors);
-			if (is_bad < 0) {
-				/* Mustn't write here until the bad block
-				 * is acknowledged
-				 */
-				atomic_inc(&rdev->nr_pending);
-				set_bit(BlockedBadBlocks, &rdev->flags);
-				blocked_rdev = rdev;
-				break;
-			}
-		}
-	}
-	rcu_read_unlock();
-
-	if (unlikely(blocked_rdev)) {
-		/* Have to wait for this device to get unblocked, then retry */
-		allow_barrier(conf);
-		raid10_log(conf->mddev, "%s wait rdev %d blocked",
-				__func__, blocked_rdev->raid_disk);
-		md_wait_for_blocked_rdev(blocked_rdev, mddev);
-		wait_barrier(conf);
-		goto retry_wait;
-	}
-}
-
 static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 				 struct r10bio *r10_bio)
 {
 	struct r10conf *conf = mddev->private;
 	int i;
+	struct md_rdev *blocked_rdev;
 	sector_t sectors;
 	int max_sectors;
 
@@ -1401,9 +1338,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 
 	r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
 	raid10_find_phys(conf, r10_bio);
-
-	wait_blocked_dev(mddev, r10_bio);
-
+retry_write:
+	blocked_rdev = NULL;
 	rcu_read_lock();
 	max_sectors = r10_bio->sectors;
 
@@ -1414,6 +1350,16 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 			conf->mirrors[d].replacement);
 		if (rdev == rrdev)
 			rrdev = NULL;
+		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+			atomic_inc(&rdev->nr_pending);
+			blocked_rdev = rdev;
+			break;
+		}
+		if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
+			atomic_inc(&rrdev->nr_pending);
+			blocked_rdev = rrdev;
+			break;
+		}
 		if (rdev && (test_bit(Faulty, &rdev->flags)))
 			rdev = NULL;
 		if (rrdev && (test_bit(Faulty, &rrdev->flags)))
@@ -1434,6 +1380,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 
 			is_bad = is_badblock(rdev, dev_sector, max_sectors,
 					     &first_bad, &bad_sectors);
+			if (is_bad < 0) {
+				/* Mustn't write here until the bad block
+				 * is acknowledged
+				 */
+				atomic_inc(&rdev->nr_pending);
+				set_bit(BlockedBadBlocks, &rdev->flags);
+				blocked_rdev = rdev;
+				break;
+			}
 			if (is_bad && first_bad <= dev_sector) {
 				/* Cannot write here at all */
 				bad_sectors -= (dev_sector - first_bad);
@@ -1469,6 +1424,35 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
 	}
 	rcu_read_unlock();
 
+	if (unlikely(blocked_rdev)) {
+		/* Have to wait for this device to get unblocked, then retry */
+		int j;
+		int d;
+
+		for (j = 0; j < i; j++) {
+			if (r10_bio->devs[j].bio) {
+				d = r10_bio->devs[j].devnum;
+				rdev_dec_pending(conf->mirrors[d].rdev, mddev);
+			}
+			if (r10_bio->devs[j].repl_bio) {
+				struct md_rdev *rdev;
+				d = r10_bio->devs[j].devnum;
+				rdev = conf->mirrors[d].replacement;
+				if (!rdev) {
+					/* Race with remove_disk */
+					smp_mb();
+					rdev = conf->mirrors[d].rdev;
+				}
+				rdev_dec_pending(rdev, mddev);
+			}
+		}
+		allow_barrier(conf);
+		raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
+		md_wait_for_blocked_rdev(blocked_rdev, mddev);
+		wait_barrier(conf);
+		goto retry_write;
+	}
+
 	if (max_sectors < r10_bio->sectors)
 		r10_bio->sectors = max_sectors;
 
-- 
cgit v1.2.3


From 17c28c2a068730e9d065a0e4ed03beed074d8997 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 9 Dec 2020 11:44:20 -0800
Subject: Revert "md/raid10: extend r10bio devs to raid disks"

This reverts commit 8650a889017cb1f6ea6813ccf83a2e9f6fa49dd3.

Matthew Ruffell reported data corruption in raid10 due to the changes
in discard handling [1]. Revert these changes before we find a proper fix.

[1] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1907262/
Cc: Matthew Ruffell <matthew.ruffell@canonical.com>
Cc: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid10.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f2ec44fda1a0..3b598a3cb462 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio)
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
-	int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
+	int size = offsetof(struct r10bio, devs[conf->copies]);
 
 	/* allocate a r10bio with room for raid_disks entries in the
 	 * bios array */
@@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
 {
 	int i;
 
-	for (i = 0; i < conf->geo.raid_disks; i++) {
+	for (i = 0; i < conf->copies; i++) {
 		struct bio **bio = & r10_bio->devs[i].bio;
 		if (!BIO_SPECIAL(*bio))
 			bio_put(*bio);
@@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
 	int slot;
 	int repl = 0;
 
-	for (slot = 0; slot < conf->geo.raid_disks; slot++) {
+	for (slot = 0; slot < conf->copies; slot++) {
 		if (r10_bio->devs[slot].bio == bio)
 			break;
 		if (r10_bio->devs[slot].repl_bio == bio) {
@@ -336,6 +336,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
 		}
 	}
 
+	BUG_ON(slot == conf->copies);
 	update_head_pos(slot, r10_bio);
 
 	if (slotp)
@@ -1492,7 +1493,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
 	r10_bio->mddev = mddev;
 	r10_bio->sector = bio->bi_iter.bi_sector;
 	r10_bio->state = 0;
-	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks);
+	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
 
 	if (bio_data_dir(bio) == READ)
 		raid10_read_request(mddev, bio, r10_bio);
-- 
cgit v1.2.3


From 57a0f3a81ef21fe51d6223aa78a1a890098d4ada Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 9 Dec 2020 11:44:42 -0800
Subject: Revert "md: add md_submit_discard_bio() for submitting discard bio"

This reverts commit 2628089b74d5a64bd0bcb5d247a18f78d7b6f4d0.

Matthew Ruffell reported data corruption in raid10 due to the changes
in discard handling [1]. Revert these changes before we find a proper fix.

[1] https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1907262/
Cc: Matthew Ruffell <matthew.ruffell@canonical.com>
Cc: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c    | 20 --------------------
 drivers/md/md.h    |  2 --
 drivers/md/raid0.c | 14 ++++++++++++--
 3 files changed, 12 insertions(+), 24 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 98bac4f304ae..0037c6ecab65 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8582,26 +8582,6 @@ void md_write_end(struct mddev *mddev)
 
 EXPORT_SYMBOL(md_write_end);
 
-/* This is used by raid0 and raid10 */
-void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
-			struct bio *bio, sector_t start, sector_t size)
-{
-	struct bio *discard_bio = NULL;
-
-	if (__blkdev_issue_discard(rdev->bdev, start, size,
-		GFP_NOIO, 0, &discard_bio) || !discard_bio)
-		return;
-
-	bio_chain(discard_bio, bio);
-	bio_clone_blkg_association(discard_bio, bio);
-	if (mddev->gendisk)
-		trace_block_bio_remap(bdev_get_queue(rdev->bdev),
-			discard_bio, disk_devt(mddev->gendisk),
-			bio->bi_iter.bi_sector);
-	submit_bio_noacct(discard_bio);
-}
-EXPORT_SYMBOL(md_submit_discard_bio);
-
 /* md_allow_write(mddev)
  * Calling this ensures that the array is marked 'active' so that writes
  * may proceed without blocking.  It is important to call this before
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ccfb69868c2e..2175a5ac4f7c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -713,8 +713,6 @@ extern void md_write_end(struct mddev *mddev);
 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
 extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
 extern void md_finish_reshape(struct mddev *mddev);
-extern void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
-			struct bio *bio, sector_t start, sector_t size);
 
 extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
 extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6f44177593a5..35843df15b5e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -477,6 +477,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 
 	for (disk = 0; disk < zone->nb_dev; disk++) {
 		sector_t dev_start, dev_end;
+		struct bio *discard_bio = NULL;
 		struct md_rdev *rdev;
 
 		if (disk < start_disk_index)
@@ -499,9 +500,18 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 
 		rdev = conf->devlist[(zone - conf->strip_zone) *
 			conf->strip_zone[0].nb_dev + disk];
-		md_submit_discard_bio(mddev, rdev, bio,
+		if (__blkdev_issue_discard(rdev->bdev,
 			dev_start + zone->dev_start + rdev->data_offset,
-			dev_end - dev_start);
+			dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
+		    !discard_bio)
+			continue;
+		bio_chain(discard_bio, bio);
+		bio_clone_blkg_association(discard_bio, bio);
+		if (mddev->gendisk)
+			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
+				discard_bio, disk_devt(mddev->gendisk),
+				bio->bi_iter.bi_sector);
+		submit_bio_noacct(discard_bio);
 	}
 	bio_endio(bio);
 }
-- 
cgit v1.2.3


From 6ffeb1c3f8226244c08105bcdbeecc04bad6b89a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sat, 12 Dec 2020 11:55:37 -0500
Subject: md: change mddev 'chunk_sectors' from int to unsigned
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit e2782f560c29 ("Revert "dm raid: remove unnecessary discard
limits for raid10"") exposed compiler warnings introduced by commit
e0910c8e4f87 ("dm raid: fix discard limits for raid1 and raid10"):

In file included from ./include/linux/kernel.h:14,
                 from ./include/asm-generic/bug.h:20,
                 from ./arch/x86/include/asm/bug.h:93,
                 from ./include/linux/bug.h:5,
                 from ./include/linux/mmdebug.h:5,
                 from ./include/linux/gfp.h:5,
                 from ./include/linux/slab.h:15,
                 from drivers/md/dm-raid.c:8:
drivers/md/dm-raid.c: In function ‘raid_io_hints’:
./include/linux/minmax.h:18:28: warning: comparison of distinct pointer types lacks a cast
  (!!(sizeof((typeof(x) *)1 == (typeof(y) *)1)))
                            ^~
./include/linux/minmax.h:32:4: note: in expansion of macro ‘__typecheck’
   (__typecheck(x, y) && __no_side_effects(x, y))
    ^~~~~~~~~~~
./include/linux/minmax.h:42:24: note: in expansion of macro ‘__safe_cmp’
  __builtin_choose_expr(__safe_cmp(x, y), \
                        ^~~~~~~~~~
./include/linux/minmax.h:51:19: note: in expansion of macro ‘__careful_cmp’
 #define min(x, y) __careful_cmp(x, y, <)
                   ^~~~~~~~~~~~~
./include/linux/minmax.h:84:39: note: in expansion of macro ‘min’
  __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
                                       ^~~
drivers/md/dm-raid.c:3739:33: note: in expansion of macro ‘min_not_zero’
   limits->max_discard_sectors = min_not_zero(rs->md.chunk_sectors,
                                 ^~~~~~~~~~~~

Fix this by changing the chunk_sectors member of 'struct mddev' from
int to 'unsigned int' to match the type used for the 'chunk_sectors'
member of 'struct queue_limits'.  Various MD code still uses 'int' but
none of it appears to ever make use of signed int; and storing
positive signed int in unsigned is perfectly safe.

Reported-by: Song Liu <songliubraving@fb.com>
Fixes: e2782f560c29 ("Revert "dm raid: remove unnecessary discard limits for raid10"")
Fixes: e0910c8e4f87 ("dm raid: fix discard limits for raid1 and raid10")
Cc: stable@vger,kernel.org # e0910c8e4f87 was marked for stable@
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Song Liu <song@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/md.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.h b/drivers/md/md.h
index 2175a5ac4f7c..bb645bc3ba6d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -311,7 +311,7 @@ struct mddev {
 	int				external;	/* metadata is
 							 * managed externally */
 	char				metadata_type[17]; /* externally set*/
-	int				chunk_sectors;
+	unsigned int			chunk_sectors;
 	time64_t			ctime, utime;
 	int				level, layout;
 	char				clevel[16];
@@ -339,7 +339,7 @@ struct mddev {
 	 */
 	sector_t			reshape_position;
 	int				delta_disks, new_level, new_layout;
-	int				new_chunk_sectors;
+	unsigned int			new_chunk_sectors;
 	int				reshape_backwards;
 
 	struct md_thread		*thread;	/* management thread */
-- 
cgit v1.2.3