From d9c0fa509eafb31605bddda34e2e39f4354563b6 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Tue, 30 Jun 2020 15:55:36 +0800
Subject: md: fix max sectors calculation for super 1.0

To grow size of super 1.0 raid array, it is necessary to check the device
max usable size.

Now it uses rdev->sectors for max usable size. If one disk is 500G and the
raid device only uses the 100GB of this disk. rdev->sectors can't tell the
real max usable size. The max usable size should be

dev_size-(superblock_size+bitmap_size+badblock_size).

Also, remove unnecessary sb_start update in super_1_rdev_size_change().

Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ea48bc25cce1..5e8f772ab117 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2193,6 +2193,24 @@ retry:
 	sb->sb_csum = calc_sb_1_csum(sb);
 }
 
+static sector_t super_1_choose_bm_space(sector_t dev_size)
+{
+	sector_t bm_space;
+
+	/* if the device is bigger than 8Gig, save 64k for bitmap
+	 * usage, if bigger than 200Gig, save 128k
+	 */
+	if (dev_size < 64*2)
+		bm_space = 0;
+	else if (dev_size - 64*2 >= 200*1024*1024*2)
+		bm_space = 128*2;
+	else if (dev_size - 4*2 > 8*1024*1024*2)
+		bm_space = 64*2;
+	else
+		bm_space = 4*2;
+	return bm_space;
+}
+
 static unsigned long long
 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 {
@@ -2213,13 +2231,22 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
 		return 0;
 	} else {
 		/* minor version 0; superblock after data */
-		sector_t sb_start;
-		sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
+		sector_t sb_start, bm_space;
+		sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9;
+
+		/* 8K is for superblock */
+		sb_start = dev_size - 8*2;
 		sb_start &= ~(sector_t)(4*2 - 1);
-		max_sectors = rdev->sectors + sb_start - rdev->sb_start;
+
+		bm_space = super_1_choose_bm_space(dev_size);
+
+		/* Space that can be used to store date needs to decrease
+		 * superblock bitmap space and bad block space(4K)
+		 */
+		max_sectors = sb_start - bm_space - 4*2;
+
 		if (!num_sectors || num_sectors > max_sectors)
 			num_sectors = max_sectors;
-		rdev->sb_start = sb_start;
 	}
 	sb = page_address(rdev->sb_page);
 	sb->data_size = cpu_to_le64(num_sectors);
-- 
cgit v1.2.3


From ec164d07aa771370c7c24c1fa7f7692ad30f01cb Mon Sep 17 00:00:00 2001
From: Sebastian Parschauer <s.parschauer@gmx.de>
Date: Tue, 28 Jul 2020 12:01:39 +0200
Subject: md: register new md sysfs file 'uuid' read-only

Report the UUID of the MD array in the following format:
xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx

This is useful if you don't want to wait for udev to identify array.
And it is also easy for script to monitor it with the format.

Signed-off-by: Sebastian Parschauer <s.parschauer@gmx.de>
[Guoqing: mention the change in md.rst]
Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 Documentation/admin-guide/md.rst | 4 ++++
 drivers/md/md.c                  | 9 +++++++++
 2 files changed, 13 insertions(+)

diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst
index d973d469ffc4..cc8781b96b4d 100644
--- a/Documentation/admin-guide/md.rst
+++ b/Documentation/admin-guide/md.rst
@@ -426,6 +426,10 @@ All md devices contain:
      The accepted values when writing to this file are ``ppl`` and ``resync``,
      used to enable and disable PPL.
 
+  uuid
+     This indicates the UUID of the array in the following format:
+     xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+
 
 As component devices are added to an md array, they appear in the ``md``
 directory as new directories named::
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5e8f772ab117..5116d29b74e1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4252,6 +4252,14 @@ out_unlock:
 static struct md_sysfs_entry md_raid_disks =
 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
 
+static ssize_t
+uuid_show(struct mddev *mddev, char *page)
+{
+	return sprintf(page, "%pU\n", mddev->uuid);
+}
+static struct md_sysfs_entry md_uuid =
+__ATTR(uuid, S_IRUGO, uuid_show, NULL);
+
 static ssize_t
 chunk_size_show(struct mddev *mddev, char *page)
 {
@@ -5508,6 +5516,7 @@ static struct attribute *md_default_attrs[] = {
 	&md_level.attr,
 	&md_layout.attr,
 	&md_raid_disks.attr,
+	&md_uuid.attr,
 	&md_chunk_size.attr,
 	&md_size.attr,
 	&md_resync_start.attr,
-- 
cgit v1.2.3


From e3914d596f79742ce3038ffdf66e4fa585ad7cd5 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Tue, 28 Jul 2020 12:01:40 +0200
Subject: md/raid5: remove the redundant setting of STRIPE_HANDLE

The flag is already set before compare rcw with rmw, so it is
not necessary to do it again.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a6ff6e1e039b..790d91aa5f40 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3995,10 +3995,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 					set_bit(R5_LOCKED, &dev->flags);
 					set_bit(R5_Wantread, &dev->flags);
 					s->locked++;
-				} else {
+				} else
 					set_bit(STRIPE_DELAYED, &sh->state);
-					set_bit(STRIPE_HANDLE, &sh->state);
-				}
 			}
 		}
 	}
@@ -4023,10 +4021,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
 					set_bit(R5_Wantread, &dev->flags);
 					s->locked++;
 					qread++;
-				} else {
+				} else
 					set_bit(STRIPE_DELAYED, &sh->state);
-					set_bit(STRIPE_HANDLE, &sh->state);
-				}
 			}
 		}
 		if (rcw && conf->mddev->queue)
-- 
cgit v1.2.3


From b3db8a216393cc228ab4add580f488da55177b66 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Tue, 28 Jul 2020 12:01:41 +0200
Subject: md: print errno in super_written

It is better to print errno instead of bi_status.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/md.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5116d29b74e1..9c69084cae73 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -978,7 +978,8 @@ static void super_written(struct bio *bio)
 	struct mddev *mddev = rdev->mddev;
 
 	if (bio->bi_status) {
-		pr_err("md: super_written gets error=%d\n", bio->bi_status);
+		pr_err("md: %s gets error=%d\n", __func__,
+		       blk_status_to_errno(bio->bi_status));
 		md_error(mddev, rdev);
 		if (!test_bit(Faulty, &rdev->flags)
 		    && (bio->bi_opf & MD_FAILFAST)) {
-- 
cgit v1.2.3


From 01b5d32a57fe83820d014485a4c41965bc3b5ce4 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Tue, 28 Jul 2020 12:01:42 +0200
Subject: raid5-cache: hold spinlock instead of mutex in r5c_journal_mode_show

Replace mddev_lock with spin_lock to align with other show methods in
raid5_attrs.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5-cache.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 82eb4a906e31..4337ae0e6af2 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2537,13 +2537,10 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
 	struct r5conf *conf;
 	int ret;
 
-	ret = mddev_lock(mddev);
-	if (ret)
-		return ret;
-
+	spin_lock(&mddev->lock);
 	conf = mddev->private;
 	if (!conf || !conf->log) {
-		mddev_unlock(mddev);
+		spin_unlock(&mddev->lock);
 		return 0;
 	}
 
@@ -2563,7 +2560,7 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page)
 	default:
 		ret = 0;
 	}
-	mddev_unlock(mddev);
+	spin_unlock(&mddev->lock);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 3a31cf3d210f85d57fd302d83514832ebacb3ca7 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Date: Tue, 28 Jul 2020 12:01:43 +0200
Subject: raid5: don't duplicate code for different paths in handle_stripe

As we can see, R5_LOCKED is set and s.locked is increased whether
R5_ReWrite is set or not, so move it to common path.

Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 790d91aa5f40..b06edfaa73b0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4966,14 +4966,11 @@ static void handle_stripe(struct stripe_head *sh)
 				if (!test_bit(R5_ReWrite, &dev->flags)) {
 					set_bit(R5_Wantwrite, &dev->flags);
 					set_bit(R5_ReWrite, &dev->flags);
-					set_bit(R5_LOCKED, &dev->flags);
-					s.locked++;
-				} else {
+				} else
 					/* let's read it back */
 					set_bit(R5_Wantread, &dev->flags);
-					set_bit(R5_LOCKED, &dev->flags);
-					s.locked++;
-				}
+				set_bit(R5_LOCKED, &dev->flags);
+				s.locked++;
 			}
 		}
 
-- 
cgit v1.2.3


From a1c6ae3d9f3dd6aa5981a332a6f700cf1c25edef Mon Sep 17 00:00:00 2001
From: ChangSyun Peng <allenpeng@synology.com>
Date: Fri, 31 Jul 2020 17:50:17 +0800
Subject: md/raid5: Fix Force reconstruct-write io stuck in degraded raid5

In degraded raid5, we need to read parity to do reconstruct-write when
data disks fail. However, we can not read parity from
handle_stripe_dirtying() in force reconstruct-write mode.

Reproducible Steps:

1. Create degraded raid5
mdadm -C /dev/md2 --assume-clean -l5 -n3 /dev/sda2 /dev/sdb2 missing
2. Set rmw_level to 0
echo 0 > /sys/block/md2/md/rmw_level
3. IO to raid5

Now some io may be stuck in raid5. We can use handle_stripe_fill() to read
the parity in this situation.

Cc: <stable@vger.kernel.org> # v4.4+
Reviewed-by: Alex Wu <alexwu@synology.com>
Reviewed-by: BingJing Chang <bingjingc@synology.com>
Reviewed-by: Danny Shih <dannyshih@synology.com>
Signed-off-by: ChangSyun Peng <allenpeng@synology.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b06edfaa73b0..9aab66fd1003 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3626,6 +3626,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 	 * is missing/faulty, then we need to read everything we can.
 	 */
 	if (sh->raid_conf->level != 6 &&
+	    sh->raid_conf->rmw_level != PARITY_DISABLE_RMW &&
 	    sh->sector < sh->raid_conf->mddev->recovery_cp)
 		/* reconstruct-write isn't being forced */
 		return 0;
@@ -4862,7 +4863,7 @@ static void handle_stripe(struct stripe_head *sh)
 	 * or to load a block that is being partially written.
 	 */
 	if (s.to_read || s.non_overwrite
-	    || (conf->level == 6 && s.to_write && s.failed)
+	    || (s.to_write && s.failed)
 	    || (s.syncing && (s.uptodate + s.compute < disks))
 	    || s.replacing
 	    || s.expanding)
-- 
cgit v1.2.3


From 45a4d8fd6c7926e7991a1b29233d725fe12935da Mon Sep 17 00:00:00 2001
From: ChangSyun Peng <allenpeng@synology.com>
Date: Fri, 31 Jul 2020 17:50:31 +0800
Subject: md/raid5: Allow degraded raid6 to do rmw

Degraded raid6 always do reconstruct-write now. With raid6 xor supported,
we can do rmw in degraded raid6. This patch can reduce many read IOs to
improve performance.

If the failed disk is P, Q or the disk we want to write to, we may need to
do reconstruct-write in max degraded raid6. In this situation we can not
read enough data from handle_stripe_dirtying() so we have to set force_rcw
in handle_stripe_fill() to read all data.

Reviewed-by: Alex Wu <alexwu@synology.com>
Reviewed-by: BingJing Chang <bingjingc@synology.com>
Reviewed-by: Danny Shih <dannyshih@synology.com>
Signed-off-by: ChangSyun Peng <allenpeng@synology.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 drivers/md/raid5.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9aab66fd1003..657634a7e8d1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3557,6 +3557,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 	struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]],
 				  &sh->dev[s->failed_num[1]] };
 	int i;
+	bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW);
 
 
 	if (test_bit(R5_LOCKED, &dev->flags) ||
@@ -3615,18 +3616,27 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 			 * devices must be read.
 			 */
 			return 1;
+
+		if (s->failed >= 2 &&
+		    (fdev[i]->towrite ||
+		     s->failed_num[i] == sh->pd_idx ||
+		     s->failed_num[i] == sh->qd_idx) &&
+		    !test_bit(R5_UPTODATE, &fdev[i]->flags))
+			/* In max degraded raid6, If the failed disk is P, Q,
+			 * or we want to read the failed disk, we need to do
+			 * reconstruct-write.
+			 */
+			force_rcw = true;
 	}
 
-	/* If we are forced to do a reconstruct-write, either because
-	 * the current RAID6 implementation only supports that, or
-	 * because parity cannot be trusted and we are currently
-	 * recovering it, there is extra need to be careful.
+	/* If we are forced to do a reconstruct-write, because parity
+	 * cannot be trusted and we are currently recovering it, there
+	 * is extra need to be careful.
 	 * If one of the devices that we would need to read, because
 	 * it is not being overwritten (and maybe not written at all)
 	 * is missing/faulty, then we need to read everything we can.
 	 */
-	if (sh->raid_conf->level != 6 &&
-	    sh->raid_conf->rmw_level != PARITY_DISABLE_RMW &&
+	if (!force_rcw &&
 	    sh->sector < sh->raid_conf->mddev->recovery_cp)
 		/* reconstruct-write isn't being forced */
 		return 0;
-- 
cgit v1.2.3