From d9c0fa509eafb31605bddda34e2e39f4354563b6 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Tue, 30 Jun 2020 15:55:36 +0800 Subject: md: fix max sectors calculation for super 1.0 To grow size of super 1.0 raid array, it is necessary to check the device max usable size. Now it uses rdev->sectors for max usable size. If one disk is 500G and the raid device only uses the 100GB of this disk. rdev->sectors can't tell the real max usable size. The max usable size should be dev_size-(superblock_size+bitmap_size+badblock_size). Also, remove unnecessary sb_start update in super_1_rdev_size_change(). Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ea48bc25cce1..5e8f772ab117 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2193,6 +2193,24 @@ retry: sb->sb_csum = calc_sb_1_csum(sb); } +static sector_t super_1_choose_bm_space(sector_t dev_size) +{ + sector_t bm_space; + + /* if the device is bigger than 8Gig, save 64k for bitmap + * usage, if bigger than 200Gig, save 128k + */ + if (dev_size < 64*2) + bm_space = 0; + else if (dev_size - 64*2 >= 200*1024*1024*2) + bm_space = 128*2; + else if (dev_size - 4*2 > 8*1024*1024*2) + bm_space = 64*2; + else + bm_space = 4*2; + return bm_space; +} + static unsigned long long super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) { @@ -2213,13 +2231,22 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) return 0; } else { /* minor version 0; superblock after data */ - sector_t sb_start; - sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; + sector_t sb_start, bm_space; + sector_t dev_size = i_size_read(rdev->bdev->bd_inode) >> 9; + + /* 8K is for superblock */ + sb_start = dev_size - 8*2; sb_start &= ~(sector_t)(4*2 - 1); - max_sectors = rdev->sectors + sb_start - rdev->sb_start; + + bm_space = super_1_choose_bm_space(dev_size); + + /* Space that can be used to store date needs to decrease + * superblock bitmap space and bad block space(4K) + */ + max_sectors = sb_start - bm_space - 4*2; + if (!num_sectors || num_sectors > max_sectors) num_sectors = max_sectors; - rdev->sb_start = sb_start; } sb = page_address(rdev->sb_page); sb->data_size = cpu_to_le64(num_sectors); -- cgit v1.2.3 From ec164d07aa771370c7c24c1fa7f7692ad30f01cb Mon Sep 17 00:00:00 2001 From: Sebastian Parschauer Date: Tue, 28 Jul 2020 12:01:39 +0200 Subject: md: register new md sysfs file 'uuid' read-only Report the UUID of the MD array in the following format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx This is useful if you don't want to wait for udev to identify array. And it is also easy for script to monitor it with the format. Signed-off-by: Sebastian Parschauer [Guoqing: mention the change in md.rst] Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- Documentation/admin-guide/md.rst | 4 ++++ drivers/md/md.c | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index d973d469ffc4..cc8781b96b4d 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -426,6 +426,10 @@ All md devices contain: The accepted values when writing to this file are ``ppl`` and ``resync``, used to enable and disable PPL. + uuid + This indicates the UUID of the array in the following format: + xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + As component devices are added to an md array, they appear in the ``md`` directory as new directories named:: diff --git a/drivers/md/md.c b/drivers/md/md.c index 5e8f772ab117..5116d29b74e1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4252,6 +4252,14 @@ out_unlock: static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); +static ssize_t +uuid_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%pU\n", mddev->uuid); +} +static struct md_sysfs_entry md_uuid = +__ATTR(uuid, S_IRUGO, uuid_show, NULL); + static ssize_t chunk_size_show(struct mddev *mddev, char *page) { @@ -5508,6 +5516,7 @@ static struct attribute *md_default_attrs[] = { &md_level.attr, &md_layout.attr, &md_raid_disks.attr, + &md_uuid.attr, &md_chunk_size.attr, &md_size.attr, &md_resync_start.attr, -- cgit v1.2.3 From e3914d596f79742ce3038ffdf66e4fa585ad7cd5 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 28 Jul 2020 12:01:40 +0200 Subject: md/raid5: remove the redundant setting of STRIPE_HANDLE The flag is already set before compare rcw with rmw, so it is not necessary to do it again. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a6ff6e1e039b..790d91aa5f40 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3995,10 +3995,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; - } else { + } else set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } } } } @@ -4023,10 +4021,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(R5_Wantread, &dev->flags); s->locked++; qread++; - } else { + } else set_bit(STRIPE_DELAYED, &sh->state); - set_bit(STRIPE_HANDLE, &sh->state); - } } } if (rcw && conf->mddev->queue) -- cgit v1.2.3 From b3db8a216393cc228ab4add580f488da55177b66 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 28 Jul 2020 12:01:41 +0200 Subject: md: print errno in super_written It is better to print errno instead of bi_status. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 5116d29b74e1..9c69084cae73 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -978,7 +978,8 @@ static void super_written(struct bio *bio) struct mddev *mddev = rdev->mddev; if (bio->bi_status) { - pr_err("md: super_written gets error=%d\n", bio->bi_status); + pr_err("md: %s gets error=%d\n", __func__, + blk_status_to_errno(bio->bi_status)); md_error(mddev, rdev); if (!test_bit(Faulty, &rdev->flags) && (bio->bi_opf & MD_FAILFAST)) { -- cgit v1.2.3 From 01b5d32a57fe83820d014485a4c41965bc3b5ce4 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 28 Jul 2020 12:01:42 +0200 Subject: raid5-cache: hold spinlock instead of mutex in r5c_journal_mode_show Replace mddev_lock with spin_lock to align with other show methods in raid5_attrs. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 82eb4a906e31..4337ae0e6af2 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2537,13 +2537,10 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) struct r5conf *conf; int ret; - ret = mddev_lock(mddev); - if (ret) - return ret; - + spin_lock(&mddev->lock); conf = mddev->private; if (!conf || !conf->log) { - mddev_unlock(mddev); + spin_unlock(&mddev->lock); return 0; } @@ -2563,7 +2560,7 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) default: ret = 0; } - mddev_unlock(mddev); + spin_unlock(&mddev->lock); return ret; } -- cgit v1.2.3 From 3a31cf3d210f85d57fd302d83514832ebacb3ca7 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 28 Jul 2020 12:01:43 +0200 Subject: raid5: don't duplicate code for different paths in handle_stripe As we can see, R5_LOCKED is set and s.locked is increased whether R5_ReWrite is set or not, so move it to common path. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 790d91aa5f40..b06edfaa73b0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4966,14 +4966,11 @@ static void handle_stripe(struct stripe_head *sh) if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); set_bit(R5_ReWrite, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } else { + } else /* let's read it back */ set_bit(R5_Wantread, &dev->flags); - set_bit(R5_LOCKED, &dev->flags); - s.locked++; - } + set_bit(R5_LOCKED, &dev->flags); + s.locked++; } } -- cgit v1.2.3 From a1c6ae3d9f3dd6aa5981a332a6f700cf1c25edef Mon Sep 17 00:00:00 2001 From: ChangSyun Peng Date: Fri, 31 Jul 2020 17:50:17 +0800 Subject: md/raid5: Fix Force reconstruct-write io stuck in degraded raid5 In degraded raid5, we need to read parity to do reconstruct-write when data disks fail. However, we can not read parity from handle_stripe_dirtying() in force reconstruct-write mode. Reproducible Steps: 1. Create degraded raid5 mdadm -C /dev/md2 --assume-clean -l5 -n3 /dev/sda2 /dev/sdb2 missing 2. Set rmw_level to 0 echo 0 > /sys/block/md2/md/rmw_level 3. IO to raid5 Now some io may be stuck in raid5. We can use handle_stripe_fill() to read the parity in this situation. Cc: # v4.4+ Reviewed-by: Alex Wu Reviewed-by: BingJing Chang Reviewed-by: Danny Shih Signed-off-by: ChangSyun Peng Signed-off-by: Song Liu --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b06edfaa73b0..9aab66fd1003 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3626,6 +3626,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, * is missing/faulty, then we need to read everything we can. */ if (sh->raid_conf->level != 6 && + sh->raid_conf->rmw_level != PARITY_DISABLE_RMW && sh->sector < sh->raid_conf->mddev->recovery_cp) /* reconstruct-write isn't being forced */ return 0; @@ -4862,7 +4863,7 @@ static void handle_stripe(struct stripe_head *sh) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite - || (conf->level == 6 && s.to_write && s.failed) + || (s.to_write && s.failed) || (s.syncing && (s.uptodate + s.compute < disks)) || s.replacing || s.expanding) -- cgit v1.2.3 From 45a4d8fd6c7926e7991a1b29233d725fe12935da Mon Sep 17 00:00:00 2001 From: ChangSyun Peng Date: Fri, 31 Jul 2020 17:50:31 +0800 Subject: md/raid5: Allow degraded raid6 to do rmw Degraded raid6 always do reconstruct-write now. With raid6 xor supported, we can do rmw in degraded raid6. This patch can reduce many read IOs to improve performance. If the failed disk is P, Q or the disk we want to write to, we may need to do reconstruct-write in max degraded raid6. In this situation we can not read enough data from handle_stripe_dirtying() so we have to set force_rcw in handle_stripe_fill() to read all data. Reviewed-by: Alex Wu Reviewed-by: BingJing Chang Reviewed-by: Danny Shih Signed-off-by: ChangSyun Peng Signed-off-by: Song Liu --- drivers/md/raid5.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9aab66fd1003..657634a7e8d1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3557,6 +3557,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; int i; + bool force_rcw = (sh->raid_conf->rmw_level == PARITY_DISABLE_RMW); if (test_bit(R5_LOCKED, &dev->flags) || @@ -3615,18 +3616,27 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, * devices must be read. */ return 1; + + if (s->failed >= 2 && + (fdev[i]->towrite || + s->failed_num[i] == sh->pd_idx || + s->failed_num[i] == sh->qd_idx) && + !test_bit(R5_UPTODATE, &fdev[i]->flags)) + /* In max degraded raid6, If the failed disk is P, Q, + * or we want to read the failed disk, we need to do + * reconstruct-write. + */ + force_rcw = true; } - /* If we are forced to do a reconstruct-write, either because - * the current RAID6 implementation only supports that, or - * because parity cannot be trusted and we are currently - * recovering it, there is extra need to be careful. + /* If we are forced to do a reconstruct-write, because parity + * cannot be trusted and we are currently recovering it, there + * is extra need to be careful. * If one of the devices that we would need to read, because * it is not being overwritten (and maybe not written at all) * is missing/faulty, then we need to read everything we can. */ - if (sh->raid_conf->level != 6 && - sh->raid_conf->rmw_level != PARITY_DISABLE_RMW && + if (!force_rcw && sh->sector < sh->raid_conf->mddev->recovery_cp) /* reconstruct-write isn't being forced */ return 0; -- cgit v1.2.3