From e3f948cd3283e4fbe5907f1f3967c839912f480e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 6 Oct 2016 14:09:16 -0700
Subject: RAID1: ignore discard error

If a write error occurs, raid1 will try to rewrite the bio in small
chunk size. If the rewrite fails, raid1 will record the error in bad
block. narrow_write_error will always use WRITE for the bio, but
actually it could be a discard. Since discard bio hasn't payload, write
the bio will cause different issues. But discard error isn't fatal, we
can safely ignore it. This is what this patch does.

This issue should exist since discard is added, but only exposed with
recent arbitrary bio size feature.

Reported-and-tested-by: Sitsofe Wheeler <sitsofe@gmail.com>
Cc: stable@vger.kernel.org (v3.6)
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1961d827dbd1..db536a68b2ee 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -403,11 +403,14 @@ static void raid1_end_write_request(struct bio *bio)
 	struct bio *to_put = NULL;
 	int mirror = find_bio_disk(r1_bio, bio);
 	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
+	bool discard_error;
+
+	discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
 
 	/*
 	 * 'one mirror IO has finished' event handler:
 	 */
-	if (bio->bi_error) {
+	if (bio->bi_error && !discard_error) {
 		set_bit(WriteErrorSeen,	&rdev->flags);
 		if (!test_and_set_bit(WantReplacement, &rdev->flags))
 			set_bit(MD_RECOVERY_NEEDED, &
@@ -444,7 +447,7 @@ static void raid1_end_write_request(struct bio *bio)
 
 		/* Maybe we can clear some bad blocks. */
 		if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
-				&first_bad, &bad_sectors)) {
+				&first_bad, &bad_sectors) && !discard_error) {
 			r1_bio->bios[mirror] = IO_MADE_GOOD;
 			set_bit(R1BIO_MadeGood, &r1_bio->state);
 		}
-- 
cgit v1.2.3


From 579ed34f7b751b8add233cba4cf755258dbdd60a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 6 Oct 2016 14:13:52 -0700
Subject: RAID10: ignore discard error

This is the counterpart of raid10 fix. If a write error occurs, raid10
will try to rewrite the bio in small chunk size. If the rewrite fails,
raid10 will record the error in bad block. narrow_write_error will
always use WRITE for the bio, but actually it could be a discard. Since
discard bio hasn't payload, write the bio will cause different issues.
But discard error isn't fatal, we can safely ignore it. This is what
this patch does.

This issue should exist since discard is added, but only exposed with
recent arbitrary bio size feature.

Cc: Sitsofe Wheeler <sitsofe@gmail.com>
Cc: stable@vger.kernel.org (v3.6)
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index be1a9fca3b2d..39fddda2fef2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -447,6 +447,9 @@ static void raid10_end_write_request(struct bio *bio)
 	struct r10conf *conf = r10_bio->mddev->private;
 	int slot, repl;
 	struct md_rdev *rdev = NULL;
+	bool discard_error;
+
+	discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
 
 	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 
@@ -460,7 +463,7 @@ static void raid10_end_write_request(struct bio *bio)
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
-	if (bio->bi_error) {
+	if (bio->bi_error && !discard_error) {
 		if (repl)
 			/* Never record new bad blocks to replacement,
 			 * just fail it.
@@ -503,7 +506,7 @@ static void raid10_end_write_request(struct bio *bio)
 		if (is_badblock(rdev,
 				r10_bio->devs[slot].addr,
 				r10_bio->sectors,
-				&first_bad, &bad_sectors)) {
+				&first_bad, &bad_sectors) && !discard_error) {
 			bio_put(bio);
 			if (repl)
 				r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
-- 
cgit v1.2.3


From 28cd88e2b4c54a466dcae7eea1efac766d42386b Mon Sep 17 00:00:00 2001
From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Date: Mon, 24 Oct 2016 09:55:20 +0800
Subject: md/raid5: initialize next_checkpoint field before use

No initial operation was done to this field when we
load/recovery the log, it got assignment only when IO
to raid disk was finished. So r5l_quiesce may use wrong
next_checkpoint to reclaim log space, that would make
reclaimable space calculation confused.

Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 1b1ab4a1d132..998ea0025dd0 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1096,6 +1096,8 @@ static int r5l_recovery_log(struct r5l_log *log)
 		log->seq = ctx.seq + 11;
 		log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
 		r5l_write_super(log, ctx.pos);
+		log->last_checkpoint = ctx.pos;
+		log->next_checkpoint = ctx.pos;
 	} else {
 		log->log_start = ctx.pos;
 		log->seq = ctx.seq;
@@ -1168,6 +1170,7 @@ create:
 	if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
 		log->max_free_space = RECLAIM_MAX_FREE_SPACE;
 	log->last_checkpoint = cp;
+	log->next_checkpoint = cp;
 
 	__free_page(page);
 
-- 
cgit v1.2.3


From 56056c2e7d58ee705755efbe780aefff987a1dc8 Mon Sep 17 00:00:00 2001
From: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Date: Mon, 24 Oct 2016 16:15:59 +0800
Subject: md/raid5: write an empty meta-block when creating log super-block

If superblock points to an invalid meta block, r5l_load_log will set
create_super with true and create an new superblock, this runtime path
would always happen if we do no writing I/O to this array since it was
created. Writing an empty meta block could avoid this unnecessary
action at the first time we created log superblock.

Another reason is for the corretness of log recovery. Currently we have
bellow code to guarantee log revocery to be correct.

        if (ctx.seq > log->last_cp_seq + 1) {
                int ret;

                ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
                if (ret)
                        return ret;
                log->seq = ctx.seq + 11;
                log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
                r5l_write_super(log, ctx.pos);
        } else {
                log->log_start = ctx.pos;
                log->seq = ctx.seq;
        }

If we just created a array with a journal device, log->log_start and
log->last_checkpoint should all be 0, then we write three meta block
which are valid except mid one and supposed crash happened. The ctx.seq
would equal to log->last_cp_seq + 1 and log->log_start would be set to
position of mid invalid meta block after we did a recovery, this will
lead to problems which could be avoided with this patch.

Signed-off-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 998ea0025dd0..981f85515191 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1156,6 +1156,7 @@ create:
 	if (create_super) {
 		log->last_cp_seq = prandom_u32();
 		cp = 0;
+		r5l_log_write_empty_meta_block(log, cp, log->last_cp_seq);
 		/*
 		 * Make sure super points to correct address. Log might have
 		 * data very soon. If super hasn't correct log tail address,
-- 
cgit v1.2.3


From 16f889499a5214ebe038f8bd00f4c0094ed0ed75 Mon Sep 17 00:00:00 2001
From: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Date: Mon, 24 Oct 2016 12:47:28 +0200
Subject: md: report 'write_pending' state when array in sync

If there is a bad block on a disk and there is a recovery performed from
this disk, the same bad block is reported for a new disk. It involves
setting MD_CHANGE_PENDING flag in rdev_set_badblocks. For external
metadata this flag is not being cleared as array state is reported as
'clean'. The read request to bad block in RAID5 array gets stuck as it
is waiting for a flag to be cleared - as per commit c3cce6cda162
("md/raid5: ensure device failure recorded before write request
returns.").

The meaning of MD_CHANGE_PENDING and MD_CHANGE_CLEAN flags has been
clarified in commit 070dc6dd7103 ("md: resolve confusion of
MD_CHANGE_CLEAN"), however MD_CHANGE_PENDING flag has been used in
personality error handlers since and it doesn't fully comply with
initial purpose. It was supposed to notify that write request is about
to start, however now it is also used to request metadata update.
Initially (in md_allow_write, md_write_start) MD_CHANGE_PENDING flag has
been set and in_sync has been set to 0 at the same time. Error handlers
just set the flag without modifying in_sync value. Sysfs array state is
a single value so now it reports 'clean' when MD_CHANGE_PENDING flag is
set and in_sync is set to 1. Userspace has no idea it is expected to
take some action.

Swap the order that array state is checked so 'write_pending' is
reported ahead of 'clean' ('write_pending' is a misleading name but it
is too late to rename it now).

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index eac84d8ff724..25b57a55db1a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3887,10 +3887,10 @@ array_state_show(struct mddev *mddev, char *page)
 			st = read_auto;
 			break;
 		case 0:
-			if (mddev->in_sync)
-				st = clean;
-			else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
+			if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
 				st = write_pending;
+			else if (mddev->in_sync)
+				st = clean;
 			else if (mddev->safemode)
 				st = active_idle;
 			else
-- 
cgit v1.2.3


From 9a8b27fac5bbb77337cc2e5d31d37c9936782d87 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 27 Oct 2016 15:22:13 -0700
Subject: raid5-cache: correct condition for empty metadata write

As long as we recover one metadata block, we should write the empty metadata
write. The original code could make recovery corrupted if only one meta is
valid.

Reported-by: Zhengyuan Liu <liuzhengyuan@kylinos.cn>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 981f85515191..a227a9f3ee65 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1087,7 +1087,7 @@ static int r5l_recovery_log(struct r5l_log *log)
 	 * 1's seq + 10 and let superblock points to meta2. The same recovery will
 	 * not think meta 3 is a valid meta, because its seq doesn't match
 	 */
-	if (ctx.seq > log->last_cp_seq + 1) {
+	if (ctx.seq > log->last_cp_seq) {
 		int ret;
 
 		ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
-- 
cgit v1.2.3


From 7449f699b2fb23bdee0a0f03aa4efb5f96fd403f Mon Sep 17 00:00:00 2001
From: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Date: Fri, 28 Oct 2016 14:45:58 +0200
Subject: raid1: handle read error also in readonly mode

If write is the first operation on a disk and it happens not to be
aligned to page size, block layer sends read request first. If read
operation fails, the disk is set as failed as no attempt to fix the
error is made because array is in auto-readonly mode. Similarily, the
disk is set as failed for read-only array.

Take the same approach as in raid10. Don't fail the disk if array is in
readonly or auto-readonly mode. Try to redirect the request first and if
unsuccessful, return a read error.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index db536a68b2ee..29e2df5cd77b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2297,17 +2297,23 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
 	 * This is all done synchronously while the array is
 	 * frozen
 	 */
+
+	bio = r1_bio->bios[r1_bio->read_disk];
+	bdevname(bio->bi_bdev, b);
+	bio_put(bio);
+	r1_bio->bios[r1_bio->read_disk] = NULL;
+
 	if (mddev->ro == 0) {
 		freeze_array(conf, 1);
 		fix_read_error(conf, r1_bio->read_disk,
 			       r1_bio->sector, r1_bio->sectors);
 		unfreeze_array(conf);
-	} else
-		md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+	} else {
+		r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
+	}
+
 	rdev_dec_pending(conf->mirrors[r1_bio->read_disk].rdev, conf->mddev);
 
-	bio = r1_bio->bios[r1_bio->read_disk];
-	bdevname(bio->bi_bdev, b);
 read_more:
 	disk = read_balance(conf, r1_bio, &max_sectors);
 	if (disk == -1) {
@@ -2318,11 +2324,6 @@ read_more:
 	} else {
 		const unsigned long do_sync
 			= r1_bio->master_bio->bi_opf & REQ_SYNC;
-		if (bio) {
-			r1_bio->bios[r1_bio->read_disk] =
-				mddev->ro ? IO_BLOCKED : NULL;
-			bio_put(bio);
-		}
 		r1_bio->read_disk = disk;
 		bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
 		bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
-- 
cgit v1.2.3


From 1217e1d1999ed6c9c1e1b1acae0a74ab70464ae2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 28 Oct 2016 15:59:41 +1100
Subject: md: be careful not lot leak internal curr_resync value into metadata.
 -- (all)

mddev->curr_resync usually records where the current resync is up to,
but during the starting phase it has some "magic" values.

 1 - means that the array is trying to start a resync, but has yielded
     to another array which shares physical devices, and also needs to
     start a resync
 2 - means the array is trying to start resync, but has found another
     array which shares physical devices and has already started resync.

 3 - means that resync has commensed, but it is possible that nothing
     has actually been resynced yet.

It is important that this value not be visible to user-space and
particularly that it doesn't get written to the metadata, as the
resync or recovery checkpoint.  In part, this is because it may be
slightly higher than the correct value, though this is very rare.
In part, because it is not a multiple of 4K, and some devices only
support 4K aligned accesses.

There are two places where this value is propagates into either
->curr_resync_completed or ->recovery_cp or ->recovery_offset.
These currently avoid the propagation of values 1 and 3, but will
allow 3 to leak through.

Change them to only propagate the value if it is > 3.

As this can cause an array to fail, the patch is suitable for -stable.

Cc: stable@vger.kernel.org (v3.7+)
Reported-by: Viswesh <viswesh.vichu@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 25b57a55db1a..2089d46b0eb8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8144,14 +8144,14 @@ void md_do_sync(struct md_thread *thread)
 
 	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
-	    mddev->curr_resync > 2) {
+	    mddev->curr_resync > 3) {
 		mddev->curr_resync_completed = mddev->curr_resync;
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	}
 	mddev->pers->sync_request(mddev, max_sectors, &skipped);
 
 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
-	    mddev->curr_resync > 2) {
+	    mddev->curr_resync > 3) {
 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 			if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 				if (mddev->curr_resync >= mddev->recovery_cp) {
-- 
cgit v1.2.3