From 535ae4eb1225f19e1d1848c65eafea8b7e9112f4 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 15 Feb 2017 19:37:32 -0800
Subject: md/raid5: prioritize stripes for writeback

In raid5-cache writeback mode, we have two types of stripes to handle.
- stripes which aren't cached yet
- stripes which are cached and flushing out to raid disks

Upperlayer is more sensistive to latency of the first type of stripes
generally. But we only one handle list for all these stripes, where the
two types of stripes are mixed together. When reclaim flushes a lot of
stripes, the first type of stripes could be noticeably delayed. On the
other hand, if the log space is tight, we'd like to handle the second
type of stripes faster and free log space.

This patch destinguishes the two types stripes. They are added into
different handle list. When we try to get a stripe to handl, we prefer
the first type of stripes unless log space is tight.

This should have no impact for !writeback case.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 48 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 9 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed5cd705b985..5a28bd9b5b5f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -176,6 +176,13 @@ static int stripe_operations_active(struct stripe_head *sh)
 	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 }
 
+static bool stripe_is_lowprio(struct stripe_head *sh)
+{
+	return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
+		test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
+	       !test_bit(STRIPE_R5C_CACHING, &sh->state);
+}
+
 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
@@ -191,7 +198,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 	if (list_empty(&sh->lru)) {
 		struct r5worker_group *group;
 		group = conf->worker_groups + cpu_to_group(cpu);
-		list_add_tail(&sh->lru, &group->handle_list);
+		if (stripe_is_lowprio(sh))
+			list_add_tail(&sh->lru, &group->loprio_list);
+		else
+			list_add_tail(&sh->lru, &group->handle_list);
 		group->stripes_cnt++;
 		sh->group = group;
 	}
@@ -254,7 +264,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			clear_bit(STRIPE_BIT_DELAY, &sh->state);
 			if (conf->worker_cnt_per_group == 0) {
-				list_add_tail(&sh->lru, &conf->handle_list);
+				if (stripe_is_lowprio(sh))
+					list_add_tail(&sh->lru,
+							&conf->loprio_list);
+				else
+					list_add_tail(&sh->lru,
+							&conf->handle_list);
 			} else {
 				raid5_wakeup_stripe_thread(sh);
 				return;
@@ -5172,19 +5187,27 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
  */
 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 {
-	struct stripe_head *sh = NULL, *tmp;
+	struct stripe_head *sh, *tmp;
 	struct list_head *handle_list = NULL;
-	struct r5worker_group *wg = NULL;
+	struct r5worker_group *wg;
+	bool second_try = !r5c_is_writeback(conf->log);
+	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
 
+again:
+	wg = NULL;
+	sh = NULL;
 	if (conf->worker_cnt_per_group == 0) {
-		handle_list = &conf->handle_list;
+		handle_list = try_loprio ? &conf->loprio_list :
+					&conf->handle_list;
 	} else if (group != ANY_GROUP) {
-		handle_list = &conf->worker_groups[group].handle_list;
+		handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
+				&conf->worker_groups[group].handle_list;
 		wg = &conf->worker_groups[group];
 	} else {
 		int i;
 		for (i = 0; i < conf->group_cnt; i++) {
-			handle_list = &conf->worker_groups[i].handle_list;
+			handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
+				&conf->worker_groups[i].handle_list;
 			wg = &conf->worker_groups[i];
 			if (!list_empty(handle_list))
 				break;
@@ -5235,8 +5258,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 		wg = NULL;
 	}
 
-	if (!sh)
-		return NULL;
+	if (!sh) {
+		if (second_try)
+			return NULL;
+		second_try = true;
+		try_loprio = !try_loprio;
+		goto again;
+	}
 
 	if (wg) {
 		wg->stripes_cnt--;
@@ -6546,6 +6574,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
 
 		group = &(*worker_groups)[i];
 		INIT_LIST_HEAD(&group->handle_list);
+		INIT_LIST_HEAD(&group->loprio_list);
 		group->conf = conf;
 		group->workers = workers + i * cnt;
 
@@ -6773,6 +6802,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
+	INIT_LIST_HEAD(&conf->loprio_list);
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
-- 
cgit v1.2.3


From aaf9f12ebfafd1ea603d61ead6dbcf456a86e0f3 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 3 Mar 2017 22:06:12 -0800
Subject: md/raid5: sort bios

Previous patch (raid5: only dispatch IO from raid5d for harddisk raid)
defers IO dispatching. The goal is to create better IO pattern. At that
time, we don't sort the deffered IO and hope the block layer can do IO
merge and sort. Now the raid5-cache writeback could create large amount
of bios. And if we enable muti-thread for stripe handling, we can't
control when to dispatch IO to raid disks. In a lot of time, we are
dispatching IO which block layer can't do merge effectively.

This patch moves further for the IO dispatching defer. We accumulate
bios, but we don't dispatch all the bios after a threshold is met. This
'dispatch partial portion of bios' stragety allows bios coming in a
large time window are sent to disks together. At the dispatching time,
there is large chance the block layer can merge the bios. To make this
more effective, we dispatch IO in ascending order. This increases
request merge chance and reduces disk seek.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 138 +++++++++++++++++++++++++++++++++++++++++++----------
 drivers/md/raid5.h |  14 +++++-
 2 files changed, 126 insertions(+), 26 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5a28bd9b5b5f..013398ce2080 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -58,6 +58,7 @@
 #include <linux/sched/signal.h>
 
 #include <trace/events/block.h>
+#include <linux/list_sort.h>
 
 #include "md.h"
 #include "raid5.h"
@@ -878,41 +879,107 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
 	return 1;
 }
 
-static void flush_deferred_bios(struct r5conf *conf)
+static void dispatch_bio_list(struct bio_list *tmp)
 {
-	struct bio_list tmp;
 	struct bio *bio;
 
-	if (!conf->batch_bio_dispatch || !conf->group_cnt)
+	while ((bio = bio_list_pop(tmp)))
+		generic_make_request(bio);
+}
+
+static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b)
+{
+	const struct r5pending_data *da = list_entry(a,
+				struct r5pending_data, sibling);
+	const struct r5pending_data *db = list_entry(b,
+				struct r5pending_data, sibling);
+	if (da->sector > db->sector)
+		return 1;
+	if (da->sector < db->sector)
+		return -1;
+	return 0;
+}
+
+static void dispatch_defer_bios(struct r5conf *conf, int target,
+				struct bio_list *list)
+{
+	struct r5pending_data *data;
+	struct list_head *first, *next = NULL;
+	int cnt = 0;
+
+	if (conf->pending_data_cnt == 0)
+		return;
+
+	list_sort(NULL, &conf->pending_list, cmp_stripe);
+
+	first = conf->pending_list.next;
+
+	/* temporarily move the head */
+	if (conf->next_pending_data)
+		list_move_tail(&conf->pending_list,
+				&conf->next_pending_data->sibling);
+
+	while (!list_empty(&conf->pending_list)) {
+		data = list_first_entry(&conf->pending_list,
+			struct r5pending_data, sibling);
+		if (&data->sibling == first)
+			first = data->sibling.next;
+		next = data->sibling.next;
+
+		bio_list_merge(list, &data->bios);
+		list_move(&data->sibling, &conf->free_list);
+		cnt++;
+		if (cnt >= target)
+			break;
+	}
+	conf->pending_data_cnt -= cnt;
+	BUG_ON(conf->pending_data_cnt < 0 || cnt < target);
+
+	if (next != &conf->pending_list)
+		conf->next_pending_data = list_entry(next,
+				struct r5pending_data, sibling);
+	else
+		conf->next_pending_data = NULL;
+	/* list isn't empty */
+	if (first != &conf->pending_list)
+		list_move_tail(&conf->pending_list, first);
+}
+
+static void flush_deferred_bios(struct r5conf *conf)
+{
+	struct bio_list tmp = BIO_EMPTY_LIST;
+
+	if (conf->pending_data_cnt == 0)
 		return;
 
-	bio_list_init(&tmp);
 	spin_lock(&conf->pending_bios_lock);
-	bio_list_merge(&tmp, &conf->pending_bios);
-	bio_list_init(&conf->pending_bios);
+	dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
+	BUG_ON(conf->pending_data_cnt != 0);
 	spin_unlock(&conf->pending_bios_lock);
 
-	while ((bio = bio_list_pop(&tmp)))
-		generic_make_request(bio);
+	dispatch_bio_list(&tmp);
 }
 
-static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
+static void defer_issue_bios(struct r5conf *conf, sector_t sector,
+				struct bio_list *bios)
 {
-	/*
-	 * change group_cnt will drain all bios, so this is safe
-	 *
-	 * A read generally means a read-modify-write, which usually means a
-	 * randwrite, so we don't delay it
-	 */
-	if (!conf->batch_bio_dispatch || !conf->group_cnt ||
-	    bio_op(bio) == REQ_OP_READ) {
-		generic_make_request(bio);
-		return;
-	}
+	struct bio_list tmp = BIO_EMPTY_LIST;
+	struct r5pending_data *ent;
+
 	spin_lock(&conf->pending_bios_lock);
-	bio_list_add(&conf->pending_bios, bio);
+	ent = list_first_entry(&conf->free_list, struct r5pending_data,
+							sibling);
+	list_move_tail(&ent->sibling, &conf->pending_list);
+	ent->sector = sector;
+	bio_list_init(&ent->bios);
+	bio_list_merge(&ent->bios, bios);
+	conf->pending_data_cnt++;
+	if (conf->pending_data_cnt >= PENDING_IO_MAX)
+		dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
+
 	spin_unlock(&conf->pending_bios_lock);
-	md_wakeup_thread(conf->mddev->thread);
+
+	dispatch_bio_list(&tmp);
 }
 
 static void
@@ -925,6 +992,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 	struct r5conf *conf = sh->raid_conf;
 	int i, disks = sh->disks;
 	struct stripe_head *head_sh = sh;
+	struct bio_list pending_bios = BIO_EMPTY_LIST;
+	bool should_defer;
 
 	might_sleep();
 
@@ -941,6 +1010,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		}
 	}
 
+	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
+
 	for (i = disks; i--; ) {
 		int op, op_flags = 0;
 		int replace_only = 0;
@@ -1095,7 +1166,10 @@ again:
 				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
 						      bi, disk_devt(conf->mddev->gendisk),
 						      sh->dev[i].sector);
-			defer_bio_issue(conf, bi);
+			if (should_defer && op_is_write(op))
+				bio_list_add(&pending_bios, bi);
+			else
+				generic_make_request(bi);
 		}
 		if (rrdev) {
 			if (s->syncing || s->expanding || s->expanded
@@ -1140,7 +1214,10 @@ again:
 				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
 						      rbi, disk_devt(conf->mddev->gendisk),
 						      sh->dev[i].sector);
-			defer_bio_issue(conf, rbi);
+			if (should_defer && op_is_write(op))
+				bio_list_add(&pending_bios, rbi);
+			else
+				generic_make_request(rbi);
 		}
 		if (!rdev && !rrdev) {
 			if (op_is_write(op))
@@ -1158,6 +1235,9 @@ again:
 		if (sh != head_sh)
 			goto again;
 	}
+
+	if (should_defer && !bio_list_empty(&pending_bios))
+		defer_issue_bios(conf, head_sh->sector, &pending_bios);
 }
 
 static struct dma_async_tx_descriptor *
@@ -6678,6 +6758,7 @@ static void free_conf(struct r5conf *conf)
 			put_page(conf->disks[i].extra_page);
 	kfree(conf->disks);
 	kfree(conf->stripe_hashtbl);
+	kfree(conf->pending_data);
 	kfree(conf);
 }
 
@@ -6787,6 +6868,14 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL);
 	if (conf == NULL)
 		goto abort;
+	INIT_LIST_HEAD(&conf->free_list);
+	INIT_LIST_HEAD(&conf->pending_list);
+	conf->pending_data = kzalloc(sizeof(struct r5pending_data) *
+		PENDING_IO_MAX, GFP_KERNEL);
+	if (!conf->pending_data)
+		goto abort;
+	for (i = 0; i < PENDING_IO_MAX; i++)
+		list_add(&conf->pending_data[i].sibling, &conf->free_list);
 	/* Don't enable multi-threading by default*/
 	if (!alloc_thread_groups(conf, 0, &group_cnt, &worker_cnt_per_group,
 				 &new_group)) {
@@ -6811,7 +6900,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
 	atomic_set(&conf->active_aligned_reads, 0);
-	bio_list_init(&conf->pending_bios);
 	spin_lock_init(&conf->pending_bios_lock);
 	conf->batch_bio_dispatch = true;
 	rdev_for_each(rdev, mddev) {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 6b9d2e839e6d..985cdc4850c2 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -572,6 +572,14 @@ enum r5_cache_state {
 				 */
 };
 
+#define PENDING_IO_MAX 512
+#define PENDING_IO_ONE_FLUSH 128
+struct r5pending_data {
+	struct list_head sibling;
+	sector_t sector; /* stripe sector */
+	struct bio_list bios;
+};
+
 struct r5conf {
 	struct hlist_head	*stripe_hashtbl;
 	/* only protect corresponding hash list and inactive_list */
@@ -689,9 +697,13 @@ struct r5conf {
 	int			worker_cnt_per_group;
 	struct r5l_log		*log;
 
-	struct bio_list		pending_bios;
 	spinlock_t		pending_bios_lock;
 	bool			batch_bio_dispatch;
+	struct r5pending_data	*pending_data;
+	struct list_head	free_list;
+	struct list_head	pending_list;
+	int			pending_data_cnt;
+	struct r5pending_data	*next_pending_data;
 };
 
 
-- 
cgit v1.2.3


From ff875738edd44e3bc892d378deacc50bccc9d70c Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 09:59:58 +0100
Subject: raid5: separate header for log functions

Move raid5-cache declarations from raid5.h to raid5-log.h, add inline
wrappers for functions which will be shared with ppl and use them in
raid5 core instead of direct calls to raid5-cache.

Remove unused parameter from r5c_cache_data(), move two duplicated
pr_debug() calls to r5l_init_log().

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 17 +++++++---
 drivers/md/raid5-log.h   | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c       | 48 ++++++++--------------------
 drivers/md/raid5.h       | 30 ------------------
 4 files changed, 107 insertions(+), 69 deletions(-)
 create mode 100644 drivers/md/raid5-log.h

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 5c8640c86b90..5f82dabdda6f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -344,6 +344,8 @@ void r5c_handle_cached_data_endio(struct r5conf *conf,
 	}
 }
 
+void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+
 /* Check whether we should flush some stripes to free up stripe cache */
 void r5c_check_stripe_cache_usage(struct r5conf *conf)
 {
@@ -2749,9 +2751,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 	}
 }
 
-int
-r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
-	       struct stripe_head_state *s)
+int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
 	int pages = 0;
@@ -2914,6 +2914,10 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 {
 	struct request_queue *q = bdev_get_queue(rdev->bdev);
 	struct r5l_log *log;
+	char b[BDEVNAME_SIZE];
+
+	pr_debug("md/raid:%s: using device %s as journal\n",
+		 mdname(conf->mddev), bdevname(rdev->bdev, b));
 
 	if (PAGE_SIZE != 4096)
 		return -EINVAL;
@@ -3016,8 +3020,13 @@ io_kc:
 	return -EINVAL;
 }
 
-void r5l_exit_log(struct r5l_log *log)
+void r5l_exit_log(struct r5conf *conf)
 {
+	struct r5l_log *log = conf->log;
+
+	conf->log = NULL;
+	synchronize_rcu();
+
 	flush_work(&log->disable_writeback_work);
 	md_unregister_thread(&log->reclaim_thread);
 	mempool_destroy(log->meta_pool);
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
new file mode 100644
index 000000000000..2da4bd3bbd79
--- /dev/null
+++ b/drivers/md/raid5-log.h
@@ -0,0 +1,81 @@
+#ifndef _RAID5_LOG_H
+#define _RAID5_LOG_H
+
+extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
+extern void r5l_exit_log(struct r5conf *conf);
+extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
+extern void r5l_write_stripe_run(struct r5l_log *log);
+extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
+extern void r5l_stripe_write_finished(struct stripe_head *sh);
+extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
+extern void r5l_quiesce(struct r5l_log *log, int state);
+extern bool r5l_log_disk_error(struct r5conf *conf);
+extern bool r5c_is_writeback(struct r5l_log *log);
+extern int
+r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
+		      struct stripe_head_state *s, int disks);
+extern void
+r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
+			    struct stripe_head_state *s);
+extern void r5c_release_extra_page(struct stripe_head *sh);
+extern void r5c_use_extra_page(struct stripe_head *sh);
+extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
+extern void r5c_handle_cached_data_endio(struct r5conf *conf,
+	struct stripe_head *sh, int disks, struct bio_list *return_bi);
+extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
+extern void r5c_make_stripe_write_out(struct stripe_head *sh);
+extern void r5c_flush_cache(struct r5conf *conf, int num);
+extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
+extern void r5c_check_cached_full_stripe(struct r5conf *conf);
+extern struct md_sysfs_entry r5c_journal_mode;
+extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+
+static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s)
+{
+	struct r5conf *conf = sh->raid_conf;
+
+	if (conf->log) {
+		if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+			/* writing out phase */
+			if (s->waiting_extra_page)
+				return 0;
+			return r5l_write_stripe(conf->log, sh);
+		} else if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
+			/* caching phase */
+			return r5c_cache_data(conf->log, sh);
+		}
+	}
+
+	return -EAGAIN;
+}
+
+static inline void log_stripe_write_finished(struct stripe_head *sh)
+{
+	struct r5conf *conf = sh->raid_conf;
+
+	if (conf->log)
+		r5l_stripe_write_finished(sh);
+}
+
+static inline void log_write_stripe_run(struct r5conf *conf)
+{
+	if (conf->log)
+		r5l_write_stripe_run(conf->log);
+}
+
+static inline void log_exit(struct r5conf *conf)
+{
+	if (conf->log)
+		r5l_exit_log(conf);
+}
+
+static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
+{
+	if (journal_dev)
+		return r5l_init_log(conf, journal_dev);
+
+	return 0;
+}
+
+#endif
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 013398ce2080..f575f40d2acb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -64,6 +64,7 @@
 #include "raid5.h"
 #include "raid0.h"
 #include "bitmap.h"
+#include "raid5-log.h"
 
 #define UNSUPPORTED_MDDEV_FLAGS	(1L << MD_FAILFAST_SUPPORTED)
 
@@ -997,18 +998,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
-	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
-		/* writing out phase */
-		if (s->waiting_extra_page)
-			return;
-		if (r5l_write_stripe(conf->log, sh) == 0)
-			return;
-	} else {  /* caching phase */
-		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
-			r5c_cache_data(conf->log, sh, s);
-			return;
-		}
-	}
+	if (log_stripe(sh, s) == 0)
+		return;
 
 	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
 
@@ -3345,7 +3336,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		if (bi)
 			bitmap_end = 1;
 
-		r5l_stripe_write_finished(sh);
+		log_stripe_write_finished(sh);
 
 		if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
 			wake_up(&conf->wait_for_overlap);
@@ -3764,7 +3755,7 @@ returnbi:
 				discard_pending = 1;
 		}
 
-	r5l_stripe_write_finished(sh);
+	log_stripe_write_finished(sh);
 
 	if (!discard_pending &&
 	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -4754,7 +4745,7 @@ static void handle_stripe(struct stripe_head *sh)
 
 	if (s.just_cached)
 		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
-	r5l_stripe_write_finished(sh);
+	log_stripe_write_finished(sh);
 
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
@@ -6168,7 +6159,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
 
 	for (i = 0; i < batch_size; i++)
 		handle_stripe(batch[i]);
-	r5l_write_stripe_run(conf->log);
+	log_write_stripe_run(conf);
 
 	cond_resched();
 
@@ -6745,8 +6736,8 @@ static void free_conf(struct r5conf *conf)
 {
 	int i;
 
-	if (conf->log)
-		r5l_exit_log(conf->log);
+	log_exit(conf);
+
 	if (conf->shrinker.nr_deferred)
 		unregister_shrinker(&conf->shrinker);
 
@@ -7436,14 +7427,8 @@ static int raid5_run(struct mddev *mddev)
 		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
 	}
 
-	if (journal_dev) {
-		char b[BDEVNAME_SIZE];
-
-		pr_debug("md/raid:%s: using device %s as journal\n",
-			 mdname(mddev), bdevname(journal_dev->bdev, b));
-		if (r5l_init_log(conf, journal_dev))
-			goto abort;
-	}
+	if (log_init(conf, journal_dev))
+		goto abort;
 
 	return 0;
 abort:
@@ -7557,17 +7542,13 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 
 	print_raid5_conf(conf);
 	if (test_bit(Journal, &rdev->flags) && conf->log) {
-		struct r5l_log *log;
 		/*
 		 * we can't wait pending write here, as this is called in
 		 * raid5d, wait will deadlock.
 		 */
 		if (atomic_read(&mddev->writes_pending))
 			return -EBUSY;
-		log = conf->log;
-		conf->log = NULL;
-		synchronize_rcu();
-		r5l_exit_log(log);
+		log_exit(conf);
 		return 0;
 	}
 	if (rdev == p->rdev)
@@ -7636,7 +7617,6 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	int last = conf->raid_disks - 1;
 
 	if (test_bit(Journal, &rdev->flags)) {
-		char b[BDEVNAME_SIZE];
 		if (conf->log)
 			return -EBUSY;
 
@@ -7645,9 +7625,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * The array is in readonly mode if journal is missing, so no
 		 * write requests running. We should be safe
 		 */
-		r5l_init_log(conf, rdev);
-		pr_debug("md/raid:%s: using device %s as journal\n",
-			 mdname(mddev), bdevname(rdev->bdev, b));
+		log_init(conf, rdev);
 		return 0;
 	}
 	if (mddev->recovery_disabled == conf->recovery_disabled)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 985cdc4850c2..6dd295a80ee1 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -779,34 +779,4 @@ extern struct stripe_head *
 raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 			int previous, int noblock, int noquiesce);
 extern int raid5_calc_degraded(struct r5conf *conf);
-extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
-extern void r5l_exit_log(struct r5l_log *log);
-extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
-extern void r5l_write_stripe_run(struct r5l_log *log);
-extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
-extern void r5l_stripe_write_finished(struct stripe_head *sh);
-extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
-extern void r5l_quiesce(struct r5l_log *log, int state);
-extern bool r5l_log_disk_error(struct r5conf *conf);
-extern bool r5c_is_writeback(struct r5l_log *log);
-extern int
-r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh,
-		      struct stripe_head_state *s, int disks);
-extern void
-r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh,
-			    struct stripe_head_state *s);
-extern void r5c_release_extra_page(struct stripe_head *sh);
-extern void r5c_use_extra_page(struct stripe_head *sh);
-extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
-extern void r5c_handle_cached_data_endio(struct r5conf *conf,
-	struct stripe_head *sh, int disks, struct bio_list *return_bi);
-extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh,
-			  struct stripe_head_state *s);
-extern void r5c_make_stripe_write_out(struct stripe_head *sh);
-extern void r5c_flush_cache(struct r5conf *conf, int num);
-extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
-extern void r5c_check_cached_full_stripe(struct r5conf *conf);
-extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev);
-extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
 #endif
-- 
cgit v1.2.3


From 3418d036c81dcb604b7c7c71b209d5890a8418aa Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 09:59:59 +0100
Subject: raid5-ppl: Partial Parity Log write logging implementation

Implement the calculation of partial parity for a stripe and PPL write
logging functionality. The description of PPL is added to the
documentation. More details can be found in the comments in raid5-ppl.c.

Attach a page for holding the partial parity data to stripe_head.
Allocate it only if mddev has the MD_HAS_PPL flag set.

Partial parity is the xor of not modified data chunks of a stripe and is
calculated as follows:

- reconstruct-write case:
  xor data from all not updated disks in a stripe

- read-modify-write case:
  xor old data and parity from all updated disks in a stripe

Implement it using the async_tx API and integrate into raid_run_ops().
It must be called when we still have access to old data, so do it when
STRIPE_OP_BIODRAIN is set, but before ops_run_prexor5(). The result is
stored into sh->ppl_page.

Partial parity is not meaningful for full stripe write and is not stored
in the log or used for recovery, so don't attempt to calculate it when
stripe has STRIPE_FULL_WRITE.

Put the PPL metadata structures to md_p.h because userspace tools
(mdadm) will also need to read/write PPL.

Warn about using PPL with enabled disk volatile write-back cache for
now. It can be removed once disk cache flushing before writing PPL is
implemented.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 Documentation/md/raid5-ppl.txt |  44 +++
 drivers/md/Makefile            |   2 +-
 drivers/md/raid5-log.h         |  24 ++
 drivers/md/raid5-ppl.c         | 703 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c             |  64 +++-
 drivers/md/raid5.h             |  10 +-
 include/uapi/linux/raid/md_p.h |  27 ++
 7 files changed, 869 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/md/raid5-ppl.txt
 create mode 100644 drivers/md/raid5-ppl.c

(limited to 'drivers/md/raid5.c')

diff --git a/Documentation/md/raid5-ppl.txt b/Documentation/md/raid5-ppl.txt
new file mode 100644
index 000000000000..127072b09363
--- /dev/null
+++ b/Documentation/md/raid5-ppl.txt
@@ -0,0 +1,44 @@
+Partial Parity Log
+
+Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue
+addressed by PPL is that after a dirty shutdown, parity of a particular stripe
+may become inconsistent with data on other member disks. If the array is also
+in degraded state, there is no way to recalculate parity, because one of the
+disks is missing. This can lead to silent data corruption when rebuilding the
+array or using it is as degraded - data calculated from parity for array blocks
+that have not been touched by a write request during the unclean shutdown can
+be incorrect. Such condition is known as the RAID5 Write Hole. Because of
+this, md by default does not allow starting a dirty degraded array.
+
+Partial parity for a write operation is the XOR of stripe data chunks not
+modified by this write. It is just enough data needed for recovering from the
+write hole. XORing partial parity with the modified chunks produces parity for
+the stripe, consistent with its state before the write operation, regardless of
+which chunk writes have completed. If one of the not modified data disks of
+this stripe is missing, this updated parity can be used to recover its
+contents. PPL recovery is also performed when starting an array after an
+unclean shutdown and all disks are available, eliminating the need to resync
+the array. Because of this, using write-intent bitmap and PPL together is not
+supported.
+
+When handling a write request PPL writes partial parity before new data and
+parity are dispatched to disks. PPL is a distributed log - it is stored on
+array member drives in the metadata area, on the parity drive of a particular
+stripe.  It does not require a dedicated journaling drive. Write performance is
+reduced by up to 30%-40% but it scales with the number of drives in the array
+and the journaling drive does not become a bottleneck or a single point of
+failure.
+
+Unlike raid5-cache, the other solution in md for closing the write hole, PPL is
+not a true journal. It does not protect from losing in-flight data, only from
+silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is
+performed for this stripe (parity is not updated). So it is possible to have
+arbitrary data in the written part of a stripe if that disk is lost. In such
+case the behavior is the same as in plain raid5.
+
+PPL is available for md version-1 metadata and external (specifically IMSM)
+metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
+
+Currently, volatile write-back cache should be disabled on all member drives
+when using PPL. Otherwise it cannot guarantee consistency in case of power
+failure.
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 3cbda1af87a0..4d48714ccc6b 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -18,7 +18,7 @@ dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
 dm-verity-y	+= dm-verity-target.o
 md-mod-y	+= md.o bitmap.o
-raid456-y	+= raid5.o raid5-cache.o
+raid456-y	+= raid5.o raid5-cache.o raid5-ppl.o
 
 # Note: link order is important.  All raid personalities
 # and must come before md.o, as they each initialise 
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 2da4bd3bbd79..a67fb58513b9 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -31,6 +31,20 @@ extern struct md_sysfs_entry r5c_journal_mode;
 extern void r5c_update_on_rdev_error(struct mddev *mddev);
 extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
 
+extern struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+		       struct dma_async_tx_descriptor *tx);
+extern int ppl_init_log(struct r5conf *conf);
+extern void ppl_exit_log(struct r5conf *conf);
+extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
+extern void ppl_write_stripe_run(struct r5conf *conf);
+extern void ppl_stripe_write_finished(struct stripe_head *sh);
+
+static inline bool raid5_has_ppl(struct r5conf *conf)
+{
+	return test_bit(MD_HAS_PPL, &conf->mddev->flags);
+}
+
 static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	struct r5conf *conf = sh->raid_conf;
@@ -45,6 +59,8 @@ static inline int log_stripe(struct stripe_head *sh, struct stripe_head_state *s
 			/* caching phase */
 			return r5c_cache_data(conf->log, sh);
 		}
+	} else if (raid5_has_ppl(conf)) {
+		return ppl_write_stripe(conf, sh);
 	}
 
 	return -EAGAIN;
@@ -56,24 +72,32 @@ static inline void log_stripe_write_finished(struct stripe_head *sh)
 
 	if (conf->log)
 		r5l_stripe_write_finished(sh);
+	else if (raid5_has_ppl(conf))
+		ppl_stripe_write_finished(sh);
 }
 
 static inline void log_write_stripe_run(struct r5conf *conf)
 {
 	if (conf->log)
 		r5l_write_stripe_run(conf->log);
+	else if (raid5_has_ppl(conf))
+		ppl_write_stripe_run(conf);
 }
 
 static inline void log_exit(struct r5conf *conf)
 {
 	if (conf->log)
 		r5l_exit_log(conf);
+	else if (raid5_has_ppl(conf))
+		ppl_exit_log(conf);
 }
 
 static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
 {
 	if (journal_dev)
 		return r5l_init_log(conf, journal_dev);
+	else if (raid5_has_ppl(conf))
+		return ppl_init_log(conf);
 
 	return 0;
 }
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
new file mode 100644
index 000000000000..db5b72b11594
--- /dev/null
+++ b/drivers/md/raid5-ppl.c
@@ -0,0 +1,703 @@
+/*
+ * Partial Parity Log for closing the RAID5 write hole
+ * Copyright (c) 2017, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/flex_array.h>
+#include <linux/async_tx.h>
+#include <linux/raid/md_p.h>
+#include "md.h"
+#include "raid5.h"
+
+/*
+ * PPL consists of a 4KB header (struct ppl_header) and at least 128KB for
+ * partial parity data. The header contains an array of entries
+ * (struct ppl_header_entry) which describe the logged write requests.
+ * Partial parity for the entries comes after the header, written in the same
+ * sequence as the entries:
+ *
+ * Header
+ *   entry0
+ *   ...
+ *   entryN
+ * PP data
+ *   PP for entry0
+ *   ...
+ *   PP for entryN
+ *
+ * An entry describes one or more consecutive stripe_heads, up to a full
+ * stripe. The modifed raid data chunks form an m-by-n matrix, where m is the
+ * number of stripe_heads in the entry and n is the number of modified data
+ * disks. Every stripe_head in the entry must write to the same data disks.
+ * An example of a valid case described by a single entry (writes to the first
+ * stripe of a 4 disk array, 16k chunk size):
+ *
+ * sh->sector   dd0   dd1   dd2    ppl
+ *            +-----+-----+-----+
+ * 0          | --- | --- | --- | +----+
+ * 8          | -W- | -W- | --- | | pp |   data_sector = 8
+ * 16         | -W- | -W- | --- | | pp |   data_size = 3 * 2 * 4k
+ * 24         | -W- | -W- | --- | | pp |   pp_size = 3 * 4k
+ *            +-----+-----+-----+ +----+
+ *
+ * data_sector is the first raid sector of the modified data, data_size is the
+ * total size of modified data and pp_size is the size of partial parity for
+ * this entry. Entries for full stripe writes contain no partial parity
+ * (pp_size = 0), they only mark the stripes for which parity should be
+ * recalculated after an unclean shutdown. Every entry holds a checksum of its
+ * partial parity, the header also has a checksum of the header itself.
+ *
+ * A write request is always logged to the PPL instance stored on the parity
+ * disk of the corresponding stripe. For each member disk there is one ppl_log
+ * used to handle logging for this disk, independently from others. They are
+ * grouped in child_logs array in struct ppl_conf, which is assigned to
+ * r5conf->log_private.
+ *
+ * ppl_io_unit represents a full PPL write, header_page contains the ppl_header.
+ * PPL entries for logged stripes are added in ppl_log_stripe(). A stripe_head
+ * can be appended to the last entry if it meets the conditions for a valid
+ * entry described above, otherwise a new entry is added. Checksums of entries
+ * are calculated incrementally as stripes containing partial parity are being
+ * added. ppl_submit_iounit() calculates the checksum of the header and submits
+ * a bio containing the header page and partial parity pages (sh->ppl_page) for
+ * all stripes of the io_unit. When the PPL write completes, the stripes
+ * associated with the io_unit are released and raid5d starts writing their data
+ * and parity. When all stripes are written, the io_unit is freed and the next
+ * can be submitted.
+ *
+ * An io_unit is used to gather stripes until it is submitted or becomes full
+ * (if the maximum number of entries or size of PPL is reached). Another io_unit
+ * can't be submitted until the previous has completed (PPL and stripe
+ * data+parity is written). The log->io_list tracks all io_units of a log
+ * (for a single member disk). New io_units are added to the end of the list
+ * and the first io_unit is submitted, if it is not submitted already.
+ * The current io_unit accepting new stripes is always at the end of the list.
+ */
+
+struct ppl_conf {
+	struct mddev *mddev;
+
+	/* array of child logs, one for each raid disk */
+	struct ppl_log *child_logs;
+	int count;
+
+	int block_size;		/* the logical block size used for data_sector
+				 * in ppl_header_entry */
+	u32 signature;		/* raid array identifier */
+	atomic64_t seq;		/* current log write sequence number */
+
+	struct kmem_cache *io_kc;
+	mempool_t *io_pool;
+	struct bio_set *bs;
+	mempool_t *meta_pool;
+};
+
+struct ppl_log {
+	struct ppl_conf *ppl_conf;	/* shared between all log instances */
+
+	struct md_rdev *rdev;		/* array member disk associated with
+					 * this log instance */
+	struct mutex io_mutex;
+	struct ppl_io_unit *current_io;	/* current io_unit accepting new data
+					 * always at the end of io_list */
+	spinlock_t io_list_lock;
+	struct list_head io_list;	/* all io_units of this log */
+	struct list_head no_mem_stripes;/* stripes to retry if failed to
+					 * allocate io_unit */
+};
+
+#define PPL_IO_INLINE_BVECS 32
+
+struct ppl_io_unit {
+	struct ppl_log *log;
+
+	struct page *header_page;	/* for ppl_header */
+
+	unsigned int entries_count;	/* number of entries in ppl_header */
+	unsigned int pp_size;		/* total size current of partial parity */
+
+	u64 seq;			/* sequence number of this log write */
+	struct list_head log_sibling;	/* log->io_list */
+
+	struct list_head stripe_list;	/* stripes added to the io_unit */
+	atomic_t pending_stripes;	/* how many stripes not written to raid */
+
+	bool submitted;			/* true if write to log started */
+
+	/* inline bio and its biovec for submitting the iounit */
+	struct bio bio;
+	struct bio_vec biovec[PPL_IO_INLINE_BVECS];
+};
+
+struct dma_async_tx_descriptor *
+ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
+		       struct dma_async_tx_descriptor *tx)
+{
+	int disks = sh->disks;
+	struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
+	int count = 0, pd_idx = sh->pd_idx, i;
+	struct async_submit_ctl submit;
+
+	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
+
+	/*
+	 * Partial parity is the XOR of stripe data chunks that are not changed
+	 * during the write request. Depending on available data
+	 * (read-modify-write vs. reconstruct-write case) we calculate it
+	 * differently.
+	 */
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+		/* rmw: xor old data and parity from updated disks */
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
+				xor_srcs[count++] = dev->page;
+		}
+	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
+		/* rcw: xor data from all not updated disks */
+		for (i = disks; i--;) {
+			struct r5dev *dev = &sh->dev[i];
+			if (test_bit(R5_UPTODATE, &dev->flags))
+				xor_srcs[count++] = dev->page;
+		}
+	} else {
+		return tx;
+	}
+
+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
+			  NULL, sh, flex_array_get(percpu->scribble, 0)
+			  + sizeof(struct page *) * (sh->disks + 2));
+
+	if (count == 1)
+		tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
+				  &submit);
+	else
+		tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
+			       &submit);
+
+	return tx;
+}
+
+static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
+					  struct stripe_head *sh)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct ppl_io_unit *io;
+	struct ppl_header *pplhdr;
+
+	io = mempool_alloc(ppl_conf->io_pool, GFP_ATOMIC);
+	if (!io)
+		return NULL;
+
+	memset(io, 0, sizeof(*io));
+	io->log = log;
+	INIT_LIST_HEAD(&io->log_sibling);
+	INIT_LIST_HEAD(&io->stripe_list);
+	atomic_set(&io->pending_stripes, 0);
+	bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
+
+	io->header_page = mempool_alloc(ppl_conf->meta_pool, GFP_NOIO);
+	pplhdr = page_address(io->header_page);
+	clear_page(pplhdr);
+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+	pplhdr->signature = cpu_to_le32(ppl_conf->signature);
+
+	io->seq = atomic64_add_return(1, &ppl_conf->seq);
+	pplhdr->generation = cpu_to_le64(io->seq);
+
+	return io;
+}
+
+static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
+{
+	struct ppl_io_unit *io = log->current_io;
+	struct ppl_header_entry *e = NULL;
+	struct ppl_header *pplhdr;
+	int i;
+	sector_t data_sector = 0;
+	int data_disks = 0;
+	unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
+	struct r5conf *conf = sh->raid_conf;
+
+	pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
+
+	/* check if current io_unit is full */
+	if (io && (io->pp_size == entry_space ||
+		   io->entries_count == PPL_HDR_MAX_ENTRIES)) {
+		pr_debug("%s: add io_unit blocked by seq: %llu\n",
+			 __func__, io->seq);
+		io = NULL;
+	}
+
+	/* add a new unit if there is none or the current is full */
+	if (!io) {
+		io = ppl_new_iounit(log, sh);
+		if (!io)
+			return -ENOMEM;
+		spin_lock_irq(&log->io_list_lock);
+		list_add_tail(&io->log_sibling, &log->io_list);
+		spin_unlock_irq(&log->io_list_lock);
+
+		log->current_io = io;
+	}
+
+	for (i = 0; i < sh->disks; i++) {
+		struct r5dev *dev = &sh->dev[i];
+
+		if (i != sh->pd_idx && test_bit(R5_Wantwrite, &dev->flags)) {
+			if (!data_disks || dev->sector < data_sector)
+				data_sector = dev->sector;
+			data_disks++;
+		}
+	}
+	BUG_ON(!data_disks);
+
+	pr_debug("%s: seq: %llu data_sector: %llu data_disks: %d\n", __func__,
+		 io->seq, (unsigned long long)data_sector, data_disks);
+
+	pplhdr = page_address(io->header_page);
+
+	if (io->entries_count > 0) {
+		struct ppl_header_entry *last =
+				&pplhdr->entries[io->entries_count - 1];
+		struct stripe_head *sh_last = list_last_entry(
+				&io->stripe_list, struct stripe_head, log_list);
+		u64 data_sector_last = le64_to_cpu(last->data_sector);
+		u32 data_size_last = le32_to_cpu(last->data_size);
+
+		/*
+		 * Check if we can append the stripe to the last entry. It must
+		 * be just after the last logged stripe and write to the same
+		 * disks. Use bit shift and logarithm to avoid 64-bit division.
+		 */
+		if ((sh->sector == sh_last->sector + STRIPE_SECTORS) &&
+		    (data_sector >> ilog2(conf->chunk_sectors) ==
+		     data_sector_last >> ilog2(conf->chunk_sectors)) &&
+		    ((data_sector - data_sector_last) * data_disks ==
+		     data_size_last >> 9))
+			e = last;
+	}
+
+	if (!e) {
+		e = &pplhdr->entries[io->entries_count++];
+		e->data_sector = cpu_to_le64(data_sector);
+		e->parity_disk = cpu_to_le32(sh->pd_idx);
+		e->checksum = cpu_to_le32(~0);
+	}
+
+	le32_add_cpu(&e->data_size, data_disks << PAGE_SHIFT);
+
+	/* don't write any PP if full stripe write */
+	if (!test_bit(STRIPE_FULL_WRITE, &sh->state)) {
+		le32_add_cpu(&e->pp_size, PAGE_SIZE);
+		io->pp_size += PAGE_SIZE;
+		e->checksum = cpu_to_le32(crc32c_le(le32_to_cpu(e->checksum),
+						    page_address(sh->ppl_page),
+						    PAGE_SIZE));
+	}
+
+	list_add_tail(&sh->log_list, &io->stripe_list);
+	atomic_inc(&io->pending_stripes);
+	sh->ppl_io = io;
+
+	return 0;
+}
+
+int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+	struct ppl_io_unit *io = sh->ppl_io;
+	struct ppl_log *log;
+
+	if (io || test_bit(STRIPE_SYNCING, &sh->state) ||
+	    !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
+	    !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
+		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
+		return -EAGAIN;
+	}
+
+	log = &ppl_conf->child_logs[sh->pd_idx];
+
+	mutex_lock(&log->io_mutex);
+
+	if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+		mutex_unlock(&log->io_mutex);
+		return -EAGAIN;
+	}
+
+	set_bit(STRIPE_LOG_TRAPPED, &sh->state);
+	clear_bit(STRIPE_DELAYED, &sh->state);
+	atomic_inc(&sh->count);
+
+	if (ppl_log_stripe(log, sh)) {
+		spin_lock_irq(&log->io_list_lock);
+		list_add_tail(&sh->log_list, &log->no_mem_stripes);
+		spin_unlock_irq(&log->io_list_lock);
+	}
+
+	mutex_unlock(&log->io_mutex);
+
+	return 0;
+}
+
+static void ppl_log_endio(struct bio *bio)
+{
+	struct ppl_io_unit *io = bio->bi_private;
+	struct ppl_log *log = io->log;
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct stripe_head *sh, *next;
+
+	pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+	if (bio->bi_error)
+		md_error(ppl_conf->mddev, log->rdev);
+
+	mempool_free(io->header_page, ppl_conf->meta_pool);
+
+	list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
+		list_del_init(&sh->log_list);
+
+		set_bit(STRIPE_HANDLE, &sh->state);
+		raid5_release_stripe(sh);
+	}
+}
+
+static void ppl_submit_iounit_bio(struct ppl_io_unit *io, struct bio *bio)
+{
+	char b[BDEVNAME_SIZE];
+
+	pr_debug("%s: seq: %llu size: %u sector: %llu dev: %s\n",
+		 __func__, io->seq, bio->bi_iter.bi_size,
+		 (unsigned long long)bio->bi_iter.bi_sector,
+		 bdevname(bio->bi_bdev, b));
+
+	submit_bio(bio);
+}
+
+static void ppl_submit_iounit(struct ppl_io_unit *io)
+{
+	struct ppl_log *log = io->log;
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct ppl_header *pplhdr = page_address(io->header_page);
+	struct bio *bio = &io->bio;
+	struct stripe_head *sh;
+	int i;
+
+	for (i = 0; i < io->entries_count; i++) {
+		struct ppl_header_entry *e = &pplhdr->entries[i];
+
+		pr_debug("%s: seq: %llu entry: %d data_sector: %llu pp_size: %u data_size: %u\n",
+			 __func__, io->seq, i, le64_to_cpu(e->data_sector),
+			 le32_to_cpu(e->pp_size), le32_to_cpu(e->data_size));
+
+		e->data_sector = cpu_to_le64(le64_to_cpu(e->data_sector) >>
+					     ilog2(ppl_conf->block_size >> 9));
+		e->checksum = cpu_to_le32(~le32_to_cpu(e->checksum));
+	}
+
+	pplhdr->entries_count = cpu_to_le32(io->entries_count);
+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
+
+	bio->bi_private = io;
+	bio->bi_end_io = ppl_log_endio;
+	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
+	bio->bi_bdev = log->rdev->bdev;
+	bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+	bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
+
+	list_for_each_entry(sh, &io->stripe_list, log_list) {
+		/* entries for full stripe writes have no partial parity */
+		if (test_bit(STRIPE_FULL_WRITE, &sh->state))
+			continue;
+
+		if (!bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0)) {
+			struct bio *prev = bio;
+
+			bio = bio_alloc_bioset(GFP_NOIO, BIO_MAX_PAGES,
+					       ppl_conf->bs);
+			bio->bi_opf = prev->bi_opf;
+			bio->bi_bdev = prev->bi_bdev;
+			bio->bi_iter.bi_sector = bio_end_sector(prev);
+			bio_add_page(bio, sh->ppl_page, PAGE_SIZE, 0);
+
+			bio_chain(bio, prev);
+			ppl_submit_iounit_bio(io, prev);
+		}
+	}
+
+	ppl_submit_iounit_bio(io, bio);
+}
+
+static void ppl_submit_current_io(struct ppl_log *log)
+{
+	struct ppl_io_unit *io;
+
+	spin_lock_irq(&log->io_list_lock);
+
+	io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
+				      log_sibling);
+	if (io && io->submitted)
+		io = NULL;
+
+	spin_unlock_irq(&log->io_list_lock);
+
+	if (io) {
+		io->submitted = true;
+
+		if (io == log->current_io)
+			log->current_io = NULL;
+
+		ppl_submit_iounit(io);
+	}
+}
+
+void ppl_write_stripe_run(struct r5conf *conf)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+	struct ppl_log *log;
+	int i;
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		log = &ppl_conf->child_logs[i];
+
+		mutex_lock(&log->io_mutex);
+		ppl_submit_current_io(log);
+		mutex_unlock(&log->io_mutex);
+	}
+}
+
+static void ppl_io_unit_finished(struct ppl_io_unit *io)
+{
+	struct ppl_log *log = io->log;
+	unsigned long flags;
+
+	pr_debug("%s: seq: %llu\n", __func__, io->seq);
+
+	spin_lock_irqsave(&log->io_list_lock, flags);
+
+	list_del(&io->log_sibling);
+	mempool_free(io, log->ppl_conf->io_pool);
+
+	if (!list_empty(&log->no_mem_stripes)) {
+		struct stripe_head *sh = list_first_entry(&log->no_mem_stripes,
+							  struct stripe_head,
+							  log_list);
+		list_del_init(&sh->log_list);
+		set_bit(STRIPE_HANDLE, &sh->state);
+		raid5_release_stripe(sh);
+	}
+
+	spin_unlock_irqrestore(&log->io_list_lock, flags);
+}
+
+void ppl_stripe_write_finished(struct stripe_head *sh)
+{
+	struct ppl_io_unit *io;
+
+	io = sh->ppl_io;
+	sh->ppl_io = NULL;
+
+	if (io && atomic_dec_and_test(&io->pending_stripes))
+		ppl_io_unit_finished(io);
+}
+
+static void __ppl_exit_log(struct ppl_conf *ppl_conf)
+{
+	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+
+	kfree(ppl_conf->child_logs);
+
+	mempool_destroy(ppl_conf->meta_pool);
+	if (ppl_conf->bs)
+		bioset_free(ppl_conf->bs);
+	mempool_destroy(ppl_conf->io_pool);
+	kmem_cache_destroy(ppl_conf->io_kc);
+
+	kfree(ppl_conf);
+}
+
+void ppl_exit_log(struct r5conf *conf)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+
+	if (ppl_conf) {
+		__ppl_exit_log(ppl_conf);
+		conf->log_private = NULL;
+	}
+}
+
+static int ppl_validate_rdev(struct md_rdev *rdev)
+{
+	char b[BDEVNAME_SIZE];
+	int ppl_data_sectors;
+	int ppl_size_new;
+
+	/*
+	 * The configured PPL size must be enough to store
+	 * the header and (at the very least) partial parity
+	 * for one stripe. Round it down to ensure the data
+	 * space is cleanly divisible by stripe size.
+	 */
+	ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9);
+
+	if (ppl_data_sectors > 0)
+		ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS);
+
+	if (ppl_data_sectors <= 0) {
+		pr_warn("md/raid:%s: PPL space too small on %s\n",
+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
+		return -ENOSPC;
+	}
+
+	ppl_size_new = ppl_data_sectors + (PPL_HEADER_SIZE >> 9);
+
+	if ((rdev->ppl.sector < rdev->data_offset &&
+	     rdev->ppl.sector + ppl_size_new > rdev->data_offset) ||
+	    (rdev->ppl.sector >= rdev->data_offset &&
+	     rdev->data_offset + rdev->sectors > rdev->ppl.sector)) {
+		pr_warn("md/raid:%s: PPL space overlaps with data on %s\n",
+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
+		return -EINVAL;
+	}
+
+	if (!rdev->mddev->external &&
+	    ((rdev->ppl.offset > 0 && rdev->ppl.offset < (rdev->sb_size >> 9)) ||
+	     (rdev->ppl.offset <= 0 && rdev->ppl.offset + ppl_size_new > 0))) {
+		pr_warn("md/raid:%s: PPL space overlaps with superblock on %s\n",
+			mdname(rdev->mddev), bdevname(rdev->bdev, b));
+		return -EINVAL;
+	}
+
+	rdev->ppl.size = ppl_size_new;
+
+	return 0;
+}
+
+int ppl_init_log(struct r5conf *conf)
+{
+	struct ppl_conf *ppl_conf;
+	struct mddev *mddev = conf->mddev;
+	int ret = 0;
+	int i;
+	bool need_cache_flush;
+
+	pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
+		 mdname(conf->mddev));
+
+	if (PAGE_SIZE != 4096)
+		return -EINVAL;
+
+	if (mddev->level != 5) {
+		pr_warn("md/raid:%s PPL is not compatible with raid level %d\n",
+			mdname(mddev), mddev->level);
+		return -EINVAL;
+	}
+
+	if (mddev->bitmap_info.file || mddev->bitmap_info.offset) {
+		pr_warn("md/raid:%s PPL is not compatible with bitmap\n",
+			mdname(mddev));
+		return -EINVAL;
+	}
+
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
+		pr_warn("md/raid:%s PPL is not compatible with journal\n",
+			mdname(mddev));
+		return -EINVAL;
+	}
+
+	ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
+	if (!ppl_conf)
+		return -ENOMEM;
+
+	ppl_conf->mddev = mddev;
+
+	ppl_conf->io_kc = KMEM_CACHE(ppl_io_unit, 0);
+	if (!ppl_conf->io_kc) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->io_pool = mempool_create_slab_pool(conf->raid_disks, ppl_conf->io_kc);
+	if (!ppl_conf->io_pool) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->bs = bioset_create(conf->raid_disks, 0);
+	if (!ppl_conf->bs) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->meta_pool = mempool_create_page_pool(conf->raid_disks, 0);
+	if (!ppl_conf->meta_pool) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ppl_conf->count = conf->raid_disks;
+	ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
+				       GFP_KERNEL);
+	if (!ppl_conf->child_logs) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	atomic64_set(&ppl_conf->seq, 0);
+
+	if (!mddev->external) {
+		ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
+		ppl_conf->block_size = 512;
+	} else {
+		ppl_conf->block_size = queue_logical_block_size(mddev->queue);
+	}
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		struct ppl_log *log = &ppl_conf->child_logs[i];
+		struct md_rdev *rdev = conf->disks[i].rdev;
+
+		mutex_init(&log->io_mutex);
+		spin_lock_init(&log->io_list_lock);
+		INIT_LIST_HEAD(&log->io_list);
+		INIT_LIST_HEAD(&log->no_mem_stripes);
+
+		log->ppl_conf = ppl_conf;
+		log->rdev = rdev;
+
+		if (rdev) {
+			struct request_queue *q;
+
+			ret = ppl_validate_rdev(rdev);
+			if (ret)
+				goto err;
+
+			q = bdev_get_queue(rdev->bdev);
+			if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+				need_cache_flush = true;
+		}
+	}
+
+	if (need_cache_flush)
+		pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
+			mdname(mddev));
+
+	conf->log_private = ppl_conf;
+
+	return 0;
+err:
+	__ppl_exit_log(ppl_conf);
+	return ret;
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f575f40d2acb..6b86e0826afe 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -482,6 +482,11 @@ static void shrink_buffers(struct stripe_head *sh)
 		sh->dev[i].page = NULL;
 		put_page(p);
 	}
+
+	if (sh->ppl_page) {
+		put_page(sh->ppl_page);
+		sh->ppl_page = NULL;
+	}
 }
 
 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -498,6 +503,13 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 		sh->dev[i].page = page;
 		sh->dev[i].orig_page = page;
 	}
+
+	if (raid5_has_ppl(sh->raid_conf)) {
+		sh->ppl_page = alloc_page(gfp);
+		if (!sh->ppl_page)
+			return 1;
+	}
+
 	return 0;
 }
 
@@ -746,7 +758,7 @@ static bool stripe_can_batch(struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
 
-	if (conf->log)
+	if (conf->log || raid5_has_ppl(conf))
 		return false;
 	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
 		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
@@ -2093,6 +2105,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			async_tx_ack(tx);
 	}
 
+	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+		tx = ops_run_partial_parity(sh, percpu, tx);
+
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
 		if (level < 6)
 			tx = ops_run_prexor5(sh, percpu, tx);
@@ -3168,6 +3183,12 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		s->locked++;
 	}
 
+	if (raid5_has_ppl(sh->raid_conf) &&
+	    test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
+	    !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
+	    test_bit(R5_Insync, &sh->dev[pd_idx].flags))
+		set_bit(STRIPE_OP_PARTIAL_PARITY, &s->ops_request);
+
 	pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
 		__func__, (unsigned long long)sh->sector,
 		s->locked, s->ops_request);
@@ -3215,6 +3236,36 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
 		goto overlap;
 
+	if (forwrite && raid5_has_ppl(conf)) {
+		/*
+		 * With PPL only writes to consecutive data chunks within a
+		 * stripe are allowed because for a single stripe_head we can
+		 * only have one PPL entry at a time, which describes one data
+		 * range. Not really an overlap, but wait_for_overlap can be
+		 * used to handle this.
+		 */
+		sector_t sector;
+		sector_t first = 0;
+		sector_t last = 0;
+		int count = 0;
+		int i;
+
+		for (i = 0; i < sh->disks; i++) {
+			if (i != sh->pd_idx &&
+			    (i == dd_idx || sh->dev[i].towrite)) {
+				sector = sh->dev[i].sector;
+				if (count == 0 || sector < first)
+					first = sector;
+				if (sector > last)
+					last = sector;
+				count++;
+			}
+		}
+
+		if (first + conf->chunk_sectors * (count - 1) != last)
+			goto overlap;
+	}
+
 	if (!forwrite || previous)
 		clear_bit(STRIPE_BATCH_READY, &sh->state);
 
@@ -7208,6 +7259,13 @@ static int raid5_run(struct mddev *mddev)
 		BUG_ON(mddev->delta_disks != 0);
 	}
 
+	if (test_bit(MD_HAS_JOURNAL, &mddev->flags) &&
+	    test_bit(MD_HAS_PPL, &mddev->flags)) {
+		pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
+			mdname(mddev));
+		clear_bit(MD_HAS_PPL, &mddev->flags);
+	}
+
 	if (mddev->private == NULL)
 		conf = setup_conf(mddev);
 	else
@@ -7689,7 +7747,7 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
 	sector_t newsize;
 	struct r5conf *conf = mddev->private;
 
-	if (conf->log)
+	if (conf->log || raid5_has_ppl(conf))
 		return -EINVAL;
 	sectors &= ~((sector_t)conf->chunk_sectors - 1);
 	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
@@ -7740,7 +7798,7 @@ static int check_reshape(struct mddev *mddev)
 {
 	struct r5conf *conf = mddev->private;
 
-	if (conf->log)
+	if (conf->log || raid5_has_ppl(conf))
 		return -EINVAL;
 	if (mddev->delta_disks == 0 &&
 	    mddev->new_layout == mddev->layout &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 6dd295a80ee1..ba5b7a3790af 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -224,10 +224,16 @@ struct stripe_head {
 	spinlock_t		batch_lock; /* only header's lock is useful */
 	struct list_head	batch_list; /* protected by head's batch lock*/
 
-	struct r5l_io_unit	*log_io;
+	union {
+		struct r5l_io_unit	*log_io;
+		struct ppl_io_unit	*ppl_io;
+	};
+
 	struct list_head	log_list;
 	sector_t		log_start; /* first meta block on the journal */
 	struct list_head	r5c; /* for r5c_cache->stripe_in_journal */
+
+	struct page		*ppl_page; /* partial parity of this stripe */
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
@@ -400,6 +406,7 @@ enum {
 	STRIPE_OP_BIODRAIN,
 	STRIPE_OP_RECONSTRUCT,
 	STRIPE_OP_CHECK,
+	STRIPE_OP_PARTIAL_PARITY,
 };
 
 /*
@@ -696,6 +703,7 @@ struct r5conf {
 	int			group_cnt;
 	int			worker_cnt_per_group;
 	struct r5l_log		*log;
+	void			*log_private;
 
 	spinlock_t		pending_bios_lock;
 	bool			batch_bio_dispatch;
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index fe2112810c43..d9a1ead867b9 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -398,4 +398,31 @@ struct r5l_meta_block {
 
 #define R5LOG_VERSION 0x1
 #define R5LOG_MAGIC 0x6433c509
+
+struct ppl_header_entry {
+	__le64 data_sector;	/* raid sector of the new data */
+	__le32 pp_size;		/* length of partial parity */
+	__le32 data_size;	/* length of data */
+	__le32 parity_disk;	/* member disk containing parity */
+	__le32 checksum;	/* checksum of partial parity data for this
+				 * entry (~crc32c) */
+} __attribute__ ((__packed__));
+
+#define PPL_HEADER_SIZE 4096
+#define PPL_HDR_RESERVED 512
+#define PPL_HDR_ENTRY_SPACE \
+	(PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(u32) - sizeof(u64))
+#define PPL_HDR_MAX_ENTRIES \
+	(PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
+
+struct ppl_header {
+	__u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */
+	__le32 signature;		/* signature (family number of volume) */
+	__le32 padding;			/* zero pad */
+	__le64 generation;		/* generation number of the header */
+	__le32 entries_count;		/* number of entries in entry array */
+	__le32 checksum;		/* checksum of the header (~crc32c) */
+	struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
+} __attribute__ ((__packed__));
+
 #endif
-- 
cgit v1.2.3


From 4536bf9ba2d03404655586b07f8830b6f2106242 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 10:00:01 +0100
Subject: raid5-ppl: load and recover the log

Load the log from each disk when starting the array and recover if the
array is dirty.

The initial empty PPL is written by mdadm. When loading the log we
verify the header checksum and signature. For external metadata arrays
the signature is verified in userspace, so here we read it from the
header, verifying only if it matches on all disks, and use it later when
writing PPL.

In addition to the header checksum, each header entry also contains a
checksum of its partial parity data. If the header is valid, recovery is
performed for each entry until an invalid entry is found. If the array
is not degraded and recovery using PPL fully succeeds, there is no need
to resync the array because data and parity will be consistent, so in
this case resync will be disabled.

Due to compatibility with IMSM implementations on other systems, we
can't assume that the recovery data block size is always 4K. Writes
generated by MD raid5 don't have this issue, but when recovering PPL
written in other environments it is possible to have entries with
512-byte sector granularity. The recovery code takes this into account
and also the logical sector size of the underlying drives.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-ppl.c | 489 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c     |   5 +-
 2 files changed, 493 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index db5b72b11594..d336c024eef9 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -103,6 +103,10 @@ struct ppl_conf {
 	mempool_t *io_pool;
 	struct bio_set *bs;
 	mempool_t *meta_pool;
+
+	/* used only for recovery */
+	int recovered_entries;
+	int mismatch_count;
 };
 
 struct ppl_log {
@@ -514,6 +518,474 @@ void ppl_stripe_write_finished(struct stripe_head *sh)
 		ppl_io_unit_finished(io);
 }
 
+static void ppl_xor(int size, struct page *page1, struct page *page2)
+{
+	struct async_submit_ctl submit;
+	struct dma_async_tx_descriptor *tx;
+	struct page *xor_srcs[] = { page1, page2 };
+
+	init_async_submit(&submit, ASYNC_TX_ACK|ASYNC_TX_XOR_DROP_DST,
+			  NULL, NULL, NULL, NULL);
+	tx = async_xor(page1, xor_srcs, 0, 2, size, &submit);
+
+	async_tx_quiesce(&tx);
+}
+
+/*
+ * PPL recovery strategy: xor partial parity and data from all modified data
+ * disks within a stripe and write the result as the new stripe parity. If all
+ * stripe data disks are modified (full stripe write), no partial parity is
+ * available, so just xor the data disks.
+ *
+ * Recovery of a PPL entry shall occur only if all modified data disks are
+ * available and read from all of them succeeds.
+ *
+ * A PPL entry applies to a stripe, partial parity size for an entry is at most
+ * the size of the chunk. Examples of possible cases for a single entry:
+ *
+ * case 0: single data disk write:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
+ * | ------ | -data- | ------ | | pp | -> | data1 ^ pp         |
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = data_size
+ *
+ * case 1: more than one data disk write:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | -data- | -data- | ------ | | pp | -> | data0 ^ data1 ^ pp |
+ * | ------ | ------ | ------ | +----+    | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = data_size / modified_data_disks
+ *
+ * case 2: write to all data disks (also full stripe write):
+ *   data0    data1    data2                parity
+ * +--------+--------+--------+           +--------------------+
+ * | ------ | ------ | ------ |           | (no change)        |
+ * | -data- | -data- | -data- | --------> | xor all data       |
+ * | ------ | ------ | ------ | --------> | (no change)        |
+ * | ------ | ------ | ------ |           | (no change)        |
+ * +--------+--------+--------+           +--------------------+
+ * pp_size = 0
+ *
+ * The following cases are possible only in other implementations. The recovery
+ * code can handle them, but they are not generated at runtime because they can
+ * be reduced to cases 0, 1 and 2:
+ *
+ * case 3:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+ +----+    +--------------------+
+ * | ------ | -data- | -data- | | pp |    | data1 ^ data2 ^ pp |
+ * | ------ | -data- | -data- | | pp | -> | data1 ^ data2 ^ pp |
+ * | -data- | -data- | -data- | | -- | -> | xor all data       |
+ * | -data- | -data- | ------ | | pp |    | data0 ^ data1 ^ pp |
+ * +--------+--------+--------+ +----+    +--------------------+
+ * pp_size = chunk_size
+ *
+ * case 4:
+ *   data0    data1    data2     ppl        parity
+ * +--------+--------+--------+ +----+    +--------------------+
+ * | ------ | -data- | ------ | | pp |    | data1 ^ pp         |
+ * | ------ | ------ | ------ | | -- | -> | (no change)        |
+ * | ------ | ------ | ------ | | -- | -> | (no change)        |
+ * | -data- | ------ | ------ | | pp |    | data0 ^ pp         |
+ * +--------+--------+--------+ +----+    +--------------------+
+ * pp_size = chunk_size
+ */
+static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
+			     sector_t ppl_sector)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct mddev *mddev = ppl_conf->mddev;
+	struct r5conf *conf = mddev->private;
+	int block_size = ppl_conf->block_size;
+	struct page *page1;
+	struct page *page2;
+	sector_t r_sector_first;
+	sector_t r_sector_last;
+	int strip_sectors;
+	int data_disks;
+	int i;
+	int ret = 0;
+	char b[BDEVNAME_SIZE];
+	unsigned int pp_size = le32_to_cpu(e->pp_size);
+	unsigned int data_size = le32_to_cpu(e->data_size);
+
+	page1 = alloc_page(GFP_KERNEL);
+	page2 = alloc_page(GFP_KERNEL);
+
+	if (!page1 || !page2) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	r_sector_first = le64_to_cpu(e->data_sector) * (block_size >> 9);
+
+	if ((pp_size >> 9) < conf->chunk_sectors) {
+		if (pp_size > 0) {
+			data_disks = data_size / pp_size;
+			strip_sectors = pp_size >> 9;
+		} else {
+			data_disks = conf->raid_disks - conf->max_degraded;
+			strip_sectors = (data_size >> 9) / data_disks;
+		}
+		r_sector_last = r_sector_first +
+				(data_disks - 1) * conf->chunk_sectors +
+				strip_sectors;
+	} else {
+		data_disks = conf->raid_disks - conf->max_degraded;
+		strip_sectors = conf->chunk_sectors;
+		r_sector_last = r_sector_first + (data_size >> 9);
+	}
+
+	pr_debug("%s: array sector first: %llu last: %llu\n", __func__,
+		 (unsigned long long)r_sector_first,
+		 (unsigned long long)r_sector_last);
+
+	/* if start and end is 4k aligned, use a 4k block */
+	if (block_size == 512 &&
+	    (r_sector_first & (STRIPE_SECTORS - 1)) == 0 &&
+	    (r_sector_last & (STRIPE_SECTORS - 1)) == 0)
+		block_size = STRIPE_SIZE;
+
+	/* iterate through blocks in strip */
+	for (i = 0; i < strip_sectors; i += (block_size >> 9)) {
+		bool update_parity = false;
+		sector_t parity_sector;
+		struct md_rdev *parity_rdev;
+		struct stripe_head sh;
+		int disk;
+		int indent = 0;
+
+		pr_debug("%s:%*s iter %d start\n", __func__, indent, "", i);
+		indent += 2;
+
+		memset(page_address(page1), 0, PAGE_SIZE);
+
+		/* iterate through data member disks */
+		for (disk = 0; disk < data_disks; disk++) {
+			int dd_idx;
+			struct md_rdev *rdev;
+			sector_t sector;
+			sector_t r_sector = r_sector_first + i +
+					    (disk * conf->chunk_sectors);
+
+			pr_debug("%s:%*s data member disk %d start\n",
+				 __func__, indent, "", disk);
+			indent += 2;
+
+			if (r_sector >= r_sector_last) {
+				pr_debug("%s:%*s array sector %llu doesn't need parity update\n",
+					 __func__, indent, "",
+					 (unsigned long long)r_sector);
+				indent -= 2;
+				continue;
+			}
+
+			update_parity = true;
+
+			/* map raid sector to member disk */
+			sector = raid5_compute_sector(conf, r_sector, 0,
+						      &dd_idx, NULL);
+			pr_debug("%s:%*s processing array sector %llu => data member disk %d, sector %llu\n",
+				 __func__, indent, "",
+				 (unsigned long long)r_sector, dd_idx,
+				 (unsigned long long)sector);
+
+			rdev = conf->disks[dd_idx].rdev;
+			if (!rdev) {
+				pr_debug("%s:%*s data member disk %d missing\n",
+					 __func__, indent, "", dd_idx);
+				update_parity = false;
+				break;
+			}
+
+			pr_debug("%s:%*s reading data member disk %s sector %llu\n",
+				 __func__, indent, "", bdevname(rdev->bdev, b),
+				 (unsigned long long)sector);
+			if (!sync_page_io(rdev, sector, block_size, page2,
+					REQ_OP_READ, 0, false)) {
+				md_error(mddev, rdev);
+				pr_debug("%s:%*s read failed!\n", __func__,
+					 indent, "");
+				ret = -EIO;
+				goto out;
+			}
+
+			ppl_xor(block_size, page1, page2);
+
+			indent -= 2;
+		}
+
+		if (!update_parity)
+			continue;
+
+		if (pp_size > 0) {
+			pr_debug("%s:%*s reading pp disk sector %llu\n",
+				 __func__, indent, "",
+				 (unsigned long long)(ppl_sector + i));
+			if (!sync_page_io(log->rdev,
+					ppl_sector - log->rdev->data_offset + i,
+					block_size, page2, REQ_OP_READ, 0,
+					false)) {
+				pr_debug("%s:%*s read failed!\n", __func__,
+					 indent, "");
+				md_error(mddev, log->rdev);
+				ret = -EIO;
+				goto out;
+			}
+
+			ppl_xor(block_size, page1, page2);
+		}
+
+		/* map raid sector to parity disk */
+		parity_sector = raid5_compute_sector(conf, r_sector_first + i,
+				0, &disk, &sh);
+		BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
+		parity_rdev = conf->disks[sh.pd_idx].rdev;
+
+		BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
+		pr_debug("%s:%*s write parity at sector %llu, disk %s\n",
+			 __func__, indent, "",
+			 (unsigned long long)parity_sector,
+			 bdevname(parity_rdev->bdev, b));
+		if (!sync_page_io(parity_rdev, parity_sector, block_size,
+				page1, REQ_OP_WRITE, 0, false)) {
+			pr_debug("%s:%*s parity write error!\n", __func__,
+				 indent, "");
+			md_error(mddev, parity_rdev);
+			ret = -EIO;
+			goto out;
+		}
+	}
+out:
+	if (page1)
+		__free_page(page1);
+	if (page2)
+		__free_page(page2);
+	return ret;
+}
+
+static int ppl_recover(struct ppl_log *log, struct ppl_header *pplhdr)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct md_rdev *rdev = log->rdev;
+	struct mddev *mddev = rdev->mddev;
+	sector_t ppl_sector = rdev->ppl.sector + (PPL_HEADER_SIZE >> 9);
+	struct page *page;
+	int i;
+	int ret = 0;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	/* iterate through all PPL entries saved */
+	for (i = 0; i < le32_to_cpu(pplhdr->entries_count); i++) {
+		struct ppl_header_entry *e = &pplhdr->entries[i];
+		u32 pp_size = le32_to_cpu(e->pp_size);
+		sector_t sector = ppl_sector;
+		int ppl_entry_sectors = pp_size >> 9;
+		u32 crc, crc_stored;
+
+		pr_debug("%s: disk: %d entry: %d ppl_sector: %llu pp_size: %u\n",
+			 __func__, rdev->raid_disk, i,
+			 (unsigned long long)ppl_sector, pp_size);
+
+		crc = ~0;
+		crc_stored = le32_to_cpu(e->checksum);
+
+		/* read parial parity for this entry and calculate its checksum */
+		while (pp_size) {
+			int s = pp_size > PAGE_SIZE ? PAGE_SIZE : pp_size;
+
+			if (!sync_page_io(rdev, sector - rdev->data_offset,
+					s, page, REQ_OP_READ, 0, false)) {
+				md_error(mddev, rdev);
+				ret = -EIO;
+				goto out;
+			}
+
+			crc = crc32c_le(crc, page_address(page), s);
+
+			pp_size -= s;
+			sector += s >> 9;
+		}
+
+		crc = ~crc;
+
+		if (crc != crc_stored) {
+			/*
+			 * Don't recover this entry if the checksum does not
+			 * match, but keep going and try to recover other
+			 * entries.
+			 */
+			pr_debug("%s: ppl entry crc does not match: stored: 0x%x calculated: 0x%x\n",
+				 __func__, crc_stored, crc);
+			ppl_conf->mismatch_count++;
+		} else {
+			ret = ppl_recover_entry(log, e, ppl_sector);
+			if (ret)
+				goto out;
+			ppl_conf->recovered_entries++;
+		}
+
+		ppl_sector += ppl_entry_sectors;
+	}
+
+	/* flush the disk cache after recovery if necessary */
+	ret = blkdev_issue_flush(rdev->bdev, GFP_KERNEL, NULL);
+out:
+	__free_page(page);
+	return ret;
+}
+
+static int ppl_write_empty_header(struct ppl_log *log)
+{
+	struct page *page;
+	struct ppl_header *pplhdr;
+	struct md_rdev *rdev = log->rdev;
+	int ret = 0;
+
+	pr_debug("%s: disk: %d ppl_sector: %llu\n", __func__,
+		 rdev->raid_disk, (unsigned long long)rdev->ppl.sector);
+
+	page = alloc_page(GFP_NOIO | __GFP_ZERO);
+	if (!page)
+		return -ENOMEM;
+
+	pplhdr = page_address(page);
+	memset(pplhdr->reserved, 0xff, PPL_HDR_RESERVED);
+	pplhdr->signature = cpu_to_le32(log->ppl_conf->signature);
+	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PAGE_SIZE));
+
+	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+			  PPL_HEADER_SIZE, page, REQ_OP_WRITE | REQ_FUA, 0,
+			  false)) {
+		md_error(rdev->mddev, rdev);
+		ret = -EIO;
+	}
+
+	__free_page(page);
+	return ret;
+}
+
+static int ppl_load_distributed(struct ppl_log *log)
+{
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct md_rdev *rdev = log->rdev;
+	struct mddev *mddev = rdev->mddev;
+	struct page *page;
+	struct ppl_header *pplhdr;
+	u32 crc, crc_stored;
+	u32 signature;
+	int ret = 0;
+
+	pr_debug("%s: disk: %d\n", __func__, rdev->raid_disk);
+
+	/* read PPL header */
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (!sync_page_io(rdev, rdev->ppl.sector - rdev->data_offset,
+			  PAGE_SIZE, page, REQ_OP_READ, 0, false)) {
+		md_error(mddev, rdev);
+		ret = -EIO;
+		goto out;
+	}
+	pplhdr = page_address(page);
+
+	/* check header validity */
+	crc_stored = le32_to_cpu(pplhdr->checksum);
+	pplhdr->checksum = 0;
+	crc = ~crc32c_le(~0, pplhdr, PAGE_SIZE);
+
+	if (crc_stored != crc) {
+		pr_debug("%s: ppl header crc does not match: stored: 0x%x calculated: 0x%x\n",
+			 __func__, crc_stored, crc);
+		ppl_conf->mismatch_count++;
+		goto out;
+	}
+
+	signature = le32_to_cpu(pplhdr->signature);
+
+	if (mddev->external) {
+		/*
+		 * For external metadata the header signature is set and
+		 * validated in userspace.
+		 */
+		ppl_conf->signature = signature;
+	} else if (ppl_conf->signature != signature) {
+		pr_debug("%s: ppl header signature does not match: stored: 0x%x configured: 0x%x\n",
+			 __func__, signature, ppl_conf->signature);
+		ppl_conf->mismatch_count++;
+		goto out;
+	}
+
+	/* attempt to recover from log if we are starting a dirty array */
+	if (!mddev->pers && mddev->recovery_cp != MaxSector)
+		ret = ppl_recover(log, pplhdr);
+out:
+	/* write empty header if we are starting the array */
+	if (!ret && !mddev->pers)
+		ret = ppl_write_empty_header(log);
+
+	__free_page(page);
+
+	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+		 __func__, ret, ppl_conf->mismatch_count,
+		 ppl_conf->recovered_entries);
+	return ret;
+}
+
+static int ppl_load(struct ppl_conf *ppl_conf)
+{
+	int ret = 0;
+	u32 signature = 0;
+	bool signature_set = false;
+	int i;
+
+	for (i = 0; i < ppl_conf->count; i++) {
+		struct ppl_log *log = &ppl_conf->child_logs[i];
+
+		/* skip missing drive */
+		if (!log->rdev)
+			continue;
+
+		ret = ppl_load_distributed(log);
+		if (ret)
+			break;
+
+		/*
+		 * For external metadata we can't check if the signature is
+		 * correct on a single drive, but we can check if it is the same
+		 * on all drives.
+		 */
+		if (ppl_conf->mddev->external) {
+			if (!signature_set) {
+				signature = ppl_conf->signature;
+				signature_set = true;
+			} else if (signature != ppl_conf->signature) {
+				pr_warn("md/raid:%s: PPL header signature does not match on all member drives\n",
+					mdname(ppl_conf->mddev));
+				ret = -EINVAL;
+				break;
+			}
+		}
+	}
+
+	pr_debug("%s: return: %d mismatch_count: %d recovered_entries: %d\n",
+		 __func__, ret, ppl_conf->mismatch_count,
+		 ppl_conf->recovered_entries);
+	return ret;
+}
+
 static void __ppl_exit_log(struct ppl_conf *ppl_conf)
 {
 	clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
@@ -694,6 +1166,23 @@ int ppl_init_log(struct r5conf *conf)
 		pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
 			mdname(mddev));
 
+	/* load and possibly recover the logs from the member disks */
+	ret = ppl_load(ppl_conf);
+
+	if (ret) {
+		goto err;
+	} else if (!mddev->pers &&
+		   mddev->recovery_cp == 0 && !mddev->degraded &&
+		   ppl_conf->recovered_entries > 0 &&
+		   ppl_conf->mismatch_count == 0) {
+		/*
+		 * If we are starting a dirty array and the recovery succeeds
+		 * without any issues, set the array as clean.
+		 */
+		mddev->recovery_cp = MaxSector;
+		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+	}
+
 	conf->log_private = ppl_conf;
 
 	return 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6b86e0826afe..78ed5748d33d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7357,7 +7357,10 @@ static int raid5_run(struct mddev *mddev)
 
 	if (mddev->degraded > dirty_parity_disks &&
 	    mddev->recovery_cp != MaxSector) {
-		if (mddev->ok_start_degraded)
+		if (test_bit(MD_HAS_PPL, &mddev->flags))
+			pr_crit("md/raid:%s: starting dirty degraded array with PPL.\n",
+				mdname(mddev));
+		else if (mddev->ok_start_degraded)
 			pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
 				mdname(mddev));
 		else {
-- 
cgit v1.2.3


From 6358c239d88c751a9f14152a8d4ad2b69f5be48f Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 10:00:02 +0100
Subject: raid5-ppl: support disk hot add/remove with PPL

Add a function to modify the log by removing an rdev when a drive fails
or adding when a spare/replacement is activated as a raid member.

Removing a disk just clears the child log rdev pointer. No new stripes
will be accepted for this child log in ppl_write_stripe() and running io
units will be processed without writing PPL to the device.

Adding a disk sets the child log rdev pointer and writes an empty PPL
header.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-log.h |  9 +++++++++
 drivers/md/raid5-ppl.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/raid5.c     | 12 +++++++++++-
 3 files changed, 64 insertions(+), 2 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index a67fb58513b9..4f5a0f4e0b1f 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -39,6 +39,7 @@ extern void ppl_exit_log(struct r5conf *conf);
 extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
 extern void ppl_write_stripe_run(struct r5conf *conf);
 extern void ppl_stripe_write_finished(struct stripe_head *sh);
+extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
 
 static inline bool raid5_has_ppl(struct r5conf *conf)
 {
@@ -102,4 +103,12 @@ static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
 	return 0;
 }
 
+static inline int log_modify(struct r5conf *conf, struct md_rdev *rdev, bool add)
+{
+	if (raid5_has_ppl(conf))
+		return ppl_modify_log(conf, rdev, add);
+
+	return 0;
+}
+
 #endif
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index d336c024eef9..4af420f4d8c0 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -400,6 +400,13 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 	struct stripe_head *sh;
 	int i;
 
+	bio->bi_private = io;
+
+	if (!log->rdev || test_bit(Faulty, &log->rdev->flags)) {
+		ppl_log_endio(bio);
+		return;
+	}
+
 	for (i = 0; i < io->entries_count; i++) {
 		struct ppl_header_entry *e = &pplhdr->entries[i];
 
@@ -415,7 +422,6 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 	pplhdr->entries_count = cpu_to_le32(io->entries_count);
 	pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
 
-	bio->bi_private = io;
 	bio->bi_end_io = ppl_log_endio;
 	bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
 	bio->bi_bdev = log->rdev->bdev;
@@ -1190,3 +1196,40 @@ err:
 	__ppl_exit_log(ppl_conf);
 	return ret;
 }
+
+int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+	struct ppl_log *log;
+	int ret = 0;
+	char b[BDEVNAME_SIZE];
+
+	if (!rdev)
+		return -EINVAL;
+
+	pr_debug("%s: disk: %d operation: %s dev: %s\n",
+		 __func__, rdev->raid_disk, add ? "add" : "remove",
+		 bdevname(rdev->bdev, b));
+
+	if (rdev->raid_disk < 0)
+		return 0;
+
+	if (rdev->raid_disk >= ppl_conf->count)
+		return -ENODEV;
+
+	log = &ppl_conf->child_logs[rdev->raid_disk];
+
+	mutex_lock(&log->io_mutex);
+	if (add) {
+		ret = ppl_validate_rdev(rdev);
+		if (!ret) {
+			log->rdev = rdev;
+			ret = ppl_write_empty_header(log);
+		}
+	} else {
+		log->rdev = NULL;
+	}
+	mutex_unlock(&log->io_mutex);
+
+	return ret;
+}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 78ed5748d33d..6760af251864 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7648,6 +7648,11 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			*rdevp = rdev;
 		}
 	}
+	if (!err) {
+		err = log_modify(conf, rdev, false);
+		if (err)
+			goto abort;
+	}
 	if (p->replacement) {
 		/* We must have just cleared 'rdev' */
 		p->rdev = p->replacement;
@@ -7657,6 +7662,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			   */
 		p->replacement = NULL;
 		clear_bit(WantReplacement, &rdev->flags);
+
+		if (!err)
+			err = log_modify(conf, p->rdev, true);
 	} else
 		/* We might have just removed the Replacement as faulty-
 		 * clear the bit just in case
@@ -7713,10 +7721,12 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		if (p->rdev == NULL) {
 			clear_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = disk;
-			err = 0;
 			if (rdev->saved_raid_disk != disk)
 				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
+
+			err = log_modify(conf, rdev, true);
+
 			goto out;
 		}
 	}
-- 
cgit v1.2.3


From ba903a3ea465bd2f2bb9316054b295e79a7a518e Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Thu, 9 Mar 2017 10:00:03 +0100
Subject: raid5-ppl: runtime PPL enabling or disabling

Allow writing to 'consistency_policy' attribute when the array is
active. Add a new function 'change_consistency_policy' to the
md_personality operations structure to handle the change in the
personality code. Values "ppl" and "resync" are accepted and
turn PPL on and off respectively.

When enabling PPL its location and size should first be set using
'ppl_sector' and 'ppl_size' attributes and a valid PPL header should be
written at this location on each member device.

Enabling or disabling PPL is performed under a suspended array.  The
raid5_reset_stripe_cache function frees the stripe cache and allocates
it again in order to allocate or free the ppl_pages for the stripes in
the stripe cache.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c        | 12 +++++++++---
 drivers/md/md.h        |  2 ++
 drivers/md/raid5-ppl.c |  4 ++++
 drivers/md/raid5.c     | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 68 insertions(+), 3 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a7740306cbbd..af9118711228 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4996,14 +4996,20 @@ consistency_policy_show(struct mddev *mddev, char *page)
 static ssize_t
 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
 {
+	int err = 0;
+
 	if (mddev->pers) {
-		return -EBUSY;
+		if (mddev->pers->change_consistency_policy)
+			err = mddev->pers->change_consistency_policy(mddev, buf);
+		else
+			err = -EBUSY;
 	} else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
 		set_bit(MD_HAS_PPL, &mddev->flags);
-		return len;
 	} else {
-		return -EINVAL;
+		err = -EINVAL;
 	}
+
+	return err ? err : len;
 }
 
 static struct md_sysfs_entry md_consistency_policy =
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a7b2f16452c4..e0940064c3ec 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -545,6 +545,8 @@ struct md_personality
 	/* congested implements bdi.congested_fn().
 	 * Will not be called while array is 'suspended' */
 	int (*congested)(struct mddev *mddev, int bits);
+	/* Changes the consistency policy of an active array. */
+	int (*change_consistency_policy)(struct mddev *mddev, const char *buf);
 };
 
 struct md_sysfs_entry {
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 4af420f4d8c0..27bad3e2d7ce 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1187,6 +1187,10 @@ int ppl_init_log(struct r5conf *conf)
 		 */
 		mddev->recovery_cp = MaxSector;
 		set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
+	} else if (mddev->pers && ppl_conf->mismatch_count > 0) {
+		/* no mismatch allowed when enabling PPL for a running array */
+		ret = -EINVAL;
+		goto err;
 	}
 
 	conf->log_private = ppl_conf;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6760af251864..88cc8981bd49 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -8334,6 +8334,58 @@ static void *raid6_takeover(struct mddev *mddev)
 	return setup_conf(mddev);
 }
 
+static void raid5_reset_stripe_cache(struct mddev *mddev)
+{
+	struct r5conf *conf = mddev->private;
+
+	mutex_lock(&conf->cache_size_mutex);
+	while (conf->max_nr_stripes &&
+	       drop_one_stripe(conf))
+		;
+	while (conf->min_nr_stripes > conf->max_nr_stripes &&
+	       grow_one_stripe(conf, GFP_KERNEL))
+		;
+	mutex_unlock(&conf->cache_size_mutex);
+}
+
+static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
+{
+	struct r5conf *conf;
+	int err;
+
+	err = mddev_lock(mddev);
+	if (err)
+		return err;
+	conf = mddev->private;
+	if (!conf) {
+		mddev_unlock(mddev);
+		return -ENODEV;
+	}
+
+	if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) {
+		mddev_suspend(mddev);
+		set_bit(MD_HAS_PPL, &mddev->flags);
+		err = log_init(conf, NULL);
+		if (!err)
+			raid5_reset_stripe_cache(mddev);
+		mddev_resume(mddev);
+	} else if (strncmp(buf, "resync", 6) == 0 && raid5_has_ppl(conf)) {
+		mddev_suspend(mddev);
+		log_exit(conf);
+		raid5_reset_stripe_cache(mddev);
+		mddev_resume(mddev);
+	} else {
+		err = -EINVAL;
+	}
+
+	if (!err)
+		md_update_sb(mddev, 1);
+
+	mddev_unlock(mddev);
+
+	return err;
+}
+
 static struct md_personality raid6_personality =
 {
 	.name		= "raid6",
@@ -8379,6 +8431,7 @@ static struct md_personality raid5_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid5_takeover,
 	.congested	= raid5_congested,
+	.change_consistency_policy = raid5_change_consistency_policy,
 };
 
 static struct md_personality raid4_personality =
-- 
cgit v1.2.3


From 497280509f32340d90feac030bce18006a3e3605 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:12 +1100
Subject: md/raid5: use md_write_start to count stripes, not bios

We use md_write_start() to increase the count of pending writes, and
md_write_end() to decrement the count.  We currently count bios
submitted to md/raid5.  Change it count stripe_heads that a WRITE bio
has been attached to.

So now, raid5_make_request() calls md_write_start() and then
md_write_end() to keep the count elevated during the setup of the
request.

add_stripe_bio() calls md_write_start() for each stripe_head, and the
completion routines always call md_write_end(), instead of only
calling it when raid5_dec_bi_active_stripes() returns 0.
make_discard_request also calls md_write_start/end().

The parallel between md_write_{start,end} and use of bi_phys_segments
can be seen in that:
 Whenever we set bi_phys_segments to 1, we now call md_write_start.
 Whenever we increment it on non-read requests with
   raid5_inc_bi_active_stripes(), we now call md_write_start().
 Whenever we decrement bi_phys_segments on non-read requsts with
    raid5_dec_bi_active_stripes(), we now call md_write_end().

This reduces our dependence on keeping a per-bio count of active
stripes in bi_phys_segments.

md_write_inc() is added which parallels md_write_start(), but requires
that a write has already been started, and is certain never to sleep.
This can be used inside a spinlocked region when adding to a write
request.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c          | 17 +++++++++++++++++
 drivers/md/md.h          |  1 +
 drivers/md/raid5-cache.c |  2 +-
 drivers/md/raid5.c       | 27 +++++++++++++--------------
 4 files changed, 32 insertions(+), 15 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 42e68b2e0b41..41f766ab824a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7907,6 +7907,23 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
 }
 EXPORT_SYMBOL(md_write_start);
 
+/* md_write_inc can only be called when md_write_start() has
+ * already been called at least once of the current request.
+ * It increments the counter and is useful when a single request
+ * is split into several parts.  Each part causes an increment and
+ * so needs a matching md_write_end().
+ * Unlike md_write_start(), it is safe to call md_write_inc() inside
+ * a spinlocked region.
+ */
+void md_write_inc(struct mddev *mddev, struct bio *bi)
+{
+	if (bio_data_dir(bi) != WRITE)
+		return;
+	WARN_ON_ONCE(mddev->in_sync || mddev->ro);
+	atomic_inc(&mddev->writes_pending);
+}
+EXPORT_SYMBOL(md_write_inc);
+
 void md_write_end(struct mddev *mddev)
 {
 	if (atomic_dec_and_test(&mddev->writes_pending)) {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e0940064c3ec..0cd12721a536 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -648,6 +648,7 @@ extern void md_wakeup_thread(struct md_thread *thread);
 extern void md_check_recovery(struct mddev *mddev);
 extern void md_reap_sync_thread(struct mddev *mddev);
 extern void md_write_start(struct mddev *mddev, struct bio *bi);
+extern void md_write_inc(struct mddev *mddev, struct bio *bi);
 extern void md_write_end(struct mddev *mddev);
 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
 extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 64493132470b..f5034ecb4e94 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -318,8 +318,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 	while (wbi && wbi->bi_iter.bi_sector <
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
+		md_write_end(conf->mddev);
 		if (!raid5_dec_bi_active_stripes(wbi)) {
-			md_write_end(conf->mddev);
 			bio_list_add(return_bi, wbi);
 		}
 		wbi = wbi2;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 88cc8981bd49..a684003fc965 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3274,6 +3274,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		bi->bi_next = *bip;
 	*bip = bi;
 	raid5_inc_bi_active_stripes(bi);
+	md_write_inc(conf->mddev, bi);
 
 	if (forwrite) {
 		/* check if page is covered */
@@ -3397,10 +3398,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
 
 			bi->bi_error = -EIO;
-			if (!raid5_dec_bi_active_stripes(bi)) {
-				md_write_end(conf->mddev);
+			md_write_end(conf->mddev);
+			if (!raid5_dec_bi_active_stripes(bi))
 				bio_list_add(return_bi, bi);
-			}
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3421,10 +3421,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
 
 			bi->bi_error = -EIO;
-			if (!raid5_dec_bi_active_stripes(bi)) {
-				md_write_end(conf->mddev);
+			md_write_end(conf->mddev);
+			if (!raid5_dec_bi_active_stripes(bi))
 				bio_list_add(return_bi, bi);
-			}
 			bi = bi2;
 		}
 
@@ -3781,10 +3780,9 @@ returnbi:
 				while (wbi && wbi->bi_iter.bi_sector <
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
-					if (!raid5_dec_bi_active_stripes(wbi)) {
-						md_write_end(conf->mddev);
+					md_write_end(conf->mddev);
+					if (!raid5_dec_bi_active_stripes(wbi))
 						bio_list_add(return_bi, wbi);
-					}
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -5487,6 +5485,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
+	md_write_start(mddev, bi);
 
 	stripe_sectors = conf->chunk_sectors *
 		(conf->raid_disks - conf->max_degraded);
@@ -5533,6 +5532,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 			sh->dev[d].towrite = bi;
 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
 			raid5_inc_bi_active_stripes(bi);
+			md_write_inc(mddev, bi);
 			sh->overwrite_disks++;
 		}
 		spin_unlock_irq(&sh->stripe_lock);
@@ -5555,9 +5555,9 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		release_stripe_plug(mddev, sh);
 	}
 
+	md_write_end(mddev);
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
-		md_write_end(mddev);
 		bio_endio(bi);
 	}
 }
@@ -5592,8 +5592,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 		do_flush = bi->bi_opf & REQ_PREFLUSH;
 	}
 
-	md_write_start(mddev, bi);
-
 	/*
 	 * If array is degraded, better not do chunk aligned read because
 	 * later we might have to read it again in order to reconstruct
@@ -5615,6 +5613,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
+	md_write_start(mddev, bi);
 
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
 	for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
@@ -5749,11 +5748,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	}
 	finish_wait(&conf->wait_for_overlap, &w);
 
+	if (rw == WRITE)
+		md_write_end(mddev);
 	remaining = raid5_dec_bi_active_stripes(bi);
 	if (remaining == 0) {
 
-		if ( rw == WRITE )
-			md_write_end(mddev);
 
 		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 					 bi, 0);
-- 
cgit v1.2.3


From 16d997b78b157315f5c90fcbc2f9ce575cb3879f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:12 +1100
Subject: md/raid5: simplfy delaying of writes while metadata is updated.

If a device fails during a write, we must ensure the failure is
recorded in the metadata before the completion of the write is
acknowleged.

Commit c3cce6cda162 ("md/raid5: ensure device failure recorded before
write request returns.")  added code for this, but it was
unnecessarily complicated.  We already had similar functionality for
handling updates to the bad-block-list, thanks to Commit de393cdea66c
("md: make it easier to wait for bad blocks to be acknowledged.")

So revert most of the former commit, and instead avoid collecting
completed writes if MD_CHANGE_PENDING is set.  raid5d() will then flush
the metadata and retry the stripe_head.
As this change can leave a stripe_head ready for handling immediately
after handle_active_stripes() returns, we change raid5_do_work() to
pause when MD_CHANGE_PENDING is set, so that it doesn't spin.

We check MD_CHANGE_PENDING *after* analyse_stripe() as it could be set
asynchronously.  After analyse_stripe(), we have collected stable data
about the state of devices, which will be used to make decisions.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 31 ++++++++-----------------------
 drivers/md/raid5.h |  3 ---
 2 files changed, 8 insertions(+), 26 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a684003fc965..a2c9ddc35335 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4691,7 +4691,8 @@ static void handle_stripe(struct stripe_head *sh)
 	if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
 		goto finish;
 
-	if (s.handle_bad_blocks) {
+	if (s.handle_bad_blocks ||
+	    test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
 		set_bit(STRIPE_HANDLE, &sh->state);
 		goto finish;
 	}
@@ -5021,15 +5022,8 @@ finish:
 			md_wakeup_thread(conf->mddev->thread);
 	}
 
-	if (!bio_list_empty(&s.return_bi)) {
-		if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
-			spin_lock_irq(&conf->device_lock);
-			bio_list_merge(&conf->return_bi, &s.return_bi);
-			spin_unlock_irq(&conf->device_lock);
-			md_wakeup_thread(conf->mddev->thread);
-		} else
-			return_io(&s.return_bi);
-	}
+	if (!bio_list_empty(&s.return_bi))
+		return_io(&s.return_bi);
 
 	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
@@ -6226,6 +6220,7 @@ static void raid5_do_work(struct work_struct *work)
 	struct r5worker *worker = container_of(work, struct r5worker, work);
 	struct r5worker_group *group = worker->group;
 	struct r5conf *conf = group->conf;
+	struct mddev *mddev = conf->mddev;
 	int group_id = group - conf->worker_groups;
 	int handled;
 	struct blk_plug plug;
@@ -6246,6 +6241,9 @@ static void raid5_do_work(struct work_struct *work)
 		if (!batch_size && !released)
 			break;
 		handled += batch_size;
+		wait_event_lock_irq(mddev->sb_wait,
+			!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+			conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);
 
@@ -6273,18 +6271,6 @@ static void raid5d(struct md_thread *thread)
 
 	md_check_recovery(mddev);
 
-	if (!bio_list_empty(&conf->return_bi) &&
-	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-		struct bio_list tmp = BIO_EMPTY_LIST;
-		spin_lock_irq(&conf->device_lock);
-		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
-			bio_list_merge(&tmp, &conf->return_bi);
-			bio_list_init(&conf->return_bi);
-		}
-		spin_unlock_irq(&conf->device_lock);
-		return_io(&tmp);
-	}
-
 	blk_start_plug(&plug);
 	handled = 0;
 	spin_lock_irq(&conf->device_lock);
@@ -6936,7 +6922,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
-	bio_list_init(&conf->return_bi);
 	init_llist_head(&conf->released_stripes);
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ba5b7a3790af..13800dc9dd88 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -638,9 +638,6 @@ struct r5conf {
 	int			skip_copy; /* Don't copy data from bio to stripe cache */
 	struct list_head	*last_hold; /* detect hold_list promotions */
 
-	/* bios to have bi_end_io called after metadata is synced */
-	struct bio_list		return_bi;
-
 	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
 	/* unfortunately we need two cache names as we temporarily have
 	 * two caches.
-- 
cgit v1.2.3


From bd83d0a28c68bacba88a3193a1bd6a083bb8d9f5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:12 +1100
Subject: md/raid5: call bio_endio() directly rather than queueing for later.

We currently gather bios that need to be returned into a bio_list
and call bio_endio() on them all together.
The original reason for this was to avoid making the calls while
holding a spinlock.
Locking has changed a lot since then, and that reason is no longer
valid.

So discard return_io() and various return_bi lists, and just call
bio_endio() directly as needed.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 13 +++++--------
 drivers/md/raid5-log.h   |  2 +-
 drivers/md/raid5.c       | 38 ++++++++++----------------------------
 drivers/md/raid5.h       |  1 -
 4 files changed, 16 insertions(+), 38 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index f5034ecb4e94..5be8dbc5d91b 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -308,8 +308,7 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
 }
 
 static void
-r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
-			      struct bio_list *return_bi)
+r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
 {
 	struct bio *wbi, *wbi2;
 
@@ -319,23 +318,21 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev,
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
 		md_write_end(conf->mddev);
-		if (!raid5_dec_bi_active_stripes(wbi)) {
-			bio_list_add(return_bi, wbi);
-		}
+		if (!raid5_dec_bi_active_stripes(wbi))
+			bio_endio(wbi);
 		wbi = wbi2;
 	}
 }
 
 void r5c_handle_cached_data_endio(struct r5conf *conf,
-	  struct stripe_head *sh, int disks, struct bio_list *return_bi)
+				  struct stripe_head *sh, int disks)
 {
 	int i;
 
 	for (i = sh->disks; i--; ) {
 		if (sh->dev[i].written) {
 			set_bit(R5_UPTODATE, &sh->dev[i].flags);
-			r5c_return_dev_pending_writes(conf, &sh->dev[i],
-						      return_bi);
+			r5c_return_dev_pending_writes(conf, &sh->dev[i]);
 			bitmap_endwrite(conf->mddev->bitmap, sh->sector,
 					STRIPE_SECTORS,
 					!test_bit(STRIPE_DEGRADED, &sh->state),
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 4f5a0f4e0b1f..738930ff5d17 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -21,7 +21,7 @@ extern void r5c_release_extra_page(struct stripe_head *sh);
 extern void r5c_use_extra_page(struct stripe_head *sh);
 extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
 extern void r5c_handle_cached_data_endio(struct r5conf *conf,
-	struct stripe_head *sh, int disks, struct bio_list *return_bi);
+	struct stripe_head *sh, int disks);
 extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh);
 extern void r5c_make_stripe_write_out(struct stripe_head *sh);
 extern void r5c_flush_cache(struct r5conf *conf, int num);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a2c9ddc35335..44c8ceba13fe 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -158,17 +158,6 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 	return slot;
 }
 
-static void return_io(struct bio_list *return_bi)
-{
-	struct bio *bi;
-	while ((bi = bio_list_pop(return_bi)) != NULL) {
-		bi->bi_iter.bi_size = 0;
-		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-					 bi, 0);
-		bio_endio(bi);
-	}
-}
-
 static void print_raid5_conf (struct r5conf *conf);
 
 static int stripe_operations_active(struct stripe_head *sh)
@@ -1310,7 +1299,6 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 static void ops_complete_biofill(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
-	struct bio_list return_bi = BIO_EMPTY_LIST;
 	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
@@ -1335,15 +1323,13 @@ static void ops_complete_biofill(void *stripe_head_ref)
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
 				if (!raid5_dec_bi_active_stripes(rbi))
-					bio_list_add(&return_bi, rbi);
+					bio_endio(rbi);
 				rbi = rbi2;
 			}
 		}
 	}
 	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 
-	return_io(&return_bi);
-
 	set_bit(STRIPE_HANDLE, &sh->state);
 	raid5_release_stripe(sh);
 }
@@ -3351,8 +3337,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 
 static void
 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
-				struct stripe_head_state *s, int disks,
-				struct bio_list *return_bi)
+		     struct stripe_head_state *s, int disks)
 {
 	int i;
 	BUG_ON(sh->batch_head);
@@ -3400,7 +3385,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
 			if (!raid5_dec_bi_active_stripes(bi))
-				bio_list_add(return_bi, bi);
+				bio_endio(bi);
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3423,7 +3408,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
 			if (!raid5_dec_bi_active_stripes(bi))
-				bio_list_add(return_bi, bi);
+				bio_endio(bi);
 			bi = bi2;
 		}
 
@@ -3449,7 +3434,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 				bi->bi_error = -EIO;
 				if (!raid5_dec_bi_active_stripes(bi))
-					bio_list_add(return_bi, bi);
+					bio_endio(bi);
 				bi = nextbi;
 			}
 		}
@@ -3748,7 +3733,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
  * never LOCKED, so we don't need to test 'failed' directly.
  */
 static void handle_stripe_clean_event(struct r5conf *conf,
-	struct stripe_head *sh, int disks, struct bio_list *return_bi)
+	struct stripe_head *sh, int disks)
 {
 	int i;
 	struct r5dev *dev;
@@ -3782,7 +3767,7 @@ returnbi:
 					wbi2 = r5_next_bio(wbi, dev->sector);
 					md_write_end(conf->mddev);
 					if (!raid5_dec_bi_active_stripes(wbi))
-						bio_list_add(return_bi, wbi);
+						bio_endio(wbi);
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -4725,7 +4710,7 @@ static void handle_stripe(struct stripe_head *sh)
 		sh->reconstruct_state = 0;
 		break_stripe_batch_list(sh, 0);
 		if (s.to_read+s.to_write+s.written)
-			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
+			handle_failed_stripe(conf, sh, &s, disks);
 		if (s.syncing + s.replacing)
 			handle_failed_sync(conf, sh, &s);
 	}
@@ -4791,10 +4776,10 @@ static void handle_stripe(struct stripe_head *sh)
 			     && !test_bit(R5_LOCKED, &qdev->flags)
 			     && (test_bit(R5_UPTODATE, &qdev->flags) ||
 				 test_bit(R5_Discard, &qdev->flags))))))
-		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
+		handle_stripe_clean_event(conf, sh, disks);
 
 	if (s.just_cached)
-		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+		r5c_handle_cached_data_endio(conf, sh, disks);
 	log_stripe_write_finished(sh);
 
 	/* Now we might consider reading some blocks, either to check/generate
@@ -5022,9 +5007,6 @@ finish:
 			md_wakeup_thread(conf->mddev->thread);
 	}
 
-	if (!bio_list_empty(&s.return_bi))
-		return_io(&s.return_bi);
-
 	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 13800dc9dd88..fd5c21cde77f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -278,7 +278,6 @@ struct stripe_head_state {
 	int dec_preread_active;
 	unsigned long ops_request;
 
-	struct bio_list return_bi;
 	struct md_rdev *blocked_rdev;
 	int handle_bad_blocks;
 	int log_failed;
-- 
cgit v1.2.3


From 016c76ac76e4c678b01a75a602dc6be0282f5b29 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:13 +1100
Subject: md/raid5: use bio_inc_remaining() instead of repurposing
 bi_phys_segments as a counter

md/raid5 needs to keep track of how many stripe_heads are processing a
bio so that it can delay calling bio_endio() until all stripe_heads
have completed.  It currently uses 16 bits of ->bi_phys_segments for
this purpose.

16 bits is only enough for 256M requests, and it is possible for a
single bio to be larger than this, which causes problems.  Also, the
bio struct contains a larger counter, __bi_remaining, which has a
purpose very similar to the purpose of our counter.  So stop using
->bi_phys_segments, and instead use __bi_remaining.

This means we don't need to initialize the counter, as our caller
initializes it to '1'.  It also means we can call bio_endio() directly
as it tests this counter internally.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c |  3 +--
 drivers/md/raid5.c       | 57 +++++++++++-------------------------------------
 drivers/md/raid5.h       | 17 +--------------
 3 files changed, 15 insertions(+), 62 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 5be8dbc5d91b..25eb048298fe 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -318,8 +318,7 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev)
 	       dev->sector + STRIPE_SECTORS) {
 		wbi2 = r5_next_bio(wbi, dev->sector);
 		md_write_end(conf->mddev);
-		if (!raid5_dec_bi_active_stripes(wbi))
-			bio_endio(wbi);
+		bio_endio(wbi);
 		wbi = wbi2;
 	}
 }
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 44c8ceba13fe..0ec9e0212158 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1322,8 +1322,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
 			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
-				if (!raid5_dec_bi_active_stripes(rbi))
-					bio_endio(rbi);
+				bio_endio(rbi);
 				rbi = rbi2;
 			}
 		}
@@ -3196,14 +3195,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		(unsigned long long)bi->bi_iter.bi_sector,
 		(unsigned long long)sh->sector);
 
-	/*
-	 * If several bio share a stripe. The bio bi_phys_segments acts as a
-	 * reference count to avoid race. The reference count should already be
-	 * increased before this function is called (for example, in
-	 * raid5_make_request()), so other bio sharing this stripe will not free the
-	 * stripe. If a stripe is owned by one stripe, the stripe lock will
-	 * protect it.
-	 */
 	spin_lock_irq(&sh->stripe_lock);
 	/* Don't allow new IO added to stripes in batch list */
 	if (sh->batch_head)
@@ -3259,7 +3250,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	if (*bip)
 		bi->bi_next = *bip;
 	*bip = bi;
-	raid5_inc_bi_active_stripes(bi);
+	bio_inc_remaining(bi);
 	md_write_inc(conf->mddev, bi);
 
 	if (forwrite) {
@@ -3384,8 +3375,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
-			if (!raid5_dec_bi_active_stripes(bi))
-				bio_endio(bi);
+			bio_endio(bi);
 			bi = nextbi;
 		}
 		if (bitmap_end)
@@ -3407,8 +3397,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 
 			bi->bi_error = -EIO;
 			md_write_end(conf->mddev);
-			if (!raid5_dec_bi_active_stripes(bi))
-				bio_endio(bi);
+			bio_endio(bi);
 			bi = bi2;
 		}
 
@@ -3433,8 +3422,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 					r5_next_bio(bi, sh->dev[i].sector);
 
 				bi->bi_error = -EIO;
-				if (!raid5_dec_bi_active_stripes(bi))
-					bio_endio(bi);
+				bio_endio(bi);
 				bi = nextbi;
 			}
 		}
@@ -3766,8 +3754,7 @@ returnbi:
 					dev->sector + STRIPE_SECTORS) {
 					wbi2 = r5_next_bio(wbi, dev->sector);
 					md_write_end(conf->mddev);
-					if (!raid5_dec_bi_active_stripes(wbi))
-						bio_endio(wbi);
+					bio_endio(wbi);
 					wbi = wbi2;
 				}
 				bitmap_endwrite(conf->mddev->bitmap, sh->sector,
@@ -5112,7 +5099,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
 		 * this sets the active strip count to 1 and the processed
 		 * strip count to zero (upper 8 bits)
 		 */
-		raid5_set_bi_stripes(bi, 1); /* biased count of active stripes */
+		raid5_set_bi_processed_stripes(bi, 0);
 	}
 
 	return bi;
@@ -5449,7 +5436,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	struct r5conf *conf = mddev->private;
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
-	int remaining;
 	int stripe_sectors;
 
 	if (mddev->reshape_position != MaxSector)
@@ -5460,7 +5446,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9);
 
 	bi->bi_next = NULL;
-	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
 	md_write_start(mddev, bi);
 
 	stripe_sectors = conf->chunk_sectors *
@@ -5507,7 +5492,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 				continue;
 			sh->dev[d].towrite = bi;
 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
-			raid5_inc_bi_active_stripes(bi);
+			bio_inc_remaining(bi);
 			md_write_inc(mddev, bi);
 			sh->overwrite_disks++;
 		}
@@ -5532,10 +5517,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 	}
 
 	md_write_end(mddev);
-	remaining = raid5_dec_bi_active_stripes(bi);
-	if (remaining == 0) {
-		bio_endio(bi);
-	}
+	bio_endio(bi);
 }
 
 static void raid5_make_request(struct mddev *mddev, struct bio * bi)
@@ -5546,7 +5528,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	sector_t logical_sector, last_sector;
 	struct stripe_head *sh;
 	const int rw = bio_data_dir(bi);
-	int remaining;
 	DEFINE_WAIT(w);
 	bool do_prepare;
 	bool do_flush = false;
@@ -5588,7 +5569,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
-	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 	md_write_start(mddev, bi);
 
 	prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
@@ -5726,14 +5706,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 
 	if (rw == WRITE)
 		md_write_end(mddev);
-	remaining = raid5_dec_bi_active_stripes(bi);
-	if (remaining == 0) {
-
-
-		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
-					 bi, 0);
-		bio_endio(bi);
-	}
+	bio_endio(bi);
 }
 
 static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks);
@@ -6098,7 +6071,6 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	int dd_idx;
 	sector_t sector, logical_sector, last_sector;
 	int scnt = 0;
-	int remaining;
 	int handled = 0;
 
 	logical_sector = raid_bio->bi_iter.bi_sector &
@@ -6137,12 +6109,9 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		raid5_release_stripe(sh);
 		handled++;
 	}
-	remaining = raid5_dec_bi_active_stripes(raid_bio);
-	if (remaining == 0) {
-		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
-					 raid_bio, 0);
-		bio_endio(raid_bio);
-	}
+
+	bio_endio(raid_bio);
+
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_quiescent);
 	return handled;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index fd5c21cde77f..7d74fb3f2ec6 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -488,8 +488,7 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 }
 
 /*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
+ * We maintain a count of processed stripes in the upper 16 bits
  */
 static inline int raid5_bi_processed_stripes(struct bio *bio)
 {
@@ -498,20 +497,6 @@ static inline int raid5_bi_processed_stripes(struct bio *bio)
 	return (atomic_read(segments) >> 16) & 0xffff;
 }
 
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	return atomic_sub_return(1, segments) & 0xffff;
-}
-
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	atomic_inc(segments);
-}
-
 static inline void raid5_set_bi_processed_stripes(struct bio *bio,
 	unsigned int cnt)
 {
-- 
cgit v1.2.3


From 0472a42ba1f89ec85f070c731f4440d7cc38c44c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:13 +1100
Subject: md/raid5: remove over-loading of ->bi_phys_segments.

When a read request, which bypassed the cache, fails, we need to retry
it through the cache.
This involves attaching it to a sequence of stripe_heads, and it may not
be possible to get all the stripe_heads we need at once.
We do what we can, and record how far we got in ->bi_phys_segments so
we can pick up again later.

There is only ever one bio which may have a non-zero offset stored in
->bi_phys_segments, the one that is either active in the single thread
which calls retry_aligned_read(), or is in conf->retry_read_aligned
waiting for retry_aligned_read() to be called again.

So we only need to store one offset value.  This can be in a local
variable passed between remove_bio_from_retry() and
retry_aligned_read(), or in the r5conf structure next to the
->retry_read_aligned pointer.

Storing it there allows the last usage of ->bi_phys_segments to be
removed from md/raid5.c.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 24 ++++++++++++------------
 drivers/md/raid5.h | 30 +-----------------------------
 2 files changed, 13 insertions(+), 41 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0ec9e0212158..1c8be667e9a9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5082,12 +5082,14 @@ static void add_bio_to_retry(struct bio *bi,struct r5conf *conf)
 	md_wakeup_thread(conf->mddev->thread);
 }
 
-static struct bio *remove_bio_from_retry(struct r5conf *conf)
+static struct bio *remove_bio_from_retry(struct r5conf *conf,
+					 unsigned int *offset)
 {
 	struct bio *bi;
 
 	bi = conf->retry_read_aligned;
 	if (bi) {
+		*offset = conf->retry_read_offset;
 		conf->retry_read_aligned = NULL;
 		return bi;
 	}
@@ -5095,11 +5097,7 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf)
 	if(bi) {
 		conf->retry_read_aligned_list = bi->bi_next;
 		bi->bi_next = NULL;
-		/*
-		 * this sets the active strip count to 1 and the processed
-		 * strip count to zero (upper 8 bits)
-		 */
-		raid5_set_bi_processed_stripes(bi, 0);
+		*offset = 0;
 	}
 
 	return bi;
@@ -6055,7 +6053,8 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
 	return STRIPE_SECTORS;
 }
 
-static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
+static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
+			       unsigned int offset)
 {
 	/* We may not be able to submit a whole bio at once as there
 	 * may not be enough stripe_heads available.
@@ -6084,7 +6083,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		     sector += STRIPE_SECTORS,
 		     scnt++) {
 
-		if (scnt < raid5_bi_processed_stripes(raid_bio))
+		if (scnt < offset)
 			/* already done this stripe */
 			continue;
 
@@ -6092,15 +6091,15 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 
 		if (!sh) {
 			/* failed to get a stripe - must wait */
-			raid5_set_bi_processed_stripes(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
+			conf->retry_read_offset = scnt;
 			return handled;
 		}
 
 		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
 			raid5_release_stripe(sh);
-			raid5_set_bi_processed_stripes(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
+			conf->retry_read_offset = scnt;
 			return handled;
 		}
 
@@ -6228,6 +6227,7 @@ static void raid5d(struct md_thread *thread)
 	while (1) {
 		struct bio *bio;
 		int batch_size, released;
+		unsigned int offset;
 
 		released = release_stripe_list(conf, conf->temp_inactive_list);
 		if (released)
@@ -6245,10 +6245,10 @@ static void raid5d(struct md_thread *thread)
 		}
 		raid5_activate_delayed(conf);
 
-		while ((bio = remove_bio_from_retry(conf))) {
+		while ((bio = remove_bio_from_retry(conf, &offset))) {
 			int ok;
 			spin_unlock_irq(&conf->device_lock);
-			ok = retry_aligned_read(conf, bio);
+			ok = retry_aligned_read(conf, bio, offset);
 			spin_lock_irq(&conf->device_lock);
 			if (!ok)
 				break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7d74fb3f2ec6..cdc7f92e1806 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -487,35 +487,6 @@ static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 		return NULL;
 }
 
-/*
- * We maintain a count of processed stripes in the upper 16 bits
- */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	return (atomic_read(segments) >> 16) & 0xffff;
-}
-
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
-	unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	int old, new;
-
-	do {
-		old = atomic_read(segments);
-		new = (old & 0xffff) | (cnt << 16);
-	} while (atomic_cmpxchg(segments, old, new) != old);
-}
-
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-
-	atomic_set(segments, cnt);
-}
-
 /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
  * This is because we sometimes take all the spinlocks
  * and creating that much locking depth can cause
@@ -613,6 +584,7 @@ struct r5conf {
 	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
 	struct bio		*retry_read_aligned; /* currently retrying aligned bios   */
+	unsigned int		retry_read_offset; /* sector offset into retry_read_aligned */
 	struct bio		*retry_read_aligned_list; /* aligned bios retry list  */
 	atomic_t		preread_active_stripes; /* stripes with scheduled io */
 	atomic_t		active_aligned_reads;
-- 
cgit v1.2.3


From 97d53438081edd25ccb1de34051efe084d240828 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:13 +1100
Subject: Revert "md/raid5: limit request size according to implementation
 limits"

This reverts commit e8d7c33232e5fdfa761c3416539bc5b4acd12db5.

Now that raid5 doesn't abuse bi_phys_segments any more, we no longer
need to impose these limits.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1c8be667e9a9..00a34faabcdf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7361,15 +7361,6 @@ static int raid5_run(struct mddev *mddev)
 			stripe = (stripe | (stripe-1)) + 1;
 		mddev->queue->limits.discard_alignment = stripe;
 		mddev->queue->limits.discard_granularity = stripe;
-
-		/*
-		 * We use 16-bit counter of active stripes in bi_phys_segments
-		 * (minus one for over-loaded initialization)
-		 */
-		blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
-		blk_queue_max_discard_sectors(mddev->queue,
-					      0xfffe * STRIPE_SECTORS);
-
 		/*
 		 * unaligned part of discard request will be ignored, so can't
 		 * guarantee discard_zeroes_data
-- 
cgit v1.2.3


From 84dd97a69092cef858483b775f1900d743d796a4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Mar 2017 14:05:14 +1100
Subject: md/raid5: don't test ->writes_pending in raid5_remove_disk

This test on ->writes_pending cannot be safe as the counter
can be incremented at any moment and cannot be locked against.

Change it to test conf->active_stripes, which at least
can be locked against.  More changes are still needed.

A future patch will change ->writes_pending, and testing it here will
be very inconvenient.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 00a34faabcdf..0b1a4339a437 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7532,9 +7532,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 		/*
 		 * we can't wait pending write here, as this is called in
 		 * raid5d, wait will deadlock.
+		 * neilb: there is no locking about new writes here,
+		 * so this cannot be safe.
 		 */
-		if (atomic_read(&mddev->writes_pending))
+		if (atomic_read(&conf->active_stripes)) {
 			return -EBUSY;
+		}
 		log_exit(conf);
 		return 0;
 	}
-- 
cgit v1.2.3


From 3560741e316b3ea52cfb27901ae284921445180f Mon Sep 17 00:00:00 2001
From: Zhilong Liu <zlliu@suse.com>
Date: Wed, 15 Mar 2017 16:14:53 +0800
Subject: md: fix several trivial typos in comments

Signed-off-by: Zhilong Liu <zlliu@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/bitmap.c | 2 +-
 drivers/md/raid5.c  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index cc7bbd21813e..bf7419a56454 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -697,7 +697,7 @@ re_read:
 
 out:
 	kunmap_atomic(sb);
-	/* Assiging chunksize is required for "re_read" */
+	/* Assigning chunksize is required for "re_read" */
 	bitmap->mddev->bitmap_info.chunksize = chunksize;
 	if (err == 0 && nodes && (bitmap->cluster_slot < 0)) {
 		err = md_setup_cluster(bitmap->mddev, nodes);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0b1a4339a437..266d661dc69b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2296,7 +2296,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	 *    pages have been transferred over, and the old kmem_cache is
 	 *    freed when all stripes are done.
 	 * 3/ reallocate conf->disks to be suitable bigger.  If this fails,
-	 *    we simple return a failre status - no need to clean anything up.
+	 *    we simple return a failure status - no need to clean anything up.
 	 * 4/ allocate new pages for the new slots in the new stripe_heads.
 	 *    If this fails, we don't bother trying the shrink the
 	 *    stripe_heads down again, we just leave them as they are.
@@ -3558,7 +3558,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 	    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 		/* Pre-reads at not permitted until after short delay
 		 * to gather multiple requests.  However if this
-		 * device is no Insync, the block could only be be computed
+		 * device is no Insync, the block could only be computed
 		 * and there is no need to delay that.
 		 */
 		return 0;
@@ -3577,7 +3577,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 
 	/* If we are forced to do a reconstruct-write, either because
 	 * the current RAID6 implementation only supports that, or
-	 * or because parity cannot be trusted and we are currently
+	 * because parity cannot be trusted and we are currently
 	 * recovering it, there is extra need to be careful.
 	 * If one of the devices that we would need to read, because
 	 * it is not being overwritten (and maybe not written at all)
-- 
cgit v1.2.3


From 0bb0c10500ba634216238c40e1eeddce92b4d488 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 27 Mar 2017 10:51:33 -0700
Subject: md/raid5: use consistency_policy to remove journal feature

When journal device of an array fails, the array is forced into read-only
mode. To make the array normal without adding another journal device, we
need to remove journal _feature_ from the array.

This patch allows remove journal _feature_ from an array, For journal
existing journal should be either missing or faulty.

To remove journal feature, it is necessary to remove the journal device
first:

  mdadm --fail /dev/md0 /dev/sdb
  mdadm: set /dev/sdb faulty in /dev/md0
  mdadm --remove /dev/md0 /dev/sdb
  mdadm: hot removed /dev/sdb from /dev/md0

Then the journal feature can be removed by echoing into the sysfs file:

 cat /sys/block/md0/md/consistency_policy
 journal

 echo resync > /sys/block/md0/md/consistency_policy
 cat /sys/block/md0/md/consistency_policy
 resync

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 266d661dc69b..6036d5e41ddd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -8292,17 +8292,41 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
 	}
 
 	if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) {
-		mddev_suspend(mddev);
-		set_bit(MD_HAS_PPL, &mddev->flags);
-		err = log_init(conf, NULL);
-		if (!err)
+		/* ppl only works with RAID 5 */
+		if (conf->level == 5) {
+			mddev_suspend(mddev);
+			set_bit(MD_HAS_PPL, &mddev->flags);
+			err = log_init(conf, NULL);
+			if (!err)
+				raid5_reset_stripe_cache(mddev);
+			mddev_resume(mddev);
+		} else
+			err = -EINVAL;
+	} else if (strncmp(buf, "resync", 6) == 0) {
+		if (raid5_has_ppl(conf)) {
+			mddev_suspend(mddev);
+			log_exit(conf);
 			raid5_reset_stripe_cache(mddev);
-		mddev_resume(mddev);
-	} else if (strncmp(buf, "resync", 6) == 0 && raid5_has_ppl(conf)) {
-		mddev_suspend(mddev);
-		log_exit(conf);
-		raid5_reset_stripe_cache(mddev);
-		mddev_resume(mddev);
+			mddev_resume(mddev);
+		} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
+			   r5l_log_disk_error(conf)) {
+			bool journal_dev_exists = false;
+			struct md_rdev *rdev;
+
+			rdev_for_each(rdev, mddev)
+				if (test_bit(Journal, &rdev->flags)) {
+					journal_dev_exists = true;
+					break;
+				}
+
+			if (!journal_dev_exists) {
+				mddev_suspend(mddev);
+				clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+				mddev_resume(mddev);
+			} else  /* need remove journal device first */
+				err = -EBUSY;
+		} else
+			err = -EINVAL;
 	} else {
 		err = -EINVAL;
 	}
@@ -8337,6 +8361,7 @@ static struct md_personality raid6_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid6_takeover,
 	.congested	= raid5_congested,
+	.change_consistency_policy = raid5_change_consistency_policy,
 };
 static struct md_personality raid5_personality =
 {
@@ -8385,6 +8410,7 @@ static struct md_personality raid4_personality =
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid4_takeover,
 	.congested	= raid5_congested,
+	.change_consistency_policy = raid5_change_consistency_policy,
 };
 
 static int __init raid5_init(void)
-- 
cgit v1.2.3


From 583da48e388f472e8818d9bb60ef6a1d40ee9f9d Mon Sep 17 00:00:00 2001
From: Dennis Yang <dennisyang@qnap.com>
Date: Wed, 29 Mar 2017 15:46:13 +0800
Subject: md: update slab_cache before releasing new stripes when stripes
 resizing

When growing raid5 device on machine with small memory, there is chance that
mdadm will be killed and the following bug report can be observed. The same
bug could also be reproduced in linux-4.10.6.

[57600.075774] BUG: unable to handle kernel NULL pointer dereference at           (null)
[57600.083796] IP: [<ffffffff81a6aa87>] _raw_spin_lock+0x7/0x20
[57600.110378] PGD 421cf067 PUD 4442d067 PMD 0
[57600.114678] Oops: 0002 [#1] SMP
[57600.180799] CPU: 1 PID: 25990 Comm: mdadm Tainted: P           O    4.2.8 #1
[57600.187849] Hardware name: To be filled by O.E.M. To be filled by O.E.M./MAHOBAY, BIOS QV05AR66 03/06/2013
[57600.197490] task: ffff880044e47240 ti: ffff880043070000 task.ti: ffff880043070000
[57600.204963] RIP: 0010:[<ffffffff81a6aa87>]  [<ffffffff81a6aa87>] _raw_spin_lock+0x7/0x20
[57600.213057] RSP: 0018:ffff880043073810  EFLAGS: 00010046
[57600.218359] RAX: 0000000000000000 RBX: 000000000000000c RCX: ffff88011e296dd0
[57600.225486] RDX: 0000000000000001 RSI: ffffe8ffffcb46c0 RDI: 0000000000000000
[57600.232613] RBP: ffff880043073878 R08: ffff88011e5f8170 R09: 0000000000000282
[57600.239739] R10: 0000000000000005 R11: 28f5c28f5c28f5c3 R12: ffff880043073838
[57600.246872] R13: ffffe8ffffcb46c0 R14: 0000000000000000 R15: ffff8800b9706a00
[57600.253999] FS:  00007f576106c700(0000) GS:ffff88011e280000(0000) knlGS:0000000000000000
[57600.262078] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[57600.267817] CR2: 0000000000000000 CR3: 00000000428fe000 CR4: 00000000001406e0
[57600.274942] Stack:
[57600.276949]  ffffffff8114ee35 ffff880043073868 0000000000000282 000000000000eb3f
[57600.284383]  ffffffff81119043 ffff880043073838 ffff880043073838 ffff88003e197b98
[57600.291820]  ffffe8ffffcb46c0 ffff88003e197360 0000000000000286 ffff880043073968
[57600.299254] Call Trace:
[57600.301698]  [<ffffffff8114ee35>] ? cache_flusharray+0x35/0xe0
[57600.307523]  [<ffffffff81119043>] ? __page_cache_release+0x23/0x110
[57600.313779]  [<ffffffff8114eb53>] kmem_cache_free+0x63/0xc0
[57600.319344]  [<ffffffff81579942>] drop_one_stripe+0x62/0x90
[57600.324915]  [<ffffffff81579b5b>] raid5_cache_scan+0x8b/0xb0
[57600.330563]  [<ffffffff8111b98a>] shrink_slab.part.36+0x19a/0x250
[57600.336650]  [<ffffffff8111e38c>] shrink_zone+0x23c/0x250
[57600.342039]  [<ffffffff8111e4f3>] do_try_to_free_pages+0x153/0x420
[57600.348210]  [<ffffffff8111e851>] try_to_free_pages+0x91/0xa0
[57600.353959]  [<ffffffff811145b1>] __alloc_pages_nodemask+0x4d1/0x8b0
[57600.360303]  [<ffffffff8157a30b>] check_reshape+0x62b/0x770
[57600.365866]  [<ffffffff8157a4a5>] raid5_check_reshape+0x55/0xa0
[57600.371778]  [<ffffffff81583df7>] update_raid_disks+0xc7/0x110
[57600.377604]  [<ffffffff81592b73>] md_ioctl+0xd83/0x1b10
[57600.382827]  [<ffffffff81385380>] blkdev_ioctl+0x170/0x690
[57600.388307]  [<ffffffff81195238>] block_ioctl+0x38/0x40
[57600.393525]  [<ffffffff811731c5>] do_vfs_ioctl+0x2b5/0x480
[57600.399010]  [<ffffffff8115e07b>] ? vfs_write+0x14b/0x1f0
[57600.404400]  [<ffffffff811733cc>] SyS_ioctl+0x3c/0x70
[57600.409447]  [<ffffffff81a6ad97>] entry_SYSCALL_64_fastpath+0x12/0x6a
[57600.415875] Code: 00 00 00 00 55 48 89 e5 8b 07 85 c0 74 04 31 c0 5d c3 ba 01 00 00 00 f0 0f b1 17 85 c0 75 ef b0 01 5d c3 90 31 c0 ba 01 00 00 00 <f0> 0f b1 17 85 c0 75 01 c3 55 89 c6 48 89 e5 e8 85 d1 63 ff 5d
[57600.435460] RIP  [<ffffffff81a6aa87>] _raw_spin_lock+0x7/0x20
[57600.441208]  RSP <ffff880043073810>
[57600.444690] CR2: 0000000000000000
[57600.448000] ---[ end trace cbc6b5cc4bf9831d ]---

The problem is that resize_stripes() releases new stripe_heads before assigning new
slab cache to conf->slab_cache. If the shrinker function raid5_cache_scan() gets called
after resize_stripes() starting releasing new stripes but right before new slab cache
being assigned, it is possible that these new stripe_heads will be freed with the old
slab_cache which was already been destoryed and that triggers this bug.

Signed-off-by: Dennis Yang <dennisyang@qnap.com>
Fixes: edbe83ab4c27 ("md/raid5: allow the stripe_cache to grow and shrink.")
Cc: stable@vger.kernel.org (4.1+)
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6036d5e41ddd..a5676559e7a6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2409,6 +2409,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 		err = -ENOMEM;
 
 	mutex_unlock(&conf->cache_size_mutex);
+
+	conf->slab_cache = sc;
+	conf->active_name = 1-conf->active_name;
+
 	/* Step 4, return new stripes to service */
 	while(!list_empty(&newstripes)) {
 		nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -2426,8 +2430,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	}
 	/* critical section pass, GFP_NOIO no longer needed */
 
-	conf->slab_cache = sc;
-	conf->active_name = 1-conf->active_name;
 	if (!err)
 		conf->pool_size = newsize;
 	return err;
-- 
cgit v1.2.3


From 7471fb77ce4dc4cb81291189947fcdf621a97987 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 3 Apr 2017 12:11:32 +1000
Subject: md/raid6: Fix anomily when recovering a single device in RAID6.

When recoverying a single missing/failed device in a RAID6,
those stripes where the Q block is on the missing device are
handled a bit differently.  In these cases it is easy to
check that the P block is correct, so we do.  This results
in the P block be destroy.  Consequently the P block needs
to be read a second time in order to compute Q.  This causes
lots of seeks and hurts performance.

It shouldn't be necessary to re-read P as it can be computed
from the DATA.  But we only compute blocks on missing
devices, since c337869d9501 ("md: do not compute parity
unless it is on a failed drive").

So relax the change made in that commit to allow computing
of the P block in a RAID6 which it is the only missing that
block.

This makes RAID6 recovery run much faster as the disk just
"before" the recovering device is no longer seeking
back-and-forth.

Reported-by-tested-by: Brad Campbell <lists2009@fnarfbargle.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a5676559e7a6..09d94ad5e52b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3619,9 +3619,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
 		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
 		BUG_ON(test_bit(R5_Wantread, &dev->flags));
 		BUG_ON(sh->batch_head);
+
+		/*
+		 * In the raid6 case if the only non-uptodate disk is P
+		 * then we already trusted P to compute the other failed
+		 * drives. It is safe to compute rather than re-read P.
+		 * In other cases we only compute blocks from failed
+		 * devices, otherwise check/repair might fail to detect
+		 * a real inconsistency.
+		 */
+
 		if ((s->uptodate == disks - 1) &&
+		    ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
 		    (s->failed && (disk_idx == s->failed_num[0] ||
-				   disk_idx == s->failed_num[1]))) {
+				   disk_idx == s->failed_num[1])))) {
 			/* have disk failed, and we're requested to fetch it;
 			 * do compute it
 			 */
-- 
cgit v1.2.3


From 845b9e229fe0716ab6b4d94b4364c99069667b59 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Tue, 4 Apr 2017 13:13:57 +0200
Subject: raid5-ppl: use resize_stripes() when enabling or disabling ppl

Use resize_stripes() instead of raid5_reset_stripe_cache() to allocate
or free sh->ppl_page at runtime for all stripes in the stripe cache.
raid5_reset_stripe_cache() required suspending the mddev and could
deadlock because of GFP_KERNEL allocations.

Move the 'newsize' check to check_reshape() to allow reallocating the
stripes with the same number of disks. Allocate sh->ppl_page in
alloc_stripe() instead of grow_buffers(). Pass 'struct r5conf *conf' as
a parameter to alloc_stripe() because it is needed to check whether to
allocate ppl_page. Add free_stripe() and use it to free stripes rather
than directly call kmem_cache_free(). Also free sh->ppl_page in
free_stripe().

Set MD_HAS_PPL at the end of ppl_init_log() instead of explicitly
setting it in advance and add another parameter to log_init() to allow
calling ppl_init_log() without the bit set. Don't try to calculate
partial parity or add a stripe to log if it does not have ppl_page set.

Enabling ppl can now be performed without suspending the mddev, because
the log won't be used until new stripes are allocated with ppl_page.
Calling mddev_suspend/resume is still necessary when disabling ppl,
because we want all stripes to finish before stopping the log, but
resize_stripes() can be called after mddev_resume() when ppl is no
longer active.

Suggested-by: NeilBrown <neilb@suse.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-log.h |  5 +--
 drivers/md/raid5-ppl.c |  3 +-
 drivers/md/raid5.c     | 88 ++++++++++++++++++++++----------------------------
 3 files changed, 43 insertions(+), 53 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 738930ff5d17..27097101ccca 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -93,11 +93,12 @@ static inline void log_exit(struct r5conf *conf)
 		ppl_exit_log(conf);
 }
 
-static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev)
+static inline int log_init(struct r5conf *conf, struct md_rdev *journal_dev,
+			   bool ppl)
 {
 	if (journal_dev)
 		return r5l_init_log(conf, journal_dev);
-	else if (raid5_has_ppl(conf))
+	else if (ppl)
 		return ppl_init_log(conf);
 
 	return 0;
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 355cf3581ef8..71968cf47d7d 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -328,7 +328,7 @@ int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh)
 	struct ppl_io_unit *io = sh->ppl_io;
 	struct ppl_log *log;
 
-	if (io || test_bit(STRIPE_SYNCING, &sh->state) ||
+	if (io || test_bit(STRIPE_SYNCING, &sh->state) || !sh->ppl_page ||
 	    !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
 	    !test_bit(R5_Insync, &sh->dev[sh->pd_idx].flags)) {
 		clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
@@ -1204,6 +1204,7 @@ int ppl_init_log(struct r5conf *conf)
 	}
 
 	conf->log_private = ppl_conf;
+	set_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
 
 	return 0;
 err:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 09d94ad5e52b..e04d7b11bc87 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -471,11 +471,6 @@ static void shrink_buffers(struct stripe_head *sh)
 		sh->dev[i].page = NULL;
 		put_page(p);
 	}
-
-	if (sh->ppl_page) {
-		put_page(sh->ppl_page);
-		sh->ppl_page = NULL;
-	}
 }
 
 static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
@@ -493,12 +488,6 @@ static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 		sh->dev[i].orig_page = page;
 	}
 
-	if (raid5_has_ppl(sh->raid_conf)) {
-		sh->ppl_page = alloc_page(gfp);
-		if (!sh->ppl_page)
-			return 1;
-	}
-
 	return 0;
 }
 
@@ -2132,8 +2121,15 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	put_cpu();
 }
 
+static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh)
+{
+	if (sh->ppl_page)
+		__free_page(sh->ppl_page);
+	kmem_cache_free(sc, sh);
+}
+
 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
-	int disks)
+	int disks, struct r5conf *conf)
 {
 	struct stripe_head *sh;
 	int i;
@@ -2147,6 +2143,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		INIT_LIST_HEAD(&sh->r5c);
 		INIT_LIST_HEAD(&sh->log_list);
 		atomic_set(&sh->count, 1);
+		sh->raid_conf = conf;
 		sh->log_start = MaxSector;
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
@@ -2154,6 +2151,14 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 			bio_init(&dev->req, &dev->vec, 1);
 			bio_init(&dev->rreq, &dev->rvec, 1);
 		}
+
+		if (raid5_has_ppl(conf)) {
+			sh->ppl_page = alloc_page(gfp);
+			if (!sh->ppl_page) {
+				free_stripe(sc, sh);
+				sh = NULL;
+			}
+		}
 	}
 	return sh;
 }
@@ -2161,15 +2166,13 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 {
 	struct stripe_head *sh;
 
-	sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size);
+	sh = alloc_stripe(conf->slab_cache, gfp, conf->pool_size, conf);
 	if (!sh)
 		return 0;
 
-	sh->raid_conf = conf;
-
 	if (grow_buffers(sh, gfp)) {
 		shrink_buffers(sh);
-		kmem_cache_free(conf->slab_cache, sh);
+		free_stripe(conf->slab_cache, sh);
 		return 0;
 	}
 	sh->hash_lock_index =
@@ -2314,9 +2317,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	int i;
 	int hash, cnt;
 
-	if (newsize <= conf->pool_size)
-		return 0; /* never bother to shrink */
-
 	err = md_allow_write(conf->mddev);
 	if (err)
 		return err;
@@ -2332,11 +2332,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	mutex_lock(&conf->cache_size_mutex);
 
 	for (i = conf->max_nr_stripes; i; i--) {
-		nsh = alloc_stripe(sc, GFP_KERNEL, newsize);
+		nsh = alloc_stripe(sc, GFP_KERNEL, newsize, conf);
 		if (!nsh)
 			break;
 
-		nsh->raid_conf = conf;
 		list_add(&nsh->lru, &newstripes);
 	}
 	if (i) {
@@ -2344,7 +2343,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 		while (!list_empty(&newstripes)) {
 			nsh = list_entry(newstripes.next, struct stripe_head, lru);
 			list_del(&nsh->lru);
-			kmem_cache_free(sc, nsh);
+			free_stripe(sc, nsh);
 		}
 		kmem_cache_destroy(sc);
 		mutex_unlock(&conf->cache_size_mutex);
@@ -2370,7 +2369,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 			nsh->dev[i].orig_page = osh->dev[i].page;
 		}
 		nsh->hash_lock_index = hash;
-		kmem_cache_free(conf->slab_cache, osh);
+		free_stripe(conf->slab_cache, osh);
 		cnt++;
 		if (cnt >= conf->max_nr_stripes / NR_STRIPE_HASH_LOCKS +
 		    !!((conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS) > hash)) {
@@ -2447,7 +2446,7 @@ static int drop_one_stripe(struct r5conf *conf)
 		return 0;
 	BUG_ON(atomic_read(&sh->count));
 	shrink_buffers(sh);
-	kmem_cache_free(conf->slab_cache, sh);
+	free_stripe(conf->slab_cache, sh);
 	atomic_dec(&conf->active_stripes);
 	conf->max_nr_stripes--;
 	return 1;
@@ -3170,7 +3169,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 		s->locked++;
 	}
 
-	if (raid5_has_ppl(sh->raid_conf) &&
+	if (raid5_has_ppl(sh->raid_conf) && sh->ppl_page &&
 	    test_bit(STRIPE_OP_BIODRAIN, &s->ops_request) &&
 	    !test_bit(STRIPE_FULL_WRITE, &sh->state) &&
 	    test_bit(R5_Insync, &sh->dev[pd_idx].flags))
@@ -7427,7 +7426,7 @@ static int raid5_run(struct mddev *mddev)
 		blk_queue_max_hw_sectors(mddev->queue, UINT_MAX);
 	}
 
-	if (log_init(conf, journal_dev))
+	if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
 		goto abort;
 
 	return 0;
@@ -7636,7 +7635,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * The array is in readonly mode if journal is missing, so no
 		 * write requests running. We should be safe
 		 */
-		log_init(conf, rdev);
+		log_init(conf, rdev, false);
 		return 0;
 	}
 	if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7786,6 +7785,9 @@ static int check_reshape(struct mddev *mddev)
 				      mddev->chunk_sectors)
 			    ) < 0)
 			return -ENOMEM;
+
+	if (conf->previous_raid_disks + mddev->delta_disks <= conf->pool_size)
+		return 0; /* never bother to shrink */
 	return resize_stripes(conf, (conf->previous_raid_disks
 				     + mddev->delta_disks));
 }
@@ -8276,20 +8278,6 @@ static void *raid6_takeover(struct mddev *mddev)
 	return setup_conf(mddev);
 }
 
-static void raid5_reset_stripe_cache(struct mddev *mddev)
-{
-	struct r5conf *conf = mddev->private;
-
-	mutex_lock(&conf->cache_size_mutex);
-	while (conf->max_nr_stripes &&
-	       drop_one_stripe(conf))
-		;
-	while (conf->min_nr_stripes > conf->max_nr_stripes &&
-	       grow_one_stripe(conf, GFP_KERNEL))
-		;
-	mutex_unlock(&conf->cache_size_mutex);
-}
-
 static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
 {
 	struct r5conf *conf;
@@ -8304,23 +8292,23 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
 		return -ENODEV;
 	}
 
-	if (strncmp(buf, "ppl", 3) == 0 && !raid5_has_ppl(conf)) {
+	if (strncmp(buf, "ppl", 3) == 0) {
 		/* ppl only works with RAID 5 */
-		if (conf->level == 5) {
-			mddev_suspend(mddev);
-			set_bit(MD_HAS_PPL, &mddev->flags);
-			err = log_init(conf, NULL);
-			if (!err)
-				raid5_reset_stripe_cache(mddev);
-			mddev_resume(mddev);
+		if (!raid5_has_ppl(conf) && conf->level == 5) {
+			err = log_init(conf, NULL, true);
+			if (!err) {
+				err = resize_stripes(conf, conf->pool_size);
+				if (err)
+					log_exit(conf);
+			}
 		} else
 			err = -EINVAL;
 	} else if (strncmp(buf, "resync", 6) == 0) {
 		if (raid5_has_ppl(conf)) {
 			mddev_suspend(mddev);
 			log_exit(conf);
-			raid5_reset_stripe_cache(mddev);
 			mddev_resume(mddev);
+			err = resize_stripes(conf, conf->pool_size);
 		} else if (test_bit(MD_HAS_JOURNAL, &conf->mddev->flags) &&
 			   r5l_log_disk_error(conf)) {
 			bool journal_dev_exists = false;
-- 
cgit v1.2.3


From ae1713e296449caf820635d384a99936ce281a71 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Tue, 4 Apr 2017 13:13:58 +0200
Subject: raid5-ppl: partial parity calculation optimization

In case of read-modify-write, partial partity is the same as the result
of ops_run_prexor5(), so we can just copy sh->dev[pd_idx].page into
sh->ppl_page instead of calculating it again.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-ppl.c | 20 ++++++++++----------
 drivers/md/raid5.c     |  6 +++---
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 71968cf47d7d..4eb0ebcf9c29 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -153,7 +153,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
 		       struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	struct page **xor_srcs = flex_array_get(percpu->scribble, 0);
+	struct page **srcs = flex_array_get(percpu->scribble, 0);
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct async_submit_ctl submit;
 
@@ -166,18 +166,18 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
 	 * differently.
 	 */
 	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
-		/* rmw: xor old data and parity from updated disks */
-		for (i = disks; i--;) {
-			struct r5dev *dev = &sh->dev[i];
-			if (test_bit(R5_Wantdrain, &dev->flags) || i == pd_idx)
-				xor_srcs[count++] = dev->page;
-		}
+		/*
+		 * rmw: xor old data and parity from updated disks
+		 * This is calculated earlier by ops_run_prexor5() so just copy
+		 * the parity dev page.
+		 */
+		srcs[count++] = sh->dev[pd_idx].page;
 	} else if (sh->reconstruct_state == reconstruct_state_drain_run) {
 		/* rcw: xor data from all not updated disks */
 		for (i = disks; i--;) {
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_UPTODATE, &dev->flags))
-				xor_srcs[count++] = dev->page;
+				srcs[count++] = dev->page;
 		}
 	} else {
 		return tx;
@@ -188,10 +188,10 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
 			  + sizeof(struct page *) * (sh->disks + 2));
 
 	if (count == 1)
-		tx = async_memcpy(sh->ppl_page, xor_srcs[0], 0, 0, PAGE_SIZE,
+		tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
 				  &submit);
 	else
-		tx = async_xor(sh->ppl_page, xor_srcs, 0, count, PAGE_SIZE,
+		tx = async_xor(sh->ppl_page, srcs, 0, count, PAGE_SIZE,
 			       &submit);
 
 	return tx;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e04d7b11bc87..f3692ff4262b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2079,9 +2079,6 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			async_tx_ack(tx);
 	}
 
-	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
-		tx = ops_run_partial_parity(sh, percpu, tx);
-
 	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
 		if (level < 6)
 			tx = ops_run_prexor5(sh, percpu, tx);
@@ -2089,6 +2086,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			tx = ops_run_prexor6(sh, percpu, tx);
 	}
 
+	if (test_bit(STRIPE_OP_PARTIAL_PARITY, &ops_request))
+		tx = ops_run_partial_parity(sh, percpu, tx);
+
 	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
 		tx = ops_run_biodrain(sh, tx);
 		overlap_clear++;
-- 
cgit v1.2.3


From dd7a8f5dee81ffb1794df1103f07c63fd4f1d766 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 5 Apr 2017 14:05:51 +1000
Subject: md/raid5: make chunk_aligned_read() split bios more cleanly.

chunk_aligned_read() currently uses fs_bio_set - which is meant for
filesystems to use - and loops if multiple splits are needed, which is
not best practice.
As this is only used for READ requests, not writes, it is unlikely
to cause a problem.  However it is best to be consistent in how
we split bios, and to follow the pattern used in raid1/raid10.

So create a private bioset, bio_split, and use it to perform a single
split, submitting the remainder to generic_make_request() for later
processing.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 33 +++++++++++++++++----------------
 drivers/md/raid5.h |  1 +
 2 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f3692ff4262b..356cd9c7c753 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5246,24 +5246,20 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
 {
 	struct bio *split;
+	sector_t sector = raid_bio->bi_iter.bi_sector;
+	unsigned chunk_sects = mddev->chunk_sectors;
+	unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
 
-	do {
-		sector_t sector = raid_bio->bi_iter.bi_sector;
-		unsigned chunk_sects = mddev->chunk_sectors;
-		unsigned sectors = chunk_sects - (sector & (chunk_sects-1));
-
-		if (sectors < bio_sectors(raid_bio)) {
-			split = bio_split(raid_bio, sectors, GFP_NOIO, fs_bio_set);
-			bio_chain(split, raid_bio);
-		} else
-			split = raid_bio;
+	if (sectors < bio_sectors(raid_bio)) {
+		struct r5conf *conf = mddev->private;
+		split = bio_split(raid_bio, sectors, GFP_NOIO, conf->bio_split);
+		bio_chain(split, raid_bio);
+		generic_make_request(raid_bio);
+		raid_bio = split;
+	}
 
-		if (!raid5_read_one_chunk(mddev, split)) {
-			if (split != raid_bio)
-				generic_make_request(raid_bio);
-			return split;
-		}
-	} while (split != raid_bio);
+	if (!raid5_read_one_chunk(mddev, raid_bio))
+		return raid_bio;
 
 	return NULL;
 }
@@ -6747,6 +6743,8 @@ static void free_conf(struct r5conf *conf)
 		if (conf->disks[i].extra_page)
 			put_page(conf->disks[i].extra_page);
 	kfree(conf->disks);
+	if (conf->bio_split)
+		bioset_free(conf->bio_split);
 	kfree(conf->stripe_hashtbl);
 	kfree(conf->pending_data);
 	kfree(conf);
@@ -6922,6 +6920,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 			goto abort;
 	}
 
+	conf->bio_split = bioset_create(BIO_POOL_SIZE, 0);
+	if (!conf->bio_split)
+		goto abort;
 	conf->mddev = mddev;
 
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index cdc7f92e1806..625c7f16fd6b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -646,6 +646,7 @@ struct r5conf {
 	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
+	struct bio_set		*bio_split;
 
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
-- 
cgit v1.2.3


From e5bc9c3c5432f5531a58e6fdd9f6c6587f2137b3 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 24 Apr 2017 15:58:04 +0800
Subject: md: clear WantReplacement once disk is removed

We can clear 'WantReplacement' flag directly no
matter it's replacement existed or not since the
semantic is same as before.

Also since the disk is removed from array, then
it is straightforward to remove 'WantReplacement'
flag and the comments in raid10/5 can be removed
as well.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c  | 6 +++---
 drivers/md/raid10.c | 8 ++------
 drivers/md/raid5.c  | 9 +++------
 3 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 14a9d36b25b8..70a596c10306 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1831,9 +1831,9 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			p->rdev = repl;
 			conf->mirrors[conf->raid_disks + number].rdev = NULL;
 			unfreeze_array(conf);
-			clear_bit(WantReplacement, &rdev->flags);
-		} else
-			clear_bit(WantReplacement, &rdev->flags);
+		}
+
+		clear_bit(WantReplacement, &rdev->flags);
 		err = md_integrity_register(mddev);
 	}
 abort:
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5de951bcd24c..2883b720a265 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1874,13 +1874,9 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			   * but will never see neither -- if they are careful.
 			   */
 		p->replacement = NULL;
-		clear_bit(WantReplacement, &rdev->flags);
-	} else
-		/* We might have just remove the Replacement as faulty
-		 * Clear the flag just in case
-		 */
-		clear_bit(WantReplacement, &rdev->flags);
+	}
 
+	clear_bit(WantReplacement, &rdev->flags);
 	err = md_integrity_register(mddev);
 
 abort:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 356cd9c7c753..3d971e5a1b0e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7603,15 +7603,12 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			   * but will never see neither - if they are careful
 			   */
 		p->replacement = NULL;
-		clear_bit(WantReplacement, &rdev->flags);
 
 		if (!err)
 			err = log_modify(conf, p->rdev, true);
-	} else
-		/* We might have just removed the Replacement as faulty-
-		 * clear the bit just in case
-		 */
-		clear_bit(WantReplacement, &rdev->flags);
+	}
+
+	clear_bit(WantReplacement, &rdev->flags);
 abort:
 
 	print_raid5_conf(conf);
-- 
cgit v1.2.3