From d5d885fd514fcebc9da5503c88aa0112df7514ef Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Sun, 19 Nov 2017 22:17:01 -0800
Subject: md: introduce new personality funciton start()

In do_md_run(), md threads should not wake up until the array is fully
initialized in md_run(). However, in raid5_run(), raid5-cache may wake
up mddev->thread to flush stripes that need to be written back. This
design doesn't break badly right now. But it could lead to bad bug in
the future.

This patch tries to resolve this problem by splitting start up work
into two personality functions, run() and start(). Tasks that do not
require the md threads should go into run(), while task that require
the md threads go into start().

r5l_load_log() is moved to raid5_start(), so it is not called until
the md threads are started in do_md_run().

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/dm-raid.c     |  9 +++++++++
 drivers/md/md.c          | 26 ++++++++++++++++++++------
 drivers/md/md.h          |  8 ++++++++
 drivers/md/raid5-cache.c | 22 +++++++++++++++++-----
 drivers/md/raid5-log.h   |  1 +
 drivers/md/raid5.c       | 10 ++++++++++
 6 files changed, 65 insertions(+), 11 deletions(-)

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 6319d846e0ad..e5ef0757fe23 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3151,6 +3151,14 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
+	r = md_start(&rs->md);
+
+	if (r) {
+		ti->error = "Failed to start raid array";
+		mddev_unlock(&rs->md);
+		goto bad_md_start;
+	}
+
 	rs->callbacks.congested_fn = raid_is_congested;
 	dm_table_add_target_callbacks(ti->table, &rs->callbacks);
 
@@ -3198,6 +3206,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	mddev_unlock(&rs->md);
 	return 0;
 
+bad_md_start:
 bad_journal_mode_set:
 bad_stripe_cache:
 bad_check_reshape:
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4e4dee0ec2de..a71adb3c34b9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5560,11 +5560,6 @@ int md_run(struct mddev *mddev)
 	if (start_readonly && mddev->ro == 0)
 		mddev->ro = 2; /* read-only, but switch on first write */
 
-	/*
-	 * NOTE: some pers->run(), for example r5l_recovery_log(), wakes
-	 * up mddev->thread. It is important to initialize critical
-	 * resources for mddev->thread BEFORE calling pers->run().
-	 */
 	err = pers->run(mddev);
 	if (err)
 		pr_warn("md: pers->run() failed ...\n");
@@ -5678,6 +5673,9 @@ static int do_md_run(struct mddev *mddev)
 	if (mddev_is_clustered(mddev))
 		md_allow_write(mddev);
 
+	/* run start up tasks that require md_thread */
+	md_start(mddev);
+
 	md_wakeup_thread(mddev->thread);
 	md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
 
@@ -5689,6 +5687,21 @@ out:
 	return err;
 }
 
+int md_start(struct mddev *mddev)
+{
+	int ret = 0;
+
+	if (mddev->pers->start) {
+		set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
+		ret = mddev->pers->start(mddev);
+		clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
+		md_wakeup_thread(mddev->sync_thread);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(md_start);
+
 static int restart_array(struct mddev *mddev)
 {
 	struct gendisk *disk = mddev->gendisk;
@@ -8169,7 +8182,8 @@ void md_do_sync(struct md_thread *thread)
 	int ret;
 
 	/* just incase thread restarts... */
-	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+	    test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
 		return;
 	if (mddev->ro) {/* never try to sync a read-only array */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 7d6bcf0eba0c..be8f72a9e30b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -485,6 +485,7 @@ enum recovery_flags {
 	MD_RECOVERY_RESHAPE,	/* A reshape is happening */
 	MD_RECOVERY_FROZEN,	/* User request to abort, and not restart, any action */
 	MD_RECOVERY_ERROR,	/* sync-action interrupted because io-error */
+	MD_RECOVERY_WAIT,	/* waiting for pers->start() to finish */
 };
 
 static inline int __must_check mddev_lock(struct mddev *mddev)
@@ -523,7 +524,13 @@ struct md_personality
 	struct list_head list;
 	struct module *owner;
 	bool (*make_request)(struct mddev *mddev, struct bio *bio);
+	/*
+	 * start up works that do NOT require md_thread. tasks that
+	 * requires md_thread should go into start()
+	 */
 	int (*run)(struct mddev *mddev);
+	/* start up works that require md threads */
+	int (*start)(struct mddev *mddev);
 	void (*free)(struct mddev *mddev, void *priv);
 	void (*status)(struct seq_file *seq, struct mddev *mddev);
 	/* error_handler must set ->faulty and clear ->in_sync
@@ -687,6 +694,7 @@ extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
 
 extern void mddev_init(struct mddev *mddev);
 extern int md_run(struct mddev *mddev);
+extern int md_start(struct mddev *mddev);
 extern void md_stop(struct mddev *mddev);
 extern void md_stop_writes(struct mddev *mddev);
 extern int md_rdev_init(struct md_rdev *rdev);
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 39f31f07ffe9..f259a5fd3fbd 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2448,7 +2448,6 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
 		raid5_release_stripe(sh);
 	}
 
-	md_wakeup_thread(conf->mddev->thread);
 	/* reuse conf->wait_for_quiescent in recovery */
 	wait_event(conf->wait_for_quiescent,
 		   atomic_read(&conf->active_stripes) == 0);
@@ -3036,6 +3035,23 @@ ioerr:
 	return ret;
 }
 
+int r5l_start(struct r5l_log *log)
+{
+	int ret;
+
+	if (!log)
+		return 0;
+
+	ret = r5l_load_log(log);
+	if (ret) {
+		struct mddev *mddev = log->rdev->mddev;
+		struct r5conf *conf = mddev->private;
+
+		r5l_exit_log(conf);
+	}
+	return ret;
+}
+
 void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct r5conf *conf = mddev->private;
@@ -3138,13 +3154,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 
 	rcu_assign_pointer(conf->log, log);
 
-	if (r5l_load_log(log))
-		goto error;
-
 	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	return 0;
 
-error:
 	rcu_assign_pointer(conf->log, NULL);
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 284578b0a349..3860041e8b74 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -32,6 +32,7 @@ extern struct md_sysfs_entry r5c_journal_mode;
 extern void r5c_update_on_rdev_error(struct mddev *mddev,
 				     struct md_rdev *rdev);
 extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
+extern int r5l_start(struct r5l_log *log);
 
 extern struct dma_async_tx_descriptor *
 ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 98ce4272ace9..5a2a29bd02dd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -8364,6 +8364,13 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
 	return err;
 }
 
+static int raid5_start(struct mddev *mddev)
+{
+	struct r5conf *conf = mddev->private;
+
+	return r5l_start(conf->log);
+}
+
 static struct md_personality raid6_personality =
 {
 	.name		= "raid6",
@@ -8371,6 +8378,7 @@ static struct md_personality raid6_personality =
 	.owner		= THIS_MODULE,
 	.make_request	= raid5_make_request,
 	.run		= raid5_run,
+	.start		= raid5_start,
 	.free		= raid5_free,
 	.status		= raid5_status,
 	.error_handler	= raid5_error,
@@ -8395,6 +8403,7 @@ static struct md_personality raid5_personality =
 	.owner		= THIS_MODULE,
 	.make_request	= raid5_make_request,
 	.run		= raid5_run,
+	.start		= raid5_start,
 	.free		= raid5_free,
 	.status		= raid5_status,
 	.error_handler	= raid5_error,
@@ -8420,6 +8429,7 @@ static struct md_personality raid4_personality =
 	.owner		= THIS_MODULE,
 	.make_request	= raid5_make_request,
 	.run		= raid5_run,
+	.start		= raid5_start,
 	.free		= raid5_free,
 	.status		= raid5_status,
 	.error_handler	= raid5_error,
-- 
cgit v1.2.3


From 474beb575c03e0e7f1a704ac428916898f81b3cd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 4 Dec 2017 08:21:04 +1100
Subject: md/raid1,raid10: silence warning about wait-within-wait

If you prepare_to_wait() after a previous prepare_to_wait(),
but before calling schedule(), you get warning:

  do not call blocking ops when !TASK_RUNNING; state=2

This is appropriate as it is often a bug.  The event that the
first prepare_to_wait() expects might wake up the schedule following
the second prepare_to_wait(), which could be confusing.

However if both prepare_to_wait()s are part of simple wait_event()
loops, and if the inner one is rarely called, then there is
no problem.  The inner loop is too simple to get confused by
a stray wakeup, and the outer loop won't spin unduly because the
inner doesnt affect it often.

This pattern occurs in both raid1.c and raid10.c in the use of
flush_pending_writes().

The warning can be silenced by setting current->state to TASK_RUNNING.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c  | 11 +++++++++++
 drivers/md/raid10.c | 12 ++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6df398e3a008..b2eae332e1a2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -815,6 +815,17 @@ static void flush_pending_writes(struct r1conf *conf)
 		bio = bio_list_get(&conf->pending_bio_list);
 		conf->pending_count = 0;
 		spin_unlock_irq(&conf->device_lock);
+
+		/*
+		 * As this is called in a wait_event() loop (see freeze_array),
+		 * current->state might be TASK_UNINTERRUPTIBLE which will
+		 * cause a warning when we prepare to wait again.  As it is
+		 * rare that this path is taken, it is perfectly safe to force
+		 * us to go around the wait_event() loop again, so the warning
+		 * is a false-positive.  Silence the warning by resetting
+		 * thread state
+		 */
+		__set_current_state(TASK_RUNNING);
 		blk_start_plug(&plug);
 		flush_bio_list(conf, bio);
 		blk_finish_plug(&plug);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c131835cf008..99c9207899a7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -900,6 +900,18 @@ static void flush_pending_writes(struct r10conf *conf)
 		bio = bio_list_get(&conf->pending_bio_list);
 		conf->pending_count = 0;
 		spin_unlock_irq(&conf->device_lock);
+
+		/*
+		 * As this is called in a wait_event() loop (see freeze_array),
+		 * current->state might be TASK_UNINTERRUPTIBLE which will
+		 * cause a warning when we prepare to wait again.  As it is
+		 * rare that this path is taken, it is perfectly safe to force
+		 * us to go around the wait_event() loop again, so the warning
+		 * is a false-positive. Silence the warning by resetting
+		 * thread state
+		 */
+		__set_current_state(TASK_RUNNING);
+
 		blk_start_plug(&plug);
 		/* flush any pending bitmap writes to disk
 		 * before proceeding w/ I/O */
-- 
cgit v1.2.3


From 92e6245deab80f0934a102ba969d8b891b8ba5bf Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Tue, 19 Dec 2017 11:43:07 -0800
Subject: md/r5cache: print more info of log recovery

Log recovery is critical for raid5 journal/cache. Printing information
about each recovery by default will help the system admin monitor the
status of the array.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index f259a5fd3fbd..8b8f9db05e91 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2490,10 +2490,10 @@ static int r5l_recovery_log(struct r5l_log *log)
 	ctx->seq += 10000;
 
 	if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
-		pr_debug("md/raid:%s: starting from clean shutdown\n",
+		pr_info("md/raid:%s: starting from clean shutdown\n",
 			 mdname(mddev));
 	else
-		pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
+		pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
 			 mdname(mddev), ctx->data_only_stripes,
 			 ctx->data_parity_stripes);
 
-- 
cgit v1.2.3


From 1532d9e87e8b2377f12929f9e40724d5fbe6ecc5 Mon Sep 17 00:00:00 2001
From: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Date: Wed, 27 Dec 2017 10:31:40 +0100
Subject: raid5-ppl: PPL support for disks with write-back cache enabled

In order to provide data consistency with PPL for disks with write-back
cache enabled all data has to be flushed to disks before next PPL
entry. The disks to be flushed are marked in the bitmap. It's modified
under a mutex and it's only read after PPL io unit is submitted.

A limitation of 64 disks in the array has been introduced to keep data
structures and implementation simple. RAID5 arrays with so many disks are
not likely due to high risk of multiple disks failure. Such restriction
should not be a real life limitation.

With write-back cache disabled next PPL entry is submitted when data write
for current one completes. Data flush defers next log submission so trigger
it when there are no stripes for handling found.

As PPL assures all data is flushed to disk at request completion, just
acknowledge flush request when PPL is enabled.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Signed-off-by: Shaohua Li <sh.li@alibaba-inc.com>
---
 Documentation/md/raid5-ppl.txt |   7 +-
 drivers/md/md.c                |   5 +-
 drivers/md/md.h                |   1 +
 drivers/md/raid5-cache.c       |   5 --
 drivers/md/raid5-log.h         |  29 +++++++
 drivers/md/raid5-ppl.c         | 167 ++++++++++++++++++++++++++++++++++++++---
 drivers/md/raid5.c             |   6 +-
 7 files changed, 195 insertions(+), 25 deletions(-)

diff --git a/Documentation/md/raid5-ppl.txt b/Documentation/md/raid5-ppl.txt
index 127072b09363..bfa092589e00 100644
--- a/Documentation/md/raid5-ppl.txt
+++ b/Documentation/md/raid5-ppl.txt
@@ -39,6 +39,7 @@ case the behavior is the same as in plain raid5.
 PPL is available for md version-1 metadata and external (specifically IMSM)
 metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
 
-Currently, volatile write-back cache should be disabled on all member drives
-when using PPL. Otherwise it cannot guarantee consistency in case of power
-failure.
+There is a limitation of maximum 64 disks in the array for PPL. It allows to
+keep data structures and implementation simple. RAID5 arrays with so many disks
+are not likely due to high risk of multiple disks failure. Such restriction
+should not be a real life limitation.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a71adb3c34b9..1b5c9af2c47d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -711,7 +711,7 @@ static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
 	return NULL;
 }
 
-static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
+struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
 {
 	struct md_rdev *rdev;
 
@@ -721,6 +721,7 @@ static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
 
 static struct md_personality *find_pers(int level, char *clevel)
 {
@@ -7010,7 +7011,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
 		return -ENODEV;
 
 	rcu_read_lock();
-	rdev = find_rdev_rcu(mddev, dev);
+	rdev = md_find_rdev_rcu(mddev, dev);
 	if (!rdev)
 		err =  -ENODEV;
 	else {
diff --git a/drivers/md/md.h b/drivers/md/md.h
index be8f72a9e30b..58cd20a5e85e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -710,6 +710,7 @@ extern void md_reload_sb(struct mddev *mddev, int raid_disk);
 extern void md_update_sb(struct mddev *mddev, int force);
 extern void md_kick_rdev_from_array(struct md_rdev * rdev);
 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
+struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
 
 static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
 {
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 8b8f9db05e91..3c65f52b68f5 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1111,9 +1111,6 @@ void r5l_write_stripe_run(struct r5l_log *log)
 
 int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
 {
-	if (!log)
-		return -ENODEV;
-
 	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
 		/*
 		 * in write through (journal only)
@@ -1592,8 +1589,6 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
 void r5l_quiesce(struct r5l_log *log, int quiesce)
 {
 	struct mddev *mddev;
-	if (!log)
-		return;
 
 	if (quiesce) {
 		/* make sure r5l_write_super_and_discard_space exits */
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 3860041e8b74..0c76bcedfc1c 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -43,6 +43,7 @@ extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
 extern void ppl_write_stripe_run(struct r5conf *conf);
 extern void ppl_stripe_write_finished(struct stripe_head *sh);
 extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
+extern void ppl_quiesce(struct r5conf *conf, int quiesce);
 
 static inline bool raid5_has_ppl(struct r5conf *conf)
 {
@@ -88,6 +89,34 @@ static inline void log_write_stripe_run(struct r5conf *conf)
 		ppl_write_stripe_run(conf);
 }
 
+static inline void log_flush_stripe_to_raid(struct r5conf *conf)
+{
+	if (conf->log)
+		r5l_flush_stripe_to_raid(conf->log);
+	else if (raid5_has_ppl(conf))
+		ppl_write_stripe_run(conf);
+}
+
+static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio)
+{
+	int ret = -ENODEV;
+
+	if (conf->log)
+		ret = r5l_handle_flush_request(conf->log, bio);
+	else if (raid5_has_ppl(conf))
+		ret = 0;
+
+	return ret;
+}
+
+static inline void log_quiesce(struct r5conf *conf, int quiesce)
+{
+	if (conf->log)
+		r5l_quiesce(conf->log, quiesce);
+	else if (raid5_has_ppl(conf))
+		ppl_quiesce(conf, quiesce);
+}
+
 static inline void log_exit(struct r5conf *conf)
 {
 	if (conf->log)
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 628c0bf7b9fd..2764c2290062 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -85,6 +85,9 @@
  * (for a single member disk). New io_units are added to the end of the list
  * and the first io_unit is submitted, if it is not submitted already.
  * The current io_unit accepting new stripes is always at the end of the list.
+ *
+ * If write-back cache is enabled for any of the disks in the array, its data
+ * must be flushed before next io_unit is submitted.
  */
 
 #define PPL_SPACE_SIZE (128 * 1024)
@@ -104,6 +107,7 @@ struct ppl_conf {
 	struct kmem_cache *io_kc;
 	mempool_t *io_pool;
 	struct bio_set *bs;
+	struct bio_set *flush_bs;
 
 	/* used only for recovery */
 	int recovered_entries;
@@ -128,6 +132,8 @@ struct ppl_log {
 	sector_t next_io_sector;
 	unsigned int entry_space;
 	bool use_multippl;
+	bool wb_cache_on;
+	unsigned long disk_flush_bitmap;
 };
 
 #define PPL_IO_INLINE_BVECS 32
@@ -145,6 +151,7 @@ struct ppl_io_unit {
 
 	struct list_head stripe_list;	/* stripes added to the io_unit */
 	atomic_t pending_stripes;	/* how many stripes not written to raid */
+	atomic_t pending_flushes;	/* how many disk flushes are in progress */
 
 	bool submitted;			/* true if write to log started */
 
@@ -249,6 +256,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
 	INIT_LIST_HEAD(&io->log_sibling);
 	INIT_LIST_HEAD(&io->stripe_list);
 	atomic_set(&io->pending_stripes, 0);
+	atomic_set(&io->pending_flushes, 0);
 	bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
 
 	pplhdr = page_address(io->header_page);
@@ -475,7 +483,18 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
 	if (log->use_multippl)
 		log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9;
 
+	WARN_ON(log->disk_flush_bitmap != 0);
+
 	list_for_each_entry(sh, &io->stripe_list, log_list) {
+		for (i = 0; i < sh->disks; i++) {
+			struct r5dev *dev = &sh->dev[i];
+
+			if ((ppl_conf->child_logs[i].wb_cache_on) &&
+			    (test_bit(R5_Wantwrite, &dev->flags))) {
+				set_bit(i, &log->disk_flush_bitmap);
+			}
+		}
+
 		/* entries for full stripe writes have no partial parity */
 		if (test_bit(STRIPE_FULL_WRITE, &sh->state))
 			continue;
@@ -540,6 +559,7 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io)
 {
 	struct ppl_log *log = io->log;
 	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct r5conf *conf = ppl_conf->mddev->private;
 	unsigned long flags;
 
 	pr_debug("%s: seq: %llu\n", __func__, io->seq);
@@ -565,6 +585,112 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io)
 	spin_unlock(&ppl_conf->no_mem_stripes_lock);
 
 	local_irq_restore(flags);
+
+	wake_up(&conf->wait_for_quiescent);
+}
+
+static void ppl_flush_endio(struct bio *bio)
+{
+	struct ppl_io_unit *io = bio->bi_private;
+	struct ppl_log *log = io->log;
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct r5conf *conf = ppl_conf->mddev->private;
+	char b[BDEVNAME_SIZE];
+
+	pr_debug("%s: dev: %s\n", __func__, bio_devname(bio, b));
+
+	if (bio->bi_status) {
+		struct md_rdev *rdev;
+
+		rcu_read_lock();
+		rdev = md_find_rdev_rcu(conf->mddev, bio_dev(bio));
+		if (rdev)
+			md_error(rdev->mddev, rdev);
+		rcu_read_unlock();
+	}
+
+	bio_put(bio);
+
+	if (atomic_dec_and_test(&io->pending_flushes)) {
+		ppl_io_unit_finished(io);
+		md_wakeup_thread(conf->mddev->thread);
+	}
+}
+
+static void ppl_do_flush(struct ppl_io_unit *io)
+{
+	struct ppl_log *log = io->log;
+	struct ppl_conf *ppl_conf = log->ppl_conf;
+	struct r5conf *conf = ppl_conf->mddev->private;
+	int raid_disks = conf->raid_disks;
+	int flushed_disks = 0;
+	int i;
+
+	atomic_set(&io->pending_flushes, raid_disks);
+
+	for_each_set_bit(i, &log->disk_flush_bitmap, raid_disks) {
+		struct md_rdev *rdev;
+		struct block_device *bdev = NULL;
+
+		rcu_read_lock();
+		rdev = rcu_dereference(conf->disks[i].rdev);
+		if (rdev && !test_bit(Faulty, &rdev->flags))
+			bdev = rdev->bdev;
+		rcu_read_unlock();
+
+		if (bdev) {
+			struct bio *bio;
+			char b[BDEVNAME_SIZE];
+
+			bio = bio_alloc_bioset(GFP_NOIO, 0, ppl_conf->flush_bs);
+			bio_set_dev(bio, bdev);
+			bio->bi_private = io;
+			bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+			bio->bi_end_io = ppl_flush_endio;
+
+			pr_debug("%s: dev: %s\n", __func__,
+				 bio_devname(bio, b));
+
+			submit_bio(bio);
+			flushed_disks++;
+		}
+	}
+
+	log->disk_flush_bitmap = 0;
+
+	for (i = flushed_disks ; i < raid_disks; i++) {
+		if (atomic_dec_and_test(&io->pending_flushes))
+			ppl_io_unit_finished(io);
+	}
+}
+
+static inline bool ppl_no_io_unit_submitted(struct r5conf *conf,
+					    struct ppl_log *log)
+{
+	struct ppl_io_unit *io;
+
+	io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
+				      log_sibling);
+
+	return !io || !io->submitted;
+}
+
+void ppl_quiesce(struct r5conf *conf, int quiesce)
+{
+	struct ppl_conf *ppl_conf = conf->log_private;
+	int i;
+
+	if (quiesce) {
+		for (i = 0; i < ppl_conf->count; i++) {
+			struct ppl_log *log = &ppl_conf->child_logs[i];
+
+			spin_lock_irq(&log->io_list_lock);
+			wait_event_lock_irq(conf->wait_for_quiescent,
+					    ppl_no_io_unit_submitted(conf, log),
+					    log->io_list_lock);
+			spin_unlock_irq(&log->io_list_lock);
+		}
+	}
 }
 
 void ppl_stripe_write_finished(struct stripe_head *sh)
@@ -574,8 +700,12 @@ void ppl_stripe_write_finished(struct stripe_head *sh)
 	io = sh->ppl_io;
 	sh->ppl_io = NULL;
 
-	if (io && atomic_dec_and_test(&io->pending_stripes))
-		ppl_io_unit_finished(io);
+	if (io && atomic_dec_and_test(&io->pending_stripes)) {
+		if (io->log->disk_flush_bitmap)
+			ppl_do_flush(io);
+		else
+			ppl_io_unit_finished(io);
+	}
 }
 
 static void ppl_xor(int size, struct page *page1, struct page *page2)
@@ -1108,6 +1238,8 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf)
 
 	if (ppl_conf->bs)
 		bioset_free(ppl_conf->bs);
+	if (ppl_conf->flush_bs)
+		bioset_free(ppl_conf->flush_bs);
 	mempool_destroy(ppl_conf->io_pool);
 	kmem_cache_destroy(ppl_conf->io_kc);
 
@@ -1173,6 +1305,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
 
 static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
 {
+	struct request_queue *q;
+
 	if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
 				      PPL_HEADER_SIZE) * 2) {
 		log->use_multippl = true;
@@ -1185,6 +1319,10 @@ static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
 				   PPL_HEADER_SIZE;
 	}
 	log->next_io_sector = rdev->ppl.sector;
+
+	q = bdev_get_queue(rdev->bdev);
+	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+		log->wb_cache_on = true;
 }
 
 int ppl_init_log(struct r5conf *conf)
@@ -1192,8 +1330,8 @@ int ppl_init_log(struct r5conf *conf)
 	struct ppl_conf *ppl_conf;
 	struct mddev *mddev = conf->mddev;
 	int ret = 0;
+	int max_disks;
 	int i;
-	bool need_cache_flush = false;
 
 	pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
 		 mdname(conf->mddev));
@@ -1219,6 +1357,14 @@ int ppl_init_log(struct r5conf *conf)
 		return -EINVAL;
 	}
 
+	max_disks = FIELD_SIZEOF(struct ppl_log, disk_flush_bitmap) *
+		BITS_PER_BYTE;
+	if (conf->raid_disks > max_disks) {
+		pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n",
+			mdname(mddev), max_disks);
+		return -EINVAL;
+	}
+
 	ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
 	if (!ppl_conf)
 		return -ENOMEM;
@@ -1244,6 +1390,12 @@ int ppl_init_log(struct r5conf *conf)
 		goto err;
 	}
 
+	ppl_conf->flush_bs = bioset_create(conf->raid_disks, 0, 0);
+	if (!ppl_conf->flush_bs) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
 	ppl_conf->count = conf->raid_disks;
 	ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
 				       GFP_KERNEL);
@@ -1275,23 +1427,14 @@ int ppl_init_log(struct r5conf *conf)
 		log->rdev = rdev;
 
 		if (rdev) {
-			struct request_queue *q;
-
 			ret = ppl_validate_rdev(rdev);
 			if (ret)
 				goto err;
 
-			q = bdev_get_queue(rdev->bdev);
-			if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
-				need_cache_flush = true;
 			ppl_init_child_log(log, rdev);
 		}
 	}
 
-	if (need_cache_flush)
-		pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
-			mdname(mddev));
-
 	/* load and possibly recover the logs from the member disks */
 	ret = ppl_load(ppl_conf);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5a2a29bd02dd..50d01144b805 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5563,7 +5563,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
 	bool do_flush = false;
 
 	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
-		int ret = r5l_handle_flush_request(conf->log, bi);
+		int ret = log_handle_flush_request(conf, bi);
 
 		if (ret == 0)
 			return true;
@@ -6168,7 +6168,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
 				break;
 		if (i == NR_STRIPE_HASH_LOCKS) {
 			spin_unlock_irq(&conf->device_lock);
-			r5l_flush_stripe_to_raid(conf->log);
+			log_flush_stripe_to_raid(conf);
 			spin_lock_irq(&conf->device_lock);
 			return batch_size;
 		}
@@ -8060,7 +8060,7 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
 		wake_up(&conf->wait_for_overlap);
 		unlock_all_device_hash_locks_irq(conf);
 	}
-	r5l_quiesce(conf->log, quiesce);
+	log_quiesce(conf, quiesce);
 }
 
 static void *raid45_takeover_raid0(struct mddev *mddev, int level)
-- 
cgit v1.2.3