From 33182d15c6bf182f7ae32a66ea4a547d979cd6d7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Aug 2017 16:56:36 +1000 Subject: md: always clear ->safemode when md_check_recovery gets the mddev lock. If ->safemode == 1, md_check_recovery() will try to get the mddev lock and perform various other checks. If mddev->in_sync is zero, it will call set_in_sync, and clear ->safemode. However if mddev->in_sync is not zero, ->safemode will not be cleared. When md_check_recovery() drops the mddev lock, the thread is woken up again. Normally it would just check if there was anything else to do, find nothing, and go to sleep. However as ->safemode was not cleared, it will take the mddev lock again, then wake itself up when unlocking. This results in an infinite loop, repeatedly calling md_check_recovery(), which RCU or the soft-lockup detector will eventually complain about. Prior to commit 4ad23a976413 ("MD: use per-cpu counter for writes_pending"), safemode would only be set to one when the writes_pending counter reached zero, and would be cleared again when writes_pending is incremented. Since that patch, safemode is set more freely, but is not reliably cleared. So in md_check_recovery() clear ->safemode before checking ->in_sync. Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending") Cc: stable@vger.kernel.org (4.12+) Reported-by: Dominik Brodowski Reported-by: David R Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index c99634612fc4..d84aceede1cb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8656,6 +8656,9 @@ void md_check_recovery(struct mddev *mddev) if (mddev_trylock(mddev)) { int spares = 0; + if (mddev->safemode == 1) + mddev->safemode = 0; + if (mddev->ro) { struct md_rdev *rdev; if (!mddev->external && mddev->in_sync) -- cgit v1.2.3 From 81fe48e9aa00bdd509bd3c37a76d1132da6b9f09 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 8 Aug 2017 16:56:36 +1000 Subject: md: fix test in md_write_start() md_write_start() needs to clear the in_sync flag is it is set, or if there might be a race with set_in_sync() such that the later will set it very soon. In the later case it is sufficient to take the spinlock to synchronize with set_in_sync(), and then set the flag if needed. The current test is incorrect. It should be: if "flag is set" or "race is possible" "flag is set" is trivially "mddev->in_sync". "race is possible" should be tested by "mddev->sync_checkers". If sync_checkers is 0, then there can be no race. set_in_sync() will wait in percpu_ref_switch_to_atomic_sync() for an RCU grace period, and as md_write_start() holds the rcu_read_lock(), set_in_sync() will be sure ot see the update to writes_pending. If sync_checkers is > 0, there could be race. If md_write_start() happened entirely between if (!mddev->in_sync && percpu_ref_is_zero(&mddev->writes_pending)) { and mddev->in_sync = 1; in set_in_sync(), then it would not see that is_sync had been set, and set_in_sync() would not see that writes_pending had been incremented. This bug means that in_sync is sometimes not set when it should be. Consequently there is a small chance that the array will be marked as "clean" when in fact it is inconsistent. Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending") cc: stable@vger.kernel.org (v4.12+) Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index d84aceede1cb..e4ba95f6cd59 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7996,7 +7996,7 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (mddev->safemode == 1) mddev->safemode = 0; /* sync_checkers is always 0 when writes_pending is in per-cpu mode */ - if (mddev->in_sync || !mddev->sync_checkers) { + if (mddev->in_sync || mddev->sync_checkers) { spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; -- cgit v1.2.3 From b44886c54a999771060371c3a05d5fedfc7e2102 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 31 Jul 2017 14:52:26 -0700 Subject: md/r5cache: call mddev_lock/unlock() in r5c_journal_mode_set In r5c_journal_mode_set(), it is necessary to call mddev_lock() before accessing conf and conf->log. Otherwise, the conf->log may change (and become NULL). Shaohua: fix unlock in failure cases Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index bfa1e907c472..ce98414a6e34 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2540,23 +2540,32 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) */ int r5c_journal_mode_set(struct mddev *mddev, int mode) { - struct r5conf *conf = mddev->private; - struct r5l_log *log = conf->log; - - if (!log) - return -ENODEV; + struct r5conf *conf; + int err; if (mode < R5C_JOURNAL_MODE_WRITE_THROUGH || mode > R5C_JOURNAL_MODE_WRITE_BACK) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf || !conf->log) { + mddev_unlock(mddev); + return -ENODEV; + } + if (raid5_calc_degraded(conf) > 0 && - mode == R5C_JOURNAL_MODE_WRITE_BACK) + mode == R5C_JOURNAL_MODE_WRITE_BACK) { + mddev_unlock(mddev); return -EINVAL; + } mddev_suspend(mddev); conf->log->r5c_journal_mode = mode; mddev_resume(mddev); + mddev_unlock(mddev); pr_debug("md/raid:%s: setting r5c cache mode to %d: %s\n", mdname(mddev), mode, r5c_journal_mode_str[mode]); -- cgit v1.2.3 From a9501d742127e613d744e29814e9532bacb147e8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 3 Aug 2017 10:03:17 -0700 Subject: md/r5cache: fix io_unit handling in r5l_log_endio() In r5l_log_endio(), once log->io_list_lock is released, the io unit may be accessed (or even freed) by other threads. Current code doesn't handle the io_unit properly, which leads to potential race conditions. This patch solves this race condition by: 1. Add a pending_stripe count flush_payload. Multiple flush_payloads are counted as only one pending_stripe. Flag has_flush_payload is added to show whether the io unit has flush_payload; 2. In r5l_log_endio(), check flags has_null_flush and has_flush_payload with log->io_list_lock held. After the lock is released, this IO unit is only accessed when we know the pending_stripe counter cannot be zeroed by other threads. Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index ce98414a6e34..2dcbafa8e66c 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -236,9 +236,10 @@ struct r5l_io_unit { bool need_split_bio; struct bio *split_bio; - unsigned int has_flush:1; /* include flush request */ - unsigned int has_fua:1; /* include fua request */ - unsigned int has_null_flush:1; /* include empty flush request */ + unsigned int has_flush:1; /* include flush request */ + unsigned int has_fua:1; /* include fua request */ + unsigned int has_null_flush:1; /* include null flush request */ + unsigned int has_flush_payload:1; /* include flush payload */ /* * io isn't sent yet, flush/fua request can only be submitted till it's * the first IO in running_ios list @@ -571,6 +572,8 @@ static void r5l_log_endio(struct bio *bio) struct r5l_io_unit *io_deferred; struct r5l_log *log = io->log; unsigned long flags; + bool has_null_flush; + bool has_flush_payload; if (bio->bi_status) md_error(log->rdev->mddev, log->rdev); @@ -580,6 +583,16 @@ static void r5l_log_endio(struct bio *bio) spin_lock_irqsave(&log->io_list_lock, flags); __r5l_set_io_unit_state(io, IO_UNIT_IO_END); + + /* + * if the io doesn't not have null_flush or flush payload, + * it is not safe to access it after releasing io_list_lock. + * Therefore, it is necessary to check the condition with + * the lock held. + */ + has_null_flush = io->has_null_flush; + has_flush_payload = io->has_flush_payload; + if (log->need_cache_flush && !list_empty(&io->stripe_list)) r5l_move_to_end_ios(log); else @@ -600,19 +613,23 @@ static void r5l_log_endio(struct bio *bio) if (log->need_cache_flush) md_wakeup_thread(log->rdev->mddev->thread); - if (io->has_null_flush) { + /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ + if (has_null_flush) { struct bio *bi; WARN_ON(bio_list_empty(&io->flush_barriers)); while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { bio_endio(bi); - atomic_dec(&io->pending_stripe); + if (atomic_dec_and_test(&io->pending_stripe)) { + __r5l_stripe_write_finished(io); + return; + } } } - - /* finish flush only io_unit and PAYLOAD_FLUSH only io_unit */ - if (atomic_read(&io->pending_stripe) == 0) - __r5l_stripe_write_finished(io); + /* decrease pending_stripe for flush payload */ + if (has_flush_payload) + if (atomic_dec_and_test(&io->pending_stripe)) + __r5l_stripe_write_finished(io); } static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) @@ -881,6 +898,11 @@ static void r5l_append_flush_payload(struct r5l_log *log, sector_t sect) payload->size = cpu_to_le32(sizeof(__le64)); payload->flush_stripes[0] = cpu_to_le64(sect); io->meta_offset += meta_size; + /* multiple flush payloads count as one pending_stripe */ + if (!io->has_flush_payload) { + io->has_flush_payload = 1; + atomic_inc(&io->pending_stripe); + } mutex_unlock(&log->io_mutex); } -- cgit v1.2.3 From afc1f55ca44e257f69da8f43e0714a76686ae8d1 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 11 Aug 2017 20:34:45 -0700 Subject: MD: not clear ->safemode for external metadata array ->safemode should be triggered by mdadm for external metadaa array, otherwise array's state confuses mdadm. Fixes: 33182d15c6bf(md: always clear ->safemode when md_check_recovery gets the mddev lock.) Cc: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index e4ba95f6cd59..b01e458d31e9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8656,7 +8656,7 @@ void md_check_recovery(struct mddev *mddev) if (mddev_trylock(mddev)) { int spares = 0; - if (mddev->safemode == 1) + if (!mddev->external && mddev->safemode == 1) mddev->safemode = 0; if (mddev->ro) { -- cgit v1.2.3