From bb8bf15bd6f7c3432ce9ad631f2f59c49c1e1853 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 2 Jun 2016 23:32:04 -0400 Subject: md-cluster: fix deadlock issue when add disk to an recoverying array Add a disk to an array which is performing recovery is a little complicated, we need to do both reap the sync thread and perform add disk for the case, then it caused deadlock as follows. linux44:~ # ps aux|grep md|grep D root 1822 0.0 0.0 0 0 ? D 16:50 0:00 [md127_resync] root 1848 0.0 0.0 19860 952 pts/0 D+ 16:50 0:00 mdadm --manage /dev/md127 --re-add /dev/vdb linux44:~ # cat /proc/1848/stack [] kthread_stop+0x6e/0x120 [] md_unregister_thread+0x40/0x80 [md_mod] [] md_reap_sync_thread+0x15/0x150 [md_mod] [] action_store+0x260/0x270 [md_mod] [] md_attr_store+0xb4/0x100 [md_mod] [] sysfs_write_file+0xbe/0x140 [] vfs_write+0xb8/0x1e0 [] SyS_write+0x48/0xa0 [] system_call_fastpath+0x16/0x1b [<00007f068ea1ed30>] 0x7f068ea1ed30 linux44:~ # cat /proc/1822/stack [] md_do_sync+0x846/0xf40 [md_mod] [] md_thread+0x16d/0x180 [md_mod] [] kthread+0xb4/0xc0 [] ret_from_fork+0x58/0x90 Task1848 Task1822 md_attr_store (held reconfig_mutex by call mddev_lock()) action_store md_reap_sync_thread md_unregister_thread kthread_stop md_wakeup_thread(mddev->thread); wait_event(mddev->sb_wait, !test_bit(MD_CHANGE_PENDING)) md_check_recovery is triggered by wakeup mddev->thread, but it can't clear MD_CHANGE_PENDING flag since it can't get lock which was held by md_attr_store already. To solve the deadlock problem, we move "->resync_finish()" from md_do_sync to md_reap_sync_thread (after md_update_sb), also MD_HELD_RESYNC_LOCK is introduced since it is possible that node can't get resync lock in md_do_sync. Then we do not need to wait for MD_CHANGE_PENDING is cleared or not since metadata should be updated after md_update_sb, so just call resync_finish if MD_HELD_RESYNC_LOCK is set. We also unified the code after skip label, since set PENDING for non-clustered case should be harmless. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md/md.h') diff --git a/drivers/md/md.h b/drivers/md/md.h index b5c4be73e6e4..03b19aad4921 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -204,6 +204,9 @@ struct mddev { #define MD_RELOAD_SB 7 /* Reload the superblock because another node * updated it. */ +#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node + * already took resync lock, need to + * release the lock */ int suspended; atomic_t active_io; -- cgit v1.2.3 From d787be4092e27728cb4c012bee9762098ef3c662 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Jun 2016 16:19:53 +1000 Subject: md: reduce the number of synchronize_rcu() calls when multiple devices fail. Every time a device is removed with ->hot_remove_disk() a synchronize_rcu() call is made which can delay several milliseconds in some case. If lots of devices fail at once - as could happen with a large RAID10 where one set of devices are removed all at once - these delays can add up to be very inconcenient. As failure is not reversible we can check for that first, setting a separate flag if it is found, and then all synchronize_rcu() once for all the flagged devices. Then ->hot_remove_disk() function can skip the synchronize_rcu() step if the flag is set. fix build error(Shaohua) Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 29 ++++++++++++++++++++++++++--- drivers/md/md.h | 5 +++++ drivers/md/multipath.c | 14 ++++++++------ drivers/md/raid1.c | 17 ++++++++++------- drivers/md/raid10.c | 19 +++++++++++-------- drivers/md/raid5.c | 15 +++++++++------ 6 files changed, 69 insertions(+), 30 deletions(-) (limited to 'drivers/md/md.h') diff --git a/drivers/md/md.c b/drivers/md/md.c index 0793754eeffd..2ed547f5c3b6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8180,15 +8180,34 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *rdev; int spares = 0; int removed = 0; + bool remove_some = false; - rdev_for_each(rdev, mddev) + rdev_for_each(rdev, mddev) { + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && + !test_bit(Blocked, &rdev->flags) && + test_bit(Faulty, &rdev->flags) && + atomic_read(&rdev->nr_pending)==0) { + /* Faulty non-Blocked devices with nr_pending == 0 + * never get nr_pending incremented, + * never get Faulty cleared, and never get Blocked set. + * So we can synchronize_rcu now rather than once per device + */ + remove_some = true; + set_bit(RemoveSynchronized, &rdev->flags); + } + } + + if (remove_some) + synchronize_rcu(); + rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && - (test_bit(Faulty, &rdev->flags) || + ((test_bit(RemoveSynchronized, &rdev->flags) || (!test_bit(In_sync, &rdev->flags) && !test_bit(Journal, &rdev->flags))) && - atomic_read(&rdev->nr_pending)==0) { + atomic_read(&rdev->nr_pending)==0)) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); @@ -8196,6 +8215,10 @@ static int remove_and_add_spares(struct mddev *mddev, removed++; } } + if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) + clear_bit(RemoveSynchronized, &rdev->flags); + } + if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); diff --git a/drivers/md/md.h b/drivers/md/md.h index 03b19aad4921..dc65ca65b26e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -163,6 +163,11 @@ enum flag_bits { * than other devices in the array */ ClusterRemove, + RemoveSynchronized, /* synchronize_rcu() was called after + * this device was known to be faulty, + * so it is safe to remove without + * another synchronize_rcu() call. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 7eb9972a37e6..c145a5a114eb 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -298,12 +298,14 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } } err = md_integrity_register(mddev); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 34f20c03d1f6..5027ef4752ac 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1656,13 +1656,16 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } else if (conf->mirrors[conf->raid_disks + number].rdev) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } + } + if (conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before * doing this to avoid confusion. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 34facda18e72..8ee5d96e6a2d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1766,7 +1766,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - /* Only remove faulty devices if recovery + /* Only remove non-faulty devices if recovery * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && @@ -1778,13 +1778,16 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - goto abort; - } else if (p->replacement) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + goto abort; + } + } + if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; clear_bit(Replacement, &p->replacement->flags); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f6a191aaaa91..413cc7d847da 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7197,12 +7197,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - } else if (p->replacement) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + } + } + if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; clear_bit(Replacement, &p->replacement->flags); -- cgit v1.2.3 From 0e3ef49eda5bae3aa75aa8c0276411bf0f27e03a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 17 Jun 2016 17:33:10 +0200 Subject: md: use seconds granularity for error logging The md code stores the exact time of the last error in the last_read_error variable using a timespec structure. It only ever uses the seconds portion of that though, so we can use a scalar for it. There won't be an overflow in 2038 here, because it already used monotonic time and 32-bit is enough for that, but I've decided to use time64_t for consistency in the conversion. Signed-off-by: Arnd Bergmann Signed-off-by: Shaohua Li --- drivers/md/md.c | 3 +-- drivers/md/md.h | 2 +- drivers/md/raid10.c | 11 +++++------ 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'drivers/md/md.h') diff --git a/drivers/md/md.c b/drivers/md/md.c index 2ed547f5c3b6..4b8a264e9777 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3173,8 +3173,7 @@ int md_rdev_init(struct md_rdev *rdev) rdev->data_offset = 0; rdev->new_data_offset = 0; rdev->sb_events = 0; - rdev->last_read_error.tv_sec = 0; - rdev->last_read_error.tv_nsec = 0; + rdev->last_read_error = 0; rdev->sb_loaded = 0; rdev->bb_page = NULL; atomic_set(&rdev->nr_pending, 0); diff --git a/drivers/md/md.h b/drivers/md/md.h index dc65ca65b26e..fd56cfd8c368 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -99,7 +99,7 @@ struct md_rdev { atomic_t read_errors; /* number of consecutive read errors that * we have tried to ignore. */ - struct timespec last_read_error; /* monotonic time since our + time64_t last_read_error; /* monotonic time since our * last read error */ atomic_t corrected_errors; /* number of corrected read errors, diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8ee5d96e6a2d..f7f3c8a63419 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2174,21 +2174,20 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) */ static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) { - struct timespec cur_time_mon; + long cur_time_mon; unsigned long hours_since_last; unsigned int read_errors = atomic_read(&rdev->read_errors); - ktime_get_ts(&cur_time_mon); + cur_time_mon = ktime_get_seconds(); - if (rdev->last_read_error.tv_sec == 0 && - rdev->last_read_error.tv_nsec == 0) { + if (rdev->last_read_error == 0) { /* first time we've seen a read error */ rdev->last_read_error = cur_time_mon; return; } - hours_since_last = (cur_time_mon.tv_sec - - rdev->last_read_error.tv_sec) / 3600; + hours_since_last = (long)(cur_time_mon - + rdev->last_read_error) / 3600; rdev->last_read_error = cur_time_mon; -- cgit v1.2.3