From e566aef12a166732b7fd85897f8736ccf4fc7814 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 12 Aug 2016 13:42:34 +0800 Subject: md-cluster: call md_kick_rdev_from_array once ack failed The new_disk_ack could return failure if WAITING_FOR_NEWDISK is not set, so we need to kick the dev from array in case failure happened. And we missed to check err before call new_disk_ack othwise we could kick a rdev which isn't in array, thanks for the reminder from Shaohua. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 915e84d631a2..7eaf5496c8d9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6101,9 +6101,14 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) export_rdev(rdev); if (mddev_is_clustered(mddev)) { - if (info->state & (1 << MD_DISK_CANDIDATE)) - md_cluster_ops->new_disk_ack(mddev, (err == 0)); - else { + if (info->state & (1 << MD_DISK_CANDIDATE)) { + if (!err) { + err = md_cluster_ops->new_disk_ack(mddev, + err == 0); + if (err) + md_kick_rdev_from_array(rdev); + } + } else { if (err) md_cluster_ops->add_new_disk_cancel(mddev); else -- cgit v1.2.3 From 400cb454a4205ec1d7311bc3dd8104859c26ba46 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 12 Aug 2016 13:42:35 +0800 Subject: md-cluster: use FORCEUNLOCK in lockres_free For dlm_unlock, we need to pass flag to dlm_unlock as the third parameter instead of set res->flags. Also, DLM_LKF_FORCEUNLOCK is more suitable for dlm_unlock since it works even the lock is on waiting or convert queue. Acked-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 34a840d9df76..ccd756fdc00e 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -194,25 +194,21 @@ out_err: static void lockres_free(struct dlm_lock_resource *res) { - int ret; + int ret = 0; if (!res) return; - /* cancel a lock request or a conversion request that is blocked */ - res->flags |= DLM_LKF_CANCEL; -retry: - ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); - if (unlikely(ret != 0)) { - pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret); - - /* if a lock conversion is cancelled, then the lock is put - * back to grant queue, need to ensure it is unlocked */ - if (ret == -DLM_ECANCEL) - goto retry; - } - res->flags &= ~DLM_LKF_CANCEL; - wait_for_completion(&res->completion); + /* + * use FORCEUNLOCK flag, so we can unlock even the lock is on the + * waiting or convert queue + */ + ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_FORCEUNLOCK, + &res->lksb, res); + if (unlikely(ret != 0)) + pr_err("failed to unlock %s return %d\n", res->name, ret); + else + wait_for_completion(&res->completion); kfree(res->name); kfree(res->lksb.sb_lvbptr); -- cgit v1.2.3 From e3f924d3dfc672dd1292bc7eb6f2a305c13981ec Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 12 Aug 2016 13:42:36 +0800 Subject: md-cluster: remove some unnecessary dlm_unlock_sync Since DLM_LKF_FORCEUNLOCK is used in lockres_free, we don't need to call dlm_unlock_sync before free lock resource. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index ccd756fdc00e..67a735840639 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -284,7 +284,7 @@ static void recover_bitmaps(struct md_thread *thread) ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); if (ret) { pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); - goto dlm_unlock; + goto clear_bit; } if (hi > 0) { if (lo < mddev->recovery_cp) @@ -296,8 +296,6 @@ static void recover_bitmaps(struct md_thread *thread) md_wakeup_thread(mddev->thread); } } -dlm_unlock: - dlm_unlock_sync(bm_lockres); clear_bit: lockres_free(bm_lockres); clear_bit(slot, &cinfo->recovery_map); @@ -766,7 +764,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) md_check_recovery(mddev); } - dlm_unlock_sync(bm_lockres); lockres_free(bm_lockres); } out: @@ -1182,7 +1179,6 @@ static void unlock_all_bitmaps(struct mddev *mddev) if (cinfo->other_bitmap_lockres) { for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) { if (cinfo->other_bitmap_lockres[i]) { - dlm_unlock_sync(cinfo->other_bitmap_lockres[i]); lockres_free(cinfo->other_bitmap_lockres[i]); } } -- cgit v1.2.3 From af8d8e6f031589ccf32b08eea91def53db8cfa95 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 12 Aug 2016 13:42:37 +0800 Subject: md: changes for MD_STILL_CLOSED flag When stop clustered raid while it is pending on resync, MD_STILL_CLOSED flag could be cleared since udev rule is triggered to open the mddev. So obviously array can't be stopped soon and returns EBUSY. mdadm -Ss md-raid-arrays.rules set MD_STILL_CLOSED md_open() ... ... ... clear MD_STILL_CLOSED do_md_stop We make below changes to resolve this issue: 1. rename MD_STILL_CLOSED to MD_CLOSING since it is set when stop array and it means we are stopping array. 2. let md_open returns early if CLOSING is set, so no other threads will open array if one thread is trying to close it. 3. no need to clear CLOSING bit in md_open because 1 has ensure the bit is cleared, then we also don't need to test CLOSING bit in do_md_stop. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md.c | 14 ++++++++------ drivers/md/md.h | 5 ++--- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 7eaf5496c8d9..b6ad04b58766 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5573,8 +5573,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) mutex_lock(&mddev->open_mutex); if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { printk("md: %s still in use.\n",mdname(mddev)); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -5636,8 +5635,7 @@ static int do_md_stop(struct mddev *mddev, int mode, if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || mddev->sysfs_active || mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { printk("md: %s still in use.\n",mdname(mddev)); mutex_unlock(&mddev->open_mutex); if (did_freeze) { @@ -6826,7 +6824,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -EBUSY; goto out; } - set_bit(MD_STILL_CLOSED, &mddev->flags); + set_bit(MD_CLOSING, &mddev->flags); mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); } @@ -7075,9 +7073,13 @@ static int md_open(struct block_device *bdev, fmode_t mode) if ((err = mutex_lock_interruptible(&mddev->open_mutex))) goto out; + if (test_bit(MD_CLOSING, &mddev->flags)) { + mutex_unlock(&mddev->open_mutex); + return -ENODEV; + } + err = 0; atomic_inc(&mddev->openers); - clear_bit(MD_STILL_CLOSED, &mddev->flags); mutex_unlock(&mddev->open_mutex); check_disk_change(bdev); diff --git a/drivers/md/md.h b/drivers/md/md.h index 20c667579ede..2b2041773e79 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -201,9 +201,8 @@ struct mddev { #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ #define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */ #define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ -#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since - * md_ioctl checked on it. - */ +#define MD_CLOSING 4 /* If set, we are closing the array, do not open + * it then */ #define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */ #define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */ #define MD_RELOAD_SB 7 /* Reload the superblock because another node -- cgit v1.2.3 From c20c33f0e2abdb8bab1ec755ed668d7894bf9336 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 12 Aug 2016 13:42:38 +0800 Subject: md-cluster: clean related infos of cluster cluster_info and bitmap_info.nodes also need to be cleared when array is stopped. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index b6ad04b58766..cd6797b3cdf7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5454,12 +5454,14 @@ static void md_clean(struct mddev *mddev) mddev->degraded = 0; mddev->safemode = 0; mddev->private = NULL; + mddev->cluster_info = NULL; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.default_space = 0; mddev->bitmap_info.chunksize = 0; mddev->bitmap_info.daemon_sleep = 0; mddev->bitmap_info.max_write_behind = 0; + mddev->bitmap_info.nodes = 0; } static void __md_stop_writes(struct mddev *mddev) -- cgit v1.2.3 From 5f0aa21da6cc620b08e5f69f51db29cb1f722174 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 12 Aug 2016 13:42:39 +0800 Subject: md-cluster: protect md_find_rdev_nr_rcu with rcu lock We need to use rcu_read_lock/unlock to avoid potential race. Reported-by: Shaohua Li Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 67a735840639..b4dc211923c7 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -489,9 +489,10 @@ static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) { - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, - le32_to_cpu(msg->raid_slot)); + struct md_rdev *rdev; + rcu_read_lock(); + rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); if (rdev) { set_bit(ClusterRemove, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -500,18 +501,21 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) else pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, le32_to_cpu(msg->raid_slot)); + rcu_read_unlock(); } static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) { - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, - le32_to_cpu(msg->raid_slot)); + struct md_rdev *rdev; + rcu_read_lock(); + rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); if (rdev && test_bit(Faulty, &rdev->flags)) clear_bit(Faulty, &rdev->flags); else pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, le32_to_cpu(msg->raid_slot)); + rcu_read_unlock(); } static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) -- cgit v1.2.3 From fccb60a42cdd863aa80f32214ae58ae13936c927 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 12 Aug 2016 13:42:41 +0800 Subject: md-cluster: convert the completion to wait queue Previously, we used completion to sync between require dlm lock and sync_ast, however we will have to expose completion.wait and completion.done in dlm_lock_sync_interruptible (introduced later), it is not a common usage for completion, so convert related things to wait queue. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index b4dc211923c7..c94715159b9b 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -25,7 +25,8 @@ struct dlm_lock_resource { struct dlm_lksb lksb; char *name; /* lock name. */ uint32_t flags; /* flags to pass to dlm_lock() */ - struct completion completion; /* completion for synchronized locking */ + wait_queue_head_t sync_locking; /* wait queue for synchronized locking */ + bool sync_locking_done; void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ struct mddev *mddev; /* pointing back to mddev. */ int mode; @@ -118,7 +119,8 @@ static void sync_ast(void *arg) struct dlm_lock_resource *res; res = arg; - complete(&res->completion); + res->sync_locking_done = true; + wake_up(&res->sync_locking); } static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) @@ -130,7 +132,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) 0, sync_ast, res, res->bast); if (ret) return ret; - wait_for_completion(&res->completion); + wait_event(res->sync_locking, res->sync_locking_done); + res->sync_locking_done = false; if (res->lksb.sb_status == 0) res->mode = mode; return res->lksb.sb_status; @@ -151,7 +154,8 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev, res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); if (!res) return NULL; - init_completion(&res->completion); + init_waitqueue_head(&res->sync_locking); + res->sync_locking_done = false; res->ls = cinfo->lockspace; res->mddev = mddev; res->mode = DLM_LOCK_IV; @@ -208,7 +212,7 @@ static void lockres_free(struct dlm_lock_resource *res) if (unlikely(ret != 0)) pr_err("failed to unlock %s return %d\n", res->name, ret); else - wait_for_completion(&res->completion); + wait_event(res->sync_locking, res->sync_locking_done); kfree(res->name); kfree(res->lksb.sb_lvbptr); -- cgit v1.2.3 From 7bcda7149dd6911bee15a51b22477f592f8b9620 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 12 Aug 2016 13:42:42 +0800 Subject: md-cluster: introduce dlm_lock_sync_interruptible to fix tasks hang When some node leaves cluster, then it's bitmap need to be synced by another node, so "md*_recover" thread is triggered for the purpose. However, with below steps. we can find tasks hang happened either in B or C. 1. Node A create a resyncing cluster raid1, assemble it in other two nodes (B and C). 2. stop array in B and C. 3. stop array in A. linux44:~ # ps aux|grep md|grep D root 5938 0.0 0.1 19852 1964 pts/0 D+ 14:52 0:00 mdadm -S md0 root 5939 0.0 0.0 0 0 ? D 14:52 0:00 [md0_recover] linux44:~ # cat /proc/5939/stack [] dlm_lock_sync+0x71/0x90 [md_cluster] [] recover_bitmaps+0x125/0x220 [md_cluster] [] md_thread+0x16d/0x180 [md_mod] [] kthread+0xb4/0xc0 [] ret_from_fork+0x58/0x90 linux44:~ # cat /proc/5938/stack [] kthread_stop+0x6e/0x120 [] md_unregister_thread+0x40/0x80 [md_mod] [] leave+0x70/0x120 [md_cluster] [] md_cluster_stop+0x14/0x30 [md_mod] [] bitmap_free+0x14b/0x150 [md_mod] [] do_md_stop+0x35b/0x5a0 [md_mod] [] md_ioctl+0x873/0x1590 [md_mod] [] blkdev_ioctl+0x214/0x7d0 [] block_ioctl+0x3d/0x40 [] do_vfs_ioctl+0x2d4/0x4b0 [] SyS_ioctl+0x88/0xa0 [] system_call_fastpath+0x16/0x1b The problem is caused by recover_bitmaps can't reliably abort when the thread is unregistered. So dlm_lock_sync_interruptible is introduced to detect the thread's situation to fix the problem. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index c94715159b9b..43b90485448d 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -10,6 +10,7 @@ #include +#include #include #include #include @@ -144,6 +145,43 @@ static int dlm_unlock_sync(struct dlm_lock_resource *res) return dlm_lock_sync(res, DLM_LOCK_NL); } +/* + * An variation of dlm_lock_sync, which make lock request could + * be interrupted + */ +static int dlm_lock_sync_interruptible(struct dlm_lock_resource *res, int mode, + struct mddev *mddev) +{ + int ret = 0; + + ret = dlm_lock(res->ls, mode, &res->lksb, + res->flags, res->name, strlen(res->name), + 0, sync_ast, res, res->bast); + if (ret) + return ret; + + wait_event(res->sync_locking, res->sync_locking_done + || kthread_should_stop()); + if (!res->sync_locking_done) { + /* + * the convert queue contains the lock request when request is + * interrupted, and sync_ast could still be run, so need to + * cancel the request and reset completion + */ + ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_CANCEL, + &res->lksb, res); + res->sync_locking_done = false; + if (unlikely(ret != 0)) + pr_info("failed to cancel previous lock request " + "%s return %d\n", res->name, ret); + return -EPERM; + } else + res->sync_locking_done = false; + if (res->lksb.sb_status == 0) + res->mode = mode; + return res->lksb.sb_status; +} + static struct dlm_lock_resource *lockres_init(struct mddev *mddev, char *name, void (*bastfn)(void *arg, int mode), int with_lvb) { @@ -279,7 +317,7 @@ static void recover_bitmaps(struct md_thread *thread) goto clear_bit; } - ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); + ret = dlm_lock_sync_interruptible(bm_lockres, DLM_LOCK_PW, mddev); if (ret) { pr_err("md-cluster: Could not DLM lock %s: %d\n", str, ret); -- cgit v1.2.3 From d6385db94196b253ae5eb3678fa95cdf1f839fcc Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 12 Aug 2016 13:42:43 +0800 Subject: md-cluster: make resync lock also could be interruptted When one node is perform resync or recovery, other nodes can't get resync lock and could block for a while before it holds the lock, so we can't stop array immediately for this scenario. To make array could be stop quickly, we check MD_CLOSING in dlm_lock_sync_interruptible to make us can interrupt the lock request. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 43b90485448d..2b13117fb918 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -161,7 +161,8 @@ static int dlm_lock_sync_interruptible(struct dlm_lock_resource *res, int mode, return ret; wait_event(res->sync_locking, res->sync_locking_done - || kthread_should_stop()); + || kthread_should_stop() + || test_bit(MD_CLOSING, &mddev->flags)); if (!res->sync_locking_done) { /* * the convert queue contains the lock request when request is @@ -1045,7 +1046,7 @@ static void metadata_update_cancel(struct mddev *mddev) static int resync_start(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; - return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX); + return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev); } static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) -- cgit v1.2.3 From 1dffddddd8315863f1a6d79c512b737864ef6a1a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 8 Sep 2016 10:49:06 -0700 Subject: raid5: allow arbitrary max_hw_sectors raid5 will split bio to proper size internally, there is no point to use underlayer disk's max_hw_sectors. In my qemu system, without the change, the raid5 only receives 128k size bio, which reduces the chance of bio merge sending to underlayer disks. Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ee7fc3701700..5883ef0d95bf 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7066,6 +7066,8 @@ static int raid5_run(struct mddev *mddev) else queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + + blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); } if (journal_dev) { -- cgit v1.2.3 From f71f1cf97c781db1be8ae0190e0983e1fceac14a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 13 Sep 2016 10:28:00 -0700 Subject: md/bitmap: fix wrong cleanup if bitmap_create fails, the bitmap is already cleaned up and the returned value is an error number. We can't do the cleanup again. Reported-by: Christophe JAILLET Signed-off-by: Shaohua Li --- drivers/md/bitmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 13041ee37ad6..2d826927a3bf 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1903,10 +1903,8 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot, struct bitmap_counts *counts; struct bitmap *bitmap = bitmap_create(mddev, slot); - if (IS_ERR(bitmap)) { - bitmap_free(bitmap); + if (IS_ERR(bitmap)) return PTR_ERR(bitmap); - } rv = bitmap_init_from_disk(bitmap, 0); if (rv) -- cgit v1.2.3 From 90bcf1338193da4c87fb7492c716f225b907acf4 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 14 Sep 2016 14:26:54 -0700 Subject: md: fix a potential deadlock lockdep reports a potential deadlock. Fix this by droping the mutex before md_import_device [ 1137.126601] ====================================================== [ 1137.127013] [ INFO: possible circular locking dependency detected ] [ 1137.127013] 4.8.0-rc4+ #538 Not tainted [ 1137.127013] ------------------------------------------------------- [ 1137.127013] mdadm/16675 is trying to acquire lock: [ 1137.127013] (&bdev->bd_mutex){+.+.+.}, at: [] __blkdev_get+0x63/0x450 [ 1137.127013] but task is already holding lock: [ 1137.127013] (detected_devices_mutex){+.+.+.}, at: [] md_ioctl+0x2ac/0x1f50 [ 1137.127013] which lock already depends on the new lock. [ 1137.127013] the existing dependency chain (in reverse order) is: [ 1137.127013] -> #1 (detected_devices_mutex){+.+.+.}: [ 1137.127013] [] lock_acquire+0xb9/0x220 [ 1137.127013] [] mutex_lock_nested+0x67/0x3d0 [ 1137.127013] [] md_autodetect_dev+0x3f/0x90 [ 1137.127013] [] rescan_partitions+0x1a8/0x2c0 [ 1137.127013] [] __blkdev_reread_part+0x71/0xb0 [ 1137.127013] [] blkdev_reread_part+0x25/0x40 [ 1137.127013] [] blkdev_ioctl+0x51b/0xa30 [ 1137.127013] [] block_ioctl+0x41/0x50 [ 1137.127013] [] do_vfs_ioctl+0x96/0x6e0 [ 1137.127013] [] SyS_ioctl+0x41/0x70 [ 1137.127013] [] entry_SYSCALL_64_fastpath+0x18/0xa8 [ 1137.127013] -> #0 (&bdev->bd_mutex){+.+.+.}: [ 1137.127013] [] __lock_acquire+0x1662/0x1690 [ 1137.127013] [] lock_acquire+0xb9/0x220 [ 1137.127013] [] mutex_lock_nested+0x67/0x3d0 [ 1137.127013] [] __blkdev_get+0x63/0x450 [ 1137.127013] [] blkdev_get+0x227/0x350 [ 1137.127013] [] blkdev_get_by_dev+0x36/0x50 [ 1137.127013] [] lock_rdev+0x35/0x80 [ 1137.127013] [] md_import_device+0xb4/0x1b0 [ 1137.127013] [] md_ioctl+0x2f6/0x1f50 [ 1137.127013] [] blkdev_ioctl+0x283/0xa30 [ 1137.127013] [] block_ioctl+0x41/0x50 [ 1137.127013] [] do_vfs_ioctl+0x96/0x6e0 [ 1137.127013] [] SyS_ioctl+0x41/0x70 [ 1137.127013] [] entry_SYSCALL_64_fastpath+0x18/0xa8 [ 1137.127013] other info that might help us debug this: [ 1137.127013] Possible unsafe locking scenario: [ 1137.127013] CPU0 CPU1 [ 1137.127013] ---- ---- [ 1137.127013] lock(detected_devices_mutex); [ 1137.127013] lock(&bdev->bd_mutex); [ 1137.127013] lock(detected_devices_mutex); [ 1137.127013] lock(&bdev->bd_mutex); [ 1137.127013] *** DEADLOCK *** Cc: Cong Wang Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index cd6797b3cdf7..457b53863117 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8882,7 +8882,9 @@ static void autostart_arrays(int part) list_del(&node_detected_dev->list); dev = node_detected_dev->dev; kfree(node_detected_dev); + mutex_unlock(&detected_devices_mutex); rdev = md_import_device(dev,0, 90); + mutex_lock(&detected_devices_mutex); if (IS_ERR(rdev)) continue; -- cgit v1.2.3 From 6a0f53ff351dfd10e74752e57b9c27d3397a3c4d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 20 Sep 2016 10:33:57 +0800 Subject: raid5: fix to detect failure of register_shrinker register_shrinker can fail after commit 1d3d4437eae1 ("vmscan: per-node deferred work"), we should detect the failure of it, otherwise we may fail to register shrinker after raid5 configuration was setup successfully. Signed-off-by: Chao Yu Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5883ef0d95bf..08274b4b4009 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6670,7 +6670,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->shrinker.count_objects = raid5_cache_count; conf->shrinker.batch = 128; conf->shrinker.flags = 0; - register_shrinker(&conf->shrinker); + if (register_shrinker(&conf->shrinker)) { + printk(KERN_ERR + "md/raid:%s: couldn't register shrinker.\n", + mdname(mddev)); + goto abort; + } sprintf(pers_name, "raid%d", mddev->new_level); conf->thread = md_register_thread(raid5d, mddev, pers_name); -- cgit v1.2.3 From 30c8946566f32493f7f1143437319c42c8a542e9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 21 Sep 2016 09:07:13 -0700 Subject: raid5: handle register_shrinker failure register_shrinker() now can fail. When it happens, shrinker.nr_deferred is null. We use it to determine if unregister_shrinker is required. Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 08274b4b4009..f94472dd0323 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6372,7 +6372,7 @@ static void free_conf(struct r5conf *conf) { if (conf->log) r5l_exit_log(conf->log); - if (conf->shrinker.seeks) + if (conf->shrinker.nr_deferred) unregister_shrinker(&conf->shrinker); free_thread_groups(conf); -- cgit v1.2.3 From bb086a89a406b5d877ee616f1490fcc81f8e1b2b Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 30 Sep 2016 09:45:40 -0700 Subject: md: set rotational bit if all disks in an array are non-rotational, set the array non-rotational. This only works for array with all disks populated at startup. Support for disk hotadd/hotremove could be added later if necessary. Acked-by: Tejun Heo Signed-off-by: Shaohua Li --- drivers/md/md.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 457b53863117..eac84d8ff724 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5297,6 +5297,21 @@ int md_run(struct mddev *mddev) return err; } if (mddev->queue) { + bool nonrot = true; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk >= 0 && + !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { + nonrot = false; + break; + } + } + if (mddev->degraded) + nonrot = false; + if (nonrot) + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue); mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = md_congested; } -- cgit v1.2.3