diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-05 17:52:22 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-09-05 17:52:22 -0700 |
commit | 2a013e37ce691a7c072df27b35e9790fc8f5a82f (patch) | |
tree | 0d81e512ebb93484602a0802396ed7c1f465d3b4 /drivers | |
parent | 17447717a3266965e257d3eae79d89539ce3ec0a (diff) | |
parent | e89c6fdf9e0eb1b5a03574d4ca73e83eae8deb91 (diff) | |
download | linux-2a013e37ce691a7c072df27b35e9790fc8f5a82f.tar.bz2 |
Merge tag 'md/4.3' of git://neil.brown.name/md
Pull md updates from Neil Brown:
- an assortment of little fixes, several for minor races only likely to
be hit during testing
- further cluster-md-raid1 development, not ready for real use yet.
- new RAID6 syndrome code for ARM NEON
- fix a race where a write can return before failure of one device is
properly recorded in metadata, so an immediate crash might result in
that write being lost.
* tag 'md/4.3' of git://neil.brown.name/md: (33 commits)
md/raid5: ensure device failure recorded before write request returns.
md/raid5: use bio_list for the list of bios to return.
md/raid10: ensure device failure recorded before write request returns.
md/raid1: ensure device failure recorded before write request returns.
md-cluster: remove inappropriate try_module_get from join()
md: extend spinlock protection in register_md_cluster_operations
md-cluster: Read the disk bitmap sb and check if it needs recovery
md-cluster: only call complete(&cinfo->completion) when node join cluster
md-cluster: add missed lockres_free
md-cluster: remove the unused sb_lock
md-cluster: init suspend_list and suspend_lock early in join
md-cluster: add the error check if failed to get dlm lock
md-cluster: init completion within lockres_init
md-cluster: fix deadlock issue on message lock
md-cluster: transfer the resync ownership to another node
md-cluster: split recover_slot for future code reuse
md-cluster: use %pU to print UUIDs
md: setup safemode_timer before it's being used
md/raid5: handle possible race as reshape completes.
md: sync sync_completed has correct value as recovery finishes.
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/md-cluster.c | 159 | ||||
-rw-r--r-- | drivers/md/md.c | 110 | ||||
-rw-r--r-- | drivers/md/raid0.c | 75 | ||||
-rw-r--r-- | drivers/md/raid1.c | 30 | ||||
-rw-r--r-- | drivers/md/raid1.h | 5 | ||||
-rw-r--r-- | drivers/md/raid10.c | 33 | ||||
-rw-r--r-- | drivers/md/raid10.h | 6 | ||||
-rw-r--r-- | drivers/md/raid5.c | 140 | ||||
-rw-r--r-- | drivers/md/raid5.h | 5 |
9 files changed, 373 insertions, 190 deletions
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 0072190515e0..11e3bc9d2a4b 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -45,6 +45,7 @@ struct resync_info { /* md_cluster_info flags */ #define MD_CLUSTER_WAITING_FOR_NEWDISK 1 #define MD_CLUSTER_SUSPEND_READ_BALANCING 2 +#define MD_CLUSTER_BEGIN_JOIN_CLUSTER 3 struct md_cluster_info { @@ -52,7 +53,6 @@ struct md_cluster_info { dlm_lockspace_t *lockspace; int slot_number; struct completion completion; - struct dlm_lock_resource *sb_lock; struct mutex sb_mutex; struct dlm_lock_resource *bitmap_lockres; struct list_head suspend_list; @@ -75,6 +75,7 @@ enum msg_type { NEWDISK, REMOVE, RE_ADD, + BITMAP_NEEDS_SYNC, }; struct cluster_msg { @@ -99,7 +100,6 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) { int ret = 0; - init_completion(&res->completion); ret = dlm_lock(res->ls, mode, &res->lksb, res->flags, res->name, strlen(res->name), 0, sync_ast, res, res->bast); @@ -124,6 +124,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev, res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); if (!res) return NULL; + init_completion(&res->completion); res->ls = cinfo->lockspace; res->mddev = mddev; namelen = strlen(name); @@ -165,11 +166,24 @@ out_err: static void lockres_free(struct dlm_lock_resource *res) { + int ret; + if (!res) return; - init_completion(&res->completion); - dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); + /* cancel a lock request or a conversion request that is blocked */ + res->flags |= DLM_LKF_CANCEL; +retry: + ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); + if (unlikely(ret != 0)) { + pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret); + + /* if a lock conversion is cancelled, then the lock is put + * back to grant queue, need to ensure it is unlocked */ + if (ret == -DLM_ECANCEL) + goto retry; + } + res->flags &= ~DLM_LKF_CANCEL; wait_for_completion(&res->completion); kfree(res->name); @@ -177,18 +191,6 @@ static void lockres_free(struct dlm_lock_resource *res) kfree(res); } -static char *pretty_uuid(char *dest, char *src) -{ - int i, len = 0; - - for (i = 0; i < 16; i++) { - if (i == 4 || i == 6 || i == 8 || i == 10) - len += sprintf(dest + len, "-"); - len += sprintf(dest + len, "%02x", (__u8)src[i]); - } - return dest; -} - static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres, sector_t lo, sector_t hi) { @@ -281,16 +283,11 @@ static void recover_prep(void *arg) set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); } -static void recover_slot(void *arg, struct dlm_slot *slot) +static void __recover_slot(struct mddev *mddev, int slot) { - struct mddev *mddev = arg; struct md_cluster_info *cinfo = mddev->cluster_info; - pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n", - mddev->bitmap_info.cluster_name, - slot->nodeid, slot->slot, - cinfo->slot_number); - set_bit(slot->slot - 1, &cinfo->recovery_map); + set_bit(slot, &cinfo->recovery_map); if (!cinfo->recovery_thread) { cinfo->recovery_thread = md_register_thread(recover_bitmaps, mddev, "recover"); @@ -302,6 +299,20 @@ static void recover_slot(void *arg, struct dlm_slot *slot) md_wakeup_thread(cinfo->recovery_thread); } +static void recover_slot(void *arg, struct dlm_slot *slot) +{ + struct mddev *mddev = arg; + struct md_cluster_info *cinfo = mddev->cluster_info; + + pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n", + mddev->bitmap_info.cluster_name, + slot->nodeid, slot->slot, + cinfo->slot_number); + /* deduct one since dlm slot starts from one while the num of + * cluster-md begins with 0 */ + __recover_slot(mddev, slot->slot - 1); +} + static void recover_done(void *arg, struct dlm_slot *slots, int num_slots, int our_slot, uint32_t generation) @@ -310,10 +321,17 @@ static void recover_done(void *arg, struct dlm_slot *slots, struct md_cluster_info *cinfo = mddev->cluster_info; cinfo->slot_number = our_slot; - complete(&cinfo->completion); + /* completion is only need to be complete when node join cluster, + * it doesn't need to run during another node's failure */ + if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) { + complete(&cinfo->completion); + clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); + } clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state); } +/* the ops is called when node join the cluster, and do lock recovery + * if node failure occurs */ static const struct dlm_lockspace_ops md_ls_ops = { .recover_prep = recover_prep, .recover_slot = recover_slot, @@ -388,7 +406,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) int len; len = snprintf(disk_uuid, 64, "DEVICE_UUID="); - pretty_uuid(disk_uuid + len, cmsg->uuid); + sprintf(disk_uuid + len, "%pU", cmsg->uuid); snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot); pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); init_completion(&cinfo->newdisk_completion); @@ -457,6 +475,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) __func__, __LINE__, msg->slot); process_readd_disk(mddev, msg); break; + case BITMAP_NEEDS_SYNC: + pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n", + __func__, __LINE__, msg->slot); + __recover_slot(mddev, msg->slot); + break; default: pr_warn("%s:%d Received unknown message from %d\n", __func__, __LINE__, msg->slot); @@ -472,6 +495,7 @@ static void recv_daemon(struct md_thread *thread) struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres; struct dlm_lock_resource *message_lockres = cinfo->message_lockres; struct cluster_msg msg; + int ret; /*get CR on Message*/ if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) { @@ -484,13 +508,21 @@ static void recv_daemon(struct md_thread *thread) process_recvd_msg(thread->mddev, &msg); /*release CR on ack_lockres*/ - dlm_unlock_sync(ack_lockres); - /*up-convert to EX on message_lockres*/ - dlm_lock_sync(message_lockres, DLM_LOCK_EX); + ret = dlm_unlock_sync(ack_lockres); + if (unlikely(ret != 0)) + pr_info("unlock ack failed return %d\n", ret); + /*up-convert to PR on message_lockres*/ + ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR); + if (unlikely(ret != 0)) + pr_info("lock PR on msg failed return %d\n", ret); /*get CR on ack_lockres again*/ - dlm_lock_sync(ack_lockres, DLM_LOCK_CR); + ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR); + if (unlikely(ret != 0)) + pr_info("lock CR on ack failed return %d\n", ret); /*release CR on message_lockres*/ - dlm_unlock_sync(message_lockres); + ret = dlm_unlock_sync(message_lockres); + if (unlikely(ret != 0)) + pr_info("unlock msg failed return %d\n", ret); } /* lock_comm() @@ -519,7 +551,7 @@ static void unlock_comm(struct md_cluster_info *cinfo) * The function: * 1. Grabs the message lockresource in EX mode * 2. Copies the message to the message LVB - * 3. Downconverts message lockresource to CR + * 3. Downconverts message lockresource to CW * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes * and the other nodes read the message. The thread will wait here until all other * nodes have released ack lock resource. @@ -540,12 +572,12 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg, sizeof(struct cluster_msg)); - /*down-convert EX to CR on Message*/ - error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR); + /*down-convert EX to CW on Message*/ + error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW); if (error) { - pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n", + pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n", error); - goto failed_message; + goto failed_ack; } /*up-convert CR to EX on Ack*/ @@ -565,7 +597,13 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg) } failed_ack: - dlm_unlock_sync(cinfo->message_lockres); + error = dlm_unlock_sync(cinfo->message_lockres); + if (unlikely(error != 0)) { + pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n", + error); + /* in case the message can't be released due to some reason */ + goto failed_ack; + } failed_message: return error; } @@ -587,6 +625,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) struct dlm_lock_resource *bm_lockres; struct suspend_info *s; char str[64]; + sector_t lo, hi; for (i = 0; i < total_slots; i++) { @@ -617,9 +656,24 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) lockres_free(bm_lockres); continue; } - if (ret) + if (ret) { + lockres_free(bm_lockres); goto out; - /* TODO: Read the disk bitmap sb and check if it needs recovery */ + } + + /* Read the disk bitmap sb and check if it needs recovery */ + ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false); + if (ret) { + pr_warn("md-cluster: Could not gather bitmaps from slot %d", i); + lockres_free(bm_lockres); + continue; + } + if ((hi > 0) && (lo < mddev->recovery_cp)) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + mddev->recovery_cp = lo; + md_check_recovery(mddev); + } + dlm_unlock_sync(bm_lockres); lockres_free(bm_lockres); } @@ -633,20 +687,20 @@ static int join(struct mddev *mddev, int nodes) int ret, ops_rv; char str[64]; - if (!try_module_get(THIS_MODULE)) - return -ENOENT; - cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL); if (!cinfo) return -ENOMEM; + INIT_LIST_HEAD(&cinfo->suspend_list); + spin_lock_init(&cinfo->suspend_lock); init_completion(&cinfo->completion); + set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); mutex_init(&cinfo->sb_mutex); mddev->cluster_info = cinfo; memset(str, 0, 64); - pretty_uuid(str, mddev->uuid); + sprintf(str, "%pU", mddev->uuid); ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name, DLM_LSFL_FS, LVB_SIZE, &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace); @@ -659,12 +713,6 @@ static int join(struct mddev *mddev, int nodes) ret = -ERANGE; goto err; } - cinfo->sb_lock = lockres_init(mddev, "cmd-super", - NULL, 0); - if (!cinfo->sb_lock) { - ret = -ENOMEM; - goto err; - } /* Initiate the communication resources */ ret = -ENOMEM; cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv"); @@ -705,9 +753,6 @@ static int join(struct mddev *mddev, int nodes) goto err; } - INIT_LIST_HEAD(&cinfo->suspend_list); - spin_lock_init(&cinfo->suspend_lock); - ret = gather_all_resync_info(mddev, nodes); if (ret) goto err; @@ -719,12 +764,10 @@ err: lockres_free(cinfo->ack_lockres); lockres_free(cinfo->no_new_dev_lockres); lockres_free(cinfo->bitmap_lockres); - lockres_free(cinfo->sb_lock); if (cinfo->lockspace) dlm_release_lockspace(cinfo->lockspace, 2); mddev->cluster_info = NULL; kfree(cinfo); - module_put(THIS_MODULE); return ret; } @@ -740,7 +783,6 @@ static int leave(struct mddev *mddev) lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); lockres_free(cinfo->no_new_dev_lockres); - lockres_free(cinfo->sb_lock); lockres_free(cinfo->bitmap_lockres); dlm_release_lockspace(cinfo->lockspace, 2); return 0; @@ -817,8 +859,17 @@ static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi) static void resync_finish(struct mddev *mddev) { + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg; + int slot = cinfo->slot_number - 1; + pr_info("%s:%d\n", __func__, __LINE__); resync_send(mddev, RESYNCING, 0, 0); + if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { + cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC); + cmsg.slot = cpu_to_le32(slot); + sendmsg(cinfo, &cmsg); + } } static int area_resyncing(struct mddev *mddev, int direction, diff --git a/drivers/md/md.c b/drivers/md/md.c index 40332625713b..4f5ecbe94ccb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -483,6 +483,8 @@ static void mddev_put(struct mddev *mddev) bioset_free(bs); } +static void md_safemode_timeout(unsigned long data); + void mddev_init(struct mddev *mddev) { mutex_init(&mddev->open_mutex); @@ -490,7 +492,8 @@ void mddev_init(struct mddev *mddev) mutex_init(&mddev->bitmap_info.mutex); INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->all_mddevs); - init_timer(&mddev->safemode_timer); + setup_timer(&mddev->safemode_timer, md_safemode_timeout, + (unsigned long) mddev); atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); @@ -3255,8 +3258,6 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) return 0; } -static void md_safemode_timeout(unsigned long data); - static ssize_t safe_delay_show(struct mddev *mddev, char *page) { @@ -4189,6 +4190,8 @@ action_show(struct mddev *mddev, char *page) type = "repair"; } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) type = "recover"; + else if (mddev->reshape_position != MaxSector) + type = "reshape"; } return sprintf(page, "%s\n", type); } @@ -5180,8 +5183,6 @@ int md_run(struct mddev *mddev) atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; - mddev->safemode_timer.function = md_safemode_timeout; - mddev->safemode_timer.data = (unsigned long) mddev; mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; smp_wmb(); @@ -5194,6 +5195,11 @@ int md_run(struct mddev *mddev) if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; + if (mddev->degraded && !mddev->ro) + /* This ensures that recovering status is reported immediately + * via sysfs - until a lack of spares is confirmed. + */ + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); if (mddev->flags & MD_UPDATE_SB_FLAGS) @@ -5741,16 +5747,16 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg) err = 0; spin_lock(&mddev->lock); - /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap_info.file) - file->pathname[0] = '\0'; - else if ((ptr = file_path(mddev->bitmap_info.file, - file->pathname, sizeof(file->pathname))), - IS_ERR(ptr)) - err = PTR_ERR(ptr); - else - memmove(file->pathname, ptr, - sizeof(file->pathname)-(ptr-file->pathname)); + /* bitmap enabled */ + if (mddev->bitmap_info.file) { + ptr = file_path(mddev->bitmap_info.file, file->pathname, + sizeof(file->pathname)); + if (IS_ERR(ptr)) + err = PTR_ERR(ptr); + else + memmove(file->pathname, ptr, + sizeof(file->pathname)-(ptr-file->pathname)); + } spin_unlock(&mddev->lock); if (err == 0 && @@ -7069,7 +7075,7 @@ static void status_unused(struct seq_file *seq) seq_printf(seq, "\n"); } -static void status_resync(struct seq_file *seq, struct mddev *mddev) +static int status_resync(struct seq_file *seq, struct mddev *mddev) { sector_t max_sectors, resync, res; unsigned long dt, db; @@ -7077,18 +7083,32 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev) int scale; unsigned int per_milli; - if (mddev->curr_resync <= 3) - resync = 0; - else - resync = mddev->curr_resync - - atomic_read(&mddev->recovery_active); - if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; else max_sectors = mddev->dev_sectors; + resync = mddev->curr_resync; + if (resync <= 3) { + if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) + /* Still cleaning up */ + resync = max_sectors; + } else + resync -= atomic_read(&mddev->recovery_active); + + if (resync == 0) { + if (mddev->recovery_cp < MaxSector) { + seq_printf(seq, "\tresync=PENDING"); + return 1; + } + return 0; + } + if (resync < 3) { + seq_printf(seq, "\tresync=DELAYED"); + return 1; + } + WARN_ON(max_sectors == 0); /* Pick 'scale' such that (resync>>scale)*1000 will fit * in a sector_t, and (max_sectors>>scale) will fit in a @@ -7153,6 +7173,7 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev) ((unsigned long)rt % 60)/6); seq_printf(seq, " speed=%ldK/sec", db/2/dt); + return 1; } static void *md_seq_start(struct seq_file *seq, loff_t *pos) @@ -7298,13 +7319,8 @@ static int md_seq_show(struct seq_file *seq, void *v) mddev->pers->status(seq, mddev); seq_printf(seq, "\n "); if (mddev->pers->sync_request) { - if (mddev->curr_resync > 2) { - status_resync(seq, mddev); + if (status_resync(seq, mddev)) seq_printf(seq, "\n "); - } else if (mddev->curr_resync >= 1) - seq_printf(seq, "\tresync=DELAYED\n "); - else if (mddev->recovery_cp < MaxSector) - seq_printf(seq, "\tresync=PENDING\n "); } } else seq_printf(seq, "\n "); @@ -7387,15 +7403,19 @@ int unregister_md_personality(struct md_personality *p) } EXPORT_SYMBOL(unregister_md_personality); -int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module) +int register_md_cluster_operations(struct md_cluster_operations *ops, + struct module *module) { - if (md_cluster_ops != NULL) - return -EALREADY; + int ret = 0; spin_lock(&pers_lock); - md_cluster_ops = ops; - md_cluster_mod = module; + if (md_cluster_ops != NULL) + ret = -EALREADY; + else { + md_cluster_ops = ops; + md_cluster_mod = module; + } spin_unlock(&pers_lock); - return 0; + return ret; } EXPORT_SYMBOL(register_md_cluster_operations); @@ -7793,7 +7813,8 @@ void md_do_sync(struct md_thread *thread) > (max_sectors >> 4)) || time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || (j - mddev->curr_resync_completed)*2 - >= mddev->resync_max - mddev->curr_resync_completed + >= mddev->resync_max - mddev->curr_resync_completed || + mddev->curr_resync_completed > mddev->resync_max )) { /* time to update curr_resync_completed */ wait_event(mddev->recovery_wait, @@ -7838,6 +7859,9 @@ void md_do_sync(struct md_thread *thread) break; j += sectors; + if (j > max_sectors) + /* when skipping, extra large numbers can be returned. */ + j = max_sectors; if (j > 2) mddev->curr_resync = j; if (mddev_is_clustered(mddev)) @@ -7906,12 +7930,15 @@ void md_do_sync(struct md_thread *thread) blk_finish_plug(&plug); wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + mddev->curr_resync > 2) { + mddev->curr_resync_completed = mddev->curr_resync; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + } /* tell personality that we are finished */ mddev->pers->sync_request(mddev, max_sectors, &skipped); - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_finish(mddev); - if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && mddev->curr_resync > 2) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { @@ -7945,6 +7972,9 @@ void md_do_sync(struct md_thread *thread) } } skip: + if (mddev_is_clustered(mddev)) + md_cluster_ops->resync_finish(mddev); + set_bit(MD_CHANGE_DEVS, &mddev->flags); spin_lock(&mddev->lock); @@ -7955,11 +7985,11 @@ void md_do_sync(struct md_thread *thread) mddev->resync_max = MaxSector; } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; + set_bit(MD_RECOVERY_DONE, &mddev->recovery); mddev->curr_resync = 0; spin_unlock(&mddev->lock); wake_up(&resync_wait); - set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); return; } @@ -8128,6 +8158,7 @@ void md_check_recovery(struct mddev *mddev) */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); + clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); goto unlock; } @@ -8574,6 +8605,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, /* Make sure they get written out promptly */ sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); + set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags); md_wakeup_thread(rdev->mddev->thread); } return rv; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 59cda501a224..63e619b2f44e 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -83,7 +83,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - bool discard_supported = false; + unsigned short blksize = 512; if (!conf) return -ENOMEM; @@ -98,6 +98,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) sector_div(sectors, mddev->chunk_sectors); rdev1->sectors = sectors * mddev->chunk_sectors; + blksize = max(blksize, queue_logical_block_size( + rdev1->bdev->bd_disk->queue)); + rdev_for_each(rdev2, mddev) { pr_debug("md/raid0:%s: comparing %s(%llu)" " with %s(%llu)\n", @@ -134,6 +137,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } pr_debug("md/raid0:%s: FINAL %d zones\n", mdname(mddev), conf->nr_strip_zones); + /* + * now since we have the hard sector sizes, we can make sure + * chunk size is a multiple of that sector size + */ + if ((mddev->chunk_sectors << 9) % blksize) { + printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n", + mdname(mddev), + mddev->chunk_sectors << 9, blksize); + err = -EINVAL; + goto abort; + } + err = -ENOMEM; conf->strip_zone = kzalloc(sizeof(struct strip_zone)* conf->nr_strip_zones, GFP_KERNEL); @@ -188,16 +203,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } dev[j] = rdev1; - if (mddev->queue) - disk_stack_limits(mddev->gendisk, rdev1->bdev, - rdev1->data_offset << 9); - if (!smallest || (rdev1->sectors < smallest->sectors)) smallest = rdev1; cnt++; - - if (blk_queue_discard(bdev_get_queue(rdev1->bdev))) - discard_supported = true; } if (cnt != mddev->raid_disks) { printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - " @@ -258,28 +266,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) (unsigned long long)smallest->sectors); } - /* - * now since we have the hard sector sizes, we can make sure - * chunk size is a multiple of that sector size - */ - if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) { - printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n", - mdname(mddev), - mddev->chunk_sectors << 9); - goto abort; - } - - if (mddev->queue) { - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); - blk_queue_io_opt(mddev->queue, - (mddev->chunk_sectors << 9) * mddev->raid_disks); - - if (!discard_supported) - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - else - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - } - pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -378,12 +364,6 @@ static int raid0_run(struct mddev *mddev) if (md_check_no_bitmap(mddev)) return -EINVAL; - if (mddev->queue) { - blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); - } - /* if private is not null, we are here after takeover */ if (mddev->private == NULL) { ret = create_strip_zones(mddev, &conf); @@ -392,6 +372,29 @@ static int raid0_run(struct mddev *mddev) mddev->private = conf; } conf = mddev->private; + if (mddev->queue) { + struct md_rdev *rdev; + bool discard_supported = false; + + rdev_for_each(rdev, mddev) { + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; + } + blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + blk_queue_io_opt(mddev->queue, + (mddev->chunk_sectors << 9) * mddev->raid_disks); + + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + } /* calculate array device size */ md_set_array_sectors(mddev, raid0_size(mddev, 0, 0)); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f39d69f884de..4517f06c41ba 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1474,6 +1474,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" "md/raid1:%s: Operation continuing on %d devices.\n", @@ -2235,6 +2236,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) { int m; + bool fail = false; for (m = 0; m < conf->raid_disks * 2 ; m++) if (r1_bio->bios[m] == IO_MADE_GOOD) { struct md_rdev *rdev = conf->mirrors[m].rdev; @@ -2247,6 +2249,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) * narrow down and record precise write * errors. */ + fail = true; if (!narrow_write_error(r1_bio, m)) { md_error(conf->mddev, conf->mirrors[m].rdev); @@ -2258,7 +2261,13 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) } if (test_bit(R1BIO_WriteError, &r1_bio->state)) close_write(r1_bio); - raid_end_bio_io(r1_bio); + if (fail) { + spin_lock_irq(&conf->device_lock); + list_add(&r1_bio->retry_list, &conf->bio_end_io_list); + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else + raid_end_bio_io(r1_bio); } static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) @@ -2364,6 +2373,23 @@ static void raid1d(struct md_thread *thread) md_check_recovery(mddev); + if (!list_empty_careful(&conf->bio_end_io_list) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + LIST_HEAD(tmp); + spin_lock_irqsave(&conf->device_lock, flags); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + list_add(&tmp, &conf->bio_end_io_list); + list_del_init(&conf->bio_end_io_list); + } + spin_unlock_irqrestore(&conf->device_lock, flags); + while (!list_empty(&tmp)) { + r1_bio = list_first_entry(&conf->bio_end_io_list, + struct r1bio, retry_list); + list_del(&r1_bio->retry_list); + raid_end_bio_io(r1_bio); + } + } + blk_start_plug(&plug); for (;;) { @@ -2763,6 +2789,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; INIT_LIST_HEAD(&conf->retry_list); + INIT_LIST_HEAD(&conf->bio_end_io_list); spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); @@ -3057,6 +3084,7 @@ static int raid1_reshape(struct mddev *mddev) unfreeze_array(conf); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 14ebb288c1ef..c52d7139c5d7 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -61,6 +61,11 @@ struct r1conf { * block, or anything else. */ struct list_head retry_list; + /* A separate list of r1bio which just need raid_end_bio_io called. + * This mustn't happen for writes which had any errors if the superblock + * needs to be written. + */ + struct list_head bio_end_io_list; /* queue pending writes to be submitted on unplug */ struct bio_list pending_bio_list; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b0fce2ebf7ad..0fc33eb88855 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1589,6 +1589,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); spin_unlock_irqrestore(&conf->device_lock, flags); printk(KERN_ALERT "md/raid10:%s: Disk failure on %s, disabling device.\n" @@ -2623,6 +2624,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } put_buf(r10_bio); } else { + bool fail = false; for (m = 0; m < conf->copies; m++) { int dev = r10_bio->devs[m].devnum; struct bio *bio = r10_bio->devs[m].bio; @@ -2634,6 +2636,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) r10_bio->sectors, 0); rdev_dec_pending(rdev, conf->mddev); } else if (bio != NULL && bio->bi_error) { + fail = true; if (!narrow_write_error(r10_bio, m)) { md_error(conf->mddev, rdev); set_bit(R10BIO_Degraded, @@ -2654,7 +2657,13 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) if (test_bit(R10BIO_WriteError, &r10_bio->state)) close_write(r10_bio); - raid_end_bio_io(r10_bio); + if (fail) { + spin_lock_irq(&conf->device_lock); + list_add(&r10_bio->retry_list, &conf->bio_end_io_list); + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else + raid_end_bio_io(r10_bio); } } @@ -2669,6 +2678,23 @@ static void raid10d(struct md_thread *thread) md_check_recovery(mddev); + if (!list_empty_careful(&conf->bio_end_io_list) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + LIST_HEAD(tmp); + spin_lock_irqsave(&conf->device_lock, flags); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + list_add(&tmp, &conf->bio_end_io_list); + list_del_init(&conf->bio_end_io_list); + } + spin_unlock_irqrestore(&conf->device_lock, flags); + while (!list_empty(&tmp)) { + r10_bio = list_first_entry(&conf->bio_end_io_list, + struct r10bio, retry_list); + list_del(&r10_bio->retry_list); + raid_end_bio_io(r10_bio); + } + } + blk_start_plug(&plug); for (;;) { @@ -3443,6 +3469,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) conf->reshape_safe = conf->reshape_progress; spin_lock_init(&conf->device_lock); INIT_LIST_HEAD(&conf->retry_list); + INIT_LIST_HEAD(&conf->bio_end_io_list); spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); @@ -4097,7 +4124,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, * at a time, possibly less if that exceeds RESYNC_PAGES, * or we hit a bad block or something. * This might mean we pause for normal IO in the middle of - * a chunk, but that is not a problem was mddev->reshape_position + * a chunk, but that is not a problem as mddev->reshape_position * can record any location. * * If we will want to write to a location that isn't @@ -4121,7 +4148,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, * * In all this the minimum difference in data offsets * (conf->offset_diff - always positive) allows a bit of slack, - * so next can be after 'safe', but not by more than offset_disk + * so next can be after 'safe', but not by more than offset_diff * * We need to prepare all the bios here before we start any IO * to ensure the size we choose is acceptable to all devices. diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 5ee6473ddc2c..6fc2c75759bf 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -53,6 +53,12 @@ struct r10conf { sector_t offset_diff; struct list_head retry_list; + /* A separate list of r1bio which just need raid_end_bio_io called. + * This mustn't happen for writes which had any errors if the superblock + * needs to be written. + */ + struct list_head bio_end_io_list; + /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; int pending_count; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b29e89cb815b..15ef2c641b2b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -223,18 +223,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh, return slot; } -static void return_io(struct bio *return_bi) +static void return_io(struct bio_list *return_bi) { - struct bio *bi = return_bi; - while (bi) { - - return_bi = bi->bi_next; - bi->bi_next = NULL; + struct bio *bi; + while ((bi = bio_list_pop(return_bi)) != NULL) { bi->bi_iter.bi_size = 0; trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), bi, 0); bio_endio(bi); - bi = return_bi; } } @@ -1177,7 +1173,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - struct bio *return_bi = NULL; + struct bio_list return_bi = BIO_EMPTY_LIST; int i; pr_debug("%s: stripe %llu\n", __func__, @@ -1201,17 +1197,15 @@ static void ops_complete_biofill(void *stripe_head_ref) while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - if (!raid5_dec_bi_active_stripes(rbi)) { - rbi->bi_next = return_bi; - return_bi = rbi; - } + if (!raid5_dec_bi_active_stripes(rbi)) + bio_list_add(&return_bi, rbi); rbi = rbi2; } } } clear_bit(STRIPE_BIOFILL_RUN, &sh->state); - return_io(return_bi); + return_io(&return_bi); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -2517,6 +2511,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) set_bit(Blocked, &rdev->flags); set_bit(Faulty, &rdev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid:%s: Disk failure on %s, disabling device.\n" "md/raid:%s: Operation continuing on %d devices.\n", @@ -3069,7 +3064,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous, static void handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, - struct bio **return_bi) + struct bio_list *return_bi) { int i; BUG_ON(sh->batch_head); @@ -3114,8 +3109,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi->bi_error = -EIO; if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; + bio_list_add(return_bi, bi); } bi = nextbi; } @@ -3139,8 +3133,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi->bi_error = -EIO; if (!raid5_dec_bi_active_stripes(bi)) { md_write_end(conf->mddev); - bi->bi_next = *return_bi; - *return_bi = bi; + bio_list_add(return_bi, bi); } bi = bi2; } @@ -3163,10 +3156,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, r5_next_bio(bi, sh->dev[i].sector); bi->bi_error = -EIO; - if (!raid5_dec_bi_active_stripes(bi)) { - bi->bi_next = *return_bi; - *return_bi = bi; - } + if (!raid5_dec_bi_active_stripes(bi)) + bio_list_add(return_bi, bi); bi = nextbi; } } @@ -3445,7 +3436,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, * never LOCKED, so we don't need to test 'failed' directly. */ static void handle_stripe_clean_event(struct r5conf *conf, - struct stripe_head *sh, int disks, struct bio **return_bi) + struct stripe_head *sh, int disks, struct bio_list *return_bi) { int i; struct r5dev *dev; @@ -3479,8 +3470,7 @@ returnbi: wbi2 = r5_next_bio(wbi, dev->sector); if (!raid5_dec_bi_active_stripes(wbi)) { md_write_end(conf->mddev); - wbi->bi_next = *return_bi; - *return_bi = wbi; + bio_list_add(return_bi, wbi); } wbi = wbi2; } @@ -4613,7 +4603,15 @@ finish: md_wakeup_thread(conf->mddev->thread); } - return_io(s.return_bi); + if (!bio_list_empty(&s.return_bi)) { + if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->return_bi, &s.return_bi); + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else + return_io(&s.return_bi); + } clear_bit_unlock(STRIPE_ACTIVE, &sh->state); } @@ -4672,12 +4670,12 @@ static int raid5_congested(struct mddev *mddev, int bits) static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) { + struct r5conf *conf = mddev->private; sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev); - unsigned int chunk_sectors = mddev->chunk_sectors; + unsigned int chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); - if (mddev->new_chunk_sectors < mddev->chunk_sectors) - chunk_sectors = mddev->new_chunk_sectors; + chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); return chunk_sectors >= ((sector & (chunk_sectors - 1)) + bio_sectors); } @@ -5325,6 +5323,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sector_t stripe_addr; int reshape_sectors; struct list_head stripes; + sector_t retn; if (sector_nr == 0) { /* If restarting in the middle, skip the initial sectors */ @@ -5332,6 +5331,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk conf->reshape_progress < raid5_size(mddev, 0, 0)) { sector_nr = raid5_size(mddev, 0, 0) - conf->reshape_progress; + } else if (mddev->reshape_backwards && + conf->reshape_progress == MaxSector) { + /* shouldn't happen, but just in case, finish up.*/ + sector_nr = MaxSector; } else if (!mddev->reshape_backwards && conf->reshape_progress > 0) sector_nr = conf->reshape_progress; @@ -5340,7 +5343,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk mddev->curr_resync_completed = sector_nr; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); *skipped = 1; - return sector_nr; + retn = sector_nr; + goto finish; } } @@ -5348,10 +5352,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * If old and new chunk sizes differ, we need to process the * largest of these */ - if (mddev->new_chunk_sectors > mddev->chunk_sectors) - reshape_sectors = mddev->new_chunk_sectors; - else - reshape_sectors = mddev->chunk_sectors; + + reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors); /* We update the metadata at least every 10 seconds, or when * the data about to be copied would over-write the source of @@ -5366,11 +5368,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->reshape_backwards) { - writepos -= min_t(sector_t, reshape_sectors, writepos); + BUG_ON(writepos < reshape_sectors); + writepos -= reshape_sectors; readpos += reshape_sectors; safepos += reshape_sectors; } else { writepos += reshape_sectors; + /* readpos and safepos are worst-case calculations. + * A negative number is overly pessimistic, and causes + * obvious problems for unsigned storage. So clip to 0. + */ readpos -= min_t(sector_t, reshape_sectors, readpos); safepos -= min_t(sector_t, reshape_sectors, safepos); } @@ -5513,7 +5520,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk * then we need to write out the superblock. */ sector_nr += reshape_sectors; - if ((sector_nr - mddev->curr_resync_completed) * 2 + retn = reshape_sectors; +finish: + if (mddev->curr_resync_completed > mddev->resync_max || + (sector_nr - mddev->curr_resync_completed) * 2 >= mddev->resync_max - mddev->curr_resync_completed) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, @@ -5538,7 +5548,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } ret: - return reshape_sectors; + return retn; } static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) @@ -5794,6 +5804,18 @@ static void raid5d(struct md_thread *thread) md_check_recovery(mddev); + if (!bio_list_empty(&conf->return_bi) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + struct bio_list tmp = BIO_EMPTY_LIST; + spin_lock_irq(&conf->device_lock); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + bio_list_merge(&tmp, &conf->return_bi); + bio_list_init(&conf->return_bi); + } + spin_unlock_irq(&conf->device_lock); + return_io(&tmp); + } + blk_start_plug(&plug); handled = 0; spin_lock_irq(&conf->device_lock); @@ -6234,8 +6256,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) /* size is defined by the smallest of previous and new size */ raid_disks = min(conf->raid_disks, conf->previous_raid_disks); - sectors &= ~((sector_t)mddev->chunk_sectors - 1); - sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); + sectors &= ~((sector_t)conf->chunk_sectors - 1); + sectors &= ~((sector_t)conf->prev_chunk_sectors - 1); return sectors * (raid_disks - conf->max_degraded); } @@ -6453,6 +6475,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->hold_list); INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); + bio_list_init(&conf->return_bi); init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); @@ -6542,6 +6565,9 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (conf->reshape_progress != MaxSector) { conf->prev_chunk_sectors = mddev->chunk_sectors; conf->prev_algo = mddev->layout; + } else { + conf->prev_chunk_sectors = conf->chunk_sectors; + conf->prev_algo = conf->algorithm; } conf->min_nr_stripes = NR_STRIPES; @@ -6661,6 +6687,8 @@ static int run(struct mddev *mddev) sector_t here_new, here_old; int old_disks; int max_degraded = (mddev->level == 6 ? 2 : 1); + int chunk_sectors; + int new_data_disks; if (mddev->new_level != mddev->level) { printk(KERN_ERR "md/raid:%s: unsupported reshape " @@ -6672,28 +6700,25 @@ static int run(struct mddev *mddev) /* reshape_position must be on a new-stripe boundary, and one * further up in new geometry must map after here in old * geometry. + * If the chunk sizes are different, then as we perform reshape + * in units of the largest of the two, reshape_position needs + * be a multiple of the largest chunk size times new data disks. */ here_new = mddev->reshape_position; - if (sector_div(here_new, mddev->new_chunk_sectors * - (mddev->raid_disks - max_degraded))) { + chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors); + new_data_disks = mddev->raid_disks - max_degraded; + if (sector_div(here_new, chunk_sectors * new_data_disks)) { printk(KERN_ERR "md/raid:%s: reshape_position not " "on a stripe boundary\n", mdname(mddev)); return -EINVAL; } - reshape_offset = here_new * mddev->new_chunk_sectors; + reshape_offset = here_new * chunk_sectors; /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; - sector_div(here_old, mddev->chunk_sectors * - (old_disks-max_degraded)); + sector_div(here_old, chunk_sectors * (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ if (mddev->delta_disks == 0) { - if ((here_new * mddev->new_chunk_sectors != - here_old * mddev->chunk_sectors)) { - printk(KERN_ERR "md/raid:%s: reshape position is" - " confused - aborting\n", mdname(mddev)); - return -EINVAL; - } /* We cannot be sure it is safe to start an in-place * reshape. It is only safe if user-space is monitoring * and taking constant backups. @@ -6712,10 +6737,10 @@ static int run(struct mddev *mddev) return -EINVAL; } } else if (mddev->reshape_backwards - ? (here_new * mddev->new_chunk_sectors + min_offset_diff <= - here_old * mddev->chunk_sectors) - : (here_new * mddev->new_chunk_sectors >= - here_old * mddev->chunk_sectors + (-min_offset_diff))) { + ? (here_new * chunk_sectors + min_offset_diff <= + here_old * chunk_sectors) + : (here_new * chunk_sectors >= + here_old * chunk_sectors + (-min_offset_diff))) { /* Reading from the same stripe as writing to - bad */ printk(KERN_ERR "md/raid:%s: reshape_position too early for " "auto-recovery - aborting.\n", @@ -6967,7 +6992,7 @@ static void status(struct seq_file *seq, struct mddev *mddev) int i; seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, - mddev->chunk_sectors / 2, mddev->layout); + conf->chunk_sectors / 2, mddev->layout); seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); for (i = 0; i < conf->raid_disks; i++) seq_printf (seq, "%s", @@ -7173,7 +7198,9 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) * worth it. */ sector_t newsize; - sectors &= ~((sector_t)mddev->chunk_sectors - 1); + struct r5conf *conf = mddev->private; + + sectors &= ~((sector_t)conf->chunk_sectors - 1); newsize = raid5_size(mddev, sectors, mddev->raid_disks); if (mddev->external_size && mddev->array_sectors > newsize) @@ -7412,6 +7439,7 @@ static void end_reshape(struct r5conf *conf) rdev->data_offset = rdev->new_data_offset; smp_wmb(); conf->reshape_progress = MaxSector; + conf->mddev->reshape_position = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index d05144278690..828c2925e68f 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -265,7 +265,7 @@ struct stripe_head_state { int dec_preread_active; unsigned long ops_request; - struct bio *return_bi; + struct bio_list return_bi; struct md_rdev *blocked_rdev; int handle_bad_blocks; }; @@ -476,6 +476,9 @@ struct r5conf { int skip_copy; /* Don't copy data from bio to stripe cache */ struct list_head *last_hold; /* detect hold_list promotions */ + /* bios to have bi_end_io called after metadata is synced */ + struct bio_list return_bi; + atomic_t reshape_stripes; /* stripes with pending writes for reshape */ /* unfortunately we need two cache names as we temporarily have * two caches. |