diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 21:12:47 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-11-04 21:12:47 -0800 |
commit | ac322de6bf5416cb145b58599297b8be73cd86ac (patch) | |
tree | 1a1be9f8b9241159fb4cde14a548eba9a4155b28 | |
parent | ccf21b69a83afaee4d5499e0d03eacf23946e08c (diff) | |
parent | 339421def582abb14c2217aa8c8f28bb2e299174 (diff) | |
download | linux-ac322de6bf5416cb145b58599297b8be73cd86ac.tar.bz2 |
Merge tag 'md/4.4' of git://neil.brown.name/md
Pull md updates from Neil Brown:
"Two major components to this update.
1) The clustered-raid1 support from SUSE is nearly complete. There
are a few outstanding issues being worked on. Maybe half a dozen
patches will bring this to a usable state.
2) The first stage of journalled-raid5 support from Facebook makes an
appearance. With a journal device configured (typically NVRAM or
SSD), the "RAID5 write hole" should be closed - a crash during
degraded operations cannot result in data corruption.
The next stage will be to use the journal as a write-behind cache
so that latency can be reduced and in some cases throughput
increased by performing more full-stripe writes.
* tag 'md/4.4' of git://neil.brown.name/md: (66 commits)
MD: when RAID journal is missing/faulty, block RESTART_ARRAY_RW
MD: set journal disk ->raid_disk
MD: kick out journal disk if it's not fresh
raid5-cache: start raid5 readonly if journal is missing
MD: add new bit to indicate raid array with journal
raid5-cache: IO error handling
raid5: journal disk can't be removed
raid5-cache: add trim support for log
MD: fix info output for journal disk
raid5-cache: use bio chaining
raid5-cache: small log->seq cleanup
raid5-cache: new helper: r5_reserve_log_entry
raid5-cache: inline r5l_alloc_io_unit into r5l_new_meta
raid5-cache: take rdev->data_offset into account early on
raid5-cache: refactor bio allocation
raid5-cache: clean up r5l_get_meta
raid5-cache: simplify state machine when caches flushes are not needed
raid5-cache: factor out a helper to run all stripes for an I/O unit
raid5-cache: rename flushed_ios to finished_ios
raid5-cache: free I/O units earlier
...
-rw-r--r-- | drivers/md/Makefile | 2 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 14 | ||||
-rw-r--r-- | drivers/md/bitmap.h | 4 | ||||
-rw-r--r-- | drivers/md/md-cluster.c | 228 | ||||
-rw-r--r-- | drivers/md/md-cluster.h | 12 | ||||
-rw-r--r-- | drivers/md/md.c | 495 | ||||
-rw-r--r-- | drivers/md/md.h | 17 | ||||
-rw-r--r-- | drivers/md/raid1.c | 41 | ||||
-rw-r--r-- | drivers/md/raid1.h | 7 | ||||
-rw-r--r-- | drivers/md/raid10.c | 2 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 1191 | ||||
-rw-r--r-- | drivers/md/raid5.c | 189 | ||||
-rw-r--r-- | drivers/md/raid5.h | 24 | ||||
-rw-r--r-- | include/uapi/linux/raid/md_p.h | 73 |
14 files changed, 1989 insertions, 310 deletions
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 462f443a4f85..f34979cd141a 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -17,7 +17,7 @@ dm-cache-smq-y += dm-cache-policy-smq.o dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o md-mod-y += md.o bitmap.o -raid456-y += raid5.o +raid456-y += raid5.o raid5-cache.o # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 48b5890c28e3..4f22e919787a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -613,12 +613,10 @@ re_read: daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); sectors_reserved = le32_to_cpu(sb->sectors_reserved); - /* XXX: This is a hack to ensure that we don't use clustering - * in case: - * - dm-raid is in use and - * - the nodes written in bitmap_sb is erroneous. + /* Setup nodes/clustername only if bitmap version is + * cluster-compatible */ - if (!bitmap->mddev->sync_super) { + if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { nodes = le32_to_cpu(sb->nodes); strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); @@ -628,7 +626,7 @@ re_read: if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) reason = "bad magic"; else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || - le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) + le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED) reason = "unrecognized superblock version"; else if (chunksize < 512) reason = "bitmap chunksize too small"; @@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap) } EXPORT_SYMBOL(bitmap_close_sync); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) { sector_t s = 0; sector_t blocks; @@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) bitmap->last_end_sync = jiffies; return; } - if (time_before(jiffies, (bitmap->last_end_sync + if (!force && time_before(jiffies, (bitmap->last_end_sync + bitmap->mddev->bitmap_info.daemon_sleep))) return; wait_event(bitmap->mddev->recovery_wait, diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index f1f4dd01090d..7d5c3a610ca5 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h @@ -9,8 +9,10 @@ #define BITMAP_MAJOR_LO 3 /* version 4 insists the bitmap is in little-endian order * with version 3, it is host-endian which is non-portable + * Version 5 is currently set only for clustered devices */ #define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_CLUSTERED 5 #define BITMAP_MAJOR_HOSTENDIAN 3 /* @@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); void bitmap_close_sync(struct bitmap *bitmap); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); void bitmap_unplug(struct bitmap *bitmap); void bitmap_daemon_work(struct mddev *mddev); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 11e3bc9d2a4b..d6a1126d85ce 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -28,6 +28,7 @@ struct dlm_lock_resource { struct completion completion; /* completion for synchronized locking */ void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ struct mddev *mddev; /* pointing back to mddev. */ + int mode; }; struct suspend_info { @@ -53,8 +54,8 @@ struct md_cluster_info { dlm_lockspace_t *lockspace; int slot_number; struct completion completion; - struct mutex sb_mutex; struct dlm_lock_resource *bitmap_lockres; + struct dlm_lock_resource *resync_lockres; struct list_head suspend_list; spinlock_t suspend_lock; struct md_thread *recovery_thread; @@ -79,20 +80,20 @@ enum msg_type { }; struct cluster_msg { - int type; - int slot; + __le32 type; + __le32 slot; /* TODO: Unionize this for smaller footprint */ - sector_t low; - sector_t high; + __le64 low; + __le64 high; char uuid[16]; - int raid_slot; + __le32 raid_slot; }; static void sync_ast(void *arg) { struct dlm_lock_resource *res; - res = (struct dlm_lock_resource *) arg; + res = arg; complete(&res->completion); } @@ -106,6 +107,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) if (ret) return ret; wait_for_completion(&res->completion); + if (res->lksb.sb_status == 0) + res->mode = mode; return res->lksb.sb_status; } @@ -127,6 +130,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev, init_completion(&res->completion); res->ls = cinfo->lockspace; res->mddev = mddev; + res->mode = DLM_LOCK_IV; namelen = strlen(name); res->name = kzalloc(namelen + 1, GFP_KERNEL); if (!res->name) { @@ -191,8 +195,8 @@ retry: kfree(res); } -static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres, - sector_t lo, sector_t hi) +static void add_resync_info(struct dlm_lock_resource *lockres, + sector_t lo, sector_t hi) { struct resync_info *ri; @@ -210,7 +214,7 @@ static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_loc dlm_lock_sync(lockres, DLM_LOCK_CR); memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); hi = le64_to_cpu(ri.hi); - if (ri.hi > 0) { + if (hi > 0) { s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); if (!s) goto out; @@ -345,7 +349,7 @@ static const struct dlm_lockspace_ops md_ls_ops = { */ static void ack_bast(void *arg, int mode) { - struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg; + struct dlm_lock_resource *res = arg; struct md_cluster_info *cinfo = res->mddev->cluster_info; if (mode == DLM_LOCK_EX) @@ -358,29 +362,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) if (slot == s->slot) { - pr_info("%s:%d Deleting suspend_info: %d\n", - __func__, __LINE__, slot); list_del(&s->list); kfree(s); break; } } -static void remove_suspend_info(struct md_cluster_info *cinfo, int slot) +static void remove_suspend_info(struct mddev *mddev, int slot) { + struct md_cluster_info *cinfo = mddev->cluster_info; spin_lock_irq(&cinfo->suspend_lock); __remove_suspend_info(cinfo, slot); spin_unlock_irq(&cinfo->suspend_lock); + mddev->pers->quiesce(mddev, 2); } -static void process_suspend_info(struct md_cluster_info *cinfo, +static void process_suspend_info(struct mddev *mddev, int slot, sector_t lo, sector_t hi) { + struct md_cluster_info *cinfo = mddev->cluster_info; struct suspend_info *s; if (!hi) { - remove_suspend_info(cinfo, slot); + remove_suspend_info(mddev, slot); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); return; } s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); @@ -389,11 +396,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo, s->slot = slot; s->lo = lo; s->hi = hi; + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); spin_lock_irq(&cinfo->suspend_lock); /* Remove existing entry (if exists) before adding */ __remove_suspend_info(cinfo, slot); list_add(&s->list, &cinfo->suspend_list); spin_unlock_irq(&cinfo->suspend_lock); + mddev->pers->quiesce(mddev, 2); } static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) @@ -407,7 +417,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) len = snprintf(disk_uuid, 64, "DEVICE_UUID="); sprintf(disk_uuid + len, "%pU", cmsg->uuid); - snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot); + snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot)); pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); init_completion(&cinfo->newdisk_completion); set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); @@ -421,64 +431,59 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) { struct md_cluster_info *cinfo = mddev->cluster_info; - - md_reload_sb(mddev); + md_reload_sb(mddev, le32_to_cpu(msg->raid_slot)); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); } static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) { - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); + struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, + le32_to_cpu(msg->raid_slot)); if (rdev) md_kick_rdev_from_array(rdev); else - pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot); + pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", + __func__, __LINE__, le32_to_cpu(msg->raid_slot)); } static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) { - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); + struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, + le32_to_cpu(msg->raid_slot)); if (rdev && test_bit(Faulty, &rdev->flags)) clear_bit(Faulty, &rdev->flags); else - pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot); + pr_warn("%s: %d Could not find disk(%d) which is faulty", + __func__, __LINE__, le32_to_cpu(msg->raid_slot)); } static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) { - switch (msg->type) { + if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot), + "node %d received it's own msg\n", le32_to_cpu(msg->slot))) + return; + switch (le32_to_cpu(msg->type)) { case METADATA_UPDATED: - pr_info("%s: %d Received message: METADATA_UPDATE from %d\n", - __func__, __LINE__, msg->slot); process_metadata_update(mddev, msg); break; case RESYNCING: - pr_info("%s: %d Received message: RESYNCING from %d\n", - __func__, __LINE__, msg->slot); - process_suspend_info(mddev->cluster_info, msg->slot, - msg->low, msg->high); + process_suspend_info(mddev, le32_to_cpu(msg->slot), + le64_to_cpu(msg->low), + le64_to_cpu(msg->high)); break; case NEWDISK: - pr_info("%s: %d Received message: NEWDISK from %d\n", - __func__, __LINE__, msg->slot); process_add_new_disk(mddev, msg); break; case REMOVE: - pr_info("%s: %d Received REMOVE from %d\n", - __func__, __LINE__, msg->slot); process_remove_disk(mddev, msg); break; case RE_ADD: - pr_info("%s: %d Received RE_ADD from %d\n", - __func__, __LINE__, msg->slot); process_readd_disk(mddev, msg); break; case BITMAP_NEEDS_SYNC: - pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n", - __func__, __LINE__, msg->slot); - __recover_slot(mddev, msg->slot); + __recover_slot(mddev, le32_to_cpu(msg->slot)); break; default: pr_warn("%s:%d Received unknown message from %d\n", @@ -528,11 +533,17 @@ static void recv_daemon(struct md_thread *thread) /* lock_comm() * Takes the lock on the TOKEN lock resource so no other * node can communicate while the operation is underway. + * If called again, and the TOKEN lock is alread in EX mode + * return success. However, care must be taken that unlock_comm() + * is called only once. */ static int lock_comm(struct md_cluster_info *cinfo) { int error; + if (cinfo->token_lockres->mode == DLM_LOCK_EX) + return 0; + error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); if (error) pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", @@ -542,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo) static void unlock_comm(struct md_cluster_info *cinfo) { + WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX); dlm_unlock_sync(cinfo->token_lockres); } @@ -696,7 +708,6 @@ static int join(struct mddev *mddev, int nodes) init_completion(&cinfo->completion); set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); - mutex_init(&cinfo->sb_mutex); mddev->cluster_info = cinfo; memset(str, 0, 64); @@ -753,6 +764,10 @@ static int join(struct mddev *mddev, int nodes) goto err; } + cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0); + if (!cinfo->resync_lockres) + goto err; + ret = gather_all_resync_info(mddev, nodes); if (ret) goto err; @@ -763,6 +778,7 @@ err: lockres_free(cinfo->token_lockres); lockres_free(cinfo->ack_lockres); lockres_free(cinfo->no_new_dev_lockres); + lockres_free(cinfo->resync_lockres); lockres_free(cinfo->bitmap_lockres); if (cinfo->lockspace) dlm_release_lockspace(cinfo->lockspace, 2); @@ -771,12 +787,32 @@ err: return ret; } +static void resync_bitmap(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + struct cluster_msg cmsg = {0}; + int err; + + cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC); + err = sendmsg(cinfo, &cmsg); + if (err) + pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n", + __func__, __LINE__, err); +} + static int leave(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; if (!cinfo) return 0; + + /* BITMAP_NEEDS_SYNC message should be sent when node + * is leaving the cluster with dirty bitmap, also we + * can only deliver it when dlm connection is available */ + if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) + resync_bitmap(mddev); + md_unregister_thread(&cinfo->recovery_thread); md_unregister_thread(&cinfo->recv_thread); lockres_free(cinfo->message_lockres); @@ -799,15 +835,6 @@ static int slot_number(struct mddev *mddev) return cinfo->slot_number - 1; } -static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) -{ - struct md_cluster_info *cinfo = mddev->cluster_info; - - add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi); - /* Re-acquire the lock to refresh LVB */ - dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); -} - static int metadata_update_start(struct mddev *mddev) { return lock_comm(mddev->cluster_info); @@ -817,59 +844,62 @@ static int metadata_update_finish(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; struct cluster_msg cmsg; - int ret; + struct md_rdev *rdev; + int ret = 0; + int raid_slot = -1; memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(METADATA_UPDATED); - ret = __sendmsg(cinfo, &cmsg); + /* Pick up a good active device number to send. + */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) { + raid_slot = rdev->desc_nr; + break; + } + if (raid_slot >= 0) { + cmsg.raid_slot = cpu_to_le32(raid_slot); + ret = __sendmsg(cinfo, &cmsg); + } else + pr_warn("md-cluster: No good device id found to send\n"); unlock_comm(cinfo); return ret; } -static int metadata_update_cancel(struct mddev *mddev) +static void metadata_update_cancel(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; + unlock_comm(cinfo); +} - return dlm_unlock_sync(cinfo->token_lockres); +static int resync_start(struct mddev *mddev) +{ + struct md_cluster_info *cinfo = mddev->cluster_info; + cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE; + return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX); } -static int resync_send(struct mddev *mddev, enum msg_type type, - sector_t lo, sector_t hi) +static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) { struct md_cluster_info *cinfo = mddev->cluster_info; - struct cluster_msg cmsg; - int slot = cinfo->slot_number - 1; + struct cluster_msg cmsg = {0}; - pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__, - (unsigned long long)lo, - (unsigned long long)hi); - resync_info_update(mddev, lo, hi); - cmsg.type = cpu_to_le32(type); - cmsg.slot = cpu_to_le32(slot); + add_resync_info(cinfo->bitmap_lockres, lo, hi); + /* Re-acquire the lock to refresh LVB */ + dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); + cmsg.type = cpu_to_le32(RESYNCING); cmsg.low = cpu_to_le64(lo); cmsg.high = cpu_to_le64(hi); - return sendmsg(cinfo, &cmsg); -} -static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi) -{ - pr_info("%s:%d\n", __func__, __LINE__); - return resync_send(mddev, RESYNCING, lo, hi); + return sendmsg(cinfo, &cmsg); } -static void resync_finish(struct mddev *mddev) +static int resync_finish(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; - struct cluster_msg cmsg; - int slot = cinfo->slot_number - 1; - - pr_info("%s:%d\n", __func__, __LINE__); - resync_send(mddev, RESYNCING, 0, 0); - if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { - cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC); - cmsg.slot = cpu_to_le32(slot); - sendmsg(cinfo, &cmsg); - } + cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE; + dlm_unlock_sync(cinfo->resync_lockres); + return resync_info_update(mddev, 0, 0); } static int area_resyncing(struct mddev *mddev, int direction, @@ -896,7 +926,11 @@ out: return ret; } -static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) +/* add_new_disk() - initiates a disk add + * However, if this fails before writing md_update_sb(), + * add_new_disk_cancel() must be called to release token lock + */ +static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) { struct md_cluster_info *cinfo = mddev->cluster_info; struct cluster_msg cmsg; @@ -907,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(NEWDISK); memcpy(cmsg.uuid, uuid, 16); - cmsg.raid_slot = rdev->desc_nr; + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); lock_comm(cinfo); ret = __sendmsg(cinfo, &cmsg); if (ret) @@ -918,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) /* Some node does not "see" the device */ if (ret == -EAGAIN) ret = -ENOENT; + if (ret) + unlock_comm(cinfo); else dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); return ret; } -static int add_new_disk_finish(struct mddev *mddev) +static void add_new_disk_cancel(struct mddev *mddev) { - struct cluster_msg cmsg; struct md_cluster_info *cinfo = mddev->cluster_info; - int ret; - /* Write sb and inform others */ - md_update_sb(mddev, 1); - cmsg.type = METADATA_UPDATED; - ret = __sendmsg(cinfo, &cmsg); unlock_comm(cinfo); - return ret; } static int new_disk_ack(struct mddev *mddev, bool ack) @@ -953,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack) static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) { - struct cluster_msg cmsg; + struct cluster_msg cmsg = {0}; struct md_cluster_info *cinfo = mddev->cluster_info; - cmsg.type = REMOVE; - cmsg.raid_slot = rdev->desc_nr; + cmsg.type = cpu_to_le32(REMOVE); + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); return __sendmsg(cinfo, &cmsg); } @@ -964,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev) { int sn, err; sector_t lo, hi; - struct cluster_msg cmsg; + struct cluster_msg cmsg = {0}; struct mddev *mddev = rdev->mddev; struct md_cluster_info *cinfo = mddev->cluster_info; - cmsg.type = RE_ADD; - cmsg.raid_slot = rdev->desc_nr; + cmsg.type = cpu_to_le32(RE_ADD); + cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); err = sendmsg(cinfo, &cmsg); if (err) goto out; @@ -993,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = { .join = join, .leave = leave, .slot_number = slot_number, - .resync_info_update = resync_info_update, .resync_start = resync_start, .resync_finish = resync_finish, + .resync_info_update = resync_info_update, .metadata_update_start = metadata_update_start, .metadata_update_finish = metadata_update_finish, .metadata_update_cancel = metadata_update_cancel, .area_resyncing = area_resyncing, - .add_new_disk_start = add_new_disk_start, - .add_new_disk_finish = add_new_disk_finish, + .add_new_disk = add_new_disk, + .add_new_disk_cancel = add_new_disk_cancel, .new_disk_ack = new_disk_ack, .remove_disk = remove_disk, .gather_bitmaps = gather_bitmaps, @@ -1022,5 +1051,6 @@ static void cluster_exit(void) module_init(cluster_init); module_exit(cluster_exit); +MODULE_AUTHOR("SUSE"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Clustering support for MD"); diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h index 00defe2badbc..e75ea2613184 100644 --- a/drivers/md/md-cluster.h +++ b/drivers/md/md-cluster.h @@ -12,15 +12,15 @@ struct md_cluster_operations { int (*join)(struct mddev *mddev, int nodes); int (*leave)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev); - void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); - int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi); - void (*resync_finish)(struct mddev *mddev); + int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev); - int (*metadata_update_cancel)(struct mddev *mddev); + void (*metadata_update_cancel)(struct mddev *mddev); + int (*resync_start)(struct mddev *mddev); + int (*resync_finish)(struct mddev *mddev); int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi); - int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); - int (*add_new_disk_finish)(struct mddev *mddev); + int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev); + void (*add_new_disk_cancel)(struct mddev *mddev); int (*new_disk_ack)(struct mddev *mddev, bool ack); int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); int (*gather_bitmaps)(struct md_rdev *rdev); diff --git a/drivers/md/md.c b/drivers/md/md.c index 740ee9bc5ad3..3f9a514b5b9d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1608,7 +1608,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ++ev1; if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) && - le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) if (ev1 < mddev->events) return -EINVAL; } else if (mddev->bitmap) { @@ -1628,16 +1629,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) int role; if (rdev->desc_nr < 0 || rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { - role = 0xffff; + role = MD_DISK_ROLE_SPARE; rdev->desc_nr = -1; } else role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { - case 0xffff: /* spare */ + case MD_DISK_ROLE_SPARE: /* spare */ break; - case 0xfffe: /* faulty */ + case MD_DISK_ROLE_FAULTY: /* faulty */ set_bit(Faulty, &rdev->flags); break; + case MD_DISK_ROLE_JOURNAL: /* journal device */ + if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { + /* journal device without journal feature */ + printk(KERN_WARNING + "md: journal device provided without journal feature, ignoring the device\n"); + return -EINVAL; + } + set_bit(Journal, &rdev->flags); + rdev->journal_tail = le64_to_cpu(sb->journal_tail); + if (mddev->recovery_cp == MaxSector) + set_bit(MD_JOURNAL_CLEAN, &mddev->flags); + rdev->raid_disk = mddev->raid_disks; + break; default: rdev->saved_raid_disk = role; if ((le32_to_cpu(sb->feature_map) & @@ -1655,6 +1669,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) set_bit(WriteMostly, &rdev->flags); if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) set_bit(Replacement, &rdev->flags); + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) + set_bit(MD_HAS_JOURNAL, &mddev->flags); } else /* MULTIPATH are always insync */ set_bit(In_sync, &rdev->flags); @@ -1679,6 +1695,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->events = cpu_to_le64(mddev->events); if (mddev->in_sync) sb->resync_offset = cpu_to_le64(mddev->recovery_cp); + else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) + sb->resync_offset = cpu_to_le64(MaxSector); else sb->resync_offset = cpu_to_le64(0); @@ -1702,7 +1720,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); } - if (rdev->raid_disk >= 0 && + if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags)) { sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); @@ -1712,6 +1730,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) sb->feature_map |= cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); } + /* Note: recovery_offset and journal_tail share space */ + if (test_bit(Journal, &rdev->flags)) + sb->journal_tail = cpu_to_le64(rdev->journal_tail); if (test_bit(Replacement, &rdev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_REPLACEMENT); @@ -1735,6 +1756,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) } } + if (mddev_is_clustered(mddev)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); + if (rdev->badblocks.count == 0) /* Nothing to do for bad blocks*/ ; else if (sb->bblog_offset == 0) @@ -1785,18 +1809,23 @@ retry: max_dev = le32_to_cpu(sb->max_dev); for (i=0; i<max_dev;i++) - sb->dev_roles[i] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); + + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); rdev_for_each(rdev2, mddev) { i = rdev2->desc_nr; if (test_bit(Faulty, &rdev2->flags)) - sb->dev_roles[i] = cpu_to_le16(0xfffe); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); else if (test_bit(In_sync, &rdev2->flags)) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); + else if (test_bit(Journal, &rdev2->flags)) + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); else if (rdev2->raid_disk >= 0) sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); else - sb->dev_roles[i] = cpu_to_le16(0xffff); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); } sb->sb_csum = calc_sb_1_csum(sb); @@ -1912,13 +1941,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) struct md_rdev *rdev, *rdev2; rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev1) - rdev_for_each_rcu(rdev2, mddev2) + rdev_for_each_rcu(rdev, mddev1) { + if (test_bit(Faulty, &rdev->flags) || + test_bit(Journal, &rdev->flags) || + rdev->raid_disk == -1) + continue; + rdev_for_each_rcu(rdev2, mddev2) { + if (test_bit(Faulty, &rdev2->flags) || + test_bit(Journal, &rdev2->flags) || + rdev2->raid_disk == -1) + continue; if (rdev->bdev->bd_contains == rdev2->bdev->bd_contains) { rcu_read_unlock(); return 1; } + } + } rcu_read_unlock(); return 0; } @@ -2194,23 +2233,77 @@ static void sync_sbs(struct mddev *mddev, int nospares) } } +static bool does_sb_need_changing(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct mdp_superblock_1 *sb; + int role; + + /* Find a good rdev */ + rdev_for_each(rdev, mddev) + if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) + break; + + /* No good device found. */ + if (!rdev) + return false; + + sb = page_address(rdev->sb_page); + /* Check if a device has become faulty or a spare become active */ + rdev_for_each(rdev, mddev) { + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + /* Device activated? */ + if (role == 0xffff && rdev->raid_disk >=0 && + !test_bit(Faulty, &rdev->flags)) + return true; + /* Device turned faulty? */ + if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) + return true; + } + + /* Check if any mddev parameters have changed */ + if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || + (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || + (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || + (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) + return true; + + return false; +} + void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; int nospares = 0; int any_badblocks_changed = 0; + int ret = -1; if (mddev->ro) { if (force_change) set_bit(MD_CHANGE_DEVS, &mddev->flags); return; } + + if (mddev_is_clustered(mddev)) { + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + force_change = 1; + ret = md_cluster_ops->metadata_update_start(mddev); + /* Has someone else has updated the sb */ + if (!does_sb_need_changing(mddev)) { + if (ret == 0) + md_cluster_ops->metadata_update_cancel(mddev); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + return; + } + } repeat: /* First make sure individual recovery_offsets are correct */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) rdev->recovery_offset = mddev->curr_resync_completed; @@ -2354,6 +2447,9 @@ repeat: clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } + + if (mddev_is_clustered(mddev) && ret == 0) + md_cluster_ops->metadata_update_finish(mddev); } EXPORT_SYMBOL(md_update_sb); @@ -2429,6 +2525,10 @@ state_show(struct md_rdev *rdev, char *page) len += sprintf(page+len, "%sin_sync",sep); sep = ","; } + if (test_bit(Journal, &flags)) { + len += sprintf(page+len, "%sjournal",sep); + sep = ","; + } if (test_bit(WriteMostly, &flags)) { len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; @@ -2440,6 +2540,7 @@ state_show(struct md_rdev *rdev, char *page) sep = ","; } if (!test_bit(Faulty, &flags) && + !test_bit(Journal, &flags) && !test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sspare", sep); sep = ","; @@ -2488,17 +2589,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = -EBUSY; else { struct mddev *mddev = rdev->mddev; - if (mddev_is_clustered(mddev)) - md_cluster_ops->remove_disk(mddev, rdev); - md_kick_rdev_from_array(rdev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); - if (mddev->pers) - md_update_sb(mddev, 1); - md_new_event(mddev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); err = 0; + if (mddev_is_clustered(mddev)) + err = md_cluster_ops->remove_disk(mddev, rdev); + + if (err == 0) { + md_kick_rdev_from_array(rdev); + if (mddev->pers) + md_update_sb(mddev, 1); + md_new_event(mddev); + } } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); @@ -2527,7 +2627,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { set_bit(In_sync, &rdev->flags); err = 0; - } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { + } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags)) { if (rdev->mddev->pers == NULL) { clear_bit(In_sync, &rdev->flags); rdev->saved_raid_disk = rdev->raid_disk; @@ -2546,6 +2647,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) * check if recovery is needed. */ if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Replacement, &rdev->flags)) set_bit(WantReplacement, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); @@ -2623,7 +2725,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); static ssize_t slot_show(struct md_rdev *rdev, char *page) { - if (rdev->raid_disk < 0) + if (test_bit(Journal, &rdev->flags)) + return sprintf(page, "journal\n"); + else if (rdev->raid_disk < 0) return sprintf(page, "none\n"); else return sprintf(page, "%d\n", rdev->raid_disk); @@ -2635,6 +2739,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) int slot; int err; + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; if (strncmp(buf, "none", 4)==0) slot = -1; else { @@ -2686,15 +2792,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); - err = rdev->mddev->pers-> - hot_add_disk(rdev->mddev, rdev); - if (err) { - rdev->raid_disk = -1; - return err; - } else - sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) - /* failure here is OK */; + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk == -1) + return -EBUSY; /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && @@ -2839,6 +2939,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) sector_t oldsectors = rdev->sectors; sector_t sectors; + if (test_bit(Journal, &rdev->flags)) + return -EBUSY; if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (rdev->data_offset != rdev->new_data_offset) @@ -3196,20 +3298,14 @@ static void analyze_sbs(struct mddev *mddev) md_kick_rdev_from_array(rdev); continue; } - /* No device should have a Candidate flag - * when reading devices - */ - if (test_bit(Candidate, &rdev->flags)) { - pr_info("md: kicking Cluster Candidate %s from array!\n", - bdevname(rdev->bdev, b)); - md_kick_rdev_from_array(rdev); - } } if (mddev->level == LEVEL_MULTIPATH) { rdev->desc_nr = i++; rdev->raid_disk = rdev->desc_nr; set_bit(In_sync, &rdev->flags); - } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { + } else if (rdev->raid_disk >= + (mddev->raid_disks - min(0, mddev->delta_disks)) && + !test_bit(Journal, &rdev->flags)) { rdev->raid_disk = -1; clear_bit(In_sync, &rdev->flags); } @@ -3267,6 +3363,11 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) { unsigned long msec; + if (mddev_is_clustered(mddev)) { + pr_info("md: Safemode is disabled for clustered mode\n"); + return -EINVAL; + } + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) return -EINVAL; if (msec == 0) @@ -3867,7 +3968,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; case clean: if (mddev->pers) { - restart_array(mddev); + err = restart_array(mddev); + if (err) + break; spin_lock(&mddev->lock); if (atomic_read(&mddev->writes_pending) == 0) { if (mddev->in_sync == 0) { @@ -3885,7 +3988,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) break; case active: if (mddev->pers) { - restart_array(mddev); + err = restart_array(mddev); + if (err) + break; clear_bit(MD_CHANGE_PENDING, &mddev->flags); wake_up(&mddev->sb_wait); err = 0; @@ -4064,12 +4169,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; if (mddev->pers) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); err = update_size(mddev, sectors); md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) @@ -5181,7 +5282,10 @@ int md_run(struct mddev *mddev) atomic_set(&mddev->max_corr_read_errors, MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); mddev->safemode = 0; - mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ + if (mddev_is_clustered(mddev)) + mddev->safemode_delay = 0; + else + mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; smp_wmb(); spin_lock(&mddev->lock); @@ -5224,6 +5328,9 @@ static int do_md_run(struct mddev *mddev) goto out; } + if (mddev_is_clustered(mddev)) + md_allow_write(mddev); + md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ @@ -5246,6 +5353,25 @@ static int restart_array(struct mddev *mddev) return -EINVAL; if (!mddev->ro) return -EBUSY; + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + struct md_rdev *rdev; + bool has_journal = false; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) { + has_journal = true; + break; + } + } + rcu_read_unlock(); + + /* Don't restart rw with journal missing/faulty */ + if (!has_journal) + return -EINVAL; + } + mddev->safemode = 0; mddev->ro = 0; set_disk_ro(disk, 0); @@ -5307,8 +5433,6 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); flush_workqueue(md_misc_wq); if (mddev->sync_thread) { @@ -5322,13 +5446,13 @@ static void __md_stop_writes(struct mddev *mddev) md_super_wait(mddev); if (mddev->ro == 0 && - (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) { + ((!mddev->in_sync && !mddev_is_clustered(mddev)) || + (mddev->flags & MD_UPDATE_SB_FLAGS))) { /* mark array as shutdown cleanly */ - mddev->in_sync = 1; + if (!mddev_is_clustered(mddev)) + mddev->in_sync = 1; md_update_sb(mddev, 1); } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); } void md_stop_writes(struct mddev *mddev) @@ -5789,6 +5913,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) info.state |= (1<<MD_DISK_ACTIVE); info.state |= (1<<MD_DISK_SYNC); } + if (test_bit(Journal, &rdev->flags)) + info.state |= (1<<MD_DISK_JOURNAL); if (test_bit(WriteMostly, &rdev->flags)) info.state |= (1<<MD_DISK_WRITEMOSTLY); } else { @@ -5903,23 +6029,18 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else clear_bit(WriteMostly, &rdev->flags); + if (info->state & (1<<MD_DISK_JOURNAL)) + set_bit(Journal, &rdev->flags); /* * check whether the device shows up in other nodes */ if (mddev_is_clustered(mddev)) { - if (info->state & (1 << MD_DISK_CANDIDATE)) { - /* Through --cluster-confirm */ + if (info->state & (1 << MD_DISK_CANDIDATE)) set_bit(Candidate, &rdev->flags); - err = md_cluster_ops->new_disk_ack(mddev, true); - if (err) { - export_rdev(rdev); - return err; - } - } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { + else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ - err = md_cluster_ops->add_new_disk_start(mddev, rdev); + err = md_cluster_ops->add_new_disk(mddev, rdev); if (err) { - md_cluster_ops->add_new_disk_finish(mddev); export_rdev(rdev); return err; } @@ -5928,13 +6049,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); + if (err) export_rdev(rdev); - else + + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) + md_cluster_ops->new_disk_ack(mddev, (err == 0)); + else { + if (err) + md_cluster_ops->add_new_disk_cancel(mddev); + else + err = add_bound_rdev(rdev); + } + + } else if (!err) err = add_bound_rdev(rdev); - if (mddev_is_clustered(mddev) && - (info->state & (1 << MD_DISK_CLUSTER_ADD))) - md_cluster_ops->add_new_disk_finish(mddev); + return err; } @@ -5990,13 +6121,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) { char b[BDEVNAME_SIZE]; struct md_rdev *rdev; + int ret = -1; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); + ret = md_cluster_ops->metadata_update_start(mddev); + + if (rdev->raid_disk < 0) + goto kick_rdev; clear_bit(Blocked, &rdev->flags); remove_and_add_spares(mddev, rdev); @@ -6004,20 +6139,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) if (rdev->raid_disk >= 0) goto busy; - if (mddev_is_clustered(mddev)) +kick_rdev: + if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->remove_disk(mddev, rdev); md_kick_rdev_from_array(rdev); md_update_sb(mddev, 1); md_new_event(mddev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); - return 0; busy: - if (mddev_is_clustered(mddev)) + if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->metadata_update_cancel(mddev); + printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; @@ -6068,14 +6202,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) goto abort_export; } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) - goto abort_clustered; + goto abort_export; /* * The rest should better be atomic, we can have disk failures @@ -6085,9 +6217,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) rdev->raid_disk = -1; md_update_sb(mddev, 1); - - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); /* * Kick recovery, maybe this spare has to be added to the * array immediately. @@ -6097,9 +6226,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) md_new_event(mddev); return 0; -abort_clustered: - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_cancel(mddev); abort_export: export_rdev(rdev); return err; @@ -6417,8 +6543,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) return rv; } } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); @@ -6476,12 +6600,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) } } md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); return rv; err: - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_cancel(mddev); return rv; } @@ -7282,6 +7402,8 @@ static int md_seq_show(struct seq_file *seq, void *v) bdevname(rdev->bdev,b), rdev->desc_nr); if (test_bit(WriteMostly, &rdev->flags)) seq_printf(seq, "(W)"); + if (test_bit(Journal, &rdev->flags)) + seq_printf(seq, "(J)"); if (test_bit(Faulty, &rdev->flags)) { seq_printf(seq, "(F)"); continue; @@ -7594,11 +7716,7 @@ int md_allow_write(struct mddev *mddev) mddev->safemode == 0) mddev->safemode = 1; spin_unlock(&mddev->lock); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); md_update_sb(mddev, 0); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock(&mddev->lock); @@ -7630,6 +7748,7 @@ void md_do_sync(struct md_thread *thread) struct md_rdev *rdev; char *desc, *action = NULL; struct blk_plug plug; + bool cluster_resync_finished = false; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -7739,6 +7858,7 @@ void md_do_sync(struct md_thread *thread) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < j) @@ -7799,9 +7919,6 @@ void md_do_sync(struct md_thread *thread) md_new_event(mddev); update_time = jiffies; - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_start(mddev, j, max_sectors); - blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; @@ -7865,8 +7982,6 @@ void md_do_sync(struct md_thread *thread) j = max_sectors; if (j > 2) mddev->curr_resync = j; - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_info_update(mddev, j, max_sectors); mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be @@ -7937,7 +8052,11 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - /* tell personality that we are finished */ + /* tell personality and other nodes that we are finished */ + if (mddev_is_clustered(mddev)) { + md_cluster_ops->resync_finish(mddev); + cluster_resync_finished = true; + } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && @@ -7965,6 +8084,7 @@ void md_do_sync(struct md_thread *thread) rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && rdev->recovery_offset < mddev->curr_resync) @@ -7973,11 +8093,13 @@ void md_do_sync(struct md_thread *thread) } } skip: - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_finish(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !cluster_resync_finished) + md_cluster_ops->resync_finish(mddev); + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ @@ -8008,7 +8130,8 @@ static int remove_and_add_spares(struct mddev *mddev, rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && (test_bit(Faulty, &rdev->flags) || - ! test_bit(In_sync, &rdev->flags)) && + (!test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags))) && atomic_read(&rdev->nr_pending)==0) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { @@ -8020,18 +8143,25 @@ static int remove_and_add_spares(struct mddev *mddev, if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); - if (this) + if (this && removed) goto no_add; rdev_for_each(rdev, mddev) { + if (this && this != rdev) + continue; + if (test_bit(Candidate, &rdev->flags)) + continue; if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && + !test_bit(Journal, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) spares++; if (rdev->raid_disk >= 0) continue; if (test_bit(Faulty, &rdev->flags)) continue; + if (test_bit(Journal, &rdev->flags)) + continue; if (mddev->ro && ! (rdev->saved_raid_disk >= 0 && !test_bit(Bitmap_sync, &rdev->flags))) @@ -8056,14 +8186,25 @@ no_add: static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); + int ret = 0; + + if (mddev_is_clustered(mddev)) { + ret = md_cluster_ops->resync_start(mddev); + if (ret) { + mddev->sync_thread = NULL; + goto out; + } + } mddev->sync_thread = md_register_thread(md_do_sync, mddev, "resync"); +out: if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); + if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) + printk(KERN_ERR "%s: could not start resync" + " thread...\n", + mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -8182,13 +8323,8 @@ void md_check_recovery(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags & MD_UPDATE_SB_FLAGS) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); - } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { @@ -8286,8 +8422,6 @@ void md_reap_sync_thread(struct mddev *mddev) set_bit(MD_CHANGE_DEVS, &mddev->flags); } } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && mddev->pers->finish_reshape) mddev->pers->finish_reshape(mddev); @@ -8300,8 +8434,6 @@ void md_reap_sync_thread(struct mddev *mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -8924,25 +9056,128 @@ err_wq: return ret; } -void md_reload_sb(struct mddev *mddev) +static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { - struct md_rdev *rdev, *tmp; + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + struct md_rdev *rdev2; + int role, ret; + char b[BDEVNAME_SIZE]; - rdev_for_each_safe(rdev, tmp, mddev) { - rdev->sb_loaded = 0; - ClearPageUptodate(rdev->sb_page); + /* Check for change of roles in the active devices */ + rdev_for_each(rdev2, mddev) { + if (test_bit(Faulty, &rdev2->flags)) + continue; + + /* Check if the roles changed */ + role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); + + if (test_bit(Candidate, &rdev2->flags)) { + if (role == 0xfffe) { + pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); + md_kick_rdev_from_array(rdev2); + continue; + } + else + clear_bit(Candidate, &rdev2->flags); + } + + if (role != rdev2->raid_disk) { + /* got activated */ + if (rdev2->raid_disk == -1 && role != 0xffff) { + rdev2->saved_raid_disk = role; + ret = remove_and_add_spares(mddev, rdev2); + pr_info("Activated spare: %s\n", + bdevname(rdev2->bdev,b)); + continue; + } + /* device faulty + * We just want to do the minimum to mark the disk + * as faulty. The recovery is performed by the + * one who initiated the error. + */ + if ((role == 0xfffe) || (role == 0xfffd)) { + md_error(mddev, rdev2); + clear_bit(Blocked, &rdev2->flags); + } + } } - mddev->raid_disks = 0; - analyze_sbs(mddev); - rdev_for_each_safe(rdev, tmp, mddev) { - struct mdp_superblock_1 *sb = page_address(rdev->sb_page); - /* since we don't write to faulty devices, we figure out if the - * disk is faulty by comparing events - */ - if (mddev->events > sb->events) - set_bit(Faulty, &rdev->flags); + + if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) + update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + + /* Finally set the event to be up to date */ + mddev->events = le64_to_cpu(sb->events); +} + +static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + int err; + struct page *swapout = rdev->sb_page; + struct mdp_superblock_1 *sb; + + /* Store the sb page of the rdev in the swapout temporary + * variable in case we err in the future + */ + rdev->sb_page = NULL; + alloc_disk_sb(rdev); + ClearPageUptodate(rdev->sb_page); + rdev->sb_loaded = 0; + err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); + + if (err < 0) { + pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", + __func__, __LINE__, rdev->desc_nr, err); + put_page(rdev->sb_page); + rdev->sb_page = swapout; + rdev->sb_loaded = 1; + return err; } + sb = page_address(rdev->sb_page); + /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET + * is not set + */ + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + + /* The other node finished recovery, call spare_active to set + * device In_sync and mddev->degraded + */ + if (rdev->recovery_offset == MaxSector && + !test_bit(In_sync, &rdev->flags) && + mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, "degraded"); + + put_page(swapout); + return 0; +} + +void md_reload_sb(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + int err; + + /* Find the rdev */ + rdev_for_each_rcu(rdev, mddev) { + if (rdev->desc_nr == nr) + break; + } + + if (!rdev || rdev->desc_nr != nr) { + pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); + return; + } + + err = read_rdev(mddev, rdev); + if (err < 0) + return; + + check_sb_changes(mddev, rdev); + + /* Read all rdev's to update recovery_offset */ + rdev_for_each_rcu(rdev, mddev) + read_rdev(mddev, rdev); } EXPORT_SYMBOL(md_reload_sb); diff --git a/drivers/md/md.h b/drivers/md/md.h index ab339571e57f..2bea51edfab7 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -87,10 +87,16 @@ struct md_rdev { * array and could again if we did a partial * resync from the bitmap */ - sector_t recovery_offset;/* If this device has been partially + union { + sector_t recovery_offset;/* If this device has been partially * recovered, this is where we were * up to. */ + sector_t journal_tail; /* If this device is a journal device, + * this is the journal tail (journal + * recovery start point) + */ + }; atomic_t nr_pending; /* number of pending requests. * only maintained for arrays that @@ -172,6 +178,11 @@ enum flag_bits { * This device is seen locally but not * by the whole cluster */ + Journal, /* This device is used as journal for + * raid-5/6. + * Usually, this device should be faster + * than other devices in the array + */ }; #define BB_LEN_MASK (0x00000000000001FFULL) @@ -221,6 +232,8 @@ struct mddev { #define MD_STILL_CLOSED 4 /* If set, then array has not been opened since * md_ioctl checked on it. */ +#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */ +#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */ int suspended; atomic_t active_io; @@ -658,7 +671,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); -extern void md_reload_sb(struct mddev *mddev); +extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 33e59876678b..e2169ff6e0f0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) +#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) @@ -1590,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + first = last = rdev->saved_raid_disk; + for (mirror = first; mirror <= last; mirror++) { p = conf->mirrors+mirror; if (!p->rdev) { @@ -2495,6 +2506,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bitmap_close_sync(mddev->bitmap); close_sync(conf); + + if (mddev_is_clustered(mddev)) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + } return 0; } @@ -2515,7 +2531,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp return sync_blocks; } - bitmap_cond_end_sync(mddev->bitmap, sector_nr); + /* we are incrementing sector_nr below. To be safe, we check against + * sector_nr + two times RESYNC_SECTORS + */ + + bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); raise_barrier(conf, sector_nr); @@ -2706,6 +2727,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bio_full: r1_bio->sectors = nr_sectors; + if (mddev_is_clustered(mddev) && + conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + /* For a user-requested sync, we read all readable devices and do a * compare */ @@ -3020,9 +3051,11 @@ static int raid1_reshape(struct mddev *mddev) return -EINVAL; } - err = md_allow_write(mddev); - if (err) - return err; + if (!mddev_is_clustered(mddev)) { + err = md_allow_write(mddev); + if (err) + return err; + } raid_disks = mddev->raid_disks + mddev->delta_disks; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index c52d7139c5d7..61c39b390cd8 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -111,6 +111,13 @@ struct r1conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + + /* Keep track of cluster resync window to send to other + * nodes. + */ + sector_t cluster_sync_low; + sector_t cluster_sync_high; + }; /* diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 826210f095be..41d70bc9ba2f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3149,7 +3149,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, /* resync. Schedule a read for every block at this virt offset */ int count = 0; - bitmap_cond_end_sync(mddev->bitmap, sector_nr); + bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0); if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c new file mode 100644 index 000000000000..b887e04d7e5c --- /dev/null +++ b/drivers/md/raid5-cache.c @@ -0,0 +1,1191 @@ +/* + * Copyright (C) 2015 Shaohua Li <shli@fb.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ +#include <linux/kernel.h> +#include <linux/wait.h> +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/raid/md_p.h> +#include <linux/crc32c.h> +#include <linux/random.h> +#include "md.h" +#include "raid5.h" + +/* + * metadata/data stored in disk with 4k size unit (a block) regardless + * underneath hardware sector size. only works with PAGE_SIZE == 4096 + */ +#define BLOCK_SECTORS (8) + +/* + * reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent + * recovery scans a very long log + */ +#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */ +#define RECLAIM_MAX_FREE_SPACE_SHIFT (2) + +struct r5l_log { + struct md_rdev *rdev; + + u32 uuid_checksum; + + sector_t device_size; /* log device size, round to + * BLOCK_SECTORS */ + sector_t max_free_space; /* reclaim run if free space is at + * this size */ + + sector_t last_checkpoint; /* log tail. where recovery scan + * starts from */ + u64 last_cp_seq; /* log tail sequence */ + + sector_t log_start; /* log head. where new data appends */ + u64 seq; /* log head sequence */ + + sector_t next_checkpoint; + u64 next_cp_seq; + + struct mutex io_mutex; + struct r5l_io_unit *current_io; /* current io_unit accepting new data */ + + spinlock_t io_list_lock; + struct list_head running_ios; /* io_units which are still running, + * and have not yet been completely + * written to the log */ + struct list_head io_end_ios; /* io_units which have been completely + * written to the log but not yet written + * to the RAID */ + struct list_head flushing_ios; /* io_units which are waiting for log + * cache flush */ + struct list_head finished_ios; /* io_units which settle down in log disk */ + struct bio flush_bio; + + struct kmem_cache *io_kc; + + struct md_thread *reclaim_thread; + unsigned long reclaim_target; /* number of space that need to be + * reclaimed. if it's 0, reclaim spaces + * used by io_units which are in + * IO_UNIT_STRIPE_END state (eg, reclaim + * dones't wait for specific io_unit + * switching to IO_UNIT_STRIPE_END + * state) */ + wait_queue_head_t iounit_wait; + + struct list_head no_space_stripes; /* pending stripes, log has no space */ + spinlock_t no_space_stripes_lock; + + bool need_cache_flush; + bool in_teardown; +}; + +/* + * an IO range starts from a meta data block and end at the next meta data + * block. The io unit's the meta data block tracks data/parity followed it. io + * unit is written to log disk with normal write, as we always flush log disk + * first and then start move data to raid disks, there is no requirement to + * write io unit with FLUSH/FUA + */ +struct r5l_io_unit { + struct r5l_log *log; + + struct page *meta_page; /* store meta block */ + int meta_offset; /* current offset in meta_page */ + + struct bio *current_bio;/* current_bio accepting new data */ + + atomic_t pending_stripe;/* how many stripes not flushed to raid */ + u64 seq; /* seq number of the metablock */ + sector_t log_start; /* where the io_unit starts */ + sector_t log_end; /* where the io_unit ends */ + struct list_head log_sibling; /* log->running_ios */ + struct list_head stripe_list; /* stripes added to the io_unit */ + + int state; + bool need_split_bio; +}; + +/* r5l_io_unit state */ +enum r5l_io_unit_state { + IO_UNIT_RUNNING = 0, /* accepting new IO */ + IO_UNIT_IO_START = 1, /* io_unit bio start writing to log, + * don't accepting new bio */ + IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */ + IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */ +}; + +static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc) +{ + start += inc; + if (start >= log->device_size) + start = start - log->device_size; + return start; +} + +static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start, + sector_t end) +{ + if (end >= start) + return end - start; + else + return end + log->device_size - start; +} + +static bool r5l_has_free_space(struct r5l_log *log, sector_t size) +{ + sector_t used_size; + + used_size = r5l_ring_distance(log, log->last_checkpoint, + log->log_start); + + return log->device_size > used_size + size; +} + +static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io) +{ + __free_page(io->meta_page); + kmem_cache_free(log->io_kc, io); +} + +static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to, + enum r5l_io_unit_state state) +{ + struct r5l_io_unit *io; + + while (!list_empty(from)) { + io = list_first_entry(from, struct r5l_io_unit, log_sibling); + /* don't change list order */ + if (io->state >= state) + list_move_tail(&io->log_sibling, to); + else + break; + } +} + +static void __r5l_set_io_unit_state(struct r5l_io_unit *io, + enum r5l_io_unit_state state) +{ + if (WARN_ON(io->state >= state)) + return; + io->state = state; +} + +static void r5l_io_run_stripes(struct r5l_io_unit *io) +{ + struct stripe_head *sh, *next; + + list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) { + list_del_init(&sh->log_list); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } +} + +static void r5l_log_run_stripes(struct r5l_log *log) +{ + struct r5l_io_unit *io, *next; + + assert_spin_locked(&log->io_list_lock); + + list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { + /* don't change list order */ + if (io->state < IO_UNIT_IO_END) + break; + + list_move_tail(&io->log_sibling, &log->finished_ios); + r5l_io_run_stripes(io); + } +} + +static void r5l_log_endio(struct bio *bio) +{ + struct r5l_io_unit *io = bio->bi_private; + struct r5l_log *log = io->log; + unsigned long flags; + + if (bio->bi_error) + md_error(log->rdev->mddev, log->rdev); + + bio_put(bio); + + spin_lock_irqsave(&log->io_list_lock, flags); + __r5l_set_io_unit_state(io, IO_UNIT_IO_END); + if (log->need_cache_flush) + r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios, + IO_UNIT_IO_END); + else + r5l_log_run_stripes(log); + spin_unlock_irqrestore(&log->io_list_lock, flags); + + if (log->need_cache_flush) + md_wakeup_thread(log->rdev->mddev->thread); +} + +static void r5l_submit_current_io(struct r5l_log *log) +{ + struct r5l_io_unit *io = log->current_io; + struct r5l_meta_block *block; + unsigned long flags; + u32 crc; + + if (!io) + return; + + block = page_address(io->meta_page); + block->meta_size = cpu_to_le32(io->meta_offset); + crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE); + block->checksum = cpu_to_le32(crc); + + log->current_io = NULL; + spin_lock_irqsave(&log->io_list_lock, flags); + __r5l_set_io_unit_state(io, IO_UNIT_IO_START); + spin_unlock_irqrestore(&log->io_list_lock, flags); + + submit_bio(WRITE, io->current_bio); +} + +static struct bio *r5l_bio_alloc(struct r5l_log *log) +{ + struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES); + + bio->bi_rw = WRITE; + bio->bi_bdev = log->rdev->bdev; + bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start; + + return bio; +} + +static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io) +{ + log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS); + + /* + * If we filled up the log device start from the beginning again, + * which will require a new bio. + * + * Note: for this to work properly the log size needs to me a multiple + * of BLOCK_SECTORS. + */ + if (log->log_start == 0) + io->need_split_bio = true; + + io->log_end = log->log_start; +} + +static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log) +{ + struct r5l_io_unit *io; + struct r5l_meta_block *block; + + /* We can't handle memory allocate failure so far */ + io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL); + io->log = log; + INIT_LIST_HEAD(&io->log_sibling); + INIT_LIST_HEAD(&io->stripe_list); + io->state = IO_UNIT_RUNNING; + + io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO); + block = page_address(io->meta_page); + block->magic = cpu_to_le32(R5LOG_MAGIC); + block->version = R5LOG_VERSION; + block->seq = cpu_to_le64(log->seq); + block->position = cpu_to_le64(log->log_start); + + io->log_start = log->log_start; + io->meta_offset = sizeof(struct r5l_meta_block); + io->seq = log->seq++; + + io->current_bio = r5l_bio_alloc(log); + io->current_bio->bi_end_io = r5l_log_endio; + io->current_bio->bi_private = io; + bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0); + + r5_reserve_log_entry(log, io); + + spin_lock_irq(&log->io_list_lock); + list_add_tail(&io->log_sibling, &log->running_ios); + spin_unlock_irq(&log->io_list_lock); + + return io; +} + +static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size) +{ + if (log->current_io && + log->current_io->meta_offset + payload_size > PAGE_SIZE) + r5l_submit_current_io(log); + + if (!log->current_io) + log->current_io = r5l_new_meta(log); + return 0; +} + +static void r5l_append_payload_meta(struct r5l_log *log, u16 type, + sector_t location, + u32 checksum1, u32 checksum2, + bool checksum2_valid) +{ + struct r5l_io_unit *io = log->current_io; + struct r5l_payload_data_parity *payload; + + payload = page_address(io->meta_page) + io->meta_offset; + payload->header.type = cpu_to_le16(type); + payload->header.flags = cpu_to_le16(0); + payload->size = cpu_to_le32((1 + !!checksum2_valid) << + (PAGE_SHIFT - 9)); + payload->location = cpu_to_le64(location); + payload->checksum[0] = cpu_to_le32(checksum1); + if (checksum2_valid) + payload->checksum[1] = cpu_to_le32(checksum2); + + io->meta_offset += sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * (1 + !!checksum2_valid); +} + +static void r5l_append_payload_page(struct r5l_log *log, struct page *page) +{ + struct r5l_io_unit *io = log->current_io; + + if (io->need_split_bio) { + struct bio *prev = io->current_bio; + + io->current_bio = r5l_bio_alloc(log); + bio_chain(io->current_bio, prev); + + submit_bio(WRITE, prev); + } + + if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0)) + BUG(); + + r5_reserve_log_entry(log, io); +} + +static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh, + int data_pages, int parity_pages) +{ + int i; + int meta_size; + struct r5l_io_unit *io; + + meta_size = + ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) + * data_pages) + + sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * parity_pages; + + r5l_get_meta(log, meta_size); + io = log->current_io; + + for (i = 0; i < sh->disks; i++) { + if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) + continue; + if (i == sh->pd_idx || i == sh->qd_idx) + continue; + r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA, + raid5_compute_blocknr(sh, i, 0), + sh->dev[i].log_checksum, 0, false); + r5l_append_payload_page(log, sh->dev[i].page); + } + + if (sh->qd_idx >= 0) { + r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, + sh->sector, sh->dev[sh->pd_idx].log_checksum, + sh->dev[sh->qd_idx].log_checksum, true); + r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); + r5l_append_payload_page(log, sh->dev[sh->qd_idx].page); + } else { + r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY, + sh->sector, sh->dev[sh->pd_idx].log_checksum, + 0, false); + r5l_append_payload_page(log, sh->dev[sh->pd_idx].page); + } + + list_add_tail(&sh->log_list, &io->stripe_list); + atomic_inc(&io->pending_stripe); + sh->log_io = io; +} + +static void r5l_wake_reclaim(struct r5l_log *log, sector_t space); +/* + * running in raid5d, where reclaim could wait for raid5d too (when it flushes + * data from log to raid disks), so we shouldn't wait for reclaim here + */ +int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh) +{ + int write_disks = 0; + int data_pages, parity_pages; + int meta_size; + int reserve; + int i; + + if (!log) + return -EAGAIN; + /* Don't support stripe batch */ + if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) || + test_bit(STRIPE_SYNCING, &sh->state)) { + /* the stripe is written to log, we start writing it to raid */ + clear_bit(STRIPE_LOG_TRAPPED, &sh->state); + return -EAGAIN; + } + + for (i = 0; i < sh->disks; i++) { + void *addr; + + if (!test_bit(R5_Wantwrite, &sh->dev[i].flags)) + continue; + write_disks++; + /* checksum is already calculated in last run */ + if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) + continue; + addr = kmap_atomic(sh->dev[i].page); + sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum, + addr, PAGE_SIZE); + kunmap_atomic(addr); + } + parity_pages = 1 + !!(sh->qd_idx >= 0); + data_pages = write_disks - parity_pages; + + meta_size = + ((sizeof(struct r5l_payload_data_parity) + sizeof(__le32)) + * data_pages) + + sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * parity_pages; + /* Doesn't work with very big raid array */ + if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE) + return -EINVAL; + + set_bit(STRIPE_LOG_TRAPPED, &sh->state); + /* + * The stripe must enter state machine again to finish the write, so + * don't delay. + */ + clear_bit(STRIPE_DELAYED, &sh->state); + atomic_inc(&sh->count); + + mutex_lock(&log->io_mutex); + /* meta + data */ + reserve = (1 + write_disks) << (PAGE_SHIFT - 9); + if (r5l_has_free_space(log, reserve)) + r5l_log_stripe(log, sh, data_pages, parity_pages); + else { + spin_lock(&log->no_space_stripes_lock); + list_add_tail(&sh->log_list, &log->no_space_stripes); + spin_unlock(&log->no_space_stripes_lock); + + r5l_wake_reclaim(log, reserve); + } + mutex_unlock(&log->io_mutex); + + return 0; +} + +void r5l_write_stripe_run(struct r5l_log *log) +{ + if (!log) + return; + mutex_lock(&log->io_mutex); + r5l_submit_current_io(log); + mutex_unlock(&log->io_mutex); +} + +int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) +{ + if (!log) + return -ENODEV; + /* + * we flush log disk cache first, then write stripe data to raid disks. + * So if bio is finished, the log disk cache is flushed already. The + * recovery guarantees we can recovery the bio from log disk, so we + * don't need to flush again + */ + if (bio->bi_iter.bi_size == 0) { + bio_endio(bio); + return 0; + } + bio->bi_rw &= ~REQ_FLUSH; + return -EAGAIN; +} + +/* This will run after log space is reclaimed */ +static void r5l_run_no_space_stripes(struct r5l_log *log) +{ + struct stripe_head *sh; + + spin_lock(&log->no_space_stripes_lock); + while (!list_empty(&log->no_space_stripes)) { + sh = list_first_entry(&log->no_space_stripes, + struct stripe_head, log_list); + list_del_init(&sh->log_list); + set_bit(STRIPE_HANDLE, &sh->state); + raid5_release_stripe(sh); + } + spin_unlock(&log->no_space_stripes_lock); +} + +static sector_t r5l_reclaimable_space(struct r5l_log *log) +{ + return r5l_ring_distance(log, log->last_checkpoint, + log->next_checkpoint); +} + +static bool r5l_complete_finished_ios(struct r5l_log *log) +{ + struct r5l_io_unit *io, *next; + bool found = false; + + assert_spin_locked(&log->io_list_lock); + + list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { + /* don't change list order */ + if (io->state < IO_UNIT_STRIPE_END) + break; + + log->next_checkpoint = io->log_start; + log->next_cp_seq = io->seq; + + list_del(&io->log_sibling); + r5l_free_io_unit(log, io); + + found = true; + } + + return found; +} + +static void __r5l_stripe_write_finished(struct r5l_io_unit *io) +{ + struct r5l_log *log = io->log; + unsigned long flags; + + spin_lock_irqsave(&log->io_list_lock, flags); + __r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END); + + if (!r5l_complete_finished_ios(log)) { + spin_unlock_irqrestore(&log->io_list_lock, flags); + return; + } + + if (r5l_reclaimable_space(log) > log->max_free_space) + r5l_wake_reclaim(log, 0); + + spin_unlock_irqrestore(&log->io_list_lock, flags); + wake_up(&log->iounit_wait); +} + +void r5l_stripe_write_finished(struct stripe_head *sh) +{ + struct r5l_io_unit *io; + + io = sh->log_io; + sh->log_io = NULL; + + if (io && atomic_dec_and_test(&io->pending_stripe)) + __r5l_stripe_write_finished(io); +} + +static void r5l_log_flush_endio(struct bio *bio) +{ + struct r5l_log *log = container_of(bio, struct r5l_log, + flush_bio); + unsigned long flags; + struct r5l_io_unit *io; + + if (bio->bi_error) + md_error(log->rdev->mddev, log->rdev); + + spin_lock_irqsave(&log->io_list_lock, flags); + list_for_each_entry(io, &log->flushing_ios, log_sibling) + r5l_io_run_stripes(io); + list_splice_tail_init(&log->flushing_ios, &log->finished_ios); + spin_unlock_irqrestore(&log->io_list_lock, flags); +} + +/* + * Starting dispatch IO to raid. + * io_unit(meta) consists of a log. There is one situation we want to avoid. A + * broken meta in the middle of a log causes recovery can't find meta at the + * head of log. If operations require meta at the head persistent in log, we + * must make sure meta before it persistent in log too. A case is: + * + * stripe data/parity is in log, we start write stripe to raid disks. stripe + * data/parity must be persistent in log before we do the write to raid disks. + * + * The solution is we restrictly maintain io_unit list order. In this case, we + * only write stripes of an io_unit to raid disks till the io_unit is the first + * one whose data/parity is in log. + */ +void r5l_flush_stripe_to_raid(struct r5l_log *log) +{ + bool do_flush; + + if (!log || !log->need_cache_flush) + return; + + spin_lock_irq(&log->io_list_lock); + /* flush bio is running */ + if (!list_empty(&log->flushing_ios)) { + spin_unlock_irq(&log->io_list_lock); + return; + } + list_splice_tail_init(&log->io_end_ios, &log->flushing_ios); + do_flush = !list_empty(&log->flushing_ios); + spin_unlock_irq(&log->io_list_lock); + + if (!do_flush) + return; + bio_reset(&log->flush_bio); + log->flush_bio.bi_bdev = log->rdev->bdev; + log->flush_bio.bi_end_io = r5l_log_flush_endio; + submit_bio(WRITE_FLUSH, &log->flush_bio); +} + +static void r5l_write_super(struct r5l_log *log, sector_t cp); +static void r5l_write_super_and_discard_space(struct r5l_log *log, + sector_t end) +{ + struct block_device *bdev = log->rdev->bdev; + struct mddev *mddev; + + r5l_write_super(log, end); + + if (!blk_queue_discard(bdev_get_queue(bdev))) + return; + + mddev = log->rdev->mddev; + /* + * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and + * wait for this thread to finish. This thread waits for + * MD_CHANGE_PENDING clear, which is supposed to be done in + * md_check_recovery(). md_check_recovery() tries to get + * reconfig_mutex. Since r5l_quiesce already holds the mutex, + * md_check_recovery() fails, so the PENDING never get cleared. The + * in_teardown check workaround this issue. + */ + if (!log->in_teardown) { + set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, + !test_bit(MD_CHANGE_PENDING, &mddev->flags) || + log->in_teardown); + /* + * r5l_quiesce could run after in_teardown check and hold + * mutex first. Superblock might get updated twice. + */ + if (log->in_teardown) + md_update_sb(mddev, 1); + } else { + WARN_ON(!mddev_is_locked(mddev)); + md_update_sb(mddev, 1); + } + + /* discard IO error really doesn't matter, ignore it */ + if (log->last_checkpoint < end) { + blkdev_issue_discard(bdev, + log->last_checkpoint + log->rdev->data_offset, + end - log->last_checkpoint, GFP_NOIO, 0); + } else { + blkdev_issue_discard(bdev, + log->last_checkpoint + log->rdev->data_offset, + log->device_size - log->last_checkpoint, + GFP_NOIO, 0); + blkdev_issue_discard(bdev, log->rdev->data_offset, end, + GFP_NOIO, 0); + } +} + + +static void r5l_do_reclaim(struct r5l_log *log) +{ + sector_t reclaim_target = xchg(&log->reclaim_target, 0); + sector_t reclaimable; + sector_t next_checkpoint; + u64 next_cp_seq; + + spin_lock_irq(&log->io_list_lock); + /* + * move proper io_unit to reclaim list. We should not change the order. + * reclaimable/unreclaimable io_unit can be mixed in the list, we + * shouldn't reuse space of an unreclaimable io_unit + */ + while (1) { + reclaimable = r5l_reclaimable_space(log); + if (reclaimable >= reclaim_target || + (list_empty(&log->running_ios) && + list_empty(&log->io_end_ios) && + list_empty(&log->flushing_ios) && + list_empty(&log->finished_ios))) + break; + + md_wakeup_thread(log->rdev->mddev->thread); + wait_event_lock_irq(log->iounit_wait, + r5l_reclaimable_space(log) > reclaimable, + log->io_list_lock); + } + + next_checkpoint = log->next_checkpoint; + next_cp_seq = log->next_cp_seq; + spin_unlock_irq(&log->io_list_lock); + + BUG_ON(reclaimable < 0); + if (reclaimable == 0) + return; + + /* + * write_super will flush cache of each raid disk. We must write super + * here, because the log area might be reused soon and we don't want to + * confuse recovery + */ + r5l_write_super_and_discard_space(log, next_checkpoint); + + mutex_lock(&log->io_mutex); + log->last_checkpoint = next_checkpoint; + log->last_cp_seq = next_cp_seq; + mutex_unlock(&log->io_mutex); + + r5l_run_no_space_stripes(log); +} + +static void r5l_reclaim_thread(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct r5conf *conf = mddev->private; + struct r5l_log *log = conf->log; + + if (!log) + return; + r5l_do_reclaim(log); +} + +static void r5l_wake_reclaim(struct r5l_log *log, sector_t space) +{ + unsigned long target; + unsigned long new = (unsigned long)space; /* overflow in theory */ + + do { + target = log->reclaim_target; + if (new < target) + return; + } while (cmpxchg(&log->reclaim_target, target, new) != target); + md_wakeup_thread(log->reclaim_thread); +} + +void r5l_quiesce(struct r5l_log *log, int state) +{ + struct mddev *mddev; + if (!log || state == 2) + return; + if (state == 0) { + log->in_teardown = 0; + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, + log->rdev->mddev, "reclaim"); + } else if (state == 1) { + /* + * at this point all stripes are finished, so io_unit is at + * least in STRIPE_END state + */ + log->in_teardown = 1; + /* make sure r5l_write_super_and_discard_space exits */ + mddev = log->rdev->mddev; + wake_up(&mddev->sb_wait); + r5l_wake_reclaim(log, -1L); + md_unregister_thread(&log->reclaim_thread); + r5l_do_reclaim(log); + } +} + +bool r5l_log_disk_error(struct r5conf *conf) +{ + /* don't allow write if journal disk is missing */ + if (!conf->log) + return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + return test_bit(Faulty, &conf->log->rdev->flags); +} + +struct r5l_recovery_ctx { + struct page *meta_page; /* current meta */ + sector_t meta_total_blocks; /* total size of current meta and data */ + sector_t pos; /* recovery position */ + u64 seq; /* recovery position seq */ +}; + +static int r5l_read_meta_block(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct page *page = ctx->meta_page; + struct r5l_meta_block *mb; + u32 crc, stored_crc; + + if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false)) + return -EIO; + + mb = page_address(page); + stored_crc = le32_to_cpu(mb->checksum); + mb->checksum = 0; + + if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || + le64_to_cpu(mb->seq) != ctx->seq || + mb->version != R5LOG_VERSION || + le64_to_cpu(mb->position) != ctx->pos) + return -EINVAL; + + crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + if (stored_crc != crc) + return -EINVAL; + + if (le32_to_cpu(mb->meta_size) > PAGE_SIZE) + return -EINVAL; + + ctx->meta_total_blocks = BLOCK_SECTORS; + + return 0; +} + +static int r5l_recovery_flush_one_stripe(struct r5l_log *log, + struct r5l_recovery_ctx *ctx, + sector_t stripe_sect, + int *offset, sector_t *log_offset) +{ + struct r5conf *conf = log->rdev->mddev->private; + struct stripe_head *sh; + struct r5l_payload_data_parity *payload; + int disk_index; + + sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); + while (1) { + payload = page_address(ctx->meta_page) + *offset; + + if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { + raid5_compute_sector(conf, + le64_to_cpu(payload->location), 0, + &disk_index, sh); + + sync_page_io(log->rdev, *log_offset, PAGE_SIZE, + sh->dev[disk_index].page, READ, false); + sh->dev[disk_index].log_checksum = + le32_to_cpu(payload->checksum[0]); + set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); + ctx->meta_total_blocks += BLOCK_SECTORS; + } else { + disk_index = sh->pd_idx; + sync_page_io(log->rdev, *log_offset, PAGE_SIZE, + sh->dev[disk_index].page, READ, false); + sh->dev[disk_index].log_checksum = + le32_to_cpu(payload->checksum[0]); + set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); + + if (sh->qd_idx >= 0) { + disk_index = sh->qd_idx; + sync_page_io(log->rdev, + r5l_ring_add(log, *log_offset, BLOCK_SECTORS), + PAGE_SIZE, sh->dev[disk_index].page, + READ, false); + sh->dev[disk_index].log_checksum = + le32_to_cpu(payload->checksum[1]); + set_bit(R5_Wantwrite, + &sh->dev[disk_index].flags); + } + ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; + } + + *log_offset = r5l_ring_add(log, *log_offset, + le32_to_cpu(payload->size)); + *offset += sizeof(struct r5l_payload_data_parity) + + sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) + break; + } + + for (disk_index = 0; disk_index < sh->disks; disk_index++) { + void *addr; + u32 checksum; + + if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) + continue; + addr = kmap_atomic(sh->dev[disk_index].page); + checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); + kunmap_atomic(addr); + if (checksum != sh->dev[disk_index].log_checksum) + goto error; + } + + for (disk_index = 0; disk_index < sh->disks; disk_index++) { + struct md_rdev *rdev, *rrdev; + + if (!test_and_clear_bit(R5_Wantwrite, + &sh->dev[disk_index].flags)) + continue; + + /* in case device is broken */ + rdev = rcu_dereference(conf->disks[disk_index].rdev); + if (rdev) + sync_page_io(rdev, stripe_sect, PAGE_SIZE, + sh->dev[disk_index].page, WRITE, false); + rrdev = rcu_dereference(conf->disks[disk_index].replacement); + if (rrdev) + sync_page_io(rrdev, stripe_sect, PAGE_SIZE, + sh->dev[disk_index].page, WRITE, false); + } + raid5_release_stripe(sh); + return 0; + +error: + for (disk_index = 0; disk_index < sh->disks; disk_index++) + sh->dev[disk_index].flags = 0; + raid5_release_stripe(sh); + return -EINVAL; +} + +static int r5l_recovery_flush_one_meta(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + struct r5conf *conf = log->rdev->mddev->private; + struct r5l_payload_data_parity *payload; + struct r5l_meta_block *mb; + int offset; + sector_t log_offset; + sector_t stripe_sector; + + mb = page_address(ctx->meta_page); + offset = sizeof(struct r5l_meta_block); + log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); + + while (offset < le32_to_cpu(mb->meta_size)) { + int dd; + + payload = (void *)mb + offset; + stripe_sector = raid5_compute_sector(conf, + le64_to_cpu(payload->location), 0, &dd, NULL); + if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector, + &offset, &log_offset)) + return -EINVAL; + } + return 0; +} + +/* copy data/parity from log to raid disks */ +static void r5l_recovery_flush_log(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) +{ + while (1) { + if (r5l_read_meta_block(log, ctx)) + return; + if (r5l_recovery_flush_one_meta(log, ctx)) + return; + ctx->seq++; + ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); + } +} + +static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, + u64 seq) +{ + struct page *page; + struct r5l_meta_block *mb; + u32 crc; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return -ENOMEM; + mb = page_address(page); + mb->magic = cpu_to_le32(R5LOG_MAGIC); + mb->version = R5LOG_VERSION; + mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); + mb->seq = cpu_to_le64(seq); + mb->position = cpu_to_le64(pos); + crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + mb->checksum = cpu_to_le32(crc); + + if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) { + __free_page(page); + return -EIO; + } + __free_page(page); + return 0; +} + +static int r5l_recovery_log(struct r5l_log *log) +{ + struct r5l_recovery_ctx ctx; + + ctx.pos = log->last_checkpoint; + ctx.seq = log->last_cp_seq; + ctx.meta_page = alloc_page(GFP_KERNEL); + if (!ctx.meta_page) + return -ENOMEM; + + r5l_recovery_flush_log(log, &ctx); + __free_page(ctx.meta_page); + + /* + * we did a recovery. Now ctx.pos points to an invalid meta block. New + * log will start here. but we can't let superblock point to last valid + * meta block. The log might looks like: + * | meta 1| meta 2| meta 3| + * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If + * superblock points to meta 1, we write a new valid meta 2n. if crash + * happens again, new recovery will start from meta 1. Since meta 2n is + * valid now, recovery will think meta 3 is valid, which is wrong. + * The solution is we create a new meta in meta2 with its seq == meta + * 1's seq + 10 and let superblock points to meta2. The same recovery will + * not think meta 3 is a valid meta, because its seq doesn't match + */ + if (ctx.seq > log->last_cp_seq + 1) { + int ret; + + ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10); + if (ret) + return ret; + log->seq = ctx.seq + 11; + log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); + r5l_write_super(log, ctx.pos); + } else { + log->log_start = ctx.pos; + log->seq = ctx.seq; + } + return 0; +} + +static void r5l_write_super(struct r5l_log *log, sector_t cp) +{ + struct mddev *mddev = log->rdev->mddev; + + log->rdev->journal_tail = cp; + set_bit(MD_CHANGE_DEVS, &mddev->flags); +} + +static int r5l_load_log(struct r5l_log *log) +{ + struct md_rdev *rdev = log->rdev; + struct page *page; + struct r5l_meta_block *mb; + sector_t cp = log->rdev->journal_tail; + u32 stored_crc, expected_crc; + bool create_super = false; + int ret; + + /* Make sure it's valid */ + if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp) + cp = 0; + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) { + ret = -EIO; + goto ioerr; + } + mb = page_address(page); + + if (le32_to_cpu(mb->magic) != R5LOG_MAGIC || + mb->version != R5LOG_VERSION) { + create_super = true; + goto create; + } + stored_crc = le32_to_cpu(mb->checksum); + mb->checksum = 0; + expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); + if (stored_crc != expected_crc) { + create_super = true; + goto create; + } + if (le64_to_cpu(mb->position) != cp) { + create_super = true; + goto create; + } +create: + if (create_super) { + log->last_cp_seq = prandom_u32(); + cp = 0; + /* + * Make sure super points to correct address. Log might have + * data very soon. If super hasn't correct log tail address, + * recovery can't find the log + */ + r5l_write_super(log, cp); + } else + log->last_cp_seq = le64_to_cpu(mb->seq); + + log->device_size = round_down(rdev->sectors, BLOCK_SECTORS); + log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT; + if (log->max_free_space > RECLAIM_MAX_FREE_SPACE) + log->max_free_space = RECLAIM_MAX_FREE_SPACE; + log->last_checkpoint = cp; + + __free_page(page); + + return r5l_recovery_log(log); +ioerr: + __free_page(page); + return ret; +} + +int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) +{ + struct r5l_log *log; + + if (PAGE_SIZE != 4096) + return -EINVAL; + log = kzalloc(sizeof(*log), GFP_KERNEL); + if (!log) + return -ENOMEM; + log->rdev = rdev; + + log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0); + + log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid, + sizeof(rdev->mddev->uuid)); + + mutex_init(&log->io_mutex); + + spin_lock_init(&log->io_list_lock); + INIT_LIST_HEAD(&log->running_ios); + INIT_LIST_HEAD(&log->io_end_ios); + INIT_LIST_HEAD(&log->flushing_ios); + INIT_LIST_HEAD(&log->finished_ios); + bio_init(&log->flush_bio); + + log->io_kc = KMEM_CACHE(r5l_io_unit, 0); + if (!log->io_kc) + goto io_kc; + + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, + log->rdev->mddev, "reclaim"); + if (!log->reclaim_thread) + goto reclaim_thread; + init_waitqueue_head(&log->iounit_wait); + + INIT_LIST_HEAD(&log->no_space_stripes); + spin_lock_init(&log->no_space_stripes_lock); + + if (r5l_load_log(log)) + goto error; + + conf->log = log; + return 0; +error: + md_unregister_thread(&log->reclaim_thread); +reclaim_thread: + kmem_cache_destroy(log->io_kc); +io_kc: + kfree(log); + return -EINVAL; +} + +void r5l_exit_log(struct r5l_log *log) +{ + md_unregister_thread(&log->reclaim_thread); + kmem_cache_destroy(log->io_kc); + kfree(log); +} diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 45933c160697..704ef7fcfbf8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -353,7 +353,7 @@ static void release_inactive_stripe_list(struct r5conf *conf, struct list_head *list = &temp_inactive_list[size - 1]; /* - * We don't hold any lock here yet, get_active_stripe() might + * We don't hold any lock here yet, raid5_get_active_stripe() might * remove stripes from the list */ if (!list_empty_careful(list)) { @@ -413,7 +413,7 @@ static int release_stripe_list(struct r5conf *conf, return count; } -static void release_stripe(struct stripe_head *sh) +void raid5_release_stripe(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; unsigned long flags; @@ -658,9 +658,9 @@ static int has_failed(struct r5conf *conf) return 0; } -static struct stripe_head * -get_active_stripe(struct r5conf *conf, sector_t sector, - int previous, int noblock, int noquiesce) +struct stripe_head * +raid5_get_active_stripe(struct r5conf *conf, sector_t sector, + int previous, int noblock, int noquiesce) { struct stripe_head *sh; int hash = stripe_hash_locks_hash(sector); @@ -755,6 +755,10 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) /* Only freshly new full stripe normal write stripe can be added to a batch list */ static bool stripe_can_batch(struct stripe_head *sh) { + struct r5conf *conf = sh->raid_conf; + + if (conf->log) + return false; return test_bit(STRIPE_BATCH_READY, &sh->state) && !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && is_full_stripe_write(sh); @@ -858,7 +862,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh unlock_out: unlock_two_stripes(head, sh); out: - release_stripe(head); + raid5_release_stripe(head); } /* Determine if 'data_offset' or 'new_data_offset' should be used @@ -895,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) might_sleep(); + if (r5l_write_stripe(conf->log, sh) == 0) + return; for (i = disks; i--; ) { int rw; int replace_only = 0; @@ -1208,7 +1214,7 @@ static void ops_complete_biofill(void *stripe_head_ref) return_io(&return_bi); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } static void ops_run_biofill(struct stripe_head *sh) @@ -1271,7 +1277,7 @@ static void ops_complete_compute(void *stripe_head_ref) if (sh->check_state == check_state_compute_run) sh->check_state = check_state_compute_result; set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } /* return a pointer to the address conversion region of the scribble buffer */ @@ -1697,7 +1703,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) } set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } static void @@ -1855,7 +1861,7 @@ static void ops_complete_check(void *stripe_head_ref) sh->check_state = check_state_check_result; set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) @@ -2017,7 +2023,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) /* we just created an active stripe so... */ atomic_inc(&conf->active_stripes); - release_stripe(sh); + raid5_release_stripe(sh); conf->max_nr_stripes++; return 1; } @@ -2236,7 +2242,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (!p) err = -ENOMEM; } - release_stripe(nsh); + raid5_release_stripe(nsh); } /* critical section pass, GFP_NOIO no longer needed */ @@ -2394,7 +2400,7 @@ static void raid5_end_read_request(struct bio * bi) rdev_dec_pending(rdev, conf->mddev); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } static void raid5_end_write_request(struct bio *bi) @@ -2468,14 +2474,12 @@ static void raid5_end_write_request(struct bio *bi) if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); if (sh->batch_head && sh != sh->batch_head) - release_stripe(sh->batch_head); + raid5_release_stripe(sh->batch_head); } -static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); - static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; @@ -2491,7 +2495,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) dev->rreq.bi_private = sh; dev->flags = 0; - dev->sector = compute_blocknr(sh, i, previous); + dev->sector = raid5_compute_blocknr(sh, i, previous); } static void error(struct mddev *mddev, struct md_rdev *rdev) @@ -2524,9 +2528,9 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) * Input: a 'big' sector number, * Output: index of the data and parity disk, and the sector # in them. */ -static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, - int previous, int *dd_idx, - struct stripe_head *sh) +sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, + struct stripe_head *sh) { sector_t stripe, stripe2; sector_t chunk_number; @@ -2726,7 +2730,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, return new_sector; } -static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) +sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) { struct r5conf *conf = sh->raid_conf; int raid_disks = sh->disks; @@ -3098,6 +3102,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bi) bitmap_end = 1; + r5l_stripe_write_finished(sh); + if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -3141,6 +3147,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, * the data has not reached the cache yet. */ if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && + s->failed > conf->max_degraded && (!test_bit(R5_Insync, &sh->dev[i].flags) || test_bit(R5_ReadError, &sh->dev[i].flags))) { spin_lock_irq(&sh->stripe_lock); @@ -3497,6 +3504,9 @@ returnbi: WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); WARN_ON(dev->page != dev->orig_page); } + + r5l_stripe_write_finished(sh); + if (!discard_pending && test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { int hash; @@ -3939,10 +3949,10 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) struct stripe_head *sh2; struct async_submit_ctl submit; - sector_t bn = compute_blocknr(sh, i, 1); + sector_t bn = raid5_compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL); - sh2 = get_active_stripe(conf, s, 0, 1, 1); + sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -3952,7 +3962,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) if (!test_bit(STRIPE_EXPANDING, &sh2->state) || test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { /* must have already done this block */ - release_stripe(sh2); + raid5_release_stripe(sh2); continue; } @@ -3973,7 +3983,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) set_bit(STRIPE_EXPAND_READY, &sh2->state); set_bit(STRIPE_HANDLE, &sh2->state); } - release_stripe(sh2); + raid5_release_stripe(sh2); } /* done submitting copies, wait for them to complete */ @@ -4008,6 +4018,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; s->failed_num[0] = -1; s->failed_num[1] = -1; + s->log_failed = r5l_log_disk_error(conf); /* Now to look around and see what can be done */ rcu_read_lock(); @@ -4259,7 +4270,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, if (handle_flags == 0 || sh->state & handle_flags) set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); } spin_lock_irq(&head_sh->stripe_lock); head_sh->batch_head = NULL; @@ -4320,6 +4331,9 @@ static void handle_stripe(struct stripe_head *sh) analyse_stripe(sh, &s); + if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) + goto finish; + if (s.handle_bad_blocks) { set_bit(STRIPE_HANDLE, &sh->state); goto finish; @@ -4348,7 +4362,7 @@ static void handle_stripe(struct stripe_head *sh) /* check if the array has lost more than max_degraded devices and, * if so, some requests might need to be failed. */ - if (s.failed > conf->max_degraded) { + if (s.failed > conf->max_degraded || s.log_failed) { sh->check_state = 0; sh->reconstruct_state = 0; break_stripe_batch_list(sh, 0); @@ -4506,7 +4520,7 @@ static void handle_stripe(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { struct stripe_head *sh_src - = get_active_stripe(conf, sh->sector, 1, 1, 1); + = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { /* sh cannot be written until sh_src has been read. * so arrange for sh to be delayed a little @@ -4516,11 +4530,11 @@ static void handle_stripe(struct stripe_head *sh) if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh_src->state)) atomic_inc(&conf->preread_active_stripes); - release_stripe(sh_src); + raid5_release_stripe(sh_src); goto finish; } if (sh_src) - release_stripe(sh_src); + raid5_release_stripe(sh_src); sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); @@ -5012,7 +5026,7 @@ static void release_stripe_plug(struct mddev *mddev, struct raid5_plug_cb *cb; if (!blk_cb) { - release_stripe(sh); + raid5_release_stripe(sh); return; } @@ -5028,7 +5042,7 @@ static void release_stripe_plug(struct mddev *mddev, if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) list_add_tail(&sh->lru, &cb->list); else - release_stripe(sh); + raid5_release_stripe(sh); } static void make_discard_request(struct mddev *mddev, struct bio *bi) @@ -5063,12 +5077,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) DEFINE_WAIT(w); int d; again: - sh = get_active_stripe(conf, logical_sector, 0, 0, 0); + sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); if (test_bit(STRIPE_SYNCING, &sh->state)) { - release_stripe(sh); + raid5_release_stripe(sh); schedule(); goto again; } @@ -5080,7 +5094,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) if (sh->dev[d].towrite || sh->dev[d].toread) { set_bit(R5_Overlap, &sh->dev[d].flags); spin_unlock_irq(&sh->stripe_lock); - release_stripe(sh); + raid5_release_stripe(sh); schedule(); goto again; } @@ -5136,8 +5150,15 @@ static void make_request(struct mddev *mddev, struct bio * bi) bool do_prepare; if (unlikely(bi->bi_rw & REQ_FLUSH)) { - md_flush_request(mddev, bi); - return; + int ret = r5l_handle_flush_request(conf->log, bi); + + if (ret == 0) + return; + if (ret == -ENODEV) { + md_flush_request(mddev, bi); + return; + } + /* ret == -EAGAIN, fallback */ } md_write_start(mddev, bi); @@ -5210,7 +5231,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) (unsigned long long)new_sector, (unsigned long long)logical_sector); - sh = get_active_stripe(conf, new_sector, previous, + sh = raid5_get_active_stripe(conf, new_sector, previous, (bi->bi_rw&RWA_MASK), 0); if (sh) { if (unlikely(previous)) { @@ -5231,7 +5252,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) must_retry = 1; spin_unlock_irq(&conf->device_lock); if (must_retry) { - release_stripe(sh); + raid5_release_stripe(sh); schedule(); do_prepare = true; goto retry; @@ -5241,14 +5262,14 @@ static void make_request(struct mddev *mddev, struct bio * bi) /* Might have got the wrong stripe_head * by accident */ - release_stripe(sh); + raid5_release_stripe(sh); goto retry; } if (rw == WRITE && logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { - release_stripe(sh); + raid5_release_stripe(sh); /* As the suspend_* range is controlled by * userspace, we want an interruptible * wait. @@ -5271,7 +5292,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) * and wait a while */ md_wakeup_thread(mddev->thread); - release_stripe(sh); + raid5_release_stripe(sh); schedule(); do_prepare = true; goto retry; @@ -5458,7 +5479,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped_disk = 0; - sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); + sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -5471,7 +5492,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk if (conf->level == 6 && j == sh->qd_idx) continue; - s = compute_blocknr(sh, j, 0); + s = raid5_compute_blocknr(sh, j, 0); if (s < raid5_size(mddev, 0, 0)) { skipped_disk = 1; continue; @@ -5507,10 +5528,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - sh = get_active_stripe(conf, first_sector, 1, 0, 1); + sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); first_sector += STRIPE_SECTORS; } /* Now that the sources are clearly marked, we can release @@ -5519,7 +5540,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk while (!list_empty(&stripes)) { sh = list_entry(stripes.next, struct stripe_head, lru); list_del_init(&sh->lru); - release_stripe(sh); + raid5_release_stripe(sh); } /* If this takes us to the resync_max point where we have to pause, * then we need to write out the superblock. @@ -5615,11 +5636,11 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ } - bitmap_cond_end_sync(mddev->bitmap, sector_nr); + bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); - sh = get_active_stripe(conf, sector_nr, 0, 1, 0); + sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, 0, 0, 0); + sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -5643,7 +5664,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); + raid5_release_stripe(sh); return STRIPE_SECTORS; } @@ -5682,7 +5703,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, 0, 1, 1); + sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); if (!sh) { /* failed to get a stripe - must wait */ @@ -5692,7 +5713,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) } if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { - release_stripe(sh); + raid5_release_stripe(sh); raid5_set_bi_processed_stripes(raid_bio, scnt); conf->retry_read_aligned = raid_bio; return handled; @@ -5700,7 +5721,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); handle_stripe(sh); - release_stripe(sh); + raid5_release_stripe(sh); handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); @@ -5730,8 +5751,12 @@ static int handle_active_stripes(struct r5conf *conf, int group, for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) if (!list_empty(temp_inactive_list + i)) break; - if (i == NR_STRIPE_HASH_LOCKS) + if (i == NR_STRIPE_HASH_LOCKS) { + spin_unlock_irq(&conf->device_lock); + r5l_flush_stripe_to_raid(conf->log); + spin_lock_irq(&conf->device_lock); return batch_size; + } release_inactive = true; } spin_unlock_irq(&conf->device_lock); @@ -5739,6 +5764,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, release_inactive_stripe_list(conf, temp_inactive_list, NR_STRIPE_HASH_LOCKS); + r5l_flush_stripe_to_raid(conf->log); if (release_inactive) { spin_lock_irq(&conf->device_lock); return 0; @@ -5746,6 +5772,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, for (i = 0; i < batch_size; i++) handle_stripe(batch[i]); + r5l_write_stripe_run(conf->log); cond_resched(); @@ -5879,6 +5906,8 @@ static void raid5d(struct md_thread *thread) mutex_unlock(&conf->cache_size_mutex); } + r5l_flush_stripe_to_raid(conf->log); + async_tx_issue_pending_all(); blk_finish_plug(&plug); @@ -6316,8 +6345,11 @@ static void raid5_free_percpu(struct r5conf *conf) static void free_conf(struct r5conf *conf) { + if (conf->log) + r5l_exit_log(conf->log); if (conf->shrinker.seeks) unregister_shrinker(&conf->shrinker); + free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); @@ -6530,7 +6562,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) rdev_for_each(rdev, mddev) { raid_disk = rdev->raid_disk; if (raid_disk >= max_disks - || raid_disk < 0) + || raid_disk < 0 || test_bit(Journal, &rdev->flags)) continue; disk = conf->disks + raid_disk; @@ -6650,6 +6682,7 @@ static int run(struct mddev *mddev) int working_disks = 0; int dirty_parity_disks = 0; struct md_rdev *rdev; + struct md_rdev *journal_dev = NULL; sector_t reshape_offset = 0; int i; long long min_offset_diff = 0; @@ -6662,6 +6695,11 @@ static int run(struct mddev *mddev) rdev_for_each(rdev, mddev) { long long diff; + + if (test_bit(Journal, &rdev->flags)) { + journal_dev = rdev; + continue; + } if (rdev->raid_disk < 0) continue; diff = (rdev->new_data_offset - rdev->data_offset); @@ -6695,6 +6733,12 @@ static int run(struct mddev *mddev) int chunk_sectors; int new_data_disks; + if (journal_dev) { + printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->new_level != mddev->level) { printk(KERN_ERR "md/raid:%s: unsupported reshape " "required - aborting.\n", @@ -6770,6 +6814,13 @@ static int run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) { + printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n", + mdname(mddev)); + mddev->ro = 1; + set_disk_ro(mddev->gendisk, 1); + } + conf->min_offset_diff = min_offset_diff; mddev->thread = conf->thread; conf->thread = NULL; @@ -6973,6 +7024,14 @@ static int run(struct mddev *mddev) mddev->queue); } + if (journal_dev) { + char b[BDEVNAME_SIZE]; + + printk(KERN_INFO"md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(journal_dev->bdev, b)); + r5l_init_log(conf, journal_dev); + } + return 0; abort: md_unregister_thread(&mddev->thread); @@ -7082,6 +7141,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct disk_info *p = conf->disks + number; print_raid5_conf(conf); + if (test_bit(Journal, &rdev->flags)) { + /* + * journal disk is not removable, but we need give a chance to + * update superblock of other disks. Otherwise journal disk + * will be considered as 'fresh' + */ + set_bit(MD_CHANGE_DEVS, &mddev->flags); + return -EINVAL; + } if (rdev == p->rdev) rdevp = &p->rdev; else if (rdev == p->replacement) @@ -7144,6 +7212,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int first = 0; int last = conf->raid_disks - 1; + if (test_bit(Journal, &rdev->flags)) + return -EINVAL; if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; @@ -7205,6 +7275,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) sector_t newsize; struct r5conf *conf = mddev->private; + if (conf->log) + return -EINVAL; sectors &= ~((sector_t)conf->chunk_sectors - 1); newsize = raid5_size(mddev, sectors, mddev->raid_disks); if (mddev->external_size && @@ -7256,6 +7328,8 @@ static int check_reshape(struct mddev *mddev) { struct r5conf *conf = mddev->private; + if (conf->log) + return -EINVAL; if (mddev->delta_disks == 0 && mddev->new_layout == mddev->layout && mddev->new_chunk_sectors == mddev->chunk_sectors) @@ -7532,6 +7606,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) unlock_all_device_hash_locks_irq(conf); break; } + r5l_quiesce(conf->log, state); } static void *raid45_takeover_raid0(struct mddev *mddev, int level) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 828c2925e68f..a415e1cd39b8 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -223,6 +223,9 @@ struct stripe_head { struct stripe_head *batch_head; /* protected by stripe lock */ spinlock_t batch_lock; /* only header's lock is useful */ struct list_head batch_list; /* protected by head's batch lock*/ + + struct r5l_io_unit *log_io; + struct list_head log_list; /** * struct stripe_operations * @target - STRIPE_OP_COMPUTE_BLK target @@ -244,6 +247,7 @@ struct stripe_head { struct bio *toread, *read, *towrite, *written; sector_t sector; /* sector of this page */ unsigned long flags; + u32 log_checksum; } dev[1]; /* allocated with extra space depending of RAID geometry */ }; @@ -268,6 +272,7 @@ struct stripe_head_state { struct bio_list return_bi; struct md_rdev *blocked_rdev; int handle_bad_blocks; + int log_failed; }; /* Flags for struct r5dev.flags */ @@ -340,6 +345,7 @@ enum { STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add * to batch yet. */ + STRIPE_LOG_TRAPPED, /* trapped into log */ }; #define STRIPE_EXPAND_SYNC_FLAGS \ @@ -543,6 +549,7 @@ struct r5conf { struct r5worker_group *worker_groups; int group_cnt; int worker_cnt_per_group; + struct r5l_log *log; }; @@ -609,4 +616,21 @@ static inline int algorithm_is_DDF(int layout) extern void md_raid5_kick_device(struct r5conf *conf); extern int raid5_set_cache_size(struct mddev *mddev, int size); +extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); +extern void raid5_release_stripe(struct stripe_head *sh); +extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, + int previous, int *dd_idx, + struct stripe_head *sh); +extern struct stripe_head * +raid5_get_active_stripe(struct r5conf *conf, sector_t sector, + int previous, int noblock, int noquiesce); +extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); +extern void r5l_exit_log(struct r5l_log *log); +extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); +extern void r5l_write_stripe_run(struct r5l_log *log); +extern void r5l_flush_stripe_to_raid(struct r5l_log *log); +extern void r5l_stripe_write_finished(struct stripe_head *sh); +extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); +extern void r5l_quiesce(struct r5l_log *log, int state); +extern bool r5l_log_disk_error(struct r5conf *conf); #endif diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 2ae6131e69a5..c3e654c6d518 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -89,6 +89,12 @@ * read requests will only be sent here in * dire need */ +#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */ + +#define MD_DISK_ROLE_SPARE 0xffff +#define MD_DISK_ROLE_FAULTY 0xfffe +#define MD_DISK_ROLE_JOURNAL 0xfffd +#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */ typedef struct mdp_device_descriptor_s { __u32 number; /* 0 Device number in the entire set */ @@ -252,7 +258,10 @@ struct mdp_superblock_1 { __le64 data_offset; /* sector start of data, often 0 */ __le64 data_size; /* sectors in this device that can be used for data */ __le64 super_offset; /* sector start of this superblock */ - __le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ + union { + __le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ + __le64 journal_tail;/* journal tail of journal device (from data_offset) */ + }; __le32 dev_number; /* permanent identifier of this device - not role in raid */ __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ @@ -302,6 +311,8 @@ struct mdp_superblock_1 { #define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening * is guided by bitmap. */ +#define MD_FEATURE_CLUSTERED 256 /* clustered MD */ +#define MD_FEATURE_JOURNAL 512 /* support write cache */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ @@ -310,6 +321,66 @@ struct mdp_superblock_1 { |MD_FEATURE_RESHAPE_BACKWARDS \ |MD_FEATURE_NEW_OFFSET \ |MD_FEATURE_RECOVERY_BITMAP \ + |MD_FEATURE_CLUSTERED \ + |MD_FEATURE_JOURNAL \ ) +struct r5l_payload_header { + __le16 type; + __le16 flags; +} __attribute__ ((__packed__)); + +enum r5l_payload_type { + R5LOG_PAYLOAD_DATA = 0, + R5LOG_PAYLOAD_PARITY = 1, + R5LOG_PAYLOAD_FLUSH = 2, +}; + +struct r5l_payload_data_parity { + struct r5l_payload_header header; + __le32 size; /* sector. data/parity size. each 4k + * has a checksum */ + __le64 location; /* sector. For data, it's raid sector. For + * parity, it's stripe sector */ + __le32 checksum[]; +} __attribute__ ((__packed__)); + +enum r5l_payload_data_parity_flag { + R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */ + /* + * RESHAPED/RESHAPING is only set when there is reshape activity. Note, + * both data/parity of a stripe should have the same flag set + * + * RESHAPED: reshape is running, and this stripe finished reshape + * RESHAPING: reshape is running, and this stripe isn't reshaped + */ + R5LOG_PAYLOAD_FLAG_RESHAPED = 2, + R5LOG_PAYLOAD_FLAG_RESHAPING = 3, +}; + +struct r5l_payload_flush { + struct r5l_payload_header header; + __le32 size; /* flush_stripes size, bytes */ + __le64 flush_stripes[]; +} __attribute__ ((__packed__)); + +enum r5l_payload_flush_flag { + R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */ +}; + +struct r5l_meta_block { + __le32 magic; + __le32 checksum; + __u8 version; + __u8 __zero_pading_1; + __le16 __zero_pading_2; + __le32 meta_size; /* whole size of the block */ + + __le64 seq; + __le64 position; /* sector, start from rdev->data_offset, current position */ + struct r5l_payload_header payloads[]; +} __attribute__ ((__packed__)); + +#define R5LOG_VERSION 0x1 +#define R5LOG_MAGIC 0x6433c509 #endif |