summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-08-02 21:31:40 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2009-08-02 21:31:40 -0700
commita33a052f19a21d727847391c8c1aff3fb221c472 (patch)
tree87c6a534109fac71fb84a02bfda34c15d00878b2
parent4905f92ed752d49ebe9cce4fe78a4bc39e710523 (diff)
parent449aad3e25358812c43afc60918c5ad3819488e7 (diff)
downloadlinux-a33a052f19a21d727847391c8c1aff3fb221c472.tar.bz2
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: md: Use revalidate_disk to effect changes in size of device. md: allow raid5_quiesce to work properly when reshape is happening. md/raid5: set reshape_position correctly when reshape starts. md: Handle growth of v1.x metadata correctly. md: avoid array overflow with bad v1.x metadata md: when a level change reduces the number of devices, remove the excess. md: Push down data integrity code to personalities. md/raid6: release spare page at ->stop()
-rw-r--r--drivers/md/linear.c2
-rw-r--r--drivers/md/md.c144
-rw-r--r--drivers/md/md.h2
-rw-r--r--drivers/md/multipath.c5
-rw-r--r--drivers/md/raid0.c1
-rw-r--r--drivers/md/raid1.c7
-rw-r--r--drivers/md/raid10.c4
-rw-r--r--drivers/md/raid5.c51
8 files changed, 132 insertions, 84 deletions
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5810fa906af0..5fe39c2a3d2b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -220,6 +220,7 @@ static int linear_run (mddev_t *mddev)
mddev->queue->unplug_fn = linear_unplug;
mddev->queue->backing_dev_info.congested_fn = linear_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
+ md_integrity_register(mddev);
return 0;
}
@@ -256,6 +257,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
rcu_assign_pointer(mddev->private, newconf);
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
+ revalidate_disk(mddev->gendisk);
call_rcu(&oldconf->rcu, free_conf);
return 0;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d4351ff0849f..5b98bea4ff9b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1308,7 +1308,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
}
if (mddev->level != LEVEL_MULTIPATH) {
int role;
- role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ if (rdev->desc_nr < 0 ||
+ rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
+ role = 0xffff;
+ rdev->desc_nr = -1;
+ } else
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
case 0xffff: /* spare */
break;
@@ -1394,8 +1399,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
if (rdev2->desc_nr+1 > max_dev)
max_dev = rdev2->desc_nr+1;
- if (max_dev > le32_to_cpu(sb->max_dev))
+ if (max_dev > le32_to_cpu(sb->max_dev)) {
+ int bmask;
sb->max_dev = cpu_to_le32(max_dev);
+ rdev->sb_size = max_dev * 2 + 256;
+ bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
+ if (rdev->sb_size & bmask)
+ rdev->sb_size = (rdev->sb_size | bmask) + 1;
+ }
for (i=0; i<max_dev;i++)
sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1487,37 +1498,76 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
static LIST_HEAD(pending_raid_disks);
-static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev)
+/*
+ * Try to register data integrity profile for an mddev
+ *
+ * This is called when an array is started and after a disk has been kicked
+ * from the array. It only succeeds if all working and active component devices
+ * are integrity capable with matching profiles.
+ */
+int md_integrity_register(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev, *reference = NULL;
+
+ if (list_empty(&mddev->disks))
+ return 0; /* nothing to do */
+ if (blk_get_integrity(mddev->gendisk))
+ return 0; /* already registered */
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ /* skip spares and non-functional disks */
+ if (test_bit(Faulty, &rdev->flags))
+ continue;
+ if (rdev->raid_disk < 0)
+ continue;
+ /*
+ * If at least one rdev is not integrity capable, we can not
+ * enable data integrity for the md device.
+ */
+ if (!bdev_get_integrity(rdev->bdev))
+ return -EINVAL;
+ if (!reference) {
+ /* Use the first rdev as the reference */
+ reference = rdev;
+ continue;
+ }
+ /* does this rdev's profile match the reference profile? */
+ if (blk_integrity_compare(reference->bdev->bd_disk,
+ rdev->bdev->bd_disk) < 0)
+ return -EINVAL;
+ }
+ /*
+ * All component devices are integrity capable and have matching
+ * profiles, register the common profile for the md device.
+ */
+ if (blk_integrity_register(mddev->gendisk,
+ bdev_get_integrity(reference->bdev)) != 0) {
+ printk(KERN_ERR "md: failed to register integrity for %s\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+ printk(KERN_NOTICE "md: data integrity on %s enabled\n",
+ mdname(mddev));
+ return 0;
+}
+EXPORT_SYMBOL(md_integrity_register);
+
+/* Disable data integrity if non-capable/non-matching disk is being added */
+void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
{
- struct mdk_personality *pers = mddev->pers;
- struct gendisk *disk = mddev->gendisk;
struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
- struct blk_integrity *bi_mddev = blk_get_integrity(disk);
+ struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
- /* Data integrity passthrough not supported on RAID 4, 5 and 6 */
- if (pers && pers->level >= 4 && pers->level <= 6)
+ if (!bi_mddev) /* nothing to do */
return;
-
- /* If rdev is integrity capable, register profile for mddev */
- if (!bi_mddev && bi_rdev) {
- if (blk_integrity_register(disk, bi_rdev))
- printk(KERN_ERR "%s: %s Could not register integrity!\n",
- __func__, disk->disk_name);
- else
- printk(KERN_NOTICE "Enabling data integrity on %s\n",
- disk->disk_name);
+ if (rdev->raid_disk < 0) /* skip spares */
return;
- }
-
- /* Check that mddev and rdev have matching profiles */
- if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) {
- printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__,
- disk->disk_name, rdev->bdev->bd_disk->disk_name);
- printk(KERN_NOTICE "Disabling data integrity on %s\n",
- disk->disk_name);
- blk_integrity_unregister(disk);
- }
+ if (bi_rdev && blk_integrity_compare(mddev->gendisk,
+ rdev->bdev->bd_disk) >= 0)
+ return;
+ printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
+ blk_integrity_unregister(mddev->gendisk);
}
+EXPORT_SYMBOL(md_integrity_add_rdev);
static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
{
@@ -1591,7 +1641,6 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
/* May as well allow recovery to be retried once */
mddev->recovery_disabled = 0;
- md_integrity_check(rdev, mddev);
return 0;
fail:
@@ -2657,6 +2706,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
ssize_t rv = len;
struct mdk_personality *pers;
void *priv;
+ mdk_rdev_t *rdev;
if (mddev->pers == NULL) {
if (len == 0)
@@ -2736,6 +2786,12 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
mddev_suspend(mddev);
mddev->pers->stop(mddev);
module_put(mddev->pers->owner);
+ /* Invalidate devices that are now superfluous */
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ if (rdev->raid_disk >= mddev->raid_disks) {
+ rdev->raid_disk = -1;
+ clear_bit(In_sync, &rdev->flags);
+ }
mddev->pers = pers;
mddev->private = priv;
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
@@ -3685,17 +3741,8 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
mddev->array_sectors = sectors;
set_capacity(mddev->gendisk, mddev->array_sectors);
- if (mddev->pers) {
- struct block_device *bdev = bdget_disk(mddev->gendisk, 0);
-
- if (bdev) {
- mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)mddev->array_sectors << 9);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- bdput(bdev);
- }
- }
+ if (mddev->pers)
+ revalidate_disk(mddev->gendisk);
return len;
}
@@ -4048,10 +4095,6 @@ static int do_md_run(mddev_t * mddev)
}
strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
- if (pers->level >= 4 && pers->level <= 6)
- /* Cannot support integrity (yet) */
- blk_integrity_unregister(mddev->gendisk);
-
if (mddev->reshape_position != MaxSector &&
pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */
@@ -4189,6 +4232,7 @@ static int do_md_run(mddev_t * mddev)
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
+ revalidate_disk(mddev->gendisk);
mddev->changed = 1;
md_new_event(mddev);
sysfs_notify_dirent(mddev->sysfs_state);
@@ -5087,18 +5131,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
return -ENOSPC;
}
rv = mddev->pers->resize(mddev, num_sectors);
- if (!rv) {
- struct block_device *bdev;
-
- bdev = bdget_disk(mddev->gendisk, 0);
- if (bdev) {
- mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)mddev->array_sectors << 9);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- bdput(bdev);
- }
- }
+ if (!rv)
+ revalidate_disk(mddev->gendisk);
return rv;
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 9430a110db93..78f03168baf9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -431,5 +431,7 @@ extern int md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
extern int md_check_no_bitmap(mddev_t *mddev);
+extern int md_integrity_register(mddev_t *mddev);
+void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 237fe3fd235c..7140909f6662 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -313,6 +313,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
set_bit(In_sync, &rdev->flags);
rcu_assign_pointer(p->rdev, rdev);
err = 0;
+ md_integrity_add_rdev(rdev, mddev);
break;
}
@@ -345,7 +346,9 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
/* lost the race, try later */
err = -EBUSY;
p->rdev = rdev;
+ goto abort;
}
+ md_integrity_register(mddev);
}
abort:
@@ -519,7 +522,7 @@ static int multipath_run (mddev_t *mddev)
mddev->queue->unplug_fn = multipath_unplug;
mddev->queue->backing_dev_info.congested_fn = multipath_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
-
+ md_integrity_register(mddev);
return 0;
out_free_conf:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 335f490dcad6..898e2bdfee47 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -351,6 +351,7 @@ static int raid0_run(mddev_t *mddev)
blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
dump_zones(mddev);
+ md_integrity_register(mddev);
return 0;
}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0569efba0c02..8726fd7ebce5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1144,7 +1144,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
rcu_assign_pointer(p->rdev, rdev);
break;
}
-
+ md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
}
@@ -1178,7 +1178,9 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
/* lost the race, try later */
err = -EBUSY;
p->rdev = rdev;
+ goto abort;
}
+ md_integrity_register(mddev);
}
abort:
@@ -2067,7 +2069,7 @@ static int run(mddev_t *mddev)
mddev->queue->unplug_fn = raid1_unplug;
mddev->queue->backing_dev_info.congested_fn = raid1_congested;
mddev->queue->backing_dev_info.congested_data = mddev;
-
+ md_integrity_register(mddev);
return 0;
out_no_mem:
@@ -2132,6 +2134,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
+ revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->dev_sectors;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7298a5e5a183..3d9020cf6f6e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1170,6 +1170,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
break;
}
+ md_integrity_add_rdev(rdev, mddev);
print_conf(conf);
return err;
}
@@ -1203,7 +1204,9 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
/* lost the race, try later */
err = -EBUSY;
p->rdev = rdev;
+ goto abort;
}
+ md_integrity_register(mddev);
}
abort:
@@ -2225,6 +2228,7 @@ static int run(mddev_t *mddev)
if (conf->near_copies < mddev->raid_disks)
blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
+ md_integrity_register(mddev);
return 0;
out_free_conf:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 37835538b58e..2b521ee67dfa 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3999,6 +3999,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
return 0;
}
+ /* Allow raid5_quiesce to complete */
+ wait_event(conf->wait_for_overlap, conf->quiesce != 2);
+
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
return reshape_request(mddev, sector_nr, skipped);
@@ -4316,6 +4319,15 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
return sectors * (raid_disks - conf->max_degraded);
}
+static void free_conf(raid5_conf_t *conf)
+{
+ shrink_stripes(conf);
+ safe_put_page(conf->spare_page);
+ kfree(conf->disks);
+ kfree(conf->stripe_hashtbl);
+ kfree(conf);
+}
+
static raid5_conf_t *setup_conf(mddev_t *mddev)
{
raid5_conf_t *conf;
@@ -4447,11 +4459,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
abort:
if (conf) {
- shrink_stripes(conf);
- safe_put_page(conf->spare_page);
- kfree(conf->disks);
- kfree(conf->stripe_hashtbl);
- kfree(conf);
+ free_conf(conf);
return ERR_PTR(-EIO);
} else
return ERR_PTR(-ENOMEM);
@@ -4629,12 +4637,8 @@ abort:
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
if (conf) {
- shrink_stripes(conf);
print_raid5_conf(conf);
- safe_put_page(conf->spare_page);
- kfree(conf->disks);
- kfree(conf->stripe_hashtbl);
- kfree(conf);
+ free_conf(conf);
}
mddev->private = NULL;
printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
@@ -4649,13 +4653,10 @@ static int stop(mddev_t *mddev)
md_unregister_thread(mddev->thread);
mddev->thread = NULL;
- shrink_stripes(conf);
- kfree(conf->stripe_hashtbl);
mddev->queue->backing_dev_info.congested_fn = NULL;
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
- kfree(conf->disks);
- kfree(conf);
+ free_conf(conf);
mddev->private = NULL;
return 0;
}
@@ -4857,6 +4858,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
return -EINVAL;
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
+ revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
mddev->recovery_cp = mddev->dev_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -5002,7 +5004,7 @@ static int raid5_start_reshape(mddev_t *mddev)
spin_unlock_irqrestore(&conf->device_lock, flags);
}
mddev->raid_disks = conf->raid_disks;
- mddev->reshape_position = 0;
+ mddev->reshape_position = conf->reshape_progress;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -5057,7 +5059,6 @@ static void end_reshape(raid5_conf_t *conf)
*/
static void raid5_finish_reshape(mddev_t *mddev)
{
- struct block_device *bdev;
raid5_conf_t *conf = mddev->private;
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -5066,15 +5067,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
set_capacity(mddev->gendisk, mddev->array_sectors);
mddev->changed = 1;
-
- bdev = bdget_disk(mddev->gendisk, 0);
- if (bdev) {
- mutex_lock(&bdev->bd_inode->i_mutex);
- i_size_write(bdev->bd_inode,
- (loff_t)mddev->array_sectors << 9);
- mutex_unlock(&bdev->bd_inode->i_mutex);
- bdput(bdev);
- }
+ revalidate_disk(mddev->gendisk);
} else {
int d;
mddev->degraded = conf->raid_disks;
@@ -5106,12 +5099,18 @@ static void raid5_quiesce(mddev_t *mddev, int state)
case 1: /* stop all writes */
spin_lock_irq(&conf->device_lock);
- conf->quiesce = 1;
+ /* '2' tells resync/reshape to pause so that all
+ * active stripes can drain
+ */
+ conf->quiesce = 2;
wait_event_lock_irq(conf->wait_for_stripe,
atomic_read(&conf->active_stripes) == 0 &&
atomic_read(&conf->active_aligned_reads) == 0,
conf->device_lock, /* nothing */);
+ conf->quiesce = 1;
spin_unlock_irq(&conf->device_lock);
+ /* allow reshape to continue */
+ wake_up(&conf->wait_for_overlap);
break;
case 0: /* re-enable writes */