From 4b80991c6cb9efa607bc4fd6f3ecdf5511c31bb0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 21 Jul 2008 17:05:25 +1000 Subject: md: Protect access to mddev->disks list using RCU All modifications and most access to the mddev->disks list are made under the reconfig_mutex lock. However there are three places where the list is walked without any locking. If a reconfig happens at this time, havoc (and oops) can ensue. So use RCU to protect these accesses: - wrap them in rcu_read_{,un}lock() - use list_for_each_entry_rcu - add to the list with list_add_rcu - delete from the list with list_del_rcu - delay the 'free' with call_rcu rather than schedule_work Note that export_rdev did a list_del_init on this list. In almost all cases the entry was not in the list anymore so it was a no-op and so safe. It is no longer safe as after list_del_rcu we may not touch the list_head. An audit shows that export_rdev is called: - after unbind_rdev_from_array, in which case the delete has already been done, - after bind_rdev_to_array fails, in which case the delete isn't needed. - before the device has been put on a list at all (e.g. in add_new_disk where reading the superblock fails). - and in autorun devices after a failure when the device is on a different list. So remove the list_del_init call from export_rdev, and add it back immediately before the called to export_rdev for that last case. Note also that ->same_set is sometimes used for lists other than mddev->list (e.g. candidates). In these cases rcu is not needed. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 15 ++++++++++----- drivers/md/md.c | 30 ++++++++++++++++++------------ include/linux/raid/md_k.h | 3 +++ 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index eba83e25b678..621a272a2c74 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -241,10 +241,10 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) { mdk_rdev_t *rdev; - struct list_head *tmp; mddev_t *mddev = bitmap->mddev; - rdev_for_each(rdev, tmp, mddev) + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) if (test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) { int size = PAGE_SIZE; @@ -260,11 +260,11 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) + (long)(page->index * (PAGE_SIZE/512)) + size/512 > 0) /* bitmap runs in to metadata */ - return -EINVAL; + goto bad_alignment; if (rdev->data_offset + mddev->size*2 > rdev->sb_start + bitmap->offset) /* data runs in to bitmap */ - return -EINVAL; + goto bad_alignment; } else if (rdev->sb_start < rdev->data_offset) { /* METADATA BITMAP DATA */ if (rdev->sb_start @@ -272,7 +272,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) + page->index*(PAGE_SIZE/512) + size/512 > rdev->data_offset) /* bitmap runs in to data */ - return -EINVAL; + goto bad_alignment; } else { /* DATA METADATA BITMAP - no problems */ } @@ -282,10 +282,15 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) size, page); } + rcu_read_unlock(); if (wait) md_super_wait(mddev); return 0; + + bad_alignment: + rcu_read_unlock(); + return -EINVAL; } static void bitmap_file_kick(struct bitmap *bitmap); diff --git a/drivers/md/md.c b/drivers/md/md.c index 450f30b6edf9..c2ff77ccec50 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1395,15 +1395,17 @@ static struct super_type super_types[] = { static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) { - struct list_head *tmp, *tmp2; mdk_rdev_t *rdev, *rdev2; - rdev_for_each(rdev, tmp, mddev1) - rdev_for_each(rdev2, tmp2, mddev2) + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev1) + rdev_for_each_rcu(rdev2, mddev2) if (rdev->bdev->bd_contains == - rdev2->bdev->bd_contains) + rdev2->bdev->bd_contains) { + rcu_read_unlock(); return 1; - + } + rcu_read_unlock(); return 0; } @@ -1470,7 +1472,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) kobject_del(&rdev->kobj); goto fail; } - list_add(&rdev->same_set, &mddev->disks); + list_add_rcu(&rdev->same_set, &mddev->disks); bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); return 0; @@ -1495,14 +1497,16 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) return; } bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); - list_del_init(&rdev->same_set); + list_del_rcu(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); /* We need to delay this, otherwise we can deadlock when - * writing to 'remove' to "dev/state" + * writing to 'remove' to "dev/state". We also need + * to delay it due to rcu usage. */ + synchronize_rcu(); INIT_WORK(&rdev->del_work, md_delayed_delete); kobject_get(&rdev->kobj); schedule_work(&rdev->del_work); @@ -1558,7 +1562,6 @@ static void export_rdev(mdk_rdev_t * rdev) if (rdev->mddev) MD_BUG(); free_disk_sb(rdev); - list_del_init(&rdev->same_set); #ifndef MODULE if (test_bit(AutoDetected, &rdev->flags)) md_autodetect_dev(rdev->bdev->bd_dev); @@ -4062,8 +4065,10 @@ static void autorun_devices(int part) /* on success, candidates will be empty, on error * it won't... */ - rdev_for_each_list(rdev, tmp, candidates) + rdev_for_each_list(rdev, tmp, candidates) { + list_del_init(&rdev->same_set); export_rdev(rdev); + } mddev_put(mddev); } printk(KERN_INFO "md: ... autorun DONE.\n"); @@ -5529,12 +5534,12 @@ int unregister_md_personality(struct mdk_personality *p) static int is_mddev_idle(mddev_t *mddev) { mdk_rdev_t * rdev; - struct list_head *tmp; int idle; long curr_events; idle = 1; - rdev_for_each(rdev, tmp, mddev) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; curr_events = disk_stat_read(disk, sectors[0]) + disk_stat_read(disk, sectors[1]) - @@ -5566,6 +5571,7 @@ static int is_mddev_idle(mddev_t *mddev) idle = 0; } } + rcu_read_unlock(); return idle; } diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h index 4bef4791d80d..9f2549ac0e2d 100644 --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -339,6 +339,9 @@ static inline char * mdname (mddev_t * mddev) #define rdev_for_each(rdev, tmp, mddev) \ rdev_for_each_list(rdev, tmp, (mddev)->disks) +#define rdev_for_each_rcu(rdev, mddev) \ + list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) + typedef struct mdk_thread_s { void (*run) (mddev_t *mddev); mddev_t *mddev; -- cgit v1.2.3