From cc27b0c78c79680d128dbac79de0d40556d041bb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 5 Jun 2017 16:49:39 +1000 Subject: md: fix deadlock between mddev_suspend() and md_write_start() If mddev_suspend() races with md_write_start() we can deadlock with mddev_suspend() waiting for the request that is currently in md_write_start() to complete the ->make_request() call, and md_write_start() waiting for the metadata to be updated to mark the array as 'dirty'. As metadata updates done by md_check_recovery() only happen then the mddev_lock() can be claimed, and as mddev_suspend() is often called with the lock held, these threads wait indefinitely for each other. We fix this by having md_write_start() abort if mddev_suspend() is happening, and ->make_request() aborts if md_write_start() aborted. md_make_request() can detect this abort, decrease the ->active_io count, and wait for mddev_suspend(). Reported-by: Nix Fix: 68866e425be2(MD: no sync IO while suspended) Cc: stable@vger.kernel.org Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/faulty.c | 5 +++-- drivers/md/linear.c | 7 ++++--- drivers/md/md.c | 22 +++++++++++++++++----- drivers/md/md.h | 4 ++-- drivers/md/multipath.c | 8 ++++---- drivers/md/raid0.c | 7 ++++--- drivers/md/raid1.c | 11 +++++++---- drivers/md/raid10.c | 10 ++++++---- drivers/md/raid5.c | 17 +++++++++-------- 9 files changed, 56 insertions(+), 35 deletions(-) (limited to 'drivers') diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index b0536cfd8e17..06a64d5d8c6c 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -170,7 +170,7 @@ static void add_sector(struct faulty_conf *conf, sector_t start, int mode) conf->nfaults = n+1; } -static void faulty_make_request(struct mddev *mddev, struct bio *bio) +static bool faulty_make_request(struct mddev *mddev, struct bio *bio) { struct faulty_conf *conf = mddev->private; int failit = 0; @@ -182,7 +182,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio) * just fail immediately */ bio_io_error(bio); - return; + return true; } if (check_sector(conf, bio->bi_iter.bi_sector, @@ -224,6 +224,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio) bio->bi_bdev = conf->rdev->bdev; generic_make_request(bio); + return true; } static void faulty_status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index df6f2c98eca7..5f1eb9189542 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -245,7 +245,7 @@ static void linear_free(struct mddev *mddev, void *priv) kfree(conf); } -static void linear_make_request(struct mddev *mddev, struct bio *bio) +static bool linear_make_request(struct mddev *mddev, struct bio *bio) { char b[BDEVNAME_SIZE]; struct dev_info *tmp_dev; @@ -254,7 +254,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); - return; + return true; } tmp_dev = which_dev(mddev, bio_sector); @@ -292,7 +292,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) mddev_check_write_zeroes(mddev, bio); generic_make_request(bio); } - return; + return true; out_of_bounds: pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n", @@ -302,6 +302,7 @@ out_of_bounds: (unsigned long long)tmp_dev->rdev->sectors, (unsigned long long)start_sector); bio_io_error(bio); + return true; } static void linear_status (struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/md.c b/drivers/md/md.c index 87edc342ccb3..d7847014821a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -277,7 +277,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) bio_endio(bio); return BLK_QC_T_NONE; } - smp_rmb(); /* Ensure implications of 'active' are visible */ +check_suspended: rcu_read_lock(); if (mddev->suspended) { DEFINE_WAIT(__wait); @@ -302,7 +302,11 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) sectors = bio_sectors(bio); /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; - mddev->pers->make_request(mddev, bio); + if (!mddev->pers->make_request(mddev, bio)) { + atomic_dec(&mddev->active_io); + wake_up(&mddev->sb_wait); + goto check_suspended; + } cpu = part_stat_lock(); part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); @@ -327,6 +331,7 @@ void mddev_suspend(struct mddev *mddev) if (mddev->suspended++) return; synchronize_rcu(); + wake_up(&mddev->sb_wait); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); @@ -7950,12 +7955,14 @@ EXPORT_SYMBOL(md_done_sync); * If we need to update some array metadata (e.g. 'active' flag * in superblock) before writing, schedule a superblock update * and wait for it to complete. + * A return value of 'false' means that the write wasn't recorded + * and cannot proceed as the array is being suspend. */ -void md_write_start(struct mddev *mddev, struct bio *bi) +bool md_write_start(struct mddev *mddev, struct bio *bi) { int did_change = 0; if (bio_data_dir(bi) != WRITE) - return; + return true; BUG_ON(mddev->ro == 1); if (mddev->ro == 2) { @@ -7987,7 +7994,12 @@ void md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { + percpu_ref_put(&mddev->writes_pending); + return false; + } + return true; } EXPORT_SYMBOL(md_write_start); diff --git a/drivers/md/md.h b/drivers/md/md.h index 0fa1de42c42b..63d342d560b8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -510,7 +510,7 @@ struct md_personality int level; struct list_head list; struct module *owner; - void (*make_request)(struct mddev *mddev, struct bio *bio); + bool (*make_request)(struct mddev *mddev, struct bio *bio); int (*run)(struct mddev *mddev); void (*free)(struct mddev *mddev, void *priv); void (*status)(struct seq_file *seq, struct mddev *mddev); @@ -649,7 +649,7 @@ extern void md_wakeup_thread(struct md_thread *thread); extern void md_check_recovery(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev); extern int mddev_init_writes_pending(struct mddev *mddev); -extern void md_write_start(struct mddev *mddev, struct bio *bi); +extern bool md_write_start(struct mddev *mddev, struct bio *bi); extern void md_write_inc(struct mddev *mddev, struct bio *bi); extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index e95d521d93e9..c8d985ba008d 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -106,7 +106,7 @@ static void multipath_end_request(struct bio *bio) rdev_dec_pending(rdev, conf->mddev); } -static void multipath_make_request(struct mddev *mddev, struct bio * bio) +static bool multipath_make_request(struct mddev *mddev, struct bio * bio) { struct mpconf *conf = mddev->private; struct multipath_bh * mp_bh; @@ -114,7 +114,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); - return; + return true; } mp_bh = mempool_alloc(conf->pool, GFP_NOIO); @@ -126,7 +126,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) if (mp_bh->path < 0) { bio_io_error(bio); mempool_free(mp_bh, conf->pool); - return; + return true; } multipath = conf->multipaths + mp_bh->path; @@ -141,7 +141,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) mddev_check_writesame(mddev, &mp_bh->bio); mddev_check_write_zeroes(mddev, &mp_bh->bio); generic_make_request(&mp_bh->bio); - return; + return true; } static void multipath_status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d6c0bc76e837..94d9ae9b0fd0 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -548,7 +548,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) bio_endio(bio); } -static void raid0_make_request(struct mddev *mddev, struct bio *bio) +static bool raid0_make_request(struct mddev *mddev, struct bio *bio) { struct strip_zone *zone; struct md_rdev *tmp_dev; @@ -559,12 +559,12 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); - return; + return true; } if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { raid0_handle_discard(mddev, bio); - return; + return true; } bio_sector = bio->bi_iter.bi_sector; @@ -599,6 +599,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); generic_make_request(bio); + return true; } static void raid0_status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e1a7e3d4c5e4..c71739b87ab7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1321,7 +1321,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, * Continue immediately if no resync is active currently. */ - md_write_start(mddev, bio); /* wait on superblock update early */ if ((bio_end_sector(bio) > mddev->suspend_lo && bio->bi_iter.bi_sector < mddev->suspend_hi) || @@ -1550,13 +1549,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, wake_up(&conf->wait_barrier); } -static void raid1_make_request(struct mddev *mddev, struct bio *bio) +static bool raid1_make_request(struct mddev *mddev, struct bio *bio) { sector_t sectors; if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); - return; + return true; } /* @@ -1571,8 +1570,12 @@ static void raid1_make_request(struct mddev *mddev, struct bio *bio) if (bio_data_dir(bio) == READ) raid1_read_request(mddev, bio, sectors, NULL); - else + else { + if (!md_write_start(mddev,bio)) + return false; raid1_write_request(mddev, bio, sectors); + } + return true; } static void raid1_status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 797ed60abd5e..52acffa7a06a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1303,8 +1303,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, sector_t sectors; int max_sectors; - md_write_start(mddev, bio); - /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. @@ -1525,7 +1523,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) raid10_write_request(mddev, bio, r10_bio); } -static void raid10_make_request(struct mddev *mddev, struct bio *bio) +static bool raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); @@ -1534,9 +1532,12 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { md_flush_request(mddev, bio); - return; + return true; } + if (!md_write_start(mddev, bio)) + return false; + /* * If this request crosses a chunk boundary, we need to split * it. @@ -1553,6 +1554,7 @@ static void raid10_make_request(struct mddev *mddev, struct bio *bio) /* In case raid10d snuck in to freeze_array */ wake_up(&conf->wait_barrier); + return true; } static void raid10_status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ec0f951ae19f..b218a42fd702 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5479,7 +5479,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) last_sector = bi->bi_iter.bi_sector + (bi->bi_iter.bi_size>>9); bi->bi_next = NULL; - md_write_start(mddev, bi); stripe_sectors = conf->chunk_sectors * (conf->raid_disks - conf->max_degraded); @@ -5549,11 +5548,10 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) release_stripe_plug(mddev, sh); } - md_write_end(mddev); bio_endio(bi); } -static void raid5_make_request(struct mddev *mddev, struct bio * bi) +static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; int dd_idx; @@ -5569,10 +5567,10 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) int ret = r5l_handle_flush_request(conf->log, bi); if (ret == 0) - return; + return true; if (ret == -ENODEV) { md_flush_request(mddev, bi); - return; + return true; } /* ret == -EAGAIN, fallback */ /* @@ -5582,6 +5580,8 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) do_flush = bi->bi_opf & REQ_PREFLUSH; } + if (!md_write_start(mddev, bi)) + return false; /* * If array is degraded, better not do chunk aligned read because * later we might have to read it again in order to reconstruct @@ -5591,18 +5591,18 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) mddev->reshape_position == MaxSector) { bi = chunk_aligned_read(mddev, bi); if (!bi) - return; + return true; } if (unlikely(bio_op(bi) == REQ_OP_DISCARD)) { make_discard_request(mddev, bi); - return; + md_write_end(mddev); + return true; } logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; - md_write_start(mddev, bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { @@ -5740,6 +5740,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) if (rw == WRITE) md_write_end(mddev); bio_endio(bi); + return true; } static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks); -- cgit v1.2.3 From f9c79bc05a2a91f4fba8bfd653579e066714b1ec Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 7 Jun 2017 19:05:31 -0400 Subject: md: don't use flush_signals in userspace processes The function flush_signals clears all pending signals for the process. It may be used by kernel threads when we need to prepare a kernel thread for responding to signals. However using this function for an userspaces processes is incorrect - clearing signals without the program expecting it can cause misbehavior. The raid1 and raid5 code uses flush_signals in its request routine because it wants to prepare for an interruptible wait. This patch drops flush_signals and uses sigprocmask instead to block all signals (including SIGKILL) around the schedule() call. The signals are not lost, but the schedule() call won't respond to them. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Acked-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 5 ++++- drivers/md/raid5.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c71739b87ab7..7866563338fa 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1334,7 +1334,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ DEFINE_WAIT(w); for (;;) { - flush_signals(current); + sigset_t full, old; prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); if (bio_end_sector(bio) <= mddev->suspend_lo || @@ -1344,7 +1344,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, bio->bi_iter.bi_sector, bio_end_sector(bio)))) break; + sigfillset(&full); + sigprocmask(SIG_BLOCK, &full, &old); schedule(); + sigprocmask(SIG_SETMASK, &old, NULL); } finish_wait(&conf->wait_barrier, &w); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b218a42fd702..547d5fa45a42 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5693,12 +5693,15 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * userspace, we want an interruptible * wait. */ - flush_signals(current); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_INTERRUPTIBLE); if (logical_sector >= mddev->suspend_lo && logical_sector < mddev->suspend_hi) { + sigset_t full, old; + sigfillset(&full); + sigprocmask(SIG_BLOCK, &full, &old); schedule(); + sigprocmask(SIG_SETMASK, &old, NULL); do_prepare = true; } goto retry; -- cgit v1.2.3 From 1cdd1257949c85c5ddff8313fe3b1e39c5bee8b8 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 13 Jun 2017 11:16:08 +0800 Subject: md/raid10: fix FailFast test for wrong device We need to test FailFast flag for replacement device here since the set up for writing is for the replacement, so we need fix it like: - if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) + if (test_bit(FailFast, &conf->mirrors[d].replacement->flags)) Since commit f90145f317ef ("md/raid10: add rcu protection to rdev access in raid10_sync_request.") had added the rcu protection for the part, so let's extend the range protected by rcu and use rdev directly. Fixes: 1919cbb ("md/raid10: add failfast handling for writes.") Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 52acffa7a06a..305b0db5a293 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3295,7 +3295,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, biolist = bio; bio->bi_end_io = end_sync_read; bio_set_op_attrs(bio, REQ_OP_READ, 0); - if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) + if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; @@ -3307,7 +3307,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, continue; } atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); /* Need to set up for writing to the replacement */ bio = r10_bio->devs[i].repl_bio; @@ -3318,11 +3317,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, biolist = bio; bio->bi_end_io = end_sync_write; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - if (test_bit(FailFast, &conf->mirrors[d].rdev->flags)) + if (test_bit(FailFast, &rdev->flags)) bio->bi_opf |= MD_FAILFAST; bio->bi_iter.bi_sector = sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; count++; + rcu_read_unlock(); } if (count < 2) { -- cgit v1.2.3 From 037d2ff62cbfd3adf634553fbc25c9af7c25a702 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 15 Jun 2017 17:00:25 +0800 Subject: md/raid1: remove unused bio in sync_request_write The "bio" is not used in sync_request_write after commit a68e58703575 ("md/raid1: split out two sub-functions from sync_request_write"). Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7866563338fa..659e1ab922b1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2171,9 +2171,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) struct r1conf *conf = mddev->private; int i; int disks = conf->raid_disks * 2; - struct bio *bio, *wbio; - - bio = r1_bio->bios[r1_bio->read_disk]; + struct bio *wbio; if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) /* ouch - failed to read all of that. */ -- cgit v1.2.3 From 8df72024393cdba2543e55d51297f2b2c4ede46f Mon Sep 17 00:00:00 2001 From: Lidong Zhong Date: Mon, 12 Jun 2017 10:45:55 +0800 Subject: md: change the initialization value for a spare device spot to MD_DISK_ROLE_SPARE The value for spare spot of sb->dev_roles is changed from MD_DISK_ROLE_FAULTY to MD_DISK_ROLE_SPARE to keep align with the value when the superblock is firstly created in userspace. Signed-off-by: Lidong Zhong Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index d7847014821a..528c1452ce54 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1857,7 +1857,7 @@ retry: max_dev = le32_to_cpu(sb->max_dev); for (i=0; idev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); + sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); -- cgit v1.2.3 From 5a85071c2cbcc7d8d8f764b33bf64c76e47d268d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 21 Jun 2017 09:12:21 +1000 Subject: md: use a separate bio_set for synchronous IO. md devices allocate a bio_set and use it for two distinct purposes. mddev->bio_set is used to clone bios as part of sending upper level requests down to lower level devices, and it is also use for synchronous IO such as superblock and bitmap updates, and for correcting read errors. This multiple usage can lead to deadlocks. It is likely that cloned bios might be queued for write and to be waiting for a metadata update before the write can be permitted. If the cloning exhausted mddev->bio_set, the metadata update may not be able to proceed. This scenario has been seen during heavy testing, with lots of IO and lots of memory pressure. Address this by adding a new bio_set specifically for synchronous IO. All synchronous IO goes directly to the underlying device and is not queued at the md level, so request using entries from the new mddev->sync_set will complete in a timely fashion. Requests that use mddev->bio_set will sometimes need to wait for synchronous IO, but will no longer risk deadlocking that iO. Also: small simplification in mddev_put(): there is no need to wait until the spinlock is released before calling bioset_free(). Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 27 ++++++++++++++++++++------- drivers/md/md.h | 3 +++ 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 528c1452ce54..65ad837aeb54 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -203,6 +203,14 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, } EXPORT_SYMBOL_GPL(bio_alloc_mddev); +static struct bio *md_bio_alloc_sync(struct mddev *mddev) +{ + if (!mddev->sync_set) + return bio_alloc(GFP_NOIO, 1); + + return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set); +} + /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -467,8 +475,6 @@ static void mddev_delayed_delete(struct work_struct *ws); static void mddev_put(struct mddev *mddev) { - struct bio_set *bs = NULL; - if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; if (!mddev->raid_disks && list_empty(&mddev->disks) && @@ -476,8 +482,12 @@ static void mddev_put(struct mddev *mddev) /* Array is not configured at all, and not held active, * so destroy it */ list_del_init(&mddev->all_mddevs); - bs = mddev->bio_set; + if (mddev->bio_set) + bioset_free(mddev->bio_set); + if (mddev->sync_set) + bioset_free(mddev->sync_set); mddev->bio_set = NULL; + mddev->sync_set = NULL; if (mddev->gendisk) { /* We did a probe so need to clean up. Call * queue_work inside the spinlock so that @@ -490,8 +500,6 @@ static void mddev_put(struct mddev *mddev) kfree(mddev); } spin_unlock(&all_mddevs_lock); - if (bs) - bioset_free(bs); } static void md_safemode_timeout(unsigned long data); @@ -756,7 +764,7 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, if (test_bit(Faulty, &rdev->flags)) return; - bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); + bio = md_bio_alloc_sync(mddev); atomic_inc(&rdev->nr_pending); @@ -788,7 +796,7 @@ int md_super_wait(struct mddev *mddev) int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int op, int op_flags, bool metadata_op) { - struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); + struct bio *bio = md_bio_alloc_sync(rdev->mddev); int ret; bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? @@ -5437,6 +5445,11 @@ int md_run(struct mddev *mddev) if (!mddev->bio_set) return -ENOMEM; } + if (mddev->sync_set == NULL) { + mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0); + if (!mddev->sync_set) + return -ENOMEM; + } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); diff --git a/drivers/md/md.h b/drivers/md/md.h index 63d342d560b8..991f0fe2dcc6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -444,6 +444,9 @@ struct mddev { struct attribute_group *to_remove; struct bio_set *bio_set; + struct bio_set *sync_set; /* for sync operations like + * metadata and bitmap writes + */ /* Generic flush handling. * The last to finish preflush schedules a worker to submit -- cgit v1.2.3 From 7f053a6a745557b3f3ad63e9d28ba85c3c0b1563 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 23 Jun 2017 09:19:49 -0700 Subject: MD: fix a null dereference rdev->mddev could be null in start time. Reported-by: Ming Lei Fix: 5a85071c2cbc(md: use a separate bio_set for synchronous IO.) Cc: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 65ad837aeb54..092b48f8095e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -205,7 +205,7 @@ EXPORT_SYMBOL_GPL(bio_alloc_mddev); static struct bio *md_bio_alloc_sync(struct mddev *mddev) { - if (!mddev->sync_set) + if (!mddev || !mddev->sync_set) return bio_alloc(GFP_NOIO, 1); return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set); -- cgit v1.2.3 From 7184ef8bab0cb865c3cea9dd1a675771145df0af Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 3 Jul 2017 14:34:23 -0700 Subject: MD: fix sleep in atomic bioset_free() will take a mutex, so can't get called with spinlock hold. Fix: 5a85071c2cbc(md: use a separate bio_set for synchronous IO.) Cc: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 092b48f8095e..66f6b928a80b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -475,6 +475,8 @@ static void mddev_delayed_delete(struct work_struct *ws); static void mddev_put(struct mddev *mddev) { + struct bio_set *bs = NULL, *sync_bs = NULL; + if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) return; if (!mddev->raid_disks && list_empty(&mddev->disks) && @@ -482,10 +484,8 @@ static void mddev_put(struct mddev *mddev) /* Array is not configured at all, and not held active, * so destroy it */ list_del_init(&mddev->all_mddevs); - if (mddev->bio_set) - bioset_free(mddev->bio_set); - if (mddev->sync_set) - bioset_free(mddev->sync_set); + bs = mddev->bio_set; + sync_bs = mddev->sync_set; mddev->bio_set = NULL; mddev->sync_set = NULL; if (mddev->gendisk) { @@ -500,6 +500,10 @@ static void mddev_put(struct mddev *mddev) kfree(mddev); } spin_unlock(&all_mddevs_lock); + if (bs) + bioset_free(bs); + if (sync_bs) + bioset_free(sync_bs); } static void md_safemode_timeout(unsigned long data); -- cgit v1.2.3