From 4ed8731d8e6bd2a88a30697fbf4f7e6e979a6c46 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:34:00 +1100 Subject: MD: change the parameter of md thread Change the thread parameter, so the thread can carry extra info. Next patch will use it. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index adda94df5eb2..81c02d63440b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4625,8 +4625,9 @@ static int handle_active_stripes(struct r5conf *conf) * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d(struct mddev *mddev) +static void raid5d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r5conf *conf = mddev->private; int handled; struct blk_plug plug; -- cgit v1.2.3 From 620125f2bf8ff0c4969b79653b54d7bcc9d40637 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:49:05 +1100 Subject: MD: raid5 trim support Discard for raid4/5/6 has limitation. If discard request size is small, we do discard for one disk, but we need calculate parity and write parity disk. To correctly calculate parity, zero_after_discard must be guaranteed. Even it's true, we need do discard for one disk but write another disks, which makes the parity disks wear out fast. This doesn't make sense. So an efficient discard for raid4/5/6 should discard all data disks and parity disks, which requires the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is smaller than chunk_size, such pattern is almost impossible in practice. So in this patch, I only handle the case that A's size equals to chunk_size. That is discard request should be aligned to stripe size and its size is multiple of stripe size. Since we can only handle request with specific alignment and size (or part of the request fitting stripes), we can't guarantee zero_after_discard even zero_after_discard is true in low level drives. The block layer doesn't send down correctly aligned requests even correct discard alignment is set, so I must filter out. For raid4/5/6 parity calculation, if data is 0, parity is 0. So if zero_after_discard is true for all disks, data is consistent after discard. Otherwise, data might be lost. Let's consider a scenario: discard a stripe, write data to one disk and write parity disk. The stripe could be still inconsistent till then depending on using data from other data disks or parity disks to calculate new parity. If the disk is broken, we can't restore it. So in this patch, we only enable discard support if all disks have zero_after_discard. If discard fails in one disk, we face the similar inconsistent issue above. The patch will make discard follow the same path as normal write request. If discard fails, a resync will be scheduled to make the data consistent. This isn't good to have extra writes, but data consistency is important. If a subsequent read/write request hits raid5 cache of a discarded stripe, the discarded dev page should have zero filled, so the data is consistent. This patch will always zero dev page for discarded request stripe. This isn't optimal because discard request doesn't need such payload. Next patch will avoid it. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++++++++- drivers/md/raid5.h | 1 + 2 files changed, 166 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 81c02d63440b..74dcf19cfe68 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rw = WRITE_FUA; else rw = WRITE; + if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags)) + rw |= REQ_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; else if (test_and_clear_bit(R5_WantReplace, @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_rw & REQ_SYNC) set_bit(R5_SyncIO, &dev->flags); - tx = async_copy_data(1, wbi, dev->page, - dev->sector, tx); + if (wbi->bi_rw & REQ_DISCARD) { + memset(page_address(dev->page), 0, + STRIPE_SECTORS << 9); + set_bit(R5_Discard, &dev->flags); + } else + tx = async_copy_data(1, wbi, dev->page, + dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); } } @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = 0; i < sh->disks; i++) { + if (pd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + memset(page_address(sh->dev[pd_idx].page), 0, + STRIPE_SECTORS << 9); + set_bit(R5_Discard, &sh->dev[pd_idx].flags); + ops_complete_reconstruct(sh); + return; + } /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, { struct async_submit_ctl submit; struct page **blocks = percpu->scribble; - int count; + int count, i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); + for (i = 0; i < sh->disks; i++) { + if (sh->pd_idx == i || sh->qd_idx == i) + continue; + if (!test_bit(R5_Discard, &sh->dev[i].flags)) + break; + } + if (i >= sh->disks) { + atomic_inc(&sh->count); + memset(page_address(sh->dev[sh->pd_idx].page), 0, + STRIPE_SECTORS << 9); + memset(page_address(sh->dev[sh->qd_idx].page), 0, + STRIPE_SECTORS << 9); + set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); + set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); + ops_complete_reconstruct(sh); + return; + } + count = set_syndrome_sources(blocks, sh); atomic_inc(&sh->count); @@ -4067,6 +4106,88 @@ static void release_stripe_plug(struct mddev *mddev, release_stripe(sh); } +static void make_discard_request(struct mddev *mddev, struct bio *bi) +{ + struct r5conf *conf = mddev->private; + sector_t logical_sector, last_sector; + struct stripe_head *sh; + int remaining; + int stripe_sectors; + + if (mddev->reshape_position != MaxSector) + /* Skip discard while reshape is happening */ + return; + + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); + last_sector = bi->bi_sector + (bi->bi_size>>9); + + bi->bi_next = NULL; + bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ + + stripe_sectors = conf->chunk_sectors * + (conf->raid_disks - conf->max_degraded); + logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector, + stripe_sectors); + sector_div(last_sector, stripe_sectors); + + logical_sector *= conf->chunk_sectors; + last_sector *= conf->chunk_sectors; + + for (; logical_sector < last_sector; + logical_sector += STRIPE_SECTORS) { + DEFINE_WAIT(w); + int d; + again: + sh = get_active_stripe(conf, logical_sector, 0, 0, 0); + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); + spin_lock_irq(&sh->stripe_lock); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + if (sh->dev[d].towrite || sh->dev[d].toread) { + set_bit(R5_Overlap, &sh->dev[d].flags); + spin_unlock_irq(&sh->stripe_lock); + release_stripe(sh); + schedule(); + goto again; + } + } + finish_wait(&conf->wait_for_overlap, &w); + for (d = 0; d < conf->raid_disks; d++) { + if (d == sh->pd_idx || d == sh->qd_idx) + continue; + sh->dev[d].towrite = bi; + set_bit(R5_OVERWRITE, &sh->dev[d].flags); + raid5_inc_bi_active_stripes(bi); + } + spin_unlock_irq(&sh->stripe_lock); + if (conf->mddev->bitmap) { + for (d = 0; + d < conf->raid_disks - conf->max_degraded; + d++) + bitmap_startwrite(mddev->bitmap, + sh->sector, + STRIPE_SECTORS, + 0); + sh->bm_seq = conf->seq_flush + 1; + set_bit(STRIPE_BIT_DELAY, &sh->state); + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + release_stripe_plug(mddev, sh); + } + + remaining = raid5_dec_bi_active_stripes(bi); + if (remaining == 0) { + md_write_end(mddev); + bio_endio(bi, 0); + } +} + static void make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; @@ -4089,6 +4210,11 @@ static void make_request(struct mddev *mddev, struct bio * bi) chunk_aligned_read(mddev,bi)) return; + if (unlikely(bi->bi_rw & REQ_DISCARD)) { + make_discard_request(mddev, bi); + return; + } + logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); last_sector = bi->bi_sector + (bi->bi_size>>9); bi->bi_next = NULL; @@ -5362,6 +5488,7 @@ static int run(struct mddev *mddev) if (mddev->queue) { int chunk_size; + bool discard_supported = true; /* read-ahead size must cover two whole stripes, which * is 2 * (datadisks) * chunksize where 'n' is the * number of raid devices @@ -5381,13 +5508,48 @@ static int run(struct mddev *mddev) blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * (conf->raid_disks - conf->max_degraded)); + /* + * We can only discard a whole stripe. It doesn't make sense to + * discard data disk but write parity disk + */ + stripe = stripe * PAGE_SIZE; + mddev->queue->limits.discard_alignment = stripe; + mddev->queue->limits.discard_granularity = stripe; + /* + * unaligned part of discard request will be ignored, so can't + * guarantee discard_zerors_data + */ + mddev->queue->limits.discard_zeroes_data = 0; rdev_for_each(rdev, mddev) { disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->new_data_offset << 9); + /* + * discard_zeroes_data is required, otherwise data + * could be lost. Consider a scenario: discard a stripe + * (the stripe could be inconsistent if + * discard_zeroes_data is 0); write one disk of the + * stripe (the stripe could be inconsistent again + * depending on which disks are used to calculate + * parity); the disk is broken; The stripe data of this + * disk is lost. + */ + if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) || + !bdev_get_queue(rdev->bdev)-> + limits.discard_zeroes_data) + discard_supported = false; } + + if (discard_supported && + mddev->queue->limits.max_discard_sectors >= stripe && + mddev->queue->limits.discard_granularity >= stripe) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, + mddev->queue); } return 0; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index a9fc24901eda..18b2c4a8a1fd 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -298,6 +298,7 @@ enum r5dev_flags { R5_WantReplace, /* We need to update the replacement, we have read * data in, and now is a good time to write it out. */ + R5_Discard, /* Discard the stripe */ }; /* -- cgit v1.2.3 From 9e44476851e91c86c98eb92b9bc27fb801f89072 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:49:49 +1100 Subject: MD: raid5 avoid unnecessary zero page for trim We want to avoid zero discarded dev page, because it's useless for discard. But if we don't zero it, another read/write hit such page in the cache and will get inconsistent data. To avoid zero the page, we don't set R5_UPTODATE flag after construction is done. In this way, discard write request is still issued and finished, but read will not hit the page. If the stripe gets accessed soon, we need reread the stripe, but since the chance is low, the reread isn't a big deal. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 74dcf19cfe68..758b77296404 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -547,7 +547,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rw = WRITE_FUA; else rw = WRITE; - if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags)) + if (test_bit(R5_Discard, &sh->dev[i].flags)) rw |= REQ_DISCARD; } else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = READ; @@ -1172,11 +1172,9 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_rw & REQ_SYNC) set_bit(R5_SyncIO, &dev->flags); - if (wbi->bi_rw & REQ_DISCARD) { - memset(page_address(dev->page), 0, - STRIPE_SECTORS << 9); + if (wbi->bi_rw & REQ_DISCARD) set_bit(R5_Discard, &dev->flags); - } else + else tx = async_copy_data(1, wbi, dev->page, dev->sector, tx); wbi = r5_next_bio(wbi, dev->sector); @@ -1194,7 +1192,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) int pd_idx = sh->pd_idx; int qd_idx = sh->qd_idx; int i; - bool fua = false, sync = false; + bool fua = false, sync = false, discard = false; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1202,13 +1200,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref) for (i = disks; i--; ) { fua |= test_bit(R5_WantFUA, &sh->dev[i].flags); sync |= test_bit(R5_SyncIO, &sh->dev[i].flags); + discard |= test_bit(R5_Discard, &sh->dev[i].flags); } for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - set_bit(R5_UPTODATE, &dev->flags); + if (!discard) + set_bit(R5_UPTODATE, &dev->flags); if (fua) set_bit(R5_WantFUA, &dev->flags); if (sync) @@ -1252,8 +1252,6 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, } if (i >= sh->disks) { atomic_inc(&sh->count); - memset(page_address(sh->dev[pd_idx].page), 0, - STRIPE_SECTORS << 9); set_bit(R5_Discard, &sh->dev[pd_idx].flags); ops_complete_reconstruct(sh); return; @@ -1314,10 +1312,6 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, } if (i >= sh->disks) { atomic_inc(&sh->count); - memset(page_address(sh->dev[sh->pd_idx].page), 0, - STRIPE_SECTORS << 9); - memset(page_address(sh->dev[sh->qd_idx].page), 0, - STRIPE_SECTORS << 9); set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags); set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags); ops_complete_reconstruct(sh); @@ -2775,7 +2769,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_and_clear_bit(R5_Discard, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; pr_debug("Return write for disc %d\n", i); @@ -3493,10 +3488,12 @@ static void handle_stripe(struct stripe_head *sh) if (s.written && (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) && !test_bit(R5_LOCKED, &pdev->flags) - && test_bit(R5_UPTODATE, &pdev->flags)))) && + && (test_bit(R5_UPTODATE, &pdev->flags) || + test_bit(R5_Discard, &pdev->flags))))) && (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) - && test_bit(R5_UPTODATE, &qdev->flags))))) + && (test_bit(R5_UPTODATE, &qdev->flags) || + test_bit(R5_Discard, &qdev->flags)))))) handle_stripe_clean_event(conf, sh, disks, &s.return_bi); /* Now we might consider reading some blocks, either to check/generate @@ -3523,9 +3520,11 @@ static void handle_stripe(struct stripe_head *sh) /* All the 'written' buffers and the parity block are ready to * be written back to disk */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)); BUG_ON(sh->qd_idx >= 0 && - !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags)); + !test_bit(R5_UPTODATE, &sh->dev[sh->qd_idx].flags) && + !test_bit(R5_Discard, &sh->dev[sh->qd_idx].flags)); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_LOCKED, &dev->flags) && -- cgit v1.2.3 From 143c4d0573caebe0ae017097614349697e2280eb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:50:12 +1100 Subject: md/raid5: add some missing locking in handle_failed_stripe. We really should hold the stripe_lock while accessing 'toread' else we could race with add_stripe_bio and corrupt a list. Reported-by: "Jianpeng Ma" Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 758b77296404..36c0a158730b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2552,8 +2552,10 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && (!test_bit(R5_Insync, &sh->dev[i].flags) || test_bit(R5_ReadError, &sh->dev[i].flags))) { + spin_lock_irq(&sh->stripe_lock); bi = sh->dev[i].toread; sh->dev[i].toread = NULL; + spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); if (bi) s->to_read--; -- cgit v1.2.3 From b97390aec4756373168ad2976e1f117b610513ea Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:50:12 +1100 Subject: md/raid5: protect debug message against NULL derefernce. The pr_debug in add_stripe_bio could race with something changing *bip, so it is best to hold the lock until after the pr_debug. Reported-by: "Jianpeng Ma" Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 36c0a158730b..d11012604e28 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2436,11 +2436,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } - spin_unlock_irq(&sh->stripe_lock); pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", (unsigned long long)(*bip)->bi_sector, (unsigned long long)sh->sector, dd_idx); + spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap && firstwrite) { bitmap_startwrite(conf->mddev->bitmap, sh->sector, -- cgit v1.2.3 From a7854487cd7128a30a7f4f5259de9f67d5efb95f Mon Sep 17 00:00:00 2001 From: Alexander Lyakas Date: Thu, 11 Oct 2012 13:50:12 +1100 Subject: md: When RAID5 is dirty, force reconstruct-write instead of read-modify-write. Signed-off-by: Alex Lyakas Suggested-by: Yair Hershko Signed-off-by: NeilBrown --- drivers/md/raid5.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d11012604e28..9de8221f64ec 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2806,12 +2806,25 @@ static void handle_stripe_dirtying(struct r5conf *conf, int disks) { int rmw = 0, rcw = 0, i; - if (conf->max_degraded == 2) { - /* RAID6 requires 'rcw' in current implementation - * Calculate the real rcw later - for now fake it + sector_t recovery_cp = conf->mddev->recovery_cp; + + /* RAID6 requires 'rcw' in current implementation. + * Otherwise, check whether resync is now happening or should start. + * If yes, then the array is dirty (after unclean shutdown or + * initial creation), so parity in some stripes might be inconsistent. + * In this case, we need to always do reconstruct-write, to ensure + * that in case of drive failure or read-error correction, we + * generate correct data from the parity. + */ + if (conf->max_degraded == 2 || + (recovery_cp < MaxSector && sh->sector >= recovery_cp)) { + /* Calculate the real rcw later - for now make it * look like rcw is cheaper */ rcw = 1; rmw = 2; + pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n", + conf->max_degraded, (unsigned long long)recovery_cp, + (unsigned long long)sh->sector); } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; -- cgit v1.2.3 From 1ed850f356a0a422013846b5291acff08815008b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:50:13 +1100 Subject: md/raid5: make sure to_read and to_write never go negative. to_read and to_write are part of the result of analysing a stripe before handling it. Their use is to avoid some loops and tests if the values are known to be zero. Thus it is not a problem if they are a little bit larger than they should be. So decrementing them in handle_failed_stripe serves little value, and due to races it could cause some loops to be skipped incorrectly. So remove those decrements. Reported-by: "Jianpeng Ma" Signed-off-by: NeilBrown --- drivers/md/raid5.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9de8221f64ec..ab613efbbead 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2507,10 +2507,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; spin_unlock_irq(&sh->stripe_lock); - if (bi) { - s->to_write--; + if (bi) bitmap_end = 1; - } if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); @@ -2558,7 +2556,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, spin_unlock_irq(&sh->stripe_lock); if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) wake_up(&conf->wait_for_overlap); - if (bi) s->to_read--; while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { struct bio *nextbi = -- cgit v1.2.3 From 7f7583d420231b9d09897afd57a957011b606a5b Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 11 Oct 2012 14:17:59 +1100 Subject: Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid races Now that multiple threads can handle stripes, it is safer to use an atomic64_t for resync_mismatches, to avoid update races. Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++++--- drivers/md/md.h | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 4 ++-- 5 files changed, 9 insertions(+), 8 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 85e6786653ea..7564c44b8045 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4269,7 +4269,8 @@ static ssize_t mismatch_cnt_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", - (unsigned long long) mddev->resync_mismatches); + (unsigned long long) + atomic64_read(&mddev->resync_mismatches)); } static struct md_sysfs_entry md_scan_mode = @@ -5235,7 +5236,7 @@ static void md_clean(struct mddev *mddev) mddev->new_layout = 0; mddev->new_chunk_sectors = 0; mddev->curr_resync = 0; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; mddev->recovery = 0; @@ -7357,7 +7358,7 @@ void md_do_sync(struct md_thread *thread) * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); /* we don't use the checkpoint if there's a bitmap */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min; diff --git a/drivers/md/md.h b/drivers/md/md.h index 93ada214b50e..af443ab868db 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -282,7 +282,7 @@ struct mddev { sector_t resync_max_sectors; /* may be set by personality */ - sector_t resync_mismatches; /* count of sectors where + atomic64_t resync_mismatches; /* count of sectors where * parity/replica mismatch found */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e913356b257e..8034fbd6190c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1876,7 +1876,7 @@ static int process_checks(struct r1bio *r1_bio) } else j = 0; if (j >= 0) - mddev->resync_mismatches += r1_bio->sectors; + atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { /* No need to write to this device. */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index fb5bd607e15c..146749b277c6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2011,7 +2011,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) break; if (j == vcnt) continue; - mddev->resync_mismatches += r10_bio->sectors; + atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) /* Don't fix anything. */ continue; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ab613efbbead..4cc6b0e398bc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2973,7 +2973,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, */ set_bit(STRIPE_INSYNC, &sh->state); else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -3125,7 +3125,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, */ } } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); -- cgit v1.2.3 From e56108d65f8705170d238858616728359542aebb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 14:24:13 +1100 Subject: md/raid5: be careful not to resize_stripes too big. When a RAID5 is reshaping, conf->raid_disks is increased before mddev->delta_disks becomes zero. This can result in check_reshape calling resize_stripes with a number that is too large. This particularly happens when md_check_recovery calls ->check_reshape(). If we use ->previous_raid_disks, we don't risk this. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4cc6b0e398bc..2737247d2a02 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5871,7 +5871,8 @@ static int check_reshape(struct mddev *mddev) if (!check_stripe_cache(mddev)) return -ENOSPC; - return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); + return resize_stripes(conf, (conf->previous_raid_disks + + mddev->delta_disks)); } static int raid5_start_reshape(struct mddev *mddev) -- cgit v1.2.3