diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/dm-bufio.c | 16 | ||||
-rw-r--r-- | drivers/md/dm-cache-background-tracker.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-smq.c | 31 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 27 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 19 | ||||
-rw-r--r-- | drivers/md/dm-rq.c | 1 | ||||
-rw-r--r-- | drivers/md/dm-thin-metadata.c | 4 | ||||
-rw-r--r-- | drivers/md/md.c | 20 | ||||
-rw-r--r-- | drivers/md/md.h | 2 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-disk.c | 15 | ||||
-rw-r--r-- | drivers/md/raid0.c | 116 | ||||
-rw-r--r-- | drivers/md/raid1.c | 21 | ||||
-rw-r--r-- | drivers/md/raid10.c | 7 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 47 | ||||
-rw-r--r-- | drivers/md/raid5-log.h | 3 | ||||
-rw-r--r-- | drivers/md/raid5.c | 79 |
16 files changed, 275 insertions, 138 deletions
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 5db11a405129..cd8139593ccd 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -218,7 +218,7 @@ static DEFINE_SPINLOCK(param_spinlock); * Buffers are freed after this timeout */ static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; -static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; +static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; static unsigned long dm_bufio_peak_allocated; static unsigned long dm_bufio_allocated_kmem_cache; @@ -1558,10 +1558,10 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) return true; } -static unsigned get_retain_buffers(struct dm_bufio_client *c) +static unsigned long get_retain_buffers(struct dm_bufio_client *c) { - unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); - return retain_bytes / c->block_size; + unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); + return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT); } static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, @@ -1571,7 +1571,7 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, struct dm_buffer *b, *tmp; unsigned long freed = 0; unsigned long count = nr_to_scan; - unsigned retain_target = get_retain_buffers(c); + unsigned long retain_target = get_retain_buffers(c); for (l = 0; l < LIST_SIZE; l++) { list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { @@ -1794,8 +1794,8 @@ static bool older_than(struct dm_buffer *b, unsigned long age_hz) static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) { struct dm_buffer *b, *tmp; - unsigned retain_target = get_retain_buffers(c); - unsigned count; + unsigned long retain_target = get_retain_buffers(c); + unsigned long count; LIST_HEAD(write_list); dm_bufio_lock(c); @@ -1955,7 +1955,7 @@ MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); -module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR); +module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c index 9b1afdfb13f0..707233891291 100644 --- a/drivers/md/dm-cache-background-tracker.c +++ b/drivers/md/dm-cache-background-tracker.c @@ -33,6 +33,11 @@ struct background_tracker *btracker_create(unsigned max_work) { struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL); + if (!b) { + DMERR("couldn't create background_tracker"); + return NULL; + } + b->max_work = max_work; atomic_set(&b->pending_promotes, 0); atomic_set(&b->pending_writebacks, 0); diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 72479bd61e11..e5eb9c9b4bc8 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1120,8 +1120,6 @@ static bool clean_target_met(struct smq_policy *mq, bool idle) * Cache entries may not be populated. So we cannot rely on the * size of the clean queue. */ - unsigned nr_clean; - if (idle) { /* * We'd like to clean everything. @@ -1129,18 +1127,16 @@ static bool clean_target_met(struct smq_policy *mq, bool idle) return q_size(&mq->dirty) == 0u; } - nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); - return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >= - percent_to_target(mq, CLEAN_TARGET); + /* + * If we're busy we don't worry about cleaning at all. + */ + return true; } -static bool free_target_met(struct smq_policy *mq, bool idle) +static bool free_target_met(struct smq_policy *mq) { unsigned nr_free; - if (!idle) - return true; - nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated; return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >= percent_to_target(mq, FREE_TARGET); @@ -1190,9 +1186,9 @@ static void queue_demotion(struct smq_policy *mq) if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed))) return; - e = q_peek(&mq->clean, mq->clean.nr_levels, true); + e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true); if (!e) { - if (!clean_target_met(mq, false)) + if (!clean_target_met(mq, true)) queue_writeback(mq); return; } @@ -1220,7 +1216,7 @@ static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, * We always claim to be 'idle' to ensure some demotions happen * with continuous loads. */ - if (!free_target_met(mq, true)) + if (!free_target_met(mq)) queue_demotion(mq); return; } @@ -1421,14 +1417,10 @@ static int smq_get_background_work(struct dm_cache_policy *p, bool idle, spin_lock_irqsave(&mq->lock, flags); r = btracker_issue(mq->bg_work, result); if (r == -ENODATA) { - /* find some writeback work to do */ - if (mq->migrations_allowed && !free_target_met(mq, idle)) - queue_demotion(mq); - - else if (!clean_target_met(mq, idle)) + if (!clean_target_met(mq, idle)) { queue_writeback(mq); - - r = btracker_issue(mq->bg_work, result); + r = btracker_issue(mq->bg_work, result); + } } spin_unlock_irqrestore(&mq->lock, flags); @@ -1452,6 +1444,7 @@ static void __complete_background_work(struct smq_policy *mq, clear_pending(mq, e); if (success) { e->oblock = work->oblock; + e->level = NR_CACHE_LEVELS - 1; push(mq, e); // h, q, a } else { diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 1db375f50a13..d682a0511381 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -94,6 +94,9 @@ static void iot_io_begin(struct io_tracker *iot, sector_t len) static void __iot_io_end(struct io_tracker *iot, sector_t len) { + if (!len) + return; + iot->in_flight -= len; if (!iot->in_flight) iot->idle_time = jiffies; @@ -474,7 +477,7 @@ struct cache { spinlock_t invalidation_lock; struct list_head invalidation_requests; - struct io_tracker origin_tracker; + struct io_tracker tracker; struct work_struct commit_ws; struct batcher committer; @@ -901,8 +904,7 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) static bool accountable_bio(struct cache *cache, struct bio *bio) { - return ((bio->bi_bdev == cache->origin_dev->bdev) && - bio_op(bio) != REQ_OP_DISCARD); + return bio_op(bio) != REQ_OP_DISCARD; } static void accounted_begin(struct cache *cache, struct bio *bio) @@ -912,7 +914,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio) if (accountable_bio(cache, bio)) { pb->len = bio_sectors(bio); - iot_io_begin(&cache->origin_tracker, pb->len); + iot_io_begin(&cache->tracker, pb->len); } } @@ -921,7 +923,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio) size_t pb_data_size = get_per_bio_data_size(cache); struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - iot_io_end(&cache->origin_tracker, pb->len); + iot_io_end(&cache->tracker, pb->len); } static void accounted_request(struct cache *cache, struct bio *bio) @@ -1716,20 +1718,19 @@ static int invalidate_start(struct cache *cache, dm_cblock_t cblock, enum busy { IDLE, - MODERATE, BUSY }; static enum busy spare_migration_bandwidth(struct cache *cache) { - bool idle = iot_idle_for(&cache->origin_tracker, HZ); + bool idle = iot_idle_for(&cache->tracker, HZ); sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * cache->sectors_per_block; - if (current_volume <= cache->migration_threshold) - return idle ? IDLE : MODERATE; + if (idle && current_volume <= cache->migration_threshold) + return IDLE; else - return idle ? MODERATE : BUSY; + return BUSY; } static void inc_hit_counter(struct cache *cache, struct bio *bio) @@ -2045,8 +2046,6 @@ static void check_migrations(struct work_struct *ws) for (;;) { b = spare_migration_bandwidth(cache); - if (b == BUSY) - break; r = policy_get_background_work(cache->policy, b == IDLE, &op); if (r == -ENODATA) @@ -2717,7 +2716,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) batcher_init(&cache->committer, commit_op, cache, issue_op, cache, cache->wq); - iot_init(&cache->origin_tracker); + iot_init(&cache->tracker); init_rwsem(&cache->background_work_lock); prevent_background_work(cache); @@ -2941,7 +2940,7 @@ static void cache_postsuspend(struct dm_target *ti) cancel_delayed_work(&cache->waker); flush_workqueue(cache->wq); - WARN_ON(cache->origin_tracker.in_flight); + WARN_ON(cache->tracker.in_flight); /* * If it's a flush suspend there won't be any deferred bios, so this diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 926a6bcb32c8..3df056b73b66 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -447,7 +447,7 @@ failed: * it has been invoked. */ #define dm_report_EIO(m) \ -({ \ +do { \ struct mapped_device *md = dm_table_get_md((m)->ti->table); \ \ pr_debug("%s: returning EIO; QIFNP = %d; SQIFNP = %d; DNFS = %d\n", \ @@ -455,8 +455,7 @@ failed: test_bit(MPATHF_QUEUE_IF_NO_PATH, &(m)->flags), \ test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &(m)->flags), \ dm_noflush_suspending((m)->ti)); \ - -EIO; \ -}) +} while (0) /* * Map cloned requests (request-based multipath) @@ -481,7 +480,8 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, if (!pgpath) { if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) return DM_MAPIO_DELAY_REQUEUE; - return dm_report_EIO(m); /* Failed */ + dm_report_EIO(m); /* Failed */ + return DM_MAPIO_KILL; } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { if (pg_init_all_paths(m)) @@ -558,7 +558,8 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m if (!pgpath) { if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) return DM_MAPIO_REQUEUE; - return dm_report_EIO(m); + dm_report_EIO(m); + return -EIO; } mpio->pgpath = pgpath; @@ -1493,7 +1494,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, if (atomic_read(&m->nr_valid_paths) == 0 && !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { if (error == -EIO) - error = dm_report_EIO(m); + dm_report_EIO(m); /* complete with the original error */ r = DM_ENDIO_DONE; } @@ -1524,8 +1525,10 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone, fail_path(mpio->pgpath); if (atomic_read(&m->nr_valid_paths) == 0 && - !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) - return dm_report_EIO(m); + !test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { + dm_report_EIO(m); + return -EIO; + } /* Queue for the daemon to resubmit */ dm_bio_restore(get_bio_details_from_bio(clone), clone); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 2af27026aa2e..b639fa7246ee 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -507,6 +507,7 @@ static int map_request(struct dm_rq_target_io *tio) case DM_MAPIO_KILL: /* The target wants to complete the I/O */ dm_kill_unmapped_request(rq, -EIO); + break; default: DMWARN("unimplemented target map return value: %d", r); BUG(); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 0f0251d0d337..d31d18d9727c 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -484,11 +484,11 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd) if (r < 0) return r; - r = save_sm_roots(pmd); + r = dm_tm_pre_commit(pmd->tm); if (r < 0) return r; - r = dm_tm_pre_commit(pmd->tm); + r = save_sm_roots(pmd); if (r < 0) return r; diff --git a/drivers/md/md.c b/drivers/md/md.c index 82f798be964f..10367ffe92e3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8022,18 +8022,15 @@ EXPORT_SYMBOL(md_write_end); * may proceed without blocking. It is important to call this before * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. - * - * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock - * is dropped, so return -EAGAIN after notifying userspace. */ -int md_allow_write(struct mddev *mddev) +void md_allow_write(struct mddev *mddev) { if (!mddev->pers) - return 0; + return; if (mddev->ro) - return 0; + return; if (!mddev->pers->sync_request) - return 0; + return; spin_lock(&mddev->lock); if (mddev->in_sync) { @@ -8046,13 +8043,12 @@ int md_allow_write(struct mddev *mddev) spin_unlock(&mddev->lock); md_update_sb(mddev, 0); sysfs_notify_dirent_safe(mddev->sysfs_state); + /* wait for the dirty state to be recorded in the metadata */ + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); - - if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) - return -EAGAIN; - else - return 0; } EXPORT_SYMBOL_GPL(md_allow_write); diff --git a/drivers/md/md.h b/drivers/md/md.h index 4e75d121bfcc..11f15146ce51 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -665,7 +665,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, bool metadata_op); extern void md_do_sync(struct md_thread *thread); extern void md_new_event(struct mddev *mddev); -extern int md_allow_write(struct mddev *mddev); +extern void md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index ebb280a14325..32adf6b4a9c7 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -142,10 +142,23 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) { + int r; + uint32_t old_count; enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - return sm_ll_dec(&smd->ll, b, &ev); + r = sm_ll_dec(&smd->ll, b, &ev); + if (!r && (ev == SM_FREE)) { + /* + * It's only free if it's also free in the last + * transaction. + */ + r = sm_ll_lookup(&smd->old_ll, b, &old_count); + if (!r && !old_count) + smd->nr_allocated_this_transaction--; + } + + return r; } static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 84e58596594d..d6c0bc76e837 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev) blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_discard_sectors(mddev->queue, UINT_MAX); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); blk_queue_io_opt(mddev->queue, @@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev, } } +static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) +{ + struct r0conf *conf = mddev->private; + struct strip_zone *zone; + sector_t start = bio->bi_iter.bi_sector; + sector_t end; + unsigned int stripe_size; + sector_t first_stripe_index, last_stripe_index; + sector_t start_disk_offset; + unsigned int start_disk_index; + sector_t end_disk_offset; + unsigned int end_disk_index; + unsigned int disk; + + zone = find_zone(conf, &start); + + if (bio_end_sector(bio) > zone->zone_end) { + struct bio *split = bio_split(bio, + zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, + mddev->bio_set); + bio_chain(split, bio); + generic_make_request(bio); + bio = split; + end = zone->zone_end; + } else + end = bio_end_sector(bio); + + if (zone != conf->strip_zone) + end = end - zone[-1].zone_end; + + /* Now start and end is the offset in zone */ + stripe_size = zone->nb_dev * mddev->chunk_sectors; + + first_stripe_index = start; + sector_div(first_stripe_index, stripe_size); + last_stripe_index = end; + sector_div(last_stripe_index, stripe_size); + + start_disk_index = (int)(start - first_stripe_index * stripe_size) / + mddev->chunk_sectors; + start_disk_offset = ((int)(start - first_stripe_index * stripe_size) % + mddev->chunk_sectors) + + first_stripe_index * mddev->chunk_sectors; + end_disk_index = (int)(end - last_stripe_index * stripe_size) / + mddev->chunk_sectors; + end_disk_offset = ((int)(end - last_stripe_index * stripe_size) % + mddev->chunk_sectors) + + last_stripe_index * mddev->chunk_sectors; + + for (disk = 0; disk < zone->nb_dev; disk++) { + sector_t dev_start, dev_end; + struct bio *discard_bio = NULL; + struct md_rdev *rdev; + + if (disk < start_disk_index) + dev_start = (first_stripe_index + 1) * + mddev->chunk_sectors; + else if (disk > start_disk_index) + dev_start = first_stripe_index * mddev->chunk_sectors; + else + dev_start = start_disk_offset; + + if (disk < end_disk_index) + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > end_disk_index) + dev_end = last_stripe_index * mddev->chunk_sectors; + else + dev_end = end_disk_offset; + + if (dev_end <= dev_start) + continue; + + rdev = conf->devlist[(zone - conf->strip_zone) * + conf->strip_zone[0].nb_dev + disk]; + if (__blkdev_issue_discard(rdev->bdev, + dev_start + zone->dev_start + rdev->data_offset, + dev_end - dev_start, GFP_NOIO, 0, &discard_bio) || + !discard_bio) + continue; + bio_chain(discard_bio, bio); + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(rdev->bdev), + discard_bio, disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); + generic_make_request(discard_bio); + } + bio_endio(bio); +} + static void raid0_make_request(struct mddev *mddev, struct bio *bio) { struct strip_zone *zone; @@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) return; } + if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { + raid0_handle_discard(mddev, bio); + return; + } + bio_sector = bio->bi_iter.bi_sector; sector = bio_sector; chunk_sects = mddev->chunk_sectors; @@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) bio->bi_iter.bi_sector = sector + zone->dev_start + tmp_dev->data_offset; - if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { - /* Just ignore it */ - bio_endio(bio); - } else { - if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), - bio, disk_devt(mddev->gendisk), - bio_sector); - mddev_check_writesame(mddev, bio); - mddev_check_write_zeroes(mddev, bio); - generic_make_request(bio); - } + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), + bio, disk_devt(mddev->gendisk), + bio_sector); + mddev_check_writesame(mddev, bio); + mddev_check_write_zeroes(mddev, bio); + generic_make_request(bio); } static void raid0_status(struct seq_file *seq, struct mddev *mddev) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7ed59351fe97..af5056d56878 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect break; } continue; - } else + } else { + if ((sectors > best_good_sectors) && (best_disk >= 0)) + best_disk = -1; best_good_sectors = sectors; + } if (best_disk >= 0) /* At least two disks to choose from so failfast is OK */ @@ -1529,17 +1532,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, plug = container_of(cb, struct raid1_plug_cb, cb); else plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); if (plug) { bio_list_add(&plug->pending, mbio); plug->pending_cnt++; } else { + spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; - } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) + spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); + } } r1_bio_write_done(r1_bio); @@ -3197,7 +3199,7 @@ static int raid1_reshape(struct mddev *mddev) struct r1conf *conf = mddev->private; int cnt, raid_disks; unsigned long flags; - int d, d2, err; + int d, d2; /* Cannot change chunk_size, layout, or level */ if (mddev->chunk_sectors != mddev->new_chunk_sectors || @@ -3209,11 +3211,8 @@ static int raid1_reshape(struct mddev *mddev) return -EINVAL; } - if (!mddev_is_clustered(mddev)) { - err = md_allow_write(mddev); - if (err) - return err; - } + if (!mddev_is_clustered(mddev)) + md_allow_write(mddev); raid_disks = mddev->raid_disks + mddev->delta_disks; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6b86a0032cf8..4343d7ff9916 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, plug = container_of(cb, struct raid10_plug_cb, cb); else plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); if (plug) { bio_list_add(&plug->pending, mbio); plug->pending_cnt++; } else { + spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; - } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) + spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); + } } static void raid10_write_request(struct mddev *mddev, struct bio *bio, diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 26ba09282e7c..4c00bc248287 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -24,6 +24,7 @@ #include "md.h" #include "raid5.h" #include "bitmap.h" +#include "raid5-log.h" /* * metadata/data stored in disk with 4k size unit (a block) regardless @@ -622,20 +623,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) __r5l_set_io_unit_state(io, IO_UNIT_IO_START); spin_unlock_irqrestore(&log->io_list_lock, flags); + /* + * In case of journal device failures, submit_bio will get error + * and calls endio, then active stripes will continue write + * process. Therefore, it is not necessary to check Faulty bit + * of journal device here. + * + * We can't check split_bio after current_bio is submitted. If + * io->split_bio is null, after current_bio is submitted, current_bio + * might already be completed and the io_unit is freed. We submit + * split_bio first to avoid the issue. + */ + if (io->split_bio) { + if (io->has_flush) + io->split_bio->bi_opf |= REQ_PREFLUSH; + if (io->has_fua) + io->split_bio->bi_opf |= REQ_FUA; + submit_bio(io->split_bio); + } + if (io->has_flush) io->current_bio->bi_opf |= REQ_PREFLUSH; if (io->has_fua) io->current_bio->bi_opf |= REQ_FUA; submit_bio(io->current_bio); - - if (!io->split_bio) - return; - - if (io->has_flush) - io->split_bio->bi_opf |= REQ_PREFLUSH; - if (io->has_fua) - io->split_bio->bi_opf |= REQ_FUA; - submit_bio(io->split_bio); } /* deferred io_unit will be dispatched here */ @@ -670,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work) return; pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", mdname(mddev)); + + /* wait superblock change before suspend */ + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + mddev_suspend(mddev); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; mddev_resume(mddev); @@ -2621,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf, * When run in degraded mode, array is set to write-through mode. * This check helps drain pending write safely in the transition to * write-through mode. + * + * When a stripe is syncing, the write is also handled in write + * through mode. */ - if (s->failed) { + if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) { r5c_make_stripe_write_out(sh); return -EAGAIN; } @@ -2825,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, } r5l_append_flush_payload(log, sh->sector); + /* stripe is flused to raid disks, we can do resync now */ + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) + set_bit(STRIPE_HANDLE, &sh->state); } int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) @@ -2973,7 +2995,7 @@ ioerr: return ret; } -void r5c_update_on_rdev_error(struct mddev *mddev) +void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; struct r5l_log *log = conf->log; @@ -2981,7 +3003,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev) if (!log) return; - if (raid5_calc_degraded(conf) > 0 && + if ((raid5_calc_degraded(conf) > 0 || + test_bit(Journal, &rdev->flags)) && conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) schedule_work(&log->disable_writeback_work); } diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 27097101ccca..328d67aedda4 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num); extern void r5c_check_stripe_cache_usage(struct r5conf *conf); extern void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; -extern void r5c_update_on_rdev_error(struct mddev *mddev); +extern void r5c_update_on_rdev_error(struct mddev *mddev, + struct md_rdev *rdev); extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); extern struct dma_async_tx_descriptor * diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2e38cfac5b1d..9c4f7659f8b1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) { int i; - local_irq_disable(); - spin_lock(conf->hash_locks); + spin_lock_irq(conf->hash_locks); for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); spin_lock(&conf->device_lock); @@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) { int i; spin_unlock(&conf->device_lock); - for (i = NR_STRIPE_HASH_LOCKS; i; i--) - spin_unlock(conf->hash_locks + i - 1); - local_irq_enable(); + for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) + spin_unlock(conf->hash_locks + i); + spin_unlock_irq(conf->hash_locks); } /* Find first data disk in a raid6 stripe */ @@ -234,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, if (test_bit(R5_InJournal, &sh->dev[i].flags)) injournal++; /* - * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with - * data in journal, so they are not released to cached lists + * In the following cases, the stripe cannot be released to cached + * lists. Therefore, we make the stripe write out and set + * STRIPE_HANDLE: + * 1. when quiesce in r5c write back; + * 2. when resync is requested fot the stripe. */ - if (conf->quiesce && r5c_is_writeback(conf->log) && - !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) || + (conf->quiesce && r5c_is_writeback(conf->log) && + !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { if (test_bit(STRIPE_R5C_CACHING, &sh->state)) r5c_make_stripe_write_out(sh); set_bit(STRIPE_HANDLE, &sh->state); @@ -714,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh) static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) { - local_irq_disable(); if (sh1 > sh2) { - spin_lock(&sh2->stripe_lock); + spin_lock_irq(&sh2->stripe_lock); spin_lock_nested(&sh1->stripe_lock, 1); } else { - spin_lock(&sh1->stripe_lock); + spin_lock_irq(&sh1->stripe_lock); spin_lock_nested(&sh2->stripe_lock, 1); } } @@ -727,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) { spin_unlock(&sh1->stripe_lock); - spin_unlock(&sh2->stripe_lock); - local_irq_enable(); + spin_unlock_irq(&sh2->stripe_lock); } /* Only freshly new full stripe normal write stripe can be added to a batch list */ @@ -2312,14 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - int err; + int err = 0; struct kmem_cache *sc; int i; int hash, cnt; - err = md_allow_write(conf->mddev); - if (err) - return err; + md_allow_write(conf->mddev); /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], @@ -2694,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); - r5c_update_on_rdev_error(mddev); + r5c_update_on_rdev_error(mddev, rdev); } /* @@ -3055,6 +3054,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) * When LOG_CRITICAL, stripes with injournal == 0 will be sent to * no_space_stripes list. * + * 3. during journal failure + * In journal failure, we try to flush all cached data to raid disks + * based on data in stripe cache. The array is read-only to upper + * layers, so we would skip all pending writes. + * */ static inline bool delay_towrite(struct r5conf *conf, struct r5dev *dev, @@ -3068,6 +3072,9 @@ static inline bool delay_towrite(struct r5conf *conf, if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && s->injournal > 0) return true; + /* case 3 above */ + if (s->log_failed && s->injournal) + return true; return false; } @@ -4653,8 +4660,13 @@ static void handle_stripe(struct stripe_head *sh) if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { spin_lock(&sh->stripe_lock); - /* Cannot process 'sync' concurrently with 'discard' */ - if (!test_bit(STRIPE_DISCARD, &sh->state) && + /* + * Cannot process 'sync' concurrently with 'discard'. + * Flush data in r5cache before 'sync'. + */ + if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && + !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) && + !test_bit(STRIPE_DISCARD, &sh->state) && test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); @@ -4701,10 +4713,15 @@ static void handle_stripe(struct stripe_head *sh) " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, s.failed_num[0], s.failed_num[1]); - /* check if the array has lost more than max_degraded devices and, + /* + * check if the array has lost more than max_degraded devices and, * if so, some requests might need to be failed. + * + * When journal device failed (log_failed), we will only process + * the stripe if there is data need write to raid disks */ - if (s.failed > conf->max_degraded || s.log_failed) { + if (s.failed > conf->max_degraded || + (s.log_failed && s.injournal == 0)) { sh->check_state = 0; sh->reconstruct_state = 0; break_stripe_batch_list(sh, 0); @@ -5277,8 +5294,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) struct stripe_head *sh, *tmp; struct list_head *handle_list = NULL; struct r5worker_group *wg; - bool second_try = !r5c_is_writeback(conf->log); - bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state); + bool second_try = !r5c_is_writeback(conf->log) && + !r5l_log_disk_error(conf); + bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) || + r5l_log_disk_error(conf); again: wg = NULL; @@ -6313,7 +6332,6 @@ int raid5_set_cache_size(struct mddev *mddev, int size) { struct r5conf *conf = mddev->private; - int err; if (size <= 16 || size > 32768) return -EINVAL; @@ -6325,10 +6343,7 @@ raid5_set_cache_size(struct mddev *mddev, int size) ; mutex_unlock(&conf->cache_size_mutex); - - err = md_allow_write(mddev); - if (err) - return err; + md_allow_write(mddev); mutex_lock(&conf->cache_size_mutex); while (size > conf->max_nr_stripes) @@ -7530,7 +7545,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * neilb: there is no locking about new writes here, * so this cannot be safe. */ - if (atomic_read(&conf->active_stripes)) { + if (atomic_read(&conf->active_stripes) || + atomic_read(&conf->r5c_cached_full_stripes) || + atomic_read(&conf->r5c_cached_partial_stripes)) { return -EBUSY; } log_exit(conf); |