diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bcache/alloc.c | 9 | ||||
-rw-r--r-- | drivers/md/bcache/bcache.h | 6 | ||||
-rw-r--r-- | drivers/md/bcache/bset.c | 61 | ||||
-rw-r--r-- | drivers/md/bcache/btree.c | 53 | ||||
-rw-r--r-- | drivers/md/bcache/btree.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/io.c | 12 | ||||
-rw-r--r-- | drivers/md/bcache/journal.c | 141 | ||||
-rw-r--r-- | drivers/md/bcache/journal.h | 4 | ||||
-rw-r--r-- | drivers/md/bcache/super.c | 227 | ||||
-rw-r--r-- | drivers/md/bcache/sysfs.c | 67 | ||||
-rw-r--r-- | drivers/md/bcache/util.h | 2 | ||||
-rw-r--r-- | drivers/md/bcache/writeback.c | 8 | ||||
-rw-r--r-- | drivers/md/md-bitmap.c | 20 | ||||
-rw-r--r-- | drivers/md/md.c | 129 | ||||
-rw-r--r-- | drivers/md/md.h | 23 | ||||
-rw-r--r-- | drivers/md/raid1-10.c | 30 | ||||
-rw-r--r-- | drivers/md/raid1.c | 119 | ||||
-rw-r--r-- | drivers/md/raid10.c | 86 | ||||
-rw-r--r-- | drivers/md/raid5.c | 12 |
19 files changed, 741 insertions, 270 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index f8986effcb50..6f776823b9ba 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -393,6 +393,11 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait) struct bucket *b; long r; + + /* No allocation if CACHE_SET_IO_DISABLE bit is set */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &ca->set->flags))) + return -1; + /* fastpath */ if (fifo_pop(&ca->free[RESERVE_NONE], r) || fifo_pop(&ca->free[reserve], r)) @@ -484,6 +489,10 @@ int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, { int i; + /* No allocation if CACHE_SET_IO_DISABLE bit is set */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) + return -1; + lockdep_assert_held(&c->bucket_lock); BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index fdf75352e16a..013e35a9e317 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -705,8 +705,8 @@ struct cache_set { atomic_long_t writeback_keys_failed; atomic_long_t reclaim; + atomic_long_t reclaimed_journal_buckets; atomic_long_t flush_write; - atomic_long_t retry_flush_write; enum { ON_ERROR_UNREGISTER, @@ -726,8 +726,6 @@ struct cache_set { #define BUCKET_HASH_BITS 12 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS]; - - DECLARE_HEAP(struct btree *, flush_btree); }; struct bbio { @@ -1006,7 +1004,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size); int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint8_t *set_uuid); void bch_cached_dev_detach(struct cached_dev *dc); -void bch_cached_dev_run(struct cached_dev *dc); +int bch_cached_dev_run(struct cached_dev *dc); void bcache_device_stop(struct bcache_device *d); void bch_cache_set_unregister(struct cache_set *c); diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 268f1b685084..08768796b543 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -347,22 +347,19 @@ EXPORT_SYMBOL(bch_btree_keys_alloc); void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops, bool *expensive_debug_checks) { - unsigned int i; - b->ops = ops; b->expensive_debug_checks = expensive_debug_checks; b->nsets = 0; b->last_set_unwritten = 0; - /* XXX: shouldn't be needed */ - for (i = 0; i < MAX_BSETS; i++) - b->set[i].size = 0; /* - * Second loop starts at 1 because b->keys[0]->data is the memory we - * allocated + * struct btree_keys in embedded in struct btree, and struct + * bset_tree is embedded into struct btree_keys. They are all + * initialized as 0 by kzalloc() in mca_bucket_alloc(), and + * b->set[0].data is allocated in bch_btree_keys_alloc(), so we + * don't have to initiate b->set[].size and b->set[].data here + * any more. */ - for (i = 1; i < MAX_BSETS; i++) - b->set[i].data = NULL; } EXPORT_SYMBOL(bch_btree_keys_init); @@ -970,45 +967,25 @@ static struct bset_search_iter bset_search_tree(struct bset_tree *t, unsigned int inorder, j, n = 1; do { - /* - * A bit trick here. - * If p < t->size, (int)(p - t->size) is a minus value and - * the most significant bit is set, right shifting 31 bits - * gets 1. If p >= t->size, the most significant bit is - * not set, right shifting 31 bits gets 0. - * So the following 2 lines equals to - * if (p >= t->size) - * p = 0; - * but a branch instruction is avoided. - */ unsigned int p = n << 4; - p &= ((int) (p - t->size)) >> 31; - - prefetch(&t->tree[p]); + if (p < t->size) + prefetch(&t->tree[p]); j = n; f = &t->tree[j]; - /* - * Similar bit trick, use subtract operation to avoid a branch - * instruction. - * - * n = (f->mantissa > bfloat_mantissa()) - * ? j * 2 - * : j * 2 + 1; - * - * We need to subtract 1 from f->mantissa for the sign bit trick - * to work - that's done in make_bfloat() - */ - if (likely(f->exponent != 127)) - n = j * 2 + (((unsigned int) - (f->mantissa - - bfloat_mantissa(search, f))) >> 31); - else - n = (bkey_cmp(tree_to_bkey(t, j), search) > 0) - ? j * 2 - : j * 2 + 1; + if (likely(f->exponent != 127)) { + if (f->mantissa >= bfloat_mantissa(search, f)) + n = j * 2; + else + n = j * 2 + 1; + } else { + if (bkey_cmp(tree_to_bkey(t, j), search) > 0) + n = j * 2; + else + n = j * 2 + 1; + } } while (n < t->size); inorder = to_inorder(j, t); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 773f5fdad25f..ba434d9ac720 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -35,7 +35,7 @@ #include <linux/rcupdate.h> #include <linux/sched/clock.h> #include <linux/rculist.h> - +#include <linux/delay.h> #include <trace/events/bcache.h> /* @@ -613,6 +613,10 @@ static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) static struct btree *mca_bucket_alloc(struct cache_set *c, struct bkey *k, gfp_t gfp) { + /* + * kzalloc() is necessary here for initialization, + * see code comments in bch_btree_keys_init(). + */ struct btree *b = kzalloc(sizeof(struct btree), gfp); if (!b) @@ -655,7 +659,25 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush) up(&b->io_mutex); } +retry: + /* + * BTREE_NODE_dirty might be cleared in btree_flush_btree() by + * __bch_btree_node_write(). To avoid an extra flush, acquire + * b->write_lock before checking BTREE_NODE_dirty bit. + */ mutex_lock(&b->write_lock); + /* + * If this btree node is selected in btree_flush_write() by journal + * code, delay and retry until the node is flushed by journal code + * and BTREE_NODE_journal_flush bit cleared by btree_flush_write(). + */ + if (btree_node_journal_flush(b)) { + pr_debug("bnode %p is flushing by journal, retry", b); + mutex_unlock(&b->write_lock); + udelay(1); + goto retry; + } + if (btree_node_dirty(b)) __bch_btree_node_write(b, &cl); mutex_unlock(&b->write_lock); @@ -778,10 +800,15 @@ void bch_btree_cache_free(struct cache_set *c) while (!list_empty(&c->btree_cache)) { b = list_first_entry(&c->btree_cache, struct btree, list); - if (btree_node_dirty(b)) + /* + * This function is called by cache_set_free(), no I/O + * request on cache now, it is unnecessary to acquire + * b->write_lock before clearing BTREE_NODE_dirty anymore. + */ + if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); - clear_bit(BTREE_NODE_dirty, &b->flags); - + clear_bit(BTREE_NODE_dirty, &b->flags); + } mca_data_free(b); } @@ -1067,11 +1094,25 @@ static void btree_node_free(struct btree *b) BUG_ON(b == b->c->root); +retry: mutex_lock(&b->write_lock); + /* + * If the btree node is selected and flushing in btree_flush_write(), + * delay and retry until the BTREE_NODE_journal_flush bit cleared, + * then it is safe to free the btree node here. Otherwise this btree + * node will be in race condition. + */ + if (btree_node_journal_flush(b)) { + mutex_unlock(&b->write_lock); + pr_debug("bnode %p journal_flush set, retry", b); + udelay(1); + goto retry; + } - if (btree_node_dirty(b)) + if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); - clear_bit(BTREE_NODE_dirty, &b->flags); + clear_bit(BTREE_NODE_dirty, &b->flags); + } mutex_unlock(&b->write_lock); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index d1c72ef64edf..76cfd121a486 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -158,11 +158,13 @@ enum btree_flags { BTREE_NODE_io_error, BTREE_NODE_dirty, BTREE_NODE_write_idx, + BTREE_NODE_journal_flush, }; BTREE_FLAG(io_error); BTREE_FLAG(dirty); BTREE_FLAG(write_idx); +BTREE_FLAG(journal_flush); static inline struct btree_write *btree_current_write(struct btree *b) { diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index c25097968319..4d93f07f63e5 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -58,6 +58,18 @@ void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); + /* + * Read-ahead requests on a degrading and recovering md raid + * (e.g. raid6) device might be failured immediately by md + * raid code, which is not a real hardware media failure. So + * we shouldn't count failed REQ_RAHEAD bio to dc->io_errors. + */ + if (bio->bi_opf & REQ_RAHEAD) { + pr_warn_ratelimited("%s: Read-ahead I/O failed on backing device, ignore", + dc->backing_dev_name); + return; + } + errors = atomic_add_return(1, &dc->io_errors); if (errors < dc->error_limit) pr_err("%s: IO error on backing device, unrecoverable", diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 12dae9348147..be2a2a201603 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -100,6 +100,20 @@ reread: left = ca->sb.bucket_size - offset; blocks = set_blocks(j, block_bytes(ca->set)); + /* + * Nodes in 'list' are in linear increasing order of + * i->j.seq, the node on head has the smallest (oldest) + * journal seq, the node on tail has the biggest + * (latest) journal seq. + */ + + /* + * Check from the oldest jset for last_seq. If + * i->j.seq < j->last_seq, it means the oldest jset + * in list is expired and useless, remove it from + * this list. Otherwise, j is a condidate jset for + * further following checks. + */ while (!list_empty(list)) { i = list_first_entry(list, struct journal_replay, list); @@ -109,13 +123,22 @@ reread: left = ca->sb.bucket_size - offset; kfree(i); } + /* iterate list in reverse order (from latest jset) */ list_for_each_entry_reverse(i, list, list) { if (j->seq == i->j.seq) goto next_set; + /* + * if j->seq is less than any i->j.last_seq + * in list, j is an expired and useless jset. + */ if (j->seq < i->j.last_seq) goto next_set; + /* + * 'where' points to first jset in list which + * is elder then j. + */ if (j->seq > i->j.seq) { where = &i->list; goto add; @@ -129,10 +152,12 @@ add: if (!i) return -ENOMEM; memcpy(&i->j, j, bytes); + /* Add to the location after 'where' points to */ list_add(&i->list, where); ret = 1; - ja->seq[bucket_index] = j->seq; + if (j->seq > ja->seq[bucket_index]) + ja->seq[bucket_index] = j->seq; next_set: offset += blocks * ca->sb.block_size; len -= blocks * ca->sb.block_size; @@ -268,7 +293,7 @@ bsearch: struct journal_replay, list)->j.seq; - return ret; + return 0; #undef read_bucket } @@ -391,60 +416,90 @@ err: } /* Journalling */ -#define journal_max_cmp(l, r) \ - (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \ - fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) -#define journal_min_cmp(l, r) \ - (fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \ - fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal)) static void btree_flush_write(struct cache_set *c) { - /* - * Try to find the btree node with that references the oldest journal - * entry, best is our current candidate and is locked if non NULL: - */ - struct btree *b; - int i; + struct btree *b, *t, *btree_nodes[BTREE_FLUSH_NR]; + unsigned int i, n; + + if (c->journal.btree_flushing) + return; + + spin_lock(&c->journal.flush_write_lock); + if (c->journal.btree_flushing) { + spin_unlock(&c->journal.flush_write_lock); + return; + } + c->journal.btree_flushing = true; + spin_unlock(&c->journal.flush_write_lock); atomic_long_inc(&c->flush_write); + memset(btree_nodes, 0, sizeof(btree_nodes)); + n = 0; -retry: - spin_lock(&c->journal.lock); - if (heap_empty(&c->flush_btree)) { - for_each_cached_btree(b, c, i) - if (btree_current_write(b)->journal) { - if (!heap_full(&c->flush_btree)) - heap_add(&c->flush_btree, b, - journal_max_cmp); - else if (journal_max_cmp(b, - heap_peek(&c->flush_btree))) { - c->flush_btree.data[0] = b; - heap_sift(&c->flush_btree, 0, - journal_max_cmp); - } - } + mutex_lock(&c->bucket_lock); + list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { + if (btree_node_journal_flush(b)) + pr_err("BUG: flush_write bit should not be set here!"); + + mutex_lock(&b->write_lock); - for (i = c->flush_btree.used / 2 - 1; i >= 0; --i) - heap_sift(&c->flush_btree, i, journal_min_cmp); + if (!btree_node_dirty(b)) { + mutex_unlock(&b->write_lock); + continue; + } + + if (!btree_current_write(b)->journal) { + mutex_unlock(&b->write_lock); + continue; + } + + set_btree_node_journal_flush(b); + + mutex_unlock(&b->write_lock); + + btree_nodes[n++] = b; + if (n == BTREE_FLUSH_NR) + break; } + mutex_unlock(&c->bucket_lock); - b = NULL; - heap_pop(&c->flush_btree, b, journal_min_cmp); - spin_unlock(&c->journal.lock); + for (i = 0; i < n; i++) { + b = btree_nodes[i]; + if (!b) { + pr_err("BUG: btree_nodes[%d] is NULL", i); + continue; + } + + /* safe to check without holding b->write_lock */ + if (!btree_node_journal_flush(b)) { + pr_err("BUG: bnode %p: journal_flush bit cleaned", b); + continue; + } - if (b) { mutex_lock(&b->write_lock); if (!btree_current_write(b)->journal) { + clear_bit(BTREE_NODE_journal_flush, &b->flags); + mutex_unlock(&b->write_lock); + pr_debug("bnode %p: written by others", b); + continue; + } + + if (!btree_node_dirty(b)) { + clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); - /* We raced */ - atomic_long_inc(&c->retry_flush_write); - goto retry; + pr_debug("bnode %p: dirty bit cleaned by others", b); + continue; } __bch_btree_node_write(b, NULL); + clear_bit(BTREE_NODE_journal_flush, &b->flags); mutex_unlock(&b->write_lock); } + + spin_lock(&c->journal.flush_write_lock); + c->journal.btree_flushing = false; + spin_unlock(&c->journal.flush_write_lock); } #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) @@ -559,6 +614,7 @@ static void journal_reclaim(struct cache_set *c) k->ptr[n++] = MAKE_PTR(0, bucket_to_sector(c, ca->sb.d[ja->cur_idx]), ca->sb.nr_this_dev); + atomic_long_inc(&c->reclaimed_journal_buckets); } if (n) { @@ -811,6 +867,10 @@ atomic_t *bch_journal(struct cache_set *c, struct journal_write *w; atomic_t *ret; + /* No journaling if CACHE_SET_IO_DISABLE set already */ + if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) + return NULL; + if (!CACHE_SYNC(&c->sb)) return NULL; @@ -855,7 +915,6 @@ void bch_journal_free(struct cache_set *c) free_pages((unsigned long) c->journal.w[1].data, JSET_BITS); free_pages((unsigned long) c->journal.w[0].data, JSET_BITS); free_fifo(&c->journal.pin); - free_heap(&c->flush_btree); } int bch_journal_alloc(struct cache_set *c) @@ -863,6 +922,7 @@ int bch_journal_alloc(struct cache_set *c) struct journal *j = &c->journal; spin_lock_init(&j->lock); + spin_lock_init(&j->flush_write_lock); INIT_DELAYED_WORK(&j->work, journal_write_work); c->journal_delay_ms = 100; @@ -870,8 +930,7 @@ int bch_journal_alloc(struct cache_set *c) j->w[0].c = c; j->w[1].c = c; - if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) || - !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) return -ENOMEM; diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h index 66f0facff84b..f2ea34d5f431 100644 --- a/drivers/md/bcache/journal.h +++ b/drivers/md/bcache/journal.h @@ -103,6 +103,8 @@ struct journal_write { /* Embedded in struct cache_set */ struct journal { spinlock_t lock; + spinlock_t flush_write_lock; + bool btree_flushing; /* used when waiting because the journal was full */ struct closure_waitlist wait; struct closure io; @@ -154,6 +156,8 @@ struct journal_device { struct bio_vec bv[8]; }; +#define BTREE_FLUSH_NR 8 + #define journal_pin_cmp(c, l, r) \ (fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r))) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1b63ac876169..26e374fbf57c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -40,6 +40,7 @@ static const char invalid_uuid[] = { static struct kobject *bcache_kobj; struct mutex bch_register_lock; +bool bcache_is_reboot; LIST_HEAD(bch_cache_sets); static LIST_HEAD(uncached_devices); @@ -49,6 +50,7 @@ static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; struct workqueue_struct *bch_journal_wq; + #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) /* limitation of partitions number on single bcache device */ #define BCACHE_MINORS 128 @@ -197,7 +199,9 @@ err: static void write_bdev_super_endio(struct bio *bio) { struct cached_dev *dc = bio->bi_private; - /* XXX: error checking */ + + if (bio->bi_status) + bch_count_backing_io_errors(dc, bio); closure_put(&dc->sb_write); } @@ -691,6 +695,7 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, { unsigned int i; struct cache *ca; + int ret; for_each_cache(ca, d->c, i) bd_link_disk_holder(ca->bdev, d->disk); @@ -698,9 +703,13 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, snprintf(d->name, BCACHEDEVNAME_SIZE, "%s%u", name, d->id); - WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || - sysfs_create_link(&c->kobj, &d->kobj, d->name), - "Couldn't create device <-> cache set symlinks"); + ret = sysfs_create_link(&d->kobj, &c->kobj, "cache"); + if (ret < 0) + pr_err("Couldn't create device -> cache set symlink"); + + ret = sysfs_create_link(&c->kobj, &d->kobj, d->name); + if (ret < 0) + pr_err("Couldn't create cache set -> device symlink"); clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); } @@ -908,7 +917,7 @@ static int cached_dev_status_update(void *arg) } -void bch_cached_dev_run(struct cached_dev *dc) +int bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL); @@ -919,11 +928,19 @@ void bch_cached_dev_run(struct cached_dev *dc) NULL, }; + if (dc->io_disable) { + pr_err("I/O disabled on cached dev %s", + dc->backing_dev_name); + return -EIO; + } + if (atomic_xchg(&dc->running, 1)) { kfree(env[1]); kfree(env[2]); kfree(buf); - return; + pr_info("cached dev %s is running already", + dc->backing_dev_name); + return -EBUSY; } if (!d->c && @@ -949,8 +966,11 @@ void bch_cached_dev_run(struct cached_dev *dc) kfree(buf); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || - sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) - pr_debug("error creating sysfs link"); + sysfs_create_link(&disk_to_dev(d->disk)->kobj, + &d->kobj, "bcache")) { + pr_err("Couldn't create bcache dev <-> disk sysfs symlinks"); + return -ENOMEM; + } dc->status_update_thread = kthread_run(cached_dev_status_update, dc, "bcache_status_update"); @@ -959,6 +979,8 @@ void bch_cached_dev_run(struct cached_dev *dc) "continue to run without monitoring backing " "device status"); } + + return 0; } /* @@ -996,7 +1018,6 @@ static void cached_dev_detach_finish(struct work_struct *w) BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); BUG_ON(refcount_read(&dc->count)); - mutex_lock(&bch_register_lock); if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc); @@ -1012,6 +1033,8 @@ static void cached_dev_detach_finish(struct work_struct *w) bch_write_bdev_super(dc, &cl); closure_sync(&cl); + mutex_lock(&bch_register_lock); + calc_cached_dev_sectors(dc->disk.c); bcache_device_detach(&dc->disk); list_move(&dc->list, &uncached_devices); @@ -1054,6 +1077,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds()); struct uuid_entry *u; struct cached_dev *exist_dc, *t; + int ret = 0; if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) || (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))) @@ -1153,6 +1177,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, down_write(&dc->writeback_lock); if (bch_cached_dev_writeback_start(dc)) { up_write(&dc->writeback_lock); + pr_err("Couldn't start writeback facilities for %s", + dc->disk.disk->disk_name); return -ENOMEM; } @@ -1163,7 +1189,22 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c, bch_sectors_dirty_init(&dc->disk); - bch_cached_dev_run(dc); + ret = bch_cached_dev_run(dc); + if (ret && (ret != -EBUSY)) { + up_write(&dc->writeback_lock); + /* + * bch_register_lock is held, bcache_device_stop() is not + * able to be directly called. The kthread and kworker + * created previously in bch_cached_dev_writeback_start() + * have to be stopped manually here. + */ + kthread_stop(dc->writeback_thread); + cancel_writeback_rate_update_dwork(dc); + pr_err("Couldn't run cached device %s", + dc->backing_dev_name); + return ret; + } + bcache_device_link(&dc->disk, c, "bdev"); atomic_inc(&c->attached_dev_nr); @@ -1190,18 +1231,16 @@ static void cached_dev_free(struct closure *cl) { struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); - mutex_lock(&bch_register_lock); - if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags)) cancel_writeback_rate_update_dwork(dc); if (!IS_ERR_OR_NULL(dc->writeback_thread)) kthread_stop(dc->writeback_thread); - if (dc->writeback_write_wq) - destroy_workqueue(dc->writeback_write_wq); if (!IS_ERR_OR_NULL(dc->status_update_thread)) kthread_stop(dc->status_update_thread); + mutex_lock(&bch_register_lock); + if (atomic_read(&dc->running)) bd_unlink_disk_holder(dc->bdev, dc->disk.disk); bcache_device_free(&dc->disk); @@ -1290,6 +1329,7 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, { const char *err = "cannot allocate memory"; struct cache_set *c; + int ret = -ENOMEM; bdevname(bdev, dc->backing_dev_name); memcpy(&dc->sb, sb, sizeof(struct cache_sb)); @@ -1319,14 +1359,18 @@ static int register_bdev(struct cache_sb *sb, struct page *sb_page, bch_cached_dev_attach(dc, c, NULL); if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || - BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) - bch_cached_dev_run(dc); + BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) { + err = "failed to run cached device"; + ret = bch_cached_dev_run(dc); + if (ret) + goto err; + } return 0; err: pr_notice("error %s: %s", dc->backing_dev_name, err); bcache_device_stop(&dc->disk); - return -EIO; + return ret; } /* Flash only volumes */ @@ -1437,8 +1481,6 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) bool bch_cached_dev_error(struct cached_dev *dc) { - struct cache_set *c; - if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) return false; @@ -1449,21 +1491,6 @@ bool bch_cached_dev_error(struct cached_dev *dc) pr_err("stop %s: too many IO errors on backing device %s\n", dc->disk.disk->disk_name, dc->backing_dev_name); - /* - * If the cached device is still attached to a cache set, - * even dc->io_disable is true and no more I/O requests - * accepted, cache device internal I/O (writeback scan or - * garbage collection) may still prevent bcache device from - * being stopped. So here CACHE_SET_IO_DISABLE should be - * set to c->flags too, to make the internal I/O to cache - * device rejected and stopped immediately. - * If c is NULL, that means the bcache device is not attached - * to any cache set, then no CACHE_SET_IO_DISABLE bit to set. - */ - c = dc->disk.c; - if (c && test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags)) - pr_info("CACHE_SET_IO_DISABLE already set"); - bcache_device_stop(&dc->disk); return true; } @@ -1564,19 +1591,23 @@ static void cache_set_flush(struct closure *cl) kobject_put(&c->internal); kobject_del(&c->kobj); - if (c->gc_thread) + if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); if (!IS_ERR_OR_NULL(c->root)) list_add(&c->root->list, &c->btree_cache); - /* Should skip this if we're unregistering because of an error */ - list_for_each_entry(b, &c->btree_cache, list) { - mutex_lock(&b->write_lock); - if (btree_node_dirty(b)) - __bch_btree_node_write(b, NULL); - mutex_unlock(&b->write_lock); - } + /* + * Avoid flushing cached nodes if cache set is retiring + * due to too many I/O errors detected. + */ + if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags)) + list_for_each_entry(b, &c->btree_cache, list) { + mutex_lock(&b->write_lock); + if (btree_node_dirty(b)) + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); + } for_each_cache(ca, c, i) if (ca->alloc_thread) @@ -1849,6 +1880,23 @@ static int run_cache_set(struct cache_set *c) if (bch_btree_check(c)) goto err; + /* + * bch_btree_check() may occupy too much system memory which + * has negative effects to user space application (e.g. data + * base) performance. Shrink the mca cache memory proactively + * here to avoid competing memory with user space workloads.. + */ + if (!c->shrinker_disabled) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = c->btree_cache_used * c->btree_pages; + /* first run to clear b->accessed tag */ + c->shrink.scan_objects(&c->shrink, &sc); + /* second run to reap non-accessed nodes */ + c->shrink.scan_objects(&c->shrink, &sc); + } + bch_journal_mark(c, &journal); bch_initial_gc_finish(c); pr_debug("btree_check() done"); @@ -1957,7 +2005,7 @@ err: } closure_sync(&cl); - /* XXX: test this, it's broken */ + bch_cache_set_error(c, "%s", err); return -EIO; @@ -2251,9 +2299,13 @@ err: static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size); +static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, size_t size); kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); +kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); static bool bch_is_open_backing(struct block_device *bdev) { @@ -2301,6 +2353,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!try_module_get(THIS_MODULE)) return -EBUSY; + /* For latest state of bcache_is_reboot */ + smp_mb(); + if (bcache_is_reboot) + return -EBUSY; + path = kstrndup(buffer, size, GFP_KERNEL); if (!path) goto err; @@ -2378,8 +2435,61 @@ err: goto out; } + +struct pdev { + struct list_head list; + struct cached_dev *dc; +}; + +static ssize_t bch_pending_bdevs_cleanup(struct kobject *k, + struct kobj_attribute *attr, + const char *buffer, + size_t size) +{ + LIST_HEAD(pending_devs); + ssize_t ret = size; + struct cached_dev *dc, *tdc; + struct pdev *pdev, *tpdev; + struct cache_set *c, *tc; + + mutex_lock(&bch_register_lock); + list_for_each_entry_safe(dc, tdc, &uncached_devices, list) { + pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL); + if (!pdev) + break; + pdev->dc = dc; + list_add(&pdev->list, &pending_devs); + } + + list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { + char *pdev_set_uuid = pdev->dc->sb.set_uuid; + char *set_uuid = c->sb.uuid; + + if (!memcmp(pdev_set_uuid, set_uuid, 16)) { + list_del(&pdev->list); + kfree(pdev); + break; + } + } + } + mutex_unlock(&bch_register_lock); + + list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) { + pr_info("delete pdev %p", pdev); + list_del(&pdev->list); + bcache_device_stop(&pdev->dc->disk); + kfree(pdev); + } + + return ret; +} + static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) { + if (bcache_is_reboot) + return NOTIFY_DONE; + if (code == SYS_DOWN || code == SYS_HALT || code == SYS_POWER_OFF) { @@ -2392,19 +2502,45 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) mutex_lock(&bch_register_lock); + if (bcache_is_reboot) + goto out; + + /* New registration is rejected since now */ + bcache_is_reboot = true; + /* + * Make registering caller (if there is) on other CPU + * core know bcache_is_reboot set to true earlier + */ + smp_mb(); + if (list_empty(&bch_cache_sets) && list_empty(&uncached_devices)) goto out; + mutex_unlock(&bch_register_lock); + pr_info("Stopping all devices:"); + /* + * The reason bch_register_lock is not held to call + * bch_cache_set_stop() and bcache_device_stop() is to + * avoid potential deadlock during reboot, because cache + * set or bcache device stopping process will acqurie + * bch_register_lock too. + * + * We are safe here because bcache_is_reboot sets to + * true already, register_bcache() will reject new + * registration now. bcache_is_reboot also makes sure + * bcache_reboot() won't be re-entered on by other thread, + * so there is no race in following list iteration by + * list_for_each_entry_safe(). + */ list_for_each_entry_safe(c, tc, &bch_cache_sets, list) bch_cache_set_stop(c); list_for_each_entry_safe(dc, tdc, &uncached_devices, list) bcache_device_stop(&dc->disk); - mutex_unlock(&bch_register_lock); /* * Give an early chance for other kthreads and @@ -2496,6 +2632,7 @@ static int __init bcache_init(void) static const struct attribute *files[] = { &ksysfs_register.attr, &ksysfs_register_quiet.attr, + &ksysfs_pendings_cleanup.attr, NULL }; @@ -2531,6 +2668,8 @@ static int __init bcache_init(void) bch_debug_init(); closure_debug_init(); + bcache_is_reboot = false; + return 0; err: bcache_exit(); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index bfb437ffb13c..9f0826712845 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -16,33 +16,31 @@ #include <linux/sort.h> #include <linux/sched/clock.h> +extern bool bcache_is_reboot; + /* Default is 0 ("writethrough") */ static const char * const bch_cache_modes[] = { "writethrough", "writeback", "writearound", - "none", - NULL + "none" }; /* Default is 0 ("auto") */ static const char * const bch_stop_on_failure_modes[] = { "auto", - "always", - NULL + "always" }; static const char * const cache_replacement_policies[] = { "lru", "fifo", - "random", - NULL + "random" }; static const char * const error_actions[] = { "unregister", - "panic", - NULL + "panic" }; write_attribute(attach); @@ -84,8 +82,8 @@ read_attribute(bset_tree_stats); read_attribute(state); read_attribute(cache_read_races); read_attribute(reclaim); +read_attribute(reclaimed_journal_buckets); read_attribute(flush_write); -read_attribute(retry_flush_write); read_attribute(writeback_keys_done); read_attribute(writeback_keys_failed); read_attribute(io_errors); @@ -180,7 +178,7 @@ SHOW(__bch_cached_dev) var_print(writeback_percent); sysfs_hprint(writeback_rate, wb ? atomic_long_read(&dc->writeback_rate.rate) << 9 : 0); - sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); + sysfs_printf(io_errors, "%i", atomic_read(&dc->io_errors)); sysfs_printf(io_error_limit, "%i", dc->error_limit); sysfs_printf(io_disable, "%i", dc->io_disable); var_print(writeback_rate_update_seconds); @@ -271,6 +269,10 @@ STORE(__cached_dev) struct cache_set *c; struct kobj_uevent_env *env; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + #define d_strtoul(var) sysfs_strtoul(var, dc->var) #define d_strtoul_nonzero(var) sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) @@ -329,11 +331,14 @@ STORE(__cached_dev) bch_cache_accounting_clear(&dc->accounting); if (attr == &sysfs_running && - strtoul_or_return(buf)) - bch_cached_dev_run(dc); + strtoul_or_return(buf)) { + v = bch_cached_dev_run(dc); + if (v) + return v; + } if (attr == &sysfs_cache_mode) { - v = __sysfs_match_string(bch_cache_modes, -1, buf); + v = sysfs_match_string(bch_cache_modes, buf); if (v < 0) return v; @@ -344,7 +349,7 @@ STORE(__cached_dev) } if (attr == &sysfs_stop_when_cache_set_failed) { - v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); + v = sysfs_match_string(bch_stop_on_failure_modes, buf); if (v < 0) return v; @@ -408,6 +413,10 @@ STORE(bch_cached_dev) struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + mutex_lock(&bch_register_lock); size = __cached_dev_store(kobj, attr, buf, size); @@ -464,7 +473,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_minimum, &sysfs_writeback_rate_debug, - &sysfs_errors, + &sysfs_io_errors, &sysfs_io_error_limit, &sysfs_io_disable, &sysfs_dirty_data, @@ -511,6 +520,10 @@ STORE(__bch_flash_dev) kobj); struct uuid_entry *u = &d->c->uuids[d->id]; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + sysfs_strtoul(data_csum, d->data_csum); if (attr == &sysfs_size) { @@ -693,12 +706,12 @@ SHOW(__bch_cache_set) sysfs_print(reclaim, atomic_long_read(&c->reclaim)); + sysfs_print(reclaimed_journal_buckets, + atomic_long_read(&c->reclaimed_journal_buckets)); + sysfs_print(flush_write, atomic_long_read(&c->flush_write)); - sysfs_print(retry_flush_write, - atomic_long_read(&c->retry_flush_write)); - sysfs_print(writeback_keys_done, atomic_long_read(&c->writeback_keys_done)); sysfs_print(writeback_keys_failed, @@ -746,6 +759,10 @@ STORE(__bch_cache_set) struct cache_set *c = container_of(kobj, struct cache_set, kobj); ssize_t v; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + if (attr == &sysfs_unregister) bch_cache_set_unregister(c); @@ -799,7 +816,7 @@ STORE(__bch_cache_set) 0, UINT_MAX); if (attr == &sysfs_errors) { - v = __sysfs_match_string(error_actions, -1, buf); + v = sysfs_match_string(error_actions, buf); if (v < 0) return v; @@ -865,6 +882,10 @@ STORE(bch_cache_set_internal) { struct cache_set *c = container_of(kobj, struct cache_set, internal); + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + return bch_cache_set_store(&c->kobj, attr, buf, size); } @@ -914,8 +935,8 @@ static struct attribute *bch_cache_set_internal_files[] = { &sysfs_bset_tree_stats, &sysfs_cache_read_races, &sysfs_reclaim, + &sysfs_reclaimed_journal_buckets, &sysfs_flush_write, - &sysfs_retry_flush_write, &sysfs_writeback_keys_done, &sysfs_writeback_keys_failed, @@ -1050,6 +1071,10 @@ STORE(__bch_cache) struct cache *ca = container_of(kobj, struct cache, kobj); ssize_t v; + /* no user space access if system is rebooting */ + if (bcache_is_reboot) + return -EBUSY; + if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); @@ -1063,7 +1088,7 @@ STORE(__bch_cache) } if (attr == &sysfs_cache_replacement_policy) { - v = __sysfs_match_string(cache_replacement_policies, -1, buf); + v = sysfs_match_string(cache_replacement_policies, buf); if (v < 0) return v; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 1fbced94e4cc..c029f7443190 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -113,8 +113,6 @@ do { \ #define heap_full(h) ((h)->used == (h)->size) -#define heap_empty(h) ((h)->used == 0) - #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 73f0efac2b9f..d60268fe49e1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -122,6 +122,9 @@ static void __update_writeback_rate(struct cached_dev *dc) static bool set_at_max_writeback_rate(struct cache_set *c, struct cached_dev *dc) { + /* Don't set max writeback rate if gc is running */ + if (!c->gc_mark_valid) + return false; /* * Idle_counter is increased everytime when update_writeback_rate() is * called. If all backing devices attached to the same cache set have @@ -735,6 +738,10 @@ static int bch_writeback_thread(void *arg) } } + if (dc->writeback_write_wq) { + flush_workqueue(dc->writeback_write_wq); + destroy_workqueue(dc->writeback_write_wq); + } cached_dev_put(dc); wait_for_kthread_stop(); @@ -830,6 +837,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc) "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) { cached_dev_put(dc); + destroy_workqueue(dc->writeback_write_wq); return PTR_ERR(dc->writeback_thread); } dc->writeback_running = true; diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c01d41198f5e..b092c7b5282f 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1790,6 +1790,8 @@ void md_bitmap_destroy(struct mddev *mddev) return; md_bitmap_wait_behind_writes(mddev); + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; mutex_lock(&mddev->bitmap_info.mutex); spin_lock(&mddev->lock); @@ -1900,10 +1902,14 @@ int md_bitmap_load(struct mddev *mddev) sector_t start = 0; sector_t sector = 0; struct bitmap *bitmap = mddev->bitmap; + struct md_rdev *rdev; if (!bitmap) goto out; + rdev_for_each(rdev, mddev) + mddev_create_wb_pool(mddev, rdev, true); + if (mddev_is_clustered(mddev)) md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); @@ -2462,12 +2468,26 @@ static ssize_t backlog_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long backlog; + unsigned long old_mwb = mddev->bitmap_info.max_write_behind; int rv = kstrtoul(buf, 10, &backlog); if (rv) return rv; if (backlog > COUNTER_MAX) return -EINVAL; mddev->bitmap_info.max_write_behind = backlog; + if (!backlog && mddev->wb_info_pool) { + /* wb_info_pool is not needed if backlog is zero */ + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; + } else if (backlog && !mddev->wb_info_pool) { + /* wb_info_pool is needed since backlog is not zero */ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) + mddev_create_wb_pool(mddev, rdev, false); + } + if (old_mwb != backlog) + md_bitmap_update_sb(mddev->bitmap); return len; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 9801d540fea1..a114b05e3db4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -37,6 +37,7 @@ */ +#include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/kthread.h> #include <linux/blkdev.h> @@ -124,6 +125,77 @@ static inline int speed_max(struct mddev *mddev) mddev->sync_speed_max : sysctl_speed_limit_max; } +static int rdev_init_wb(struct md_rdev *rdev) +{ + if (rdev->bdev->bd_queue->nr_hw_queues == 1) + return 0; + + spin_lock_init(&rdev->wb_list_lock); + INIT_LIST_HEAD(&rdev->wb_list); + init_waitqueue_head(&rdev->wb_io_wait); + set_bit(WBCollisionCheck, &rdev->flags); + + return 1; +} + +/* + * Create wb_info_pool if rdev is the first multi-queue device flaged + * with writemostly, also write-behind mode is enabled. + */ +void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend) +{ + if (mddev->bitmap_info.max_write_behind == 0) + return; + + if (!test_bit(WriteMostly, &rdev->flags) || !rdev_init_wb(rdev)) + return; + + if (mddev->wb_info_pool == NULL) { + unsigned int noio_flag; + + if (!is_suspend) + mddev_suspend(mddev); + noio_flag = memalloc_noio_save(); + mddev->wb_info_pool = mempool_create_kmalloc_pool(NR_WB_INFOS, + sizeof(struct wb_info)); + memalloc_noio_restore(noio_flag); + if (!mddev->wb_info_pool) + pr_err("can't alloc memory pool for writemostly\n"); + if (!is_suspend) + mddev_resume(mddev); + } +} +EXPORT_SYMBOL_GPL(mddev_create_wb_pool); + +/* + * destroy wb_info_pool if rdev is the last device flaged with WBCollisionCheck. + */ +static void mddev_destroy_wb_pool(struct mddev *mddev, struct md_rdev *rdev) +{ + if (!test_and_clear_bit(WBCollisionCheck, &rdev->flags)) + return; + + if (mddev->wb_info_pool) { + struct md_rdev *temp; + int num = 0; + + /* + * Check if other rdevs need wb_info_pool. + */ + rdev_for_each(temp, mddev) + if (temp != rdev && + test_bit(WBCollisionCheck, &temp->flags)) + num++; + if (!num) { + mddev_suspend(rdev->mddev); + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; + mddev_resume(rdev->mddev); + } + } +} + static struct ctl_table_header *raid_table_header; static struct ctl_table raid_table[] = { @@ -2210,6 +2282,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) rdev->mddev = mddev; pr_debug("md: bind<%s>\n", b); + if (mddev->raid_disks) + mddev_create_wb_pool(mddev, rdev, false); + if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) goto fail; @@ -2246,6 +2321,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); + mddev_destroy_wb_pool(rdev->mddev, rdev); rdev->mddev = NULL; sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); @@ -2758,8 +2834,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); + mddev_create_wb_pool(rdev->mddev, rdev, false); err = 0; } else if (cmd_match(buf, "-writemostly")) { + mddev_destroy_wb_pool(rdev->mddev, rdev); clear_bit(WriteMostly, &rdev->flags); err = 0; } else if (cmd_match(buf, "blocked")) { @@ -3356,7 +3434,7 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; if (!rdev->mddev) - return -EBUSY; + return -ENODEV; return entry->show(rdev, page); } @@ -5588,15 +5666,28 @@ int md_run(struct mddev *mddev) mddev->bitmap = bitmap; } - if (err) { - mddev_detach(mddev); - if (mddev->private) - pers->free(mddev, mddev->private); - mddev->private = NULL; - module_put(pers->owner); - md_bitmap_destroy(mddev); - goto abort; + if (err) + goto bitmap_abort; + + if (mddev->bitmap_info.max_write_behind > 0) { + bool creat_pool = false; + + rdev_for_each(rdev, mddev) { + if (test_bit(WriteMostly, &rdev->flags) && + rdev_init_wb(rdev)) + creat_pool = true; + } + if (creat_pool && mddev->wb_info_pool == NULL) { + mddev->wb_info_pool = + mempool_create_kmalloc_pool(NR_WB_INFOS, + sizeof(struct wb_info)); + if (!mddev->wb_info_pool) { + err = -ENOMEM; + goto bitmap_abort; + } + } } + if (mddev->queue) { bool nonrot = true; @@ -5639,8 +5730,7 @@ int md_run(struct mddev *mddev) spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; + sysfs_link_rdev(mddev, rdev); /* failure here is OK */ if (mddev->degraded && !mddev->ro) /* This ensures that recovering status is reported immediately @@ -5658,6 +5748,13 @@ int md_run(struct mddev *mddev) sysfs_notify(&mddev->kobj, NULL, "degraded"); return 0; +bitmap_abort: + mddev_detach(mddev); + if (mddev->private) + pers->free(mddev, mddev->private); + mddev->private = NULL; + module_put(pers->owner); + md_bitmap_destroy(mddev); abort: bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); @@ -5826,6 +5923,8 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } + mempool_destroy(mddev->wb_info_pool); + mddev->wb_info_pool = NULL; } void md_stop_writes(struct mddev *mddev) @@ -8198,8 +8297,7 @@ void md_do_sync(struct md_thread *thread) { struct mddev *mddev = thread->mddev; struct mddev *mddev2; - unsigned int currspeed = 0, - window; + unsigned int currspeed = 0, window; sector_t max_sectors,j, io_sectors, recovery_done; unsigned long mark[SYNC_MARKS]; unsigned long update_time; @@ -8256,7 +8354,7 @@ void md_do_sync(struct md_thread *thread) * 0 == not engaged in resync at all * 2 == checking that there is no conflict with another sync * 1 == like 2, but have yielded to allow conflicting resync to - * commense + * commence * other == active in resync - this many blocks * * Before starting a resync we must have set curr_resync to @@ -8387,7 +8485,7 @@ void md_do_sync(struct md_thread *thread) /* * Tune reconstruction: */ - window = 32*(PAGE_SIZE/512); + window = 32 * (PAGE_SIZE / 512); pr_debug("md: using %dk window, over a total of %lluk.\n", window/2, (unsigned long long)max_sectors/2); @@ -9200,7 +9298,6 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) * perform resync with the new activated disk */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); - } /* device faulty * We just want to do the minimum to mark the disk diff --git a/drivers/md/md.h b/drivers/md/md.h index 7c930c091193..10f98200e2f8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -109,6 +109,14 @@ struct md_rdev { * for reporting to userspace and storing * in superblock. */ + + /* + * The members for check collision of write behind IOs. + */ + struct list_head wb_list; + spinlock_t wb_list_lock; + wait_queue_head_t wb_io_wait; + struct work_struct del_work; /* used for delayed sysfs removal */ struct kernfs_node *sysfs_state; /* handle for 'state' @@ -193,6 +201,10 @@ enum flag_bits { * it didn't fail, so don't use FailFast * any more for metadata */ + WBCollisionCheck, /* + * multiqueue device should check if there + * is collision between write behind bios. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -245,6 +257,14 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; +#define NR_WB_INFOS 8 +/* record current range of write behind IOs */ +struct wb_info { + sector_t lo; + sector_t hi; + struct list_head list; +}; + struct mddev { void *private; struct md_personality *pers; @@ -461,6 +481,7 @@ struct mddev { */ struct work_struct flush_work; struct work_struct event_work; /* used by dm to report failure event */ + mempool_t *wb_info_pool; void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ @@ -709,6 +730,8 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); extern void md_kick_rdev_from_array(struct md_rdev * rdev); +extern void mddev_create_wb_pool(struct mddev *mddev, struct md_rdev *rdev, + bool is_suspend); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev); diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 400001b815db..54db34163968 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -3,12 +3,42 @@ #define RESYNC_BLOCK_SIZE (64*1024) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) +/* + * Number of guaranteed raid bios in case of extreme VM load: + */ +#define NR_RAID_BIOS 256 + +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + +/* When there are this many requests queue to be written by + * the raid thread, we become 'congested' to provide back-pressure + * for writeback. + */ +static int max_queued_requests = 1024; + /* for managing resync I/O pages */ struct resync_pages { void *raid_bio; struct page *pages[RESYNC_PAGES]; }; +static void rbio_pool_free(void *rbio, void *data) +{ + kfree(rbio); +} + static inline int resync_alloc_pages(struct resync_pages *rp, gfp_t gfp_flags) { diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 2aa36e570e04..34e26834ad28 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -42,31 +42,6 @@ (1L << MD_HAS_PPL) | \ (1L << MD_HAS_MULTIPLE_PPLS)) -/* - * Number of guaranteed r1bios in case of extreme VM load: - */ -#define NR_RAID1_BIOS 256 - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio *)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting devs[n].bio to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) - -/* When there are this many requests queue to be written by - * the raid1 thread, we become 'congested' to provide back-pressure - * for writeback. - */ -static int max_queued_requests = 1024; - static void allow_barrier(struct r1conf *conf, sector_t sector_nr); static void lower_barrier(struct r1conf *conf, sector_t sector_nr); @@ -75,6 +50,57 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr); #include "raid1-10.c" +static int check_and_add_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +{ + struct wb_info *wi, *temp_wi; + unsigned long flags; + int ret = 0; + struct mddev *mddev = rdev->mddev; + + wi = mempool_alloc(mddev->wb_info_pool, GFP_NOIO); + + spin_lock_irqsave(&rdev->wb_list_lock, flags); + list_for_each_entry(temp_wi, &rdev->wb_list, list) { + /* collision happened */ + if (hi > temp_wi->lo && lo < temp_wi->hi) { + ret = -EBUSY; + break; + } + } + + if (!ret) { + wi->lo = lo; + wi->hi = hi; + list_add(&wi->list, &rdev->wb_list); + } else + mempool_free(wi, mddev->wb_info_pool); + spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + + return ret; +} + +static void remove_wb(struct md_rdev *rdev, sector_t lo, sector_t hi) +{ + struct wb_info *wi; + unsigned long flags; + int found = 0; + struct mddev *mddev = rdev->mddev; + + spin_lock_irqsave(&rdev->wb_list_lock, flags); + list_for_each_entry(wi, &rdev->wb_list, list) + if (hi == wi->hi && lo == wi->lo) { + list_del(&wi->list); + mempool_free(wi, mddev->wb_info_pool); + found = 1; + break; + } + + if (!found) + WARN(1, "The write behind IO is not recorded\n"); + spin_unlock_irqrestore(&rdev->wb_list_lock, flags); + wake_up(&rdev->wb_io_wait); +} + /* * for resync bio, r1bio pointer can be retrieved from the per-bio * 'struct resync_pages'. @@ -93,11 +119,6 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) return kzalloc(size, gfp_flags); } -static void r1bio_pool_free(void *r1_bio, void *data) -{ - kfree(r1_bio); -} - #define RESYNC_DEPTH 32 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) @@ -173,7 +194,7 @@ out_free_bio: kfree(rps); out_free_r1bio: - r1bio_pool_free(r1_bio, data); + rbio_pool_free(r1_bio, data); return NULL; } @@ -193,7 +214,7 @@ static void r1buf_pool_free(void *__r1_bio, void *data) /* resync pages array stored in the 1st bio's .bi_private */ kfree(rp); - r1bio_pool_free(r1bio, data); + rbio_pool_free(r1bio, data); } static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio) @@ -476,6 +497,12 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { + if (test_bit(WBCollisionCheck, &rdev->flags)) { + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; + + remove_wb(rdev, lo, hi); + } if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); @@ -1449,7 +1476,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (!r1_bio->bios[i]) continue; - if (first_clone) { /* do behind I/O ? * Not if there are too many, or cannot @@ -1474,7 +1500,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); if (r1_bio->behind_master_bio) { - if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) + struct md_rdev *rdev = conf->mirrors[i].rdev; + + if (test_bit(WBCollisionCheck, &rdev->flags)) { + sector_t lo = r1_bio->sector; + sector_t hi = r1_bio->sector + r1_bio->sectors; + + wait_event(rdev->wb_io_wait, + check_and_add_wb(rdev, lo, hi) == 0); + } + if (test_bit(WriteMostly, &rdev->flags)) atomic_inc(&r1_bio->behind_remaining); } @@ -1729,9 +1764,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) first = last = rdev->saved_raid_disk; for (mirror = first; mirror <= last; mirror++) { - p = conf->mirrors+mirror; + p = conf->mirrors + mirror; if (!p->rdev) { - if (mddev->gendisk) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); @@ -2888,7 +2922,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; generic_make_request(bio); - } return nr_sectors; } @@ -2947,8 +2980,8 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf->poolinfo) goto abort; conf->poolinfo->raid_disks = mddev->raid_disks * 2; - err = mempool_init(&conf->r1bio_pool, NR_RAID1_BIOS, r1bio_pool_alloc, - r1bio_pool_free, conf->poolinfo); + err = mempool_init(&conf->r1bio_pool, NR_RAID_BIOS, r1bio_pool_alloc, + rbio_pool_free, conf->poolinfo); if (err) goto abort; @@ -3089,7 +3122,7 @@ static int raid1_run(struct mddev *mddev) } mddev->degraded = 0; - for (i=0; i < conf->raid_disks; i++) + for (i = 0; i < conf->raid_disks; i++) if (conf->mirrors[i].rdev == NULL || !test_bit(In_sync, &conf->mirrors[i].rdev->flags) || test_bit(Faulty, &conf->mirrors[i].rdev->flags)) @@ -3124,7 +3157,7 @@ static int raid1_run(struct mddev *mddev) mddev->queue); } - ret = md_integrity_register(mddev); + ret = md_integrity_register(mddev); if (ret) { md_unregister_thread(&mddev->thread); raid1_free(mddev, conf); @@ -3232,8 +3265,8 @@ static int raid1_reshape(struct mddev *mddev) newpoolinfo->mddev = mddev; newpoolinfo->raid_disks = raid_disks * 2; - ret = mempool_init(&newpool, NR_RAID1_BIOS, r1bio_pool_alloc, - r1bio_pool_free, newpoolinfo); + ret = mempool_init(&newpool, NR_RAID_BIOS, r1bio_pool_alloc, + rbio_pool_free, newpoolinfo); if (ret) { kfree(newpoolinfo); return ret; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index aea11476fee6..8a1354a08a1a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -64,31 +64,6 @@ * [B A] [D C] [B A] [E C D] */ -/* - * Number of guaranteed r10bios in case of extreme VM load: - */ -#define NR_RAID10_BIOS 256 - -/* when we get a read error on a read-only array, we redirect to another - * device without failing the first device, or trying to over-write to - * correct the read error. To keep track of bad blocks on a per-bio - * level, we store IO_BLOCKED in the appropriate 'bios' pointer - */ -#define IO_BLOCKED ((struct bio *)1) -/* When we successfully write to a known bad-block, we need to remove the - * bad-block marking which must be done from process context. So we record - * the success by setting devs[n].bio to IO_MADE_GOOD - */ -#define IO_MADE_GOOD ((struct bio *)2) - -#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) - -/* When there are this many requests queued to be written by - * the raid10 thread, we become 'congested' to provide back-pressure - * for writeback. - */ -static int max_queued_requests = 1024; - static void allow_barrier(struct r10conf *conf); static void lower_barrier(struct r10conf *conf); static int _enough(struct r10conf *conf, int previous, int ignore); @@ -123,11 +98,6 @@ static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) return kzalloc(size, gfp_flags); } -static void r10bio_pool_free(void *r10_bio, void *data) -{ - kfree(r10_bio); -} - #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) @@ -233,7 +203,7 @@ out_free_bio: } kfree(rps); out_free_r10bio: - r10bio_pool_free(r10_bio, conf); + rbio_pool_free(r10_bio, conf); return NULL; } @@ -261,7 +231,7 @@ static void r10buf_pool_free(void *__r10_bio, void *data) /* resync pages array stored in the 1st bio's .bi_private */ kfree(rp); - r10bio_pool_free(r10bio, conf); + rbio_pool_free(r10bio, conf); } static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) @@ -737,15 +707,19 @@ static struct md_rdev *read_balance(struct r10conf *conf, int sectors = r10_bio->sectors; int best_good_sectors; sector_t new_distance, best_dist; - struct md_rdev *best_rdev, *rdev = NULL; + struct md_rdev *best_dist_rdev, *best_pending_rdev, *rdev = NULL; int do_balance; - int best_slot; + int best_dist_slot, best_pending_slot; + bool has_nonrot_disk = false; + unsigned int min_pending; struct geom *geo = &conf->geo; raid10_find_phys(conf, r10_bio); rcu_read_lock(); - best_slot = -1; - best_rdev = NULL; + best_dist_slot = -1; + min_pending = UINT_MAX; + best_dist_rdev = NULL; + best_pending_rdev = NULL; best_dist = MaxSector; best_good_sectors = 0; do_balance = 1; @@ -767,6 +741,8 @@ static struct md_rdev *read_balance(struct r10conf *conf, sector_t first_bad; int bad_sectors; sector_t dev_sector; + unsigned int pending; + bool nonrot; if (r10_bio->devs[slot].bio == IO_BLOCKED) continue; @@ -803,8 +779,8 @@ static struct md_rdev *read_balance(struct r10conf *conf, first_bad - dev_sector; if (good_sectors > best_good_sectors) { best_good_sectors = good_sectors; - best_slot = slot; - best_rdev = rdev; + best_dist_slot = slot; + best_dist_rdev = rdev; } if (!do_balance) /* Must read from here */ @@ -817,14 +793,23 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (!do_balance) break; - if (best_slot >= 0) + nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + has_nonrot_disk |= nonrot; + pending = atomic_read(&rdev->nr_pending); + if (min_pending > pending && nonrot) { + min_pending = pending; + best_pending_slot = slot; + best_pending_rdev = rdev; + } + + if (best_dist_slot >= 0) /* At least 2 disks to choose from so failfast is OK */ set_bit(R10BIO_FailFast, &r10_bio->state); /* This optimisation is debatable, and completely destroys * sequential read speed for 'far copies' arrays. So only * keep it for 'near' arrays, and review those later. */ - if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) + if (geo->near_copies > 1 && !pending) new_distance = 0; /* for far > 1 always use the lowest address */ @@ -833,15 +818,21 @@ static struct md_rdev *read_balance(struct r10conf *conf, else new_distance = abs(r10_bio->devs[slot].addr - conf->mirrors[disk].head_position); + if (new_distance < best_dist) { best_dist = new_distance; - best_slot = slot; - best_rdev = rdev; + best_dist_slot = slot; + best_dist_rdev = rdev; } } if (slot >= conf->copies) { - slot = best_slot; - rdev = best_rdev; + if (has_nonrot_disk) { + slot = best_pending_slot; + rdev = best_pending_rdev; + } else { + slot = best_dist_slot; + rdev = best_dist_rdev; + } } if (slot >= 0) { @@ -3675,8 +3666,8 @@ static struct r10conf *setup_conf(struct mddev *mddev) conf->geo = geo; conf->copies = copies; - err = mempool_init(&conf->r10bio_pool, NR_RAID10_BIOS, r10bio_pool_alloc, - r10bio_pool_free, conf); + err = mempool_init(&conf->r10bio_pool, NR_RAID_BIOS, r10bio_pool_alloc, + rbio_pool_free, conf); if (err) goto out; @@ -4780,8 +4771,7 @@ static int handle_reshape_read_error(struct mddev *mddev, int idx = 0; struct page **pages; - r10b = kmalloc(sizeof(*r10b) + - sizeof(struct r10dev) * conf->copies, GFP_NOIO); + r10b = kmalloc(struct_size(r10b, devs, conf->copies), GFP_NOIO); if (!r10b) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); return -ENOMEM; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b83bce2beb66..3de4e13bde98 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5251,7 +5251,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) rcu_read_unlock(); raid_bio->bi_next = (void*)rdev; bio_set_dev(align_bi, rdev->bdev); - bio_clear_flag(align_bi, BIO_SEG_VALID); if (is_badblock(rdev, align_bi->bi_iter.bi_sector, bio_sectors(align_bi), @@ -7672,7 +7671,7 @@ abort: static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; - int err = -EEXIST; + int ret, err = -EEXIST; int disk; struct disk_info *p; int first = 0; @@ -7687,7 +7686,14 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) * The array is in readonly mode if journal is missing, so no * write requests running. We should be safe */ - log_init(conf, rdev, false); + ret = log_init(conf, rdev, false); + if (ret) + return ret; + + ret = r5l_start(conf->log); + if (ret) + return ret; + return 0; } if (mddev->recovery_disabled == conf->recovery_disabled) |