From 5c694129c8db6d89c9be109049a16510b2f70f6d Mon Sep 17 00:00:00 2001 From: Kumar Amit Mehta Date: Tue, 28 May 2013 00:31:15 -0700 Subject: md: bcache: io.c: fix a potential NULL pointer dereference bio_alloc_bioset returns NULL on failure. This fix adds a missing check for potential NULL pointer dereferencing. Signed-off-by: Kumar Amit Mehta Signed-off-by: Kent Overstreet --- drivers/md/bcache/io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 48efd4dea645..d285cd49104c 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -97,6 +97,8 @@ struct bio *bch_bio_split(struct bio *bio, int sectors, if (bio->bi_rw & REQ_DISCARD) { ret = bio_alloc_bioset(gfp, 1, bs); + if (!ret) + return NULL; idx = 0; goto out; } -- cgit v1.2.3 From bbc77aa7fb72e616edcb1b165c293e47c6d4d0cf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2013 21:53:19 -0700 Subject: bcache: fix a spurious gcc complaint, use scnprintf An old version of gcc was complaining about using a const int as the size of a stack allocated array. Which should be fine - but using ARRAY_SIZE() is better, anyways. Also, refactor the code to use scnprintf(). Signed-off-by: Kent Overstreet --- drivers/md/bcache/sysfs.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 4d9cca47e4c6..29228b8a6ffe 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -665,12 +665,10 @@ SHOW(__bch_cache) int cmp(const void *l, const void *r) { return *((uint16_t *) r) - *((uint16_t *) l); } - /* Number of quantiles we compute */ - const unsigned nq = 31; - size_t n = ca->sb.nbuckets, i, unused, btree; uint64_t sum = 0; - uint16_t q[nq], *p, *cached; + /* Compute 31 quantiles */ + uint16_t q[31], *p, *cached; ssize_t ret; cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t)); @@ -703,26 +701,29 @@ SHOW(__bch_cache) if (n) do_div(sum, n); - for (i = 0; i < nq; i++) - q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)]; + for (i = 0; i < ARRAY_SIZE(q); i++) + q[i] = INITIAL_PRIO - cached[n * (i + 1) / + (ARRAY_SIZE(q) + 1)]; vfree(p); - ret = snprintf(buf, PAGE_SIZE, - "Unused: %zu%%\n" - "Metadata: %zu%%\n" - "Average: %llu\n" - "Sectors per Q: %zu\n" - "Quantiles: [", - unused * 100 / (size_t) ca->sb.nbuckets, - btree * 100 / (size_t) ca->sb.nbuckets, sum, - n * ca->sb.bucket_size / (nq + 1)); - - for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++) - ret += snprintf(buf + ret, PAGE_SIZE - ret, - i < nq - 1 ? "%u " : "%u]\n", q[i]); - - buf[PAGE_SIZE - 1] = '\0'; + ret = scnprintf(buf, PAGE_SIZE, + "Unused: %zu%%\n" + "Metadata: %zu%%\n" + "Average: %llu\n" + "Sectors per Q: %zu\n" + "Quantiles: [", + unused * 100 / (size_t) ca->sb.nbuckets, + btree * 100 / (size_t) ca->sb.nbuckets, sum, + n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1)); + + for (i = 0; i < ARRAY_SIZE(q); i++) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, + "%u ", q[i]); + ret--; + + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n"); + return ret; } -- cgit v1.2.3 From a9dd53adbb84c12f769a862ba2c80404873c2c99 Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Sat, 4 May 2013 12:19:41 +0200 Subject: bcache: Warn when a device is already registered. Signed-off-by: Gabriel de Perthuis Signed-off-by: Kent Overstreet --- drivers/md/bcache/super.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f88e2b653a3f..3de5626919ef 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1786,6 +1786,36 @@ static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); +static bool bch_is_open_backing(struct block_device *bdev) { + struct cache_set *c, *tc; + struct cached_dev *dc, *t; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + list_for_each_entry_safe(dc, t, &c->cached_devs, list) + if (dc->bdev == bdev) + return true; + list_for_each_entry_safe(dc, t, &uncached_devices, list) + if (dc->bdev == bdev) + return true; + return false; +} + +static bool bch_is_open_cache(struct block_device *bdev) { + struct cache_set *c, *tc; + struct cache *ca; + unsigned i; + + list_for_each_entry_safe(c, tc, &bch_cache_sets, list) + for_each_cache(ca, c, i) + if (ca->bdev == bdev) + return true; + return false; +} + +static bool bch_is_open(struct block_device *bdev) { + return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); +} + static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { @@ -1810,8 +1840,13 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) { - if (bdev == ERR_PTR(-EBUSY)) - err = "device busy"; + if (bdev == ERR_PTR(-EBUSY)) { + bdev = lookup_bdev(strim(path)); + if (!IS_ERR(bdev) && bch_is_open(bdev)) + err = "device already registered"; + else + err = "device busy"; + } goto err; } -- cgit v1.2.3 From 119ba0f82839cd80eaef3e6991988f1403965d5b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 24 Apr 2013 19:01:12 -0700 Subject: bcache: Convert allocator thread to kthread Using a workqueue when we just want a single thread is a bit silly. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 34 ++++++++++++++++++++++------------ drivers/md/bcache/bcache.h | 17 +++++++++++------ drivers/md/bcache/btree.c | 6 +++--- drivers/md/bcache/super.c | 19 +++++++------------ 4 files changed, 43 insertions(+), 33 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 048f2947e08b..38428f46ea74 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -63,6 +63,7 @@ #include "bcache.h" #include "btree.h" +#include #include #define MAX_IN_FLIGHT_DISCARDS 8U @@ -151,7 +152,7 @@ static void discard_finish(struct work_struct *w) mutex_unlock(&ca->set->bucket_lock); closure_wake_up(&ca->set->bucket_wait); - wake_up(&ca->set->alloc_wait); + wake_up_process(ca->alloc_thread); closure_put(&ca->set->cl); } @@ -358,30 +359,26 @@ static void invalidate_buckets(struct cache *ca) #define allocator_wait(ca, cond) \ do { \ - DEFINE_WAIT(__wait); \ - \ while (1) { \ - prepare_to_wait(&ca->set->alloc_wait, \ - &__wait, TASK_INTERRUPTIBLE); \ + set_current_state(TASK_INTERRUPTIBLE); \ if (cond) \ break; \ \ mutex_unlock(&(ca)->set->bucket_lock); \ if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ - finish_wait(&ca->set->alloc_wait, &__wait); \ - closure_return(cl); \ + closure_put(&ca->set->cl); \ + return 0; \ } \ \ schedule(); \ mutex_lock(&(ca)->set->bucket_lock); \ } \ - \ - finish_wait(&ca->set->alloc_wait, &__wait); \ + __set_current_state(TASK_RUNNING); \ } while (0) -void bch_allocator_thread(struct closure *cl) +static int bch_allocator_thread(void *arg) { - struct cache *ca = container_of(cl, struct cache, alloc); + struct cache *ca = arg; mutex_lock(&ca->set->bucket_lock); @@ -442,7 +439,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl) { long r = -1; again: - wake_up(&ca->set->alloc_wait); + wake_up_process(ca->alloc_thread); if (fifo_used(&ca->free) > ca->watermark[watermark] && fifo_pop(&ca->free, r)) { @@ -552,6 +549,19 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, /* Init */ +int bch_cache_allocator_start(struct cache *ca) +{ + ca->alloc_thread = kthread_create(bch_allocator_thread, + ca, "bcache_allocator"); + if (IS_ERR(ca->alloc_thread)) + return PTR_ERR(ca->alloc_thread); + + closure_get(&ca->set->cl); + wake_up_process(ca->alloc_thread); + + return 0; +} + void bch_cache_allocator_exit(struct cache *ca) { struct discard *d; diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d3e15b42a4ab..166c8ddc0be4 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -565,8 +565,7 @@ struct cache { unsigned watermark[WATERMARK_MAX]; - struct closure alloc; - struct workqueue_struct *alloc_workqueue; + struct task_struct *alloc_thread; struct closure prio; struct prio_set *disk_buckets; @@ -703,9 +702,6 @@ struct cache_set { /* For the btree cache */ struct shrinker shrink; - /* For the allocator itself */ - wait_queue_head_t alloc_wait; - /* For the btree cache and anything allocation related */ struct mutex bucket_lock; @@ -1173,6 +1169,15 @@ static inline uint8_t bucket_disk_gen(struct bucket *b) static struct kobj_attribute ksysfs_##n = \ __ATTR(n, S_IWUSR|S_IRUSR, show, store) +static inline void wake_up_allocators(struct cache_set *c) +{ + struct cache *ca; + unsigned i; + + for_each_cache(ca, c, i) + wake_up_process(ca->alloc_thread); +} + /* Forward declarations */ void bch_writeback_queue(struct cached_dev *); @@ -1193,7 +1198,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned); uint8_t bch_inc_gen(struct cache *, struct bucket *); void bch_rescale_priorities(struct cache_set *, int); bool bch_bucket_add_unused(struct cache *, struct bucket *); -void bch_allocator_thread(struct closure *); long bch_bucket_alloc(struct cache *, unsigned, struct closure *); void bch_bucket_free(struct cache_set *, struct bkey *); @@ -1244,6 +1248,7 @@ int bch_btree_cache_alloc(struct cache_set *); void bch_cached_dev_writeback_init(struct cached_dev *); void bch_moving_init_cache_set(struct cache_set *); +int bch_cache_allocator_start(struct cache *ca); void bch_cache_allocator_exit(struct cache *ca); int bch_cache_allocator_init(struct cache *ca); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 7a5658f04e62..45b88fbffbe0 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -273,7 +273,7 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) { if (w->prio_blocked && !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) - wake_up(&b->c->alloc_wait); + wake_up_allocators(b->c); if (w->journal) { atomic_dec_bug(w->journal); @@ -984,7 +984,7 @@ static void btree_node_free(struct btree *b, struct btree_op *op) if (b->prio_blocked && !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) - wake_up(&b->c->alloc_wait); + wake_up_allocators(b->c); b->prio_blocked = 0; @@ -1547,7 +1547,7 @@ static void bch_btree_gc(struct closure *cl) blktrace_msg_all(c, "Finished gc"); trace_bcache_gc_end(c->sb.set_uuid); - wake_up(&c->alloc_wait); + wake_up_allocators(c); continue_at(cl, bch_moving_gc, bch_gc_wq); } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3de5626919ef..aaeda235fc75 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1282,7 +1282,7 @@ static void cache_set_flush(struct closure *cl) /* Shut down allocator threads */ set_bit(CACHE_SET_STOPPING_2, &c->flags); - wake_up(&c->alloc_wait); + wake_up_allocators(c); bch_cache_accounting_destroy(&c->accounting); @@ -1373,7 +1373,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); - init_waitqueue_head(&c->alloc_wait); mutex_init(&c->bucket_lock); mutex_init(&c->fill_lock); mutex_init(&c->sort_lock); @@ -1496,9 +1495,10 @@ static void run_cache_set(struct cache_set *c) */ bch_journal_next(&c->journal); + err = "error starting allocator thread"; for_each_cache(ca, c, i) - closure_call(&ca->alloc, bch_allocator_thread, - system_wq, &c->cl); + if (bch_cache_allocator_start(ca)) + goto err; /* * First place it's safe to allocate: btree_check() and @@ -1531,17 +1531,16 @@ static void run_cache_set(struct cache_set *c) bch_btree_gc_finish(c); + err = "error starting allocator thread"; for_each_cache(ca, c, i) - closure_call(&ca->alloc, bch_allocator_thread, - ca->alloc_workqueue, &c->cl); + if (bch_cache_allocator_start(ca)) + goto err; mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) bch_prio_write(ca); mutex_unlock(&c->bucket_lock); - wake_up(&c->alloc_wait); - err = "cannot allocate new UUID bucket"; if (__uuid_write(c)) goto err_unlock_gc; @@ -1673,9 +1672,6 @@ void bch_cache_release(struct kobject *kobj) bio_split_pool_free(&ca->bio_split_hook); - if (ca->alloc_workqueue) - destroy_workqueue(ca->alloc_workqueue); - free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); kfree(ca->prio_buckets); vfree(ca->buckets); @@ -1723,7 +1719,6 @@ static int cache_alloc(struct cache_sb *sb, struct cache *ca) !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * 2, GFP_KERNEL)) || !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || - !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) || bio_split_pool_init(&ca->bio_split_hook)) return -ENOMEM; -- cgit v1.2.3 From 5794351146199b9ac67a5ab1beab82be8bfd7b5d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 25 Apr 2013 13:58:35 -0700 Subject: bcache: Refactor btree io The most significant change is that btree reads are now done synchronously, instead of asynchronously and doing the post read stuff from a workqueue. This was originally done because we can't block on IO under generic_make_request(). But - we already have a mechanism to punt cache lookups to workqueue if needed, so if we just use that we don't have to deal with the complexity of doing things asynchronously. The main benefit is this makes the locking situation saner; we can hold our write lock on the btree node until we're finished reading it, and we don't need that btree_node_read_done() flag anymore. Also, for writes, btree_write() was broken out into btree_node_write() and btree_leaf_dirty() - the old code with the boolean argument was dumb and confusing. The prio_blocked mechanism was improved a bit too, now the only counter is in struct btree_write, we don't mess with transfering a count from struct btree anymore. This required changing garbage collection to block prios at the start and unblock when it finishes, which is cleaner than what it was doing anyways (the old code had mostly the same effect, but was doing it in a convoluted way) And the btree iter btree_node_read_done() uses was converted to a real mempool. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 5 +- drivers/md/bcache/btree.c | 274 ++++++++++++++++++++------------------------ drivers/md/bcache/btree.h | 19 +-- drivers/md/bcache/debug.c | 4 +- drivers/md/bcache/journal.c | 2 +- drivers/md/bcache/super.c | 12 +- 6 files changed, 140 insertions(+), 176 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 166c8ddc0be4..ad4957b52f10 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -819,10 +819,9 @@ struct cache_set { /* * A btree node on disk could have too many bsets for an iterator to fit - * on the stack - this is a single element mempool for btree_read_work() + * on the stack - have to dynamically allocate them */ - struct mutex fill_lock; - struct btree_iter *fill_iter; + mempool_t *fill_iter; /* * btree_sort() is a merge sort and requires temporary space - single diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 45b88fbffbe0..aaec186f7ba6 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -134,44 +134,17 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) return crc ^ 0xffffffffffffffffULL; } -static void btree_bio_endio(struct bio *bio, int error) +void bch_btree_node_read_done(struct btree *b) { - struct closure *cl = bio->bi_private; - struct btree *b = container_of(cl, struct btree, io.cl); - - if (error) - set_btree_node_io_error(b); - - bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE) - ? "writing btree" : "reading btree"); - closure_put(cl); -} - -static void btree_bio_init(struct btree *b) -{ - BUG_ON(b->bio); - b->bio = bch_bbio_alloc(b->c); - - b->bio->bi_end_io = btree_bio_endio; - b->bio->bi_private = &b->io.cl; -} - -void bch_btree_read_done(struct closure *cl) -{ - struct btree *b = container_of(cl, struct btree, io.cl); - struct bset *i = b->sets[0].data; - struct btree_iter *iter = b->c->fill_iter; const char *err = "bad btree header"; - BUG_ON(b->nsets || b->written); - - bch_bbio_free(b->bio, b->c); - b->bio = NULL; + struct bset *i = b->sets[0].data; + struct btree_iter *iter; - mutex_lock(&b->c->fill_lock); + iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); + iter->size = b->c->sb.bucket_size / b->c->sb.block_size; iter->used = 0; - if (btree_node_io_error(b) || - !i->seq) + if (!i->seq) goto err; for (; @@ -228,17 +201,8 @@ void bch_btree_read_done(struct closure *cl) if (b->written < btree_blocks(b)) bch_bset_init_next(b); out: - - mutex_unlock(&b->c->fill_lock); - - spin_lock(&b->c->btree_read_time_lock); - bch_time_stats_update(&b->c->btree_read_time, b->io_start_time); - spin_unlock(&b->c->btree_read_time_lock); - - smp_wmb(); /* read_done is our write lock */ - set_btree_node_read_done(b); - - closure_return(cl); + mempool_free(iter, b->c->fill_iter); + return; err: set_btree_node_io_error(b); bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", @@ -247,26 +211,51 @@ err: goto out; } -void bch_btree_read(struct btree *b) +static void btree_node_read_endio(struct bio *bio, int error) { - BUG_ON(b->nsets || b->written); + struct closure *cl = bio->bi_private; + closure_put(cl); +} - if (!closure_trylock(&b->io.cl, &b->c->cl)) - BUG(); +void bch_btree_node_read(struct btree *b) +{ + uint64_t start_time = local_clock(); + struct closure cl; + struct bio *bio; - b->io_start_time = local_clock(); + closure_init_stack(&cl); + pr_debug("%s", pbtree(b)); - btree_bio_init(b); - b->bio->bi_rw = REQ_META|READ_SYNC; - b->bio->bi_size = KEY_SIZE(&b->key) << 9; + bio = bch_bbio_alloc(b->c); + bio->bi_rw = REQ_META|READ_SYNC; + bio->bi_size = KEY_SIZE(&b->key) << 9; + bio->bi_end_io = btree_node_read_endio; + bio->bi_private = &cl; - bch_bio_map(b->bio, b->sets[0].data); + bch_bio_map(bio, b->sets[0].data); - pr_debug("%s", pbtree(b)); - trace_bcache_btree_read(b->bio); - bch_submit_bbio(b->bio, b->c, &b->key, 0); + trace_bcache_btree_read(bio); + bch_submit_bbio(bio, b->c, &b->key, 0); + closure_sync(&cl); - continue_at(&b->io.cl, bch_btree_read_done, system_wq); + if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + set_btree_node_io_error(b); + + bch_bbio_free(bio, b->c); + + if (btree_node_io_error(b)) + goto err; + + bch_btree_node_read_done(b); + + spin_lock(&b->c->btree_read_time_lock); + bch_time_stats_update(&b->c->btree_read_time, start_time); + spin_unlock(&b->c->btree_read_time_lock); + + return; +err: + bch_cache_set_error(b->c, "io error reading bucket %lu", + PTR_BUCKET_NR(b->c, &b->key, 0)); } static void btree_complete_write(struct btree *b, struct btree_write *w) @@ -280,15 +269,11 @@ static void btree_complete_write(struct btree *b, struct btree_write *w) __closure_wake_up(&b->c->journal.wait); } - if (w->owner) - closure_put(w->owner); - w->prio_blocked = 0; w->journal = NULL; - w->owner = NULL; } -static void __btree_write_done(struct closure *cl) +static void __btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io.cl); struct btree_write *w = btree_prev_write(b); @@ -304,7 +289,7 @@ static void __btree_write_done(struct closure *cl) closure_return(cl); } -static void btree_write_done(struct closure *cl) +static void btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io.cl); struct bio_vec *bv; @@ -313,10 +298,22 @@ static void btree_write_done(struct closure *cl) __bio_for_each_segment(bv, b->bio, n, 0) __free_page(bv->bv_page); - __btree_write_done(cl); + __btree_node_write_done(cl); } -static void do_btree_write(struct btree *b) +static void btree_node_write_endio(struct bio *bio, int error) +{ + struct closure *cl = bio->bi_private; + struct btree *b = container_of(cl, struct btree, io.cl); + + if (error) + set_btree_node_io_error(b); + + bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); + closure_put(cl); +} + +static void do_btree_node_write(struct btree *b) { struct closure *cl = &b->io.cl; struct bset *i = b->sets[b->nsets].data; @@ -325,7 +322,11 @@ static void do_btree_write(struct btree *b) i->version = BCACHE_BSET_VERSION; i->csum = btree_csum_set(b, i); - btree_bio_init(b); + BUG_ON(b->bio); + b->bio = bch_bbio_alloc(b->c); + + b->bio->bi_end_io = btree_node_write_endio; + b->bio->bi_private = &b->io.cl; b->bio->bi_rw = REQ_META|WRITE_SYNC; b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); bch_bio_map(b->bio, i); @@ -345,7 +346,7 @@ static void do_btree_write(struct btree *b) trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); - continue_at(cl, btree_write_done, NULL); + continue_at(cl, btree_node_write_done, NULL); } else { b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); @@ -354,26 +355,30 @@ static void do_btree_write(struct btree *b) bch_submit_bbio(b->bio, b->c, &k.key, 0); closure_sync(cl); - __btree_write_done(cl); + __btree_node_write_done(cl); } } -static void __btree_write(struct btree *b) +void bch_btree_node_write(struct btree *b, struct closure *parent) { struct bset *i = b->sets[b->nsets].data; BUG_ON(current->bio_list); + BUG_ON(b->written >= btree_blocks(b)); + BUG_ON(b->written && !i->keys); + BUG_ON(b->sets->data->seq != i->seq); - closure_lock(&b->io, &b->c->cl); cancel_delayed_work(&b->work); + /* If caller isn't waiting for write, parent refcount is cache set */ + closure_lock(&b->io, parent ?: &b->c->cl); + clear_bit(BTREE_NODE_dirty, &b->flags); change_bit(BTREE_NODE_write_idx, &b->flags); bch_check_key_order(b, i); - BUG_ON(b->written && !i->keys); - do_btree_write(b); + do_btree_node_write(b); pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); @@ -387,37 +392,31 @@ static void __btree_write(struct btree *b) bch_bset_init_next(b); } -static void btree_write_work(struct work_struct *w) +static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); - down_write(&b->lock); + rw_lock(true, b, b->level); if (btree_node_dirty(b)) - __btree_write(b); - up_write(&b->lock); + bch_btree_node_write(b, NULL); + rw_unlock(true, b); } -void bch_btree_write(struct btree *b, bool now, struct btree_op *op) +static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) { struct bset *i = b->sets[b->nsets].data; struct btree_write *w = btree_current_write(b); - BUG_ON(b->written && - (b->written >= btree_blocks(b) || - i->seq != b->sets[0].data->seq || - !i->keys)); + BUG_ON(!b->written); + BUG_ON(!i->keys); - if (!btree_node_dirty(b)) { - set_btree_node_dirty(b); - queue_delayed_work(btree_io_wq, &b->work, - msecs_to_jiffies(30000)); - } + if (!btree_node_dirty(b)) + queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); - w->prio_blocked += b->prio_blocked; - b->prio_blocked = 0; + set_btree_node_dirty(b); - if (op && op->journal && !b->level) { + if (op && op->journal) { if (w->journal && journal_pin_cmp(b->c, w, op)) { atomic_dec_bug(w->journal); @@ -430,23 +429,10 @@ void bch_btree_write(struct btree *b, bool now, struct btree_op *op) } } - if (current->bio_list) - return; - /* Force write if set is too big */ - if (now || - b->level || - set_bytes(i) > PAGE_SIZE - 48) { - if (op && now) { - /* Must wait on multiple writes */ - BUG_ON(w->owner); - w->owner = &op->cl; - closure_get(&op->cl); - } - - __btree_write(b); - } - BUG_ON(!b->written); + if (set_bytes(i) > PAGE_SIZE - 48 && + !current->bio_list) + bch_btree_node_write(b, NULL); } /* @@ -559,7 +545,7 @@ static struct btree *mca_bucket_alloc(struct cache_set *c, init_rwsem(&b->lock); lockdep_set_novalidate_class(&b->lock); INIT_LIST_HEAD(&b->list); - INIT_DELAYED_WORK(&b->work, btree_write_work); + INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; closure_init_unlocked(&b->io); @@ -582,7 +568,7 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) BUG_ON(btree_node_dirty(b) && !b->sets[0].data); if (cl && btree_node_dirty(b)) - bch_btree_write(b, true, NULL); + bch_btree_node_write(b, NULL); if (cl) closure_wait_event_async(&b->io.wait, cl, @@ -905,6 +891,9 @@ retry: b = mca_find(c, k); if (!b) { + if (current->bio_list) + return ERR_PTR(-EAGAIN); + mutex_lock(&c->bucket_lock); b = mca_alloc(c, k, level, &op->cl); mutex_unlock(&c->bucket_lock); @@ -914,7 +903,7 @@ retry: if (IS_ERR(b)) return b; - bch_btree_read(b); + bch_btree_node_read(b); if (!write) downgrade_write(&b->lock); @@ -937,15 +926,12 @@ retry: for (; i <= b->nsets; i++) prefetch(b->sets[i].data); - if (!closure_wait_event(&b->io.wait, &op->cl, - btree_node_read_done(b))) { - rw_unlock(write, b); - b = ERR_PTR(-EAGAIN); - } else if (btree_node_io_error(b)) { + if (btree_node_io_error(b)) { rw_unlock(write, b); - b = ERR_PTR(-EIO); - } else - BUG_ON(!b->written); + return ERR_PTR(-EIO); + } + + BUG_ON(!b->written); return b; } @@ -959,7 +945,7 @@ static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) mutex_unlock(&c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { - bch_btree_read(b); + bch_btree_node_read(b); rw_unlock(true, b); } } @@ -982,12 +968,6 @@ static void btree_node_free(struct btree *b, struct btree_op *op) btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); - if (b->prio_blocked && - !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked)) - wake_up_allocators(b->c); - - b->prio_blocked = 0; - cancel_delayed_work(&b->work); mutex_lock(&b->c->bucket_lock); @@ -1028,7 +1008,6 @@ retry: goto retry; } - set_btree_node_read_done(b); b->accessed = 1; bch_bset_init_next(b); @@ -1166,14 +1145,11 @@ static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, if (!IS_ERR_OR_NULL(n)) { swap(b, n); + __bkey_put(b->c, &b->key); memcpy(k->ptr, b->key.ptr, sizeof(uint64_t) * KEY_PTRS(&b->key)); - __bkey_put(b->c, &b->key); - atomic_inc(&b->c->prio_blocked); - b->prio_blocked++; - btree_node_free(n, op); up_write(&n->lock); } @@ -1293,14 +1269,9 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, void write(struct btree *r) { if (!r->written) - bch_btree_write(r, true, op); - else if (btree_node_dirty(r)) { - BUG_ON(btree_current_write(r)->owner); - btree_current_write(r)->owner = writes; - closure_get(writes); - - bch_btree_write(r, true, NULL); - } + bch_btree_node_write(r, &op->cl); + else if (btree_node_dirty(r)) + bch_btree_node_write(r, writes); up_write(&r->lock); } @@ -1386,9 +1357,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, ret = btree_gc_recurse(b, op, writes, gc); if (!b->written || btree_node_dirty(b)) { - atomic_inc(&b->c->prio_blocked); - b->prio_blocked++; - bch_btree_write(b, true, n ? op : NULL); + bch_btree_node_write(b, n ? &op->cl : NULL); } if (!IS_ERR_OR_NULL(n)) { @@ -1508,8 +1477,8 @@ static void bch_btree_gc(struct closure *cl) struct gc_stat stats; struct closure writes; struct btree_op op; - uint64_t start_time = local_clock(); + trace_bcache_gc_start(c->sb.set_uuid); blktrace_msg_all(c, "Starting gc"); @@ -1520,6 +1489,8 @@ static void bch_btree_gc(struct closure *cl) btree_gc_start(c); + atomic_inc(&c->prio_blocked); + ret = btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&op.cl); closure_sync(&writes); @@ -1537,6 +1508,9 @@ static void bch_btree_gc(struct closure *cl) available = bch_btree_gc_finish(c); + atomic_dec(&c->prio_blocked); + wake_up_allocators(c); + bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); @@ -1544,10 +1518,9 @@ static void bch_btree_gc(struct closure *cl) stats.data <<= 9; stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); - blktrace_msg_all(c, "Finished gc"); + blktrace_msg_all(c, "Finished gc"); trace_bcache_gc_end(c->sb.set_uuid); - wake_up_allocators(c); continue_at(cl, bch_moving_gc, bch_gc_wq); } @@ -1857,7 +1830,7 @@ merged: op_type(op), pbtree(b), pkey(k)); if (b->level && !KEY_OFFSET(k)) - b->prio_blocked++; + btree_current_write(b)->prio_blocked++; pr_debug("%s for %s at %s: %s", status, op_type(op), pbtree(b), pkey(k)); @@ -1907,7 +1880,6 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, BUG_ON(op->type != BTREE_INSERT); BUG_ON(!btree_insert_key(b, op, &tmp.k)); - bch_btree_write(b, false, NULL); ret = true; out: downgrade_write(&b->lock); @@ -1967,18 +1939,18 @@ static int btree_split(struct btree *b, struct btree_op *op) bkey_copy_key(&n2->key, &b->key); bch_keylist_add(&op->keys, &n2->key); - bch_btree_write(n2, true, op); + bch_btree_node_write(n2, &op->cl); rw_unlock(true, n2); } else bch_btree_insert_keys(n1, op); bch_keylist_add(&op->keys, &n1->key); - bch_btree_write(n1, true, op); + bch_btree_node_write(n1, &op->cl); if (n3) { bkey_copy_key(&n3->key, &MAX_KEY); bch_btree_insert_keys(n3, op); - bch_btree_write(n3, true, op); + bch_btree_node_write(n3, &op->cl); closure_sync(&op->cl); bch_btree_set_root(n3); @@ -2082,8 +2054,12 @@ static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, BUG_ON(write_block(b) != b->sets[b->nsets].data); - if (bch_btree_insert_keys(b, op)) - bch_btree_write(b, false, op); + if (bch_btree_insert_keys(b, op)) { + if (!b->level) + bch_btree_leaf_dirty(b, op); + else + bch_btree_node_write(b, &op->cl); + } } return 0; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index af4a7092a28c..809bd77847a2 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -102,7 +102,6 @@ #include "debug.h" struct btree_write { - struct closure *owner; atomic_t *journal; /* If btree_split() frees a btree node, it writes a new pointer to that @@ -142,16 +141,12 @@ struct btree { */ struct bset_tree sets[MAX_BSETS]; - /* Used to refcount bio splits, also protects b->bio */ + /* For outstanding btree writes, used as a lock - protects write_idx */ struct closure_with_waitlist io; - /* Gets transferred to w->prio_blocked - see the comment there */ - int prio_blocked; - struct list_head list; struct delayed_work work; - uint64_t io_start_time; struct btree_write writes[2]; struct bio *bio; }; @@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \ { set_bit(BTREE_NODE_ ## flag, &b->flags); } \ enum btree_flags { - BTREE_NODE_read_done, BTREE_NODE_io_error, BTREE_NODE_dirty, BTREE_NODE_write_idx, }; -BTREE_FLAG(read_done); BTREE_FLAG(io_error); BTREE_FLAG(dirty); BTREE_FLAG(write_idx); @@ -293,9 +286,7 @@ static inline void rw_unlock(bool w, struct btree *b) #ifdef CONFIG_BCACHE_EDEBUG unsigned i; - if (w && - b->key.ptr[0] && - btree_node_read_done(b)) + if (w && b->key.ptr[0]) for (i = 0; i <= b->nsets; i++) bch_check_key_order(b, b->sets[i].data); #endif @@ -370,9 +361,9 @@ static inline bool should_split(struct btree *b) > btree_blocks(b)); } -void bch_btree_read_done(struct closure *); -void bch_btree_read(struct btree *); -void bch_btree_write(struct btree *b, bool now, struct btree_op *op); +void bch_btree_node_read(struct btree *); +void bch_btree_node_read_done(struct btree *); +void bch_btree_node_write(struct btree *, struct closure *); void bch_cannibalize_unlock(struct cache_set *, struct closure *); void bch_btree_set_root(struct btree *); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 89fd5204924e..ae6096c6845d 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -144,7 +144,7 @@ void bch_btree_verify(struct btree *b, struct bset *new) v->written = 0; v->level = b->level; - bch_btree_read(v); + bch_btree_node_read(v); closure_wait_event(&v->io.wait, &cl, atomic_read(&b->io.cl.remaining) == -1); @@ -512,7 +512,7 @@ static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, bch_btree_sort(b); fill->written = 0; - bch_btree_read_done(&fill->io.cl); + bch_btree_node_read_done(fill); if (b->sets[0].data->keys != fill->sets[0].data->keys || memcmp(b->sets[0].data->start, diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 8c8dfdcd9d4c..970d819d4350 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -384,7 +384,7 @@ out: return; found: if (btree_node_dirty(best)) - bch_btree_write(best, true, NULL); + bch_btree_node_write(best, NULL); rw_unlock(true, best); } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index aaeda235fc75..e53f89988b08 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1255,9 +1255,10 @@ static void cache_set_free(struct closure *cl) free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); - kfree(c->fill_iter); if (c->bio_split) bioset_free(c->bio_split); + if (c->fill_iter) + mempool_destroy(c->fill_iter); if (c->bio_meta) mempool_destroy(c->bio_meta); if (c->search) @@ -1295,7 +1296,7 @@ static void cache_set_flush(struct closure *cl) /* Should skip this if we're unregistering because of an error */ list_for_each_entry(b, &c->btree_cache, list) if (btree_node_dirty(b)) - bch_btree_write(b, true, NULL); + bch_btree_node_write(b, NULL); closure_return(cl); } @@ -1374,7 +1375,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) BTREE_MAX_PAGES); mutex_init(&c->bucket_lock); - mutex_init(&c->fill_lock); mutex_init(&c->sort_lock); spin_lock_init(&c->sort_time_lock); closure_init_unlocked(&c->sb_write); @@ -1400,8 +1400,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) !(c->bio_meta = mempool_create_kmalloc_pool(2, sizeof(struct bbio) + sizeof(struct bio_vec) * bucket_pages(c))) || + !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || - !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) || !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || bch_journal_alloc(c) || @@ -1409,8 +1409,6 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) bch_open_buckets_alloc(c)) goto err; - c->fill_iter->size = sb->bucket_size / sb->block_size; - c->congested_read_threshold_us = 2000; c->congested_write_threshold_us = 20000; c->error_limit = 8 << IO_ERROR_SHIFT; @@ -1551,7 +1549,7 @@ static void run_cache_set(struct cache_set *c) goto err_unlock_gc; bkey_copy_key(&c->root->key, &MAX_KEY); - bch_btree_write(c->root, true, &op); + bch_btree_node_write(c->root, &op.cl); bch_btree_set_root(c->root); rw_unlock(true, c->root); -- cgit v1.2.3 From c37511b863f36c1cc6e18440717fd4cc0e881b8a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 26 Apr 2013 15:39:55 -0700 Subject: bcache: Fix/revamp tracepoints The tracepoints were reworked to be more sensible, and fixed a null pointer deref in one of the tracepoints. Converted some of the pr_debug()s to tracepoints - this is partly a performance optimization; it used to be that with DEBUG or CONFIG_DYNAMIC_DEBUG pr_debug() was an empty macro; but at some point it was changed to an empty inline function. Some of the pr_debug() statements had rather expensive function calls as part of the arguments, so this code was getting run unnecessarily even on non debug kernels - in some fast paths, too. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 10 +- drivers/md/bcache/bcache.h | 20 --- drivers/md/bcache/bset.h | 4 + drivers/md/bcache/btree.c | 47 +++--- drivers/md/bcache/io.c | 2 + drivers/md/bcache/journal.c | 14 +- drivers/md/bcache/movinggc.c | 12 +- drivers/md/bcache/request.c | 65 ++++---- drivers/md/bcache/request.h | 2 +- drivers/md/bcache/super.c | 2 +- drivers/md/bcache/sysfs.c | 1 + drivers/md/bcache/trace.c | 45 ++++-- drivers/md/bcache/util.h | 2 - drivers/md/bcache/writeback.c | 10 +- include/trace/events/bcache.h | 352 +++++++++++++++++++++++++++++------------- 15 files changed, 367 insertions(+), 221 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 38428f46ea74..b54b73b9b2b7 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -65,6 +65,7 @@ #include #include +#include #define MAX_IN_FLIGHT_DISCARDS 8U @@ -351,10 +352,7 @@ static void invalidate_buckets(struct cache *ca) break; } - pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu", - fifo_used(&ca->free), ca->free.size, - fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->unused), ca->unused.size); + trace_bcache_alloc_invalidate(ca); } #define allocator_wait(ca, cond) \ @@ -473,9 +471,7 @@ again: return r; } - pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu", - atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free), - fifo_used(&ca->free_inc), fifo_used(&ca->unused)); + trace_bcache_alloc_fail(ca); if (cl) { closure_wait(&ca->set->bucket_wait, cl); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index ad4957b52f10..59c15e09e4dd 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -178,7 +178,6 @@ #define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__ #include -#include #include #include #include @@ -901,8 +900,6 @@ static inline unsigned local_clock_us(void) return local_clock() >> 10; } -#define MAX_BSETS 4U - #define BTREE_PRIO USHRT_MAX #define INITIAL_PRIO 32768 @@ -1107,23 +1104,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k) atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); } -/* Blktrace macros */ - -#define blktrace_msg(c, fmt, ...) \ -do { \ - struct request_queue *q = bdev_get_queue(c->bdev); \ - if (q) \ - blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \ -} while (0) - -#define blktrace_msg_all(s, fmt, ...) \ -do { \ - struct cache *_c; \ - unsigned i; \ - for_each_cache(_c, (s), i) \ - blktrace_msg(_c, fmt, ##__VA_ARGS__); \ -} while (0) - static inline void cached_dev_put(struct cached_dev *dc) { if (atomic_dec_and_test(&dc->count)) diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index 57a9cff41546..ae115a253d73 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -1,6 +1,8 @@ #ifndef _BCACHE_BSET_H #define _BCACHE_BSET_H +#include + /* * BKEYS: * @@ -142,6 +144,8 @@ /* Btree key comparison/iteration */ +#define MAX_BSETS 4U + struct btree_iter { size_t size, used; struct btree_iter_set { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index aaec186f7ba6..218d486259a3 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -223,8 +223,9 @@ void bch_btree_node_read(struct btree *b) struct closure cl; struct bio *bio; + trace_bcache_btree_read(b); + closure_init_stack(&cl); - pr_debug("%s", pbtree(b)); bio = bch_bbio_alloc(b->c); bio->bi_rw = REQ_META|READ_SYNC; @@ -234,7 +235,6 @@ void bch_btree_node_read(struct btree *b) bch_bio_map(bio, b->sets[0].data); - trace_bcache_btree_read(bio); bch_submit_bbio(bio, b->c, &b->key, 0); closure_sync(&cl); @@ -343,7 +343,6 @@ static void do_btree_node_write(struct btree *b) memcpy(page_address(bv->bv_page), base + j * PAGE_SIZE, PAGE_SIZE); - trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); continue_at(cl, btree_node_write_done, NULL); @@ -351,7 +350,6 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); - trace_bcache_btree_write(b->bio); bch_submit_bbio(b->bio, b->c, &k.key, 0); closure_sync(cl); @@ -363,10 +361,13 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) { struct bset *i = b->sets[b->nsets].data; + trace_bcache_btree_write(b); + BUG_ON(current->bio_list); BUG_ON(b->written >= btree_blocks(b)); BUG_ON(b->written && !i->keys); BUG_ON(b->sets->data->seq != i->seq); + bch_check_key_order(b, i); cancel_delayed_work(&b->work); @@ -376,12 +377,8 @@ void bch_btree_node_write(struct btree *b, struct closure *parent) clear_bit(BTREE_NODE_dirty, &b->flags); change_bit(BTREE_NODE_write_idx, &b->flags); - bch_check_key_order(b, i); - do_btree_node_write(b); - pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys); - b->written += set_blocks(i, b->c); atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); @@ -752,6 +749,8 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, int ret = -ENOMEM; struct btree *i; + trace_bcache_btree_cache_cannibalize(c); + if (!cl) return ERR_PTR(-ENOMEM); @@ -770,7 +769,6 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, return ERR_PTR(-EAGAIN); } - /* XXX: tracepoint */ c->try_harder = cl; c->try_harder_start = local_clock(); retry: @@ -956,13 +954,14 @@ static void btree_node_free(struct btree *b, struct btree_op *op) { unsigned i; + trace_bcache_btree_node_free(b); + /* * The BUG_ON() in btree_node_get() implies that we must have a write * lock on parent to free or even invalidate a node */ BUG_ON(op->lock <= b->level); BUG_ON(b == b->c->root); - pr_debug("bucket %s", pbtree(b)); if (btree_node_dirty(b)) btree_complete_write(b, btree_current_write(b)); @@ -1012,12 +1011,16 @@ retry: bch_bset_init_next(b); mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc(b); return b; err_free: bch_bucket_free(c, &k.key); __bkey_put(c, &k.key); err: mutex_unlock(&c->bucket_lock); + + trace_bcache_btree_node_alloc_fail(b); return b; } @@ -1254,7 +1257,7 @@ static void btree_gc_coalesce(struct btree *b, struct btree_op *op, btree_node_free(r->b, op); up_write(&r->b->lock); - pr_debug("coalesced %u nodes", nodes); + trace_bcache_btree_gc_coalesce(nodes); gc->nodes--; nodes--; @@ -1479,8 +1482,7 @@ static void bch_btree_gc(struct closure *cl) struct btree_op op; uint64_t start_time = local_clock(); - trace_bcache_gc_start(c->sb.set_uuid); - blktrace_msg_all(c, "Starting gc"); + trace_bcache_gc_start(c); memset(&stats, 0, sizeof(struct gc_stat)); closure_init_stack(&writes); @@ -1496,9 +1498,7 @@ static void bch_btree_gc(struct closure *cl) closure_sync(&writes); if (ret) { - blktrace_msg_all(c, "Stopped gc"); pr_warn("gc failed!"); - continue_at(cl, bch_btree_gc, bch_gc_wq); } @@ -1519,8 +1519,7 @@ static void bch_btree_gc(struct closure *cl) stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); - blktrace_msg_all(c, "Finished gc"); - trace_bcache_gc_end(c->sb.set_uuid); + trace_bcache_gc_end(c); continue_at(cl, bch_moving_gc, bch_gc_wq); } @@ -1901,12 +1900,11 @@ static int btree_split(struct btree *b, struct btree_op *op) split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; - pr_debug("%ssplitting at %s keys %i", split ? "" : "not ", - pbtree(b), n1->sets[0].data->keys); - if (split) { unsigned keys = 0; + trace_bcache_btree_node_split(b, n1->sets[0].data->keys); + n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); if (IS_ERR(n2)) goto err_free1; @@ -1941,8 +1939,11 @@ static int btree_split(struct btree *b, struct btree_op *op) bch_keylist_add(&op->keys, &n2->key); bch_btree_node_write(n2, &op->cl); rw_unlock(true, n2); - } else + } else { + trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); + bch_btree_insert_keys(n1, op); + } bch_keylist_add(&op->keys, &n1->key); bch_btree_node_write(n1, &op->cl); @@ -2117,6 +2118,8 @@ void bch_btree_set_root(struct btree *b) { unsigned i; + trace_bcache_btree_set_root(b); + BUG_ON(!b->written); for (i = 0; i < KEY_PTRS(&b->key); i++) @@ -2130,7 +2133,6 @@ void bch_btree_set_root(struct btree *b) __bkey_put(b->c, &b->key); bch_journal_meta(b->c, NULL); - pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0)); } /* Cache lookup */ @@ -2216,7 +2218,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, n->bi_end_io = bch_cache_read_endio; n->bi_private = &s->cl; - trace_bcache_cache_hit(n); __bch_submit_bbio(n, b->c); } diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index d285cd49104c..0f6d69658b61 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -9,6 +9,8 @@ #include "bset.h" #include "debug.h" +#include + static void bch_bi_idx_hack_endio(struct bio *bio, int error) { struct bio *p = bio->bi_private; diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 970d819d4350..5ca22149b749 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -9,6 +9,8 @@ #include "debug.h" #include "request.h" +#include + /* * Journal replay/recovery: * @@ -300,7 +302,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list, for (k = i->j.start; k < end(&i->j); k = bkey_next(k)) { - pr_debug("%s", pkey(k)); + trace_bcache_journal_replay_key(k); + bkey_copy(op->keys.top, k); bch_keylist_push(&op->keys); @@ -712,7 +715,8 @@ void bch_journal(struct closure *cl) spin_lock(&c->journal.lock); if (journal_full(&c->journal)) { - /* XXX: tracepoint */ + trace_bcache_journal_full(c); + closure_wait(&c->journal.wait, cl); journal_reclaim(c); @@ -728,13 +732,15 @@ void bch_journal(struct closure *cl) if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || b > c->journal.blocks_free) { - /* XXX: If we were inserting so many keys that they won't fit in + trace_bcache_journal_entry_full(c); + + /* + * XXX: If we were inserting so many keys that they won't fit in * an _empty_ journal write, we'll deadlock. For now, handle * this in bch_keylist_realloc() - but something to think about. */ BUG_ON(!w->data->keys); - /* XXX: tracepoint */ BUG_ON(!closure_wait(&w->wait, cl)); closure_flush(&c->journal.io); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 8589512c972e..04f6b97ffda6 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -9,6 +9,8 @@ #include "debug.h" #include "request.h" +#include + struct moving_io { struct keybuf_key *w; struct search s; @@ -49,9 +51,8 @@ static void write_moving_finish(struct closure *cl) while (bv-- != bio->bi_io_vec) __free_page(bv->bv_page); - pr_debug("%s %s", io->s.op.insert_collision - ? "collision moving" : "moved", - pkey(&io->w->key)); + if (io->s.op.insert_collision) + trace_bcache_gc_copy_collision(&io->w->key); bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w); @@ -94,8 +95,6 @@ static void write_moving(struct closure *cl) struct moving_io *io = container_of(s, struct moving_io, s); if (!s->error) { - trace_bcache_write_moving(&io->bio.bio); - moving_init(io); io->bio.bio.bi_sector = KEY_START(&io->w->key); @@ -122,7 +121,6 @@ static void read_moving_submit(struct closure *cl) struct moving_io *io = container_of(s, struct moving_io, s); struct bio *bio = &io->bio.bio; - trace_bcache_read_moving(bio); bch_submit_bbio(bio, s->op.c, &io->w->key, 0); continue_at(cl, write_moving, bch_gc_wq); @@ -162,7 +160,7 @@ static void read_moving(struct closure *cl) if (bch_bio_alloc_pages(bio, GFP_KERNEL)) goto err; - pr_debug("%s", pkey(&w->key)); + trace_bcache_gc_copy(&w->key); closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index e5ff12e52d5b..695469958c1e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -530,10 +530,9 @@ static void bch_insert_data_loop(struct closure *cl) if (KEY_CSUM(k)) bio_csum(n, k); - pr_debug("%s", pkey(k)); + trace_bcache_cache_insert(k); bch_keylist_push(&op->keys); - trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev); n->bi_rw |= REQ_WRITE; bch_submit_bbio(n, op->c, k, 0); } while (n != bio); @@ -784,11 +783,8 @@ static void request_read_error(struct closure *cl) int i; if (s->recoverable) { - /* The cache read failed, but we can retry from the backing - * device. - */ - pr_debug("recovering at sector %llu", - (uint64_t) s->orig_bio->bi_sector); + /* Retry from the backing device: */ + trace_bcache_read_retry(s->orig_bio); s->error = 0; bv = s->bio.bio.bi_io_vec; @@ -806,7 +802,6 @@ static void request_read_error(struct closure *cl) /* XXX: invalidate cache */ - trace_bcache_read_retry(&s->bio.bio); closure_bio_submit(&s->bio.bio, &s->cl, s->d); } @@ -899,6 +894,7 @@ static void request_read_done_bh(struct closure *cl) struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip); + trace_bcache_read(s->orig_bio, !s->cache_miss, s->op.skip); if (s->error) continue_at_nobarrier(cl, request_read_error, bcache_wq); @@ -969,7 +965,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_miss = miss; bio_get(s->op.cache_bio); - trace_bcache_cache_miss(s->orig_bio); closure_bio_submit(s->op.cache_bio, &s->cl, s->d); return ret; @@ -1040,15 +1035,15 @@ static void request_write(struct cached_dev *dc, struct search *s) if (should_writeback(dc, s->orig_bio)) s->writeback = true; + trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); + if (!s->writeback) { s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, dc->disk.bio_split); - trace_bcache_writethrough(s->orig_bio); closure_bio_submit(bio, cl, s->d); } else { s->op.cache_bio = bio; - trace_bcache_writeback(s->orig_bio); bch_writeback_add(dc, bio_sectors(bio)); } out: @@ -1058,7 +1053,6 @@ skip: s->op.skip = true; s->op.cache_bio = s->orig_bio; bio_get(s->op.cache_bio); - trace_bcache_write_skip(s->orig_bio); if ((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(dc->bdev))) @@ -1088,9 +1082,10 @@ static void request_nodata(struct cached_dev *dc, struct search *s) /* Cached devices - read & write stuff */ -int bch_get_congested(struct cache_set *c) +unsigned bch_get_congested(struct cache_set *c) { int i; + long rand; if (!c->congested_read_threshold_us && !c->congested_write_threshold_us) @@ -1106,7 +1101,13 @@ int bch_get_congested(struct cache_set *c) i += CONGESTED_MAX; - return i <= 0 ? 1 : fract_exp_two(i, 6); + if (i > 0) + i = fract_exp_two(i, 6); + + rand = get_random_int(); + i -= bitmap_weight(&rand, BITS_PER_LONG); + + return i > 0 ? i : 1; } static void add_sequential(struct task_struct *t) @@ -1126,10 +1127,8 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) { struct cache_set *c = s->op.c; struct bio *bio = &s->bio.bio; - - long rand; - int cutoff = bch_get_congested(c); unsigned mode = cache_mode(dc, bio); + unsigned sectors, congested = bch_get_congested(c); if (atomic_read(&dc->disk.detaching) || c->gc_stats.in_use > CUTOFF_CACHE_ADD || @@ -1147,17 +1146,14 @@ static void check_should_skip(struct cached_dev *dc, struct search *s) goto skip; } - if (!cutoff) { - cutoff = dc->sequential_cutoff >> 9; + if (!congested && !dc->sequential_cutoff) + goto rescale; - if (!cutoff) - goto rescale; - - if (mode == CACHE_MODE_WRITEBACK && - (bio->bi_rw & REQ_WRITE) && - (bio->bi_rw & REQ_SYNC)) - goto rescale; - } + if (!congested && + mode == CACHE_MODE_WRITEBACK && + (bio->bi_rw & REQ_WRITE) && + (bio->bi_rw & REQ_SYNC)) + goto rescale; if (dc->sequential_merge) { struct io *i; @@ -1192,12 +1188,19 @@ found: add_sequential(s->task); } - rand = get_random_int(); - cutoff -= bitmap_weight(&rand, BITS_PER_LONG); + sectors = max(s->task->sequential_io, + s->task->sequential_io_avg) >> 9; - if (cutoff <= (int) (max(s->task->sequential_io, - s->task->sequential_io_avg) >> 9)) + if (dc->sequential_cutoff && + sectors >= dc->sequential_cutoff >> 9) { + trace_bcache_bypass_sequential(s->orig_bio); goto skip; + } + + if (congested && sectors >= congested) { + trace_bcache_bypass_congested(s->orig_bio); + goto skip; + } rescale: bch_rescale_priorities(c, bio_sectors(bio)); diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index 254d9ab5707c..57dc4784f4f4 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -30,7 +30,7 @@ struct search { }; void bch_cache_read_endio(struct bio *, int); -int bch_get_congested(struct cache_set *); +unsigned bch_get_congested(struct cache_set *); void bch_insert_data(struct closure *cl); void bch_btree_insert_async(struct closure *); void bch_cache_read_endio(struct bio *, int); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e53f89988b08..47bc13745068 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -11,6 +11,7 @@ #include "debug.h" #include "request.h" +#include #include #include #include @@ -543,7 +544,6 @@ void bch_prio_write(struct cache *ca) pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), fifo_used(&ca->free_inc), fifo_used(&ca->unused)); - blktrace_msg(ca, "Starting priorities: " buckets_free(ca)); for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 29228b8a6ffe..f5c2d8695230 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -10,6 +10,7 @@ #include "btree.h" #include "request.h" +#include #include static const char * const cache_replacement_policies[] = { diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index 983f9bb411bc..7f4f38aa16ae 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -2,6 +2,7 @@ #include "btree.h" #include "request.h" +#include #include #define CREATE_TRACE_POINTS @@ -9,18 +10,42 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize); + EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write); -EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail); + +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision); diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 577393e38c3a..e02780545f12 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -15,8 +15,6 @@ struct closure; -#include - #ifdef CONFIG_BCACHE_EDEBUG #define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 2714ed3991d1..82f6d4577be2 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -10,6 +10,8 @@ #include "btree.h" #include "debug.h" +#include + static struct workqueue_struct *dirty_wq; static void read_dirty(struct closure *); @@ -236,10 +238,12 @@ static void write_dirty_finish(struct closure *cl) for (i = 0; i < KEY_PTRS(&w->key); i++) atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); - pr_debug("clearing %s", pkey(&w->key)); bch_btree_insert(&op, dc->disk.c); closure_sync(&op.cl); + if (op.insert_collision) + trace_bcache_writeback_collision(&w->key); + atomic_long_inc(op.insert_collision ? &dc->disk.c->writeback_keys_failed : &dc->disk.c->writeback_keys_done); @@ -275,7 +279,6 @@ static void write_dirty(struct closure *cl) io->bio.bi_bdev = io->dc->bdev; io->bio.bi_end_io = dirty_endio; - trace_bcache_write_dirty(&io->bio); closure_bio_submit(&io->bio, cl, &io->dc->disk); continue_at(cl, write_dirty_finish, dirty_wq); @@ -296,7 +299,6 @@ static void read_dirty_submit(struct closure *cl) { struct dirty_io *io = container_of(cl, struct dirty_io, cl); - trace_bcache_read_dirty(&io->bio); closure_bio_submit(&io->bio, cl, &io->dc->disk); continue_at(cl, write_dirty, dirty_wq); @@ -352,7 +354,7 @@ static void read_dirty(struct closure *cl) if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) goto err_free; - pr_debug("%s", pkey(&w->key)); + trace_bcache_writeback(&w->key); closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 3cc5a0b278c3..c9952b36fcea 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -9,9 +9,7 @@ struct search; DECLARE_EVENT_CLASS(bcache_request, - TP_PROTO(struct search *s, struct bio *bio), - TP_ARGS(s, bio), TP_STRUCT__entry( @@ -22,7 +20,6 @@ DECLARE_EVENT_CLASS(bcache_request, __field(dev_t, orig_sector ) __field(unsigned int, nr_sector ) __array(char, rwbs, 6 ) - __array(char, comm, TASK_COMM_LEN ) ), TP_fast_assign( @@ -33,36 +30,66 @@ DECLARE_EVENT_CLASS(bcache_request, __entry->orig_sector = bio->bi_sector - 16; __entry->nr_sector = bio->bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), - TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)", + TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm, - __entry->orig_major, __entry->orig_minor, + __entry->rwbs, (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->orig_major, __entry->orig_minor, (unsigned long long)__entry->orig_sector) ); -DEFINE_EVENT(bcache_request, bcache_request_start, +DECLARE_EVENT_CLASS(bkey, + TP_PROTO(struct bkey *k), + TP_ARGS(k), - TP_PROTO(struct search *s, struct bio *bio), + TP_STRUCT__entry( + __field(u32, size ) + __field(u32, inode ) + __field(u64, offset ) + __field(bool, dirty ) + ), - TP_ARGS(s, bio) + TP_fast_assign( + __entry->inode = KEY_INODE(k); + __entry->offset = KEY_OFFSET(k); + __entry->size = KEY_SIZE(k); + __entry->dirty = KEY_DIRTY(k); + ), + + TP_printk("%u:%llu len %u dirty %u", __entry->inode, + __entry->offset, __entry->size, __entry->dirty) ); -DEFINE_EVENT(bcache_request, bcache_request_end, +DECLARE_EVENT_CLASS(btree_node, + TP_PROTO(struct btree *b), + TP_ARGS(b), + + TP_STRUCT__entry( + __field(size_t, bucket ) + ), + + TP_fast_assign( + __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); + ), + TP_printk("bucket %zu", __entry->bucket) +); + +/* request.c */ + +DEFINE_EVENT(bcache_request, bcache_request_start, TP_PROTO(struct search *s, struct bio *bio), + TP_ARGS(s, bio) +); +DEFINE_EVENT(bcache_request, bcache_request_end, + TP_PROTO(struct search *s, struct bio *bio), TP_ARGS(s, bio) ); DECLARE_EVENT_CLASS(bcache_bio, - TP_PROTO(struct bio *bio), - TP_ARGS(bio), TP_STRUCT__entry( @@ -70,7 +97,6 @@ DECLARE_EVENT_CLASS(bcache_bio, __field(sector_t, sector ) __field(unsigned int, nr_sector ) __array(char, rwbs, 6 ) - __array(char, comm, TASK_COMM_LEN ) ), TP_fast_assign( @@ -78,191 +104,295 @@ DECLARE_EVENT_CLASS(bcache_bio, __entry->sector = bio->bi_sector; __entry->nr_sector = bio->bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) + TP_printk("%d,%d %s %llu + %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, __entry->nr_sector) ); - -DEFINE_EVENT(bcache_bio, bcache_passthrough, - +DEFINE_EVENT(bcache_bio, bcache_bypass_sequential, TP_PROTO(struct bio *bio), - TP_ARGS(bio) ); -DEFINE_EVENT(bcache_bio, bcache_cache_hit, - +DEFINE_EVENT(bcache_bio, bcache_bypass_congested, TP_PROTO(struct bio *bio), - TP_ARGS(bio) ); -DEFINE_EVENT(bcache_bio, bcache_cache_miss, +TRACE_EVENT(bcache_read, + TP_PROTO(struct bio *bio, bool hit, bool bypass), + TP_ARGS(bio, hit, bypass), - TP_PROTO(struct bio *bio), + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __field(bool, cache_hit ) + __field(bool, bypass ) + ), - TP_ARGS(bio) + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + __entry->cache_hit = hit; + __entry->bypass = bypass; + ), + + TP_printk("%d,%d %s %llu + %u hit %u bypass %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->cache_hit, __entry->bypass) ); -DEFINE_EVENT(bcache_bio, bcache_read_retry, +TRACE_EVENT(bcache_write, + TP_PROTO(struct bio *bio, bool writeback, bool bypass), + TP_ARGS(bio, writeback, bypass), - TP_PROTO(struct bio *bio), + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + __field(bool, writeback ) + __field(bool, bypass ) + ), - TP_ARGS(bio) -); + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + __entry->writeback = writeback; + __entry->bypass = bypass; + ), -DEFINE_EVENT(bcache_bio, bcache_writethrough, + TP_printk("%d,%d %s %llu + %u hit %u bypass %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->writeback, __entry->bypass) +); +DEFINE_EVENT(bcache_bio, bcache_read_retry, TP_PROTO(struct bio *bio), - TP_ARGS(bio) ); -DEFINE_EVENT(bcache_bio, bcache_writeback, +DEFINE_EVENT(bkey, bcache_cache_insert, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); - TP_PROTO(struct bio *bio), +/* Journal */ - TP_ARGS(bio) -); +DECLARE_EVENT_CLASS(cache_set, + TP_PROTO(struct cache_set *c), + TP_ARGS(c), -DEFINE_EVENT(bcache_bio, bcache_write_skip, + TP_STRUCT__entry( + __array(char, uuid, 16 ) + ), - TP_PROTO(struct bio *bio), + TP_fast_assign( + memcpy(__entry->uuid, c->sb.set_uuid, 16); + ), - TP_ARGS(bio) + TP_printk("%pU", __entry->uuid) ); -DEFINE_EVENT(bcache_bio, bcache_btree_read, - - TP_PROTO(struct bio *bio), +DEFINE_EVENT(bkey, bcache_journal_replay_key, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); - TP_ARGS(bio) +DEFINE_EVENT(cache_set, bcache_journal_full, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) ); -DEFINE_EVENT(bcache_bio, bcache_btree_write, +DEFINE_EVENT(cache_set, bcache_journal_entry_full, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); +DEFINE_EVENT(bcache_bio, bcache_journal_write, TP_PROTO(struct bio *bio), - TP_ARGS(bio) ); -DEFINE_EVENT(bcache_bio, bcache_write_dirty, +/* Btree */ - TP_PROTO(struct bio *bio), +DEFINE_EVENT(cache_set, bcache_btree_cache_cannibalize, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); - TP_ARGS(bio) +DEFINE_EVENT(btree_node, bcache_btree_read, + TP_PROTO(struct btree *b), + TP_ARGS(b) ); -DEFINE_EVENT(bcache_bio, bcache_read_dirty, +TRACE_EVENT(bcache_btree_write, + TP_PROTO(struct btree *b), + TP_ARGS(b), - TP_PROTO(struct bio *bio), + TP_STRUCT__entry( + __field(size_t, bucket ) + __field(unsigned, block ) + __field(unsigned, keys ) + ), - TP_ARGS(bio) + TP_fast_assign( + __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); + __entry->block = b->written; + __entry->keys = b->sets[b->nsets].data->keys; + ), + + TP_printk("bucket %zu", __entry->bucket) ); -DEFINE_EVENT(bcache_bio, bcache_write_moving, +DEFINE_EVENT(btree_node, bcache_btree_node_alloc, + TP_PROTO(struct btree *b), + TP_ARGS(b) +); - TP_PROTO(struct bio *bio), +DEFINE_EVENT(btree_node, bcache_btree_node_alloc_fail, + TP_PROTO(struct btree *b), + TP_ARGS(b) +); - TP_ARGS(bio) +DEFINE_EVENT(btree_node, bcache_btree_node_free, + TP_PROTO(struct btree *b), + TP_ARGS(b) ); -DEFINE_EVENT(bcache_bio, bcache_read_moving, +TRACE_EVENT(bcache_btree_gc_coalesce, + TP_PROTO(unsigned nodes), + TP_ARGS(nodes), - TP_PROTO(struct bio *bio), + TP_STRUCT__entry( + __field(unsigned, nodes ) + ), - TP_ARGS(bio) -); + TP_fast_assign( + __entry->nodes = nodes; + ), -DEFINE_EVENT(bcache_bio, bcache_journal_write, + TP_printk("coalesced %u nodes", __entry->nodes) +); - TP_PROTO(struct bio *bio), +DEFINE_EVENT(cache_set, bcache_gc_start, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) +); - TP_ARGS(bio) +DEFINE_EVENT(cache_set, bcache_gc_end, + TP_PROTO(struct cache_set *c), + TP_ARGS(c) ); -DECLARE_EVENT_CLASS(bcache_cache_bio, +DEFINE_EVENT(bkey, bcache_gc_copy, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); - TP_PROTO(struct bio *bio, - sector_t orig_sector, - struct block_device* orig_bdev), +DEFINE_EVENT(bkey, bcache_gc_copy_collision, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); - TP_ARGS(bio, orig_sector, orig_bdev), +DECLARE_EVENT_CLASS(btree_split, + TP_PROTO(struct btree *b, unsigned keys), + TP_ARGS(b, keys), TP_STRUCT__entry( - __field(dev_t, dev ) - __field(dev_t, orig_dev ) - __field(sector_t, sector ) - __field(sector_t, orig_sector ) - __field(unsigned int, nr_sector ) - __array(char, rwbs, 6 ) - __array(char, comm, TASK_COMM_LEN ) + __field(size_t, bucket ) + __field(unsigned, keys ) ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; - __entry->orig_dev = orig_bdev->bd_dev; - __entry->sector = bio->bi_sector; - __entry->orig_sector = orig_sector; - __entry->nr_sector = bio->bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + __entry->bucket = PTR_BUCKET_NR(b->c, &b->key, 0); + __entry->keys = keys; ), - TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d %llu)", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm, - MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev), - (unsigned long long)__entry->orig_sector) + TP_printk("bucket %zu keys %u", __entry->bucket, __entry->keys) ); -DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert, - - TP_PROTO(struct bio *bio, - sector_t orig_sector, - struct block_device *orig_bdev), +DEFINE_EVENT(btree_split, bcache_btree_node_split, + TP_PROTO(struct btree *b, unsigned keys), + TP_ARGS(b, keys) +); - TP_ARGS(bio, orig_sector, orig_bdev) +DEFINE_EVENT(btree_split, bcache_btree_node_compact, + TP_PROTO(struct btree *b, unsigned keys), + TP_ARGS(b, keys) ); -DECLARE_EVENT_CLASS(bcache_gc, +DEFINE_EVENT(btree_node, bcache_btree_set_root, + TP_PROTO(struct btree *b), + TP_ARGS(b) +); - TP_PROTO(uint8_t *uuid), +/* Allocator */ - TP_ARGS(uuid), +TRACE_EVENT(bcache_alloc_invalidate, + TP_PROTO(struct cache *ca), + TP_ARGS(ca), TP_STRUCT__entry( - __field(uint8_t *, uuid) + __field(unsigned, free ) + __field(unsigned, free_inc ) + __field(unsigned, free_inc_size ) + __field(unsigned, unused ) ), TP_fast_assign( - __entry->uuid = uuid; + __entry->free = fifo_used(&ca->free); + __entry->free_inc = fifo_used(&ca->free_inc); + __entry->free_inc_size = ca->free_inc.size; + __entry->unused = fifo_used(&ca->unused); ), - TP_printk("%pU", __entry->uuid) + TP_printk("free %u free_inc %u/%u unused %u", __entry->free, + __entry->free_inc, __entry->free_inc_size, __entry->unused) ); +TRACE_EVENT(bcache_alloc_fail, + TP_PROTO(struct cache *ca), + TP_ARGS(ca), -DEFINE_EVENT(bcache_gc, bcache_gc_start, + TP_STRUCT__entry( + __field(unsigned, free ) + __field(unsigned, free_inc ) + __field(unsigned, unused ) + __field(unsigned, blocked ) + ), - TP_PROTO(uint8_t *uuid), + TP_fast_assign( + __entry->free = fifo_used(&ca->free); + __entry->free_inc = fifo_used(&ca->free_inc); + __entry->unused = fifo_used(&ca->unused); + __entry->blocked = atomic_read(&ca->set->prio_blocked); + ), - TP_ARGS(uuid) + TP_printk("free %u free_inc %u unused %u blocked %u", __entry->free, + __entry->free_inc, __entry->unused, __entry->blocked) ); -DEFINE_EVENT(bcache_gc, bcache_gc_end, +/* Background writeback */ - TP_PROTO(uint8_t *uuid), +DEFINE_EVENT(bkey, bcache_writeback, + TP_PROTO(struct bkey *k), + TP_ARGS(k) +); - TP_ARGS(uuid) +DEFINE_EVENT(bkey, bcache_writeback_collision, + TP_PROTO(struct bkey *k), + TP_ARGS(k) ); #endif /* _TRACE_BCACHE_H */ -- cgit v1.2.3 From 85b1492ee113486d871de7676a61f506a43ca475 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 14 May 2013 20:33:16 -0700 Subject: bcache: Rip out pkey()/pbtree() Old gcc doesnt like the struct hack, and it is kind of ugly. So finish off the work to convert pr_debug() statements to tracepoints, and delete pkey()/pbtree(). Signed-off-by: Kent Overstreet --- drivers/md/bcache/bset.c | 16 ++++++++++++---- drivers/md/bcache/btree.c | 21 ++++++--------------- drivers/md/bcache/btree.h | 7 +++++++ drivers/md/bcache/debug.c | 40 +++++++++++++++++++++++++--------------- drivers/md/bcache/debug.h | 11 ++--------- drivers/md/bcache/super.c | 5 +++-- drivers/md/bcache/trace.c | 2 ++ include/trace/events/bcache.h | 33 +++++++++++++++++++++++++++++++++ 8 files changed, 90 insertions(+), 45 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index cb4578a327b9..e9399ed7f688 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l) bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) { unsigned i; + char buf[80]; if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))) goto bad; @@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k) return false; bad: - cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k)); + bch_bkey_to_text(buf, sizeof(buf), k); + cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k)); return true; } @@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k) #ifdef CONFIG_BCACHE_EDEBUG bug: mutex_unlock(&b->c->bucket_lock); - btree_bug(b, + + { + char buf[80]; + + bch_bkey_to_text(buf, sizeof(buf), k); + btree_bug(b, "inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i", - pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), - g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin), + g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen); + } return true; #endif } diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 218d486259a3..53a0f4ef4e32 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1770,7 +1770,7 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, { struct bset *i = b->sets[b->nsets].data; struct bkey *m, *prev; - const char *status = "insert"; + unsigned status = BTREE_INSERT_STATUS_INSERT; BUG_ON(bkey_cmp(k, &b->key) > 0); BUG_ON(b->level && !KEY_PTRS(k)); @@ -1803,17 +1803,17 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, goto insert; /* prev is in the tree, if we merge we're done */ - status = "back merging"; + status = BTREE_INSERT_STATUS_BACK_MERGE; if (prev && bch_bkey_try_merge(b, prev, k)) goto merged; - status = "overwrote front"; + status = BTREE_INSERT_STATUS_OVERWROTE; if (m != end(i) && KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) goto copy; - status = "front merge"; + status = BTREE_INSERT_STATUS_FRONT_MERGE; if (m != end(i) && bch_bkey_try_merge(b, k, m)) goto copy; @@ -1823,16 +1823,12 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, insert: shift_keys(b, m, k); copy: bkey_copy(m, k); merged: - bch_check_keys(b, "%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); - bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); + bch_check_keys(b, "%u for %s", status, op_type(op)); if (b->level && !KEY_OFFSET(k)) btree_current_write(b)->prio_blocked++; - pr_debug("%s for %s at %s: %s", status, - op_type(op), pbtree(b), pkey(k)); + trace_bcache_btree_insert_key(b, k, op->type, status); return true; } @@ -2234,9 +2230,6 @@ int bch_btree_search_recurse(struct btree *b, struct btree_op *op) struct btree_iter iter; bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); - pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode, - (uint64_t) bio->bi_sector); - do { k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); if (!k) { @@ -2302,8 +2295,6 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (buf->key_predicate(buf, k)) { struct keybuf_key *w; - pr_debug("%s", pkey(k)); - spin_lock(&buf->lock); w = array_alloc(&buf->freelist); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 809bd77847a2..2b016b93cad4 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -271,6 +271,13 @@ struct btree_op { BKEY_PADDED(replace); }; +enum { + BTREE_INSERT_STATUS_INSERT, + BTREE_INSERT_STATUS_BACK_MERGE, + BTREE_INSERT_STATUS_OVERWROTE, + BTREE_INSERT_STATUS_FRONT_MERGE, +}; + void bch_btree_op_init_stack(struct btree_op *); static inline void rw_lock(bool w, struct btree *b, int level) diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index ae6096c6845d..82e3a07771ec 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k) return ""; } -struct keyprint_hack bch_pkey(const struct bkey *k) +int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k) { unsigned i = 0; - struct keyprint_hack r; - char *out = r.s, *end = r.s + KEYHACK_SIZE; + char *out = buf, *end = buf + size; #define p(...) (out += scnprintf(out, end - out, __VA_ARGS__)) @@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k) if (KEY_CSUM(k)) p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]); #undef p - return r; + return out - buf; } -struct keyprint_hack bch_pbtree(const struct btree *b) +int bch_btree_to_text(char *buf, size_t size, const struct btree *b) { - struct keyprint_hack r; - - snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0), - b->level, b->c->root ? b->c->root->level : -1); - return r; + return scnprintf(buf, size, "%zu level %i/%i", + PTR_BUCKET_NR(b->c, &b->key, 0), + b->level, b->c->root ? b->c->root->level : -1); } #if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG) @@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i) { struct bkey *k; unsigned j; + char buf[80]; for (k = i->start; k < end(i); k = bkey_next(k)) { + bch_bkey_to_text(buf, sizeof(buf), k); printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b), - (uint64_t *) k - i->d, i->keys, pkey(k)); + (uint64_t *) k - i->d, i->keys, buf); for (j = 0; j < KEY_PTRS(k); j++) { size_t n = PTR_BUCKET_NR(b->c, k, j); @@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt, va_list args) { unsigned i; + char buf[80]; console_lock(); @@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt, console_unlock(); - panic("at %s\n", pbtree(b)); + bch_btree_to_text(buf, sizeof(buf), b); + panic("at %s\n", buf); } void bch_check_key_order_msg(struct btree *b, struct bset *i, @@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, { struct dump_iterator *i = file->private_data; ssize_t ret = 0; + char kbuf[80]; while (size) { struct keybuf_key *w; @@ -359,7 +361,8 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, if (!w) break; - i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key)); + bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key); + i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf); bch_keybuf_del(&i->keys, w); } @@ -526,10 +529,17 @@ static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, k < end(i); k = bkey_next(k), l = bkey_next(l)) if (bkey_cmp(k, l) || - KEY_SIZE(k) != KEY_SIZE(l)) + KEY_SIZE(k) != KEY_SIZE(l)) { + char buf1[80]; + char buf2[80]; + + bch_bkey_to_text(buf1, sizeof(buf1), k); + bch_bkey_to_text(buf2, sizeof(buf2), l); + pr_err("key %zi differs: %s != %s", (uint64_t *) k - i->d, - pkey(k), pkey(l)); + buf1, buf2); + } for (j = 0; j < 3; j++) { pr_err("**** Set %i ****", j); diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h index f9378a218148..1c39b5a2489b 100644 --- a/drivers/md/bcache/debug.h +++ b/drivers/md/bcache/debug.h @@ -3,15 +3,8 @@ /* Btree/bkey debug printing */ -#define KEYHACK_SIZE 80 -struct keyprint_hack { - char s[KEYHACK_SIZE]; -}; - -struct keyprint_hack bch_pkey(const struct bkey *k); -struct keyprint_hack bch_pbtree(const struct btree *b); -#define pkey(k) (&bch_pkey(k).s[0]) -#define pbtree(b) (&bch_pbtree(b).s[0]) +int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k); +int bch_btree_to_text(char *buf, size_t size, const struct btree *b); #ifdef CONFIG_BCACHE_EDEBUG diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 47bc13745068..f24c2e0cbb1c 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -343,6 +343,7 @@ static void uuid_io(struct cache_set *c, unsigned long rw, struct closure *cl = &c->uuid_write.cl; struct uuid_entry *u; unsigned i; + char buf[80]; BUG_ON(!parent); closure_lock(&c->uuid_write, parent); @@ -363,8 +364,8 @@ static void uuid_io(struct cache_set *c, unsigned long rw, break; } - pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", - pkey(&c->uuid_bucket)); + bch_bkey_to_text(buf, sizeof(buf), k); + pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) if (!bch_is_zero(u->uuid, 16)) diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c index 7f4f38aa16ae..f7b6c197f90f 100644 --- a/drivers/md/bcache/trace.c +++ b/drivers/md/bcache/trace.c @@ -40,6 +40,8 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision); +EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key); + EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact); EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root); diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index c9952b36fcea..5ebda976ea93 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -305,6 +305,39 @@ DEFINE_EVENT(bkey, bcache_gc_copy_collision, TP_ARGS(k) ); +TRACE_EVENT(bcache_btree_insert_key, + TP_PROTO(struct btree *b, struct bkey *k, unsigned op, unsigned status), + TP_ARGS(b, k, op, status), + + TP_STRUCT__entry( + __field(u64, btree_node ) + __field(u32, btree_level ) + __field(u32, inode ) + __field(u64, offset ) + __field(u32, size ) + __field(u8, dirty ) + __field(u8, op ) + __field(u8, status ) + ), + + TP_fast_assign( + __entry->btree_node = PTR_BUCKET_NR(b->c, &b->key, 0); + __entry->btree_level = b->level; + __entry->inode = KEY_INODE(k); + __entry->offset = KEY_OFFSET(k); + __entry->size = KEY_SIZE(k); + __entry->dirty = KEY_DIRTY(k); + __entry->op = op; + __entry->status = status; + ), + + TP_printk("%u for %u at %llu(%u): %u:%llu len %u dirty %u", + __entry->status, __entry->op, + __entry->btree_node, __entry->btree_level, + __entry->inode, __entry->offset, + __entry->size, __entry->dirty) +); + DECLARE_EVENT_CLASS(btree_split, TP_PROTO(struct btree *b, unsigned keys), TP_ARGS(b, keys), -- cgit v1.2.3 From 6ded34d1a54c046a45db071d3cb7b37bd0a4a31f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 May 2013 15:59:37 -0700 Subject: bcache: Improve lazy sorting The old lazy sorting code was kind of hacky - rewrite in a way that mathematically makes more sense; the idea is that the size of the sets of keys in a btree node should increase by a more or less fixed ratio from smallest to biggest. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/bset.c | 40 +++++++++++++++++++++++----------------- drivers/md/bcache/super.c | 2 ++ 3 files changed, 26 insertions(+), 17 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 59c15e09e4dd..6fa5a1e33c49 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -828,6 +828,7 @@ struct cache_set { */ struct mutex sort_lock; struct bset *sort; + unsigned sort_crit_factor; /* List of buckets we're currently writing data to */ struct list_head data_buckets; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index e9399ed7f688..a0f190ac17a4 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1092,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new) new->sets->size = 0; } +#define SORT_CRIT (4096 / sizeof(uint64_t)) + void bch_btree_sort_lazy(struct btree *b) { - if (b->nsets) { - unsigned i, j, keys = 0, total; - - for (i = 0; i <= b->nsets; i++) - keys += b->sets[i].data->keys; + unsigned crit = SORT_CRIT; + int i; - total = keys; + /* Don't sort if nothing to do */ + if (!b->nsets) + goto out; - for (j = 0; j < b->nsets; j++) { - if (keys * 2 < total || - keys < 1000) { - bch_btree_sort_partial(b, j); - return; - } + /* If not a leaf node, always sort */ + if (b->level) { + bch_btree_sort(b); + return; + } - keys -= b->sets[j].data->keys; - } + for (i = b->nsets - 1; i >= 0; --i) { + crit *= b->c->sort_crit_factor; - /* Must sort if b->nsets == 3 or we'll overflow */ - if (b->nsets >= (MAX_BSETS - 1) - b->level) { - bch_btree_sort(b); + if (b->sets[i].data->keys < crit) { + bch_btree_sort_partial(b, i); return; } } + /* Sort if we'd overflow */ + if (b->nsets + 1 == MAX_BSETS) { + bch_btree_sort(b); + return; + } + +out: bset_build_written_tree(b); } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f24c2e0cbb1c..3c8474161e8e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1375,6 +1375,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) c->btree_pages = max_t(int, c->btree_pages / 4, BTREE_MAX_PAGES); + c->sort_crit_factor = int_sqrt(c->btree_pages); + mutex_init(&c->bucket_lock); mutex_init(&c->sort_lock); spin_lock_init(&c->sort_time_lock); -- cgit v1.2.3 From 444fc0b6b167ed164e7436621a9d095e042644dd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 11 May 2013 17:07:26 -0700 Subject: bcache: Initialize sectors_dirty when attaching Previously, dirty_data wouldn't get initialized until the first garbage collection... which was a bit of a problem for background writeback (as the PD controller keys off of it) and also confusing for users. This is also prep work for making background writeback aware of raid5/6 stripes. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/btree.c | 29 +---------------------------- drivers/md/bcache/super.c | 1 + drivers/md/bcache/writeback.c | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 39 insertions(+), 29 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6fa5a1e33c49..d099d8894c2f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -438,7 +438,6 @@ struct bcache_device { atomic_t detaching; atomic_long_t sectors_dirty; - unsigned long sectors_dirty_gc; unsigned long sectors_dirty_last; long sectors_dirty_derivative; @@ -1225,6 +1224,7 @@ void bch_cache_set_stop(struct cache_set *); struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); +void bch_sectors_dirty_init(struct cached_dev *); void bch_cached_dev_writeback_init(struct cached_dev *); void bch_moving_init_cache_set(struct cache_set *); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 53a0f4ef4e32..230c3a6d9be2 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1119,11 +1119,8 @@ static int btree_gc_mark_node(struct btree *b, unsigned *keys, gc->nkeys++; gc->data += KEY_SIZE(k); - if (KEY_DIRTY(k)) { + if (KEY_DIRTY(k)) gc->dirty += KEY_SIZE(k); - if (d) - d->sectors_dirty_gc += KEY_SIZE(k); - } } for (t = b->sets; t <= &b->sets[b->nsets]; t++) @@ -1377,7 +1374,6 @@ static void btree_gc_start(struct cache_set *c) { struct cache *ca; struct bucket *b; - struct bcache_device **d; unsigned i; if (!c->gc_mark_valid) @@ -1395,12 +1391,6 @@ static void btree_gc_start(struct cache_set *c) SET_GC_MARK(b, GC_MARK_RECLAIMABLE); } - for (d = c->devices; - d < c->devices + c->nr_uuids; - d++) - if (*d) - (*d)->sectors_dirty_gc = 0; - mutex_unlock(&c->bucket_lock); } @@ -1409,7 +1399,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) size_t available = 0; struct bucket *b; struct cache *ca; - struct bcache_device **d; unsigned i; mutex_lock(&c->bucket_lock); @@ -1452,22 +1441,6 @@ size_t bch_btree_gc_finish(struct cache_set *c) } } - for (d = c->devices; - d < c->devices + c->nr_uuids; - d++) - if (*d) { - unsigned long last = - atomic_long_read(&((*d)->sectors_dirty)); - long difference = (*d)->sectors_dirty_gc - last; - - pr_debug("sectors dirty off by %li", difference); - - (*d)->sectors_dirty_last += difference; - - atomic_long_set(&((*d)->sectors_dirty), - (*d)->sectors_dirty_gc); - } - mutex_unlock(&c->bucket_lock); return available; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 3c8474161e8e..dbfa1c38e85e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -961,6 +961,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) atomic_set(&dc->count, 1); if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { + bch_sectors_dirty_init(dc); atomic_set(&dc->has_dirty, 1); atomic_inc(&dc->count); bch_writeback_queue(dc); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 82f6d4577be2..553949eefd51 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -377,6 +377,42 @@ err: refill_dirty(cl); } +/* Init */ + +static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, + struct cached_dev *dc) +{ + struct bkey *k; + struct btree_iter iter; + + bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); + while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) + if (!b->level) { + if (KEY_INODE(k) > dc->disk.id) + break; + + if (KEY_DIRTY(k)) + atomic_long_add(KEY_SIZE(k), + &dc->disk.sectors_dirty); + } else { + btree(sectors_dirty_init, k, b, op, dc); + if (KEY_INODE(k) > dc->disk.id) + break; + + cond_resched(); + } + + return 0; +} + +void bch_sectors_dirty_init(struct cached_dev *dc) +{ + struct btree_op op; + + bch_btree_op_init_stack(&op); + btree_root(sectors_dirty_init, dc->disk.c, &op, dc); +} + void bch_cached_dev_writeback_init(struct cached_dev *dc) { closure_init_unlocked(&dc->writeback); -- cgit v1.2.3 From 279afbad4e54acbd61bf88a54a73af3bbfdeb5dd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Jun 2013 06:21:07 -0700 Subject: bcache: Track dirty data by stripe To make background writeback aware of raid5/6 stripes, we first need to track the amount of dirty data within each stripe - we do this by breaking up the existing sectors_dirty into per stripe atomic_ts Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 10 ++++------ drivers/md/bcache/btree.c | 20 +++++++++++++------- drivers/md/bcache/request.c | 3 ++- drivers/md/bcache/super.c | 32 ++++++++++++++++++++++++++++---- drivers/md/bcache/sysfs.c | 5 +++-- drivers/md/bcache/writeback.c | 40 ++++++++++++++++++++++++++++++++++------ drivers/md/bcache/writeback.h | 21 +++++++++++++++++++++ 7 files changed, 105 insertions(+), 26 deletions(-) create mode 100644 drivers/md/bcache/writeback.h (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d099d8894c2f..dbddef0cdb59 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -437,7 +437,10 @@ struct bcache_device { /* If nonzero, we're detaching/unregistering from cache set */ atomic_t detaching; - atomic_long_t sectors_dirty; + uint64_t nr_stripes; + unsigned stripe_size_bits; + atomic_t *stripe_sectors_dirty; + unsigned long sectors_dirty_last; long sectors_dirty_derivative; @@ -1159,9 +1162,6 @@ static inline void wake_up_allocators(struct cache_set *c) /* Forward declarations */ -void bch_writeback_queue(struct cached_dev *); -void bch_writeback_add(struct cached_dev *, unsigned); - void bch_count_io_errors(struct cache *, int, const char *); void bch_bbio_count_io_errors(struct cache_set *, struct bio *, int, const char *); @@ -1224,8 +1224,6 @@ void bch_cache_set_stop(struct cache_set *); struct cache_set *bch_cache_set_alloc(struct cache_sb *); void bch_btree_cache_free(struct cache_set *); int bch_btree_cache_alloc(struct cache_set *); -void bch_sectors_dirty_init(struct cached_dev *); -void bch_cached_dev_writeback_init(struct cached_dev *); void bch_moving_init_cache_set(struct cache_set *); int bch_cache_allocator_start(struct cache *ca); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 230c3a6d9be2..b93cf56260a4 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -24,6 +24,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" #include #include @@ -1599,14 +1600,14 @@ static bool fix_overlapping_extents(struct btree *b, struct btree_iter *iter, struct btree_op *op) { - void subtract_dirty(struct bkey *k, int sectors) + void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) { - struct bcache_device *d = b->c->devices[KEY_INODE(k)]; - - if (KEY_DIRTY(k) && d) - atomic_long_sub(sectors, &d->sectors_dirty); + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + offset, -sectors); } + uint64_t old_offset; unsigned old_size, sectors_found = 0; while (1) { @@ -1618,6 +1619,7 @@ static bool fix_overlapping_extents(struct btree *b, if (bkey_cmp(k, &START_KEY(insert)) <= 0) continue; + old_offset = KEY_START(k); old_size = KEY_SIZE(k); /* @@ -1673,7 +1675,7 @@ static bool fix_overlapping_extents(struct btree *b, struct bkey *top; - subtract_dirty(k, KEY_SIZE(insert)); + subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); if (bkey_written(b, k)) { /* @@ -1720,7 +1722,7 @@ static bool fix_overlapping_extents(struct btree *b, } } - subtract_dirty(k, old_size - KEY_SIZE(k)); + subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); } check_failed: @@ -1796,6 +1798,10 @@ static bool btree_insert_key(struct btree *b, struct btree_op *op, insert: shift_keys(b, m, k); copy: bkey_copy(m, k); merged: + if (KEY_DIRTY(k)) + bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), + KEY_START(k), KEY_SIZE(k)); + bch_check_keys(b, "%u for %s", status, op_type(op)); if (b->level && !KEY_OFFSET(k)) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 695469958c1e..017c95fced8e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -10,6 +10,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" #include #include @@ -1044,7 +1045,7 @@ static void request_write(struct cached_dev *dc, struct search *s) closure_bio_submit(bio, cl, s->d); } else { s->op.cache_bio = bio; - bch_writeback_add(dc, bio_sectors(bio)); + bch_writeback_add(dc); } out: closure_call(&s->op.cl, bch_insert_data, NULL, cl); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index dbfa1c38e85e..8c73f0c7f28a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -10,6 +10,7 @@ #include "btree.h" #include "debug.h" #include "request.h" +#include "writeback.h" #include #include @@ -744,13 +745,35 @@ static void bcache_device_free(struct bcache_device *d) mempool_destroy(d->unaligned_bvec); if (d->bio_split) bioset_free(d->bio_split); + if (is_vmalloc_addr(d->stripe_sectors_dirty)) + vfree(d->stripe_sectors_dirty); + else + kfree(d->stripe_sectors_dirty); closure_debug_destroy(&d->cl); } -static int bcache_device_init(struct bcache_device *d, unsigned block_size) +static int bcache_device_init(struct bcache_device *d, unsigned block_size, + sector_t sectors) { struct request_queue *q; + size_t n; + + if (!d->stripe_size_bits) + d->stripe_size_bits = 31; + + d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> + d->stripe_size_bits; + + if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) + return -ENOMEM; + + n = d->nr_stripes * sizeof(atomic_t); + d->stripe_sectors_dirty = n < PAGE_SIZE << 6 + ? kzalloc(n, GFP_KERNEL) + : vzalloc(n); + if (!d->stripe_sectors_dirty) + return -ENOMEM; if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, @@ -760,6 +783,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size) !(q = blk_alloc_queue(GFP_KERNEL))) return -ENOMEM; + set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); d->disk->major = bcache_major; @@ -1047,7 +1071,8 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } - ret = bcache_device_init(&dc->disk, block_size); + ret = bcache_device_init(&dc->disk, block_size, + dc->bdev->bd_part->nr_sects - dc->sb.data_offset); if (ret) return ret; @@ -1146,11 +1171,10 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) kobject_init(&d->kobj, &bch_flash_dev_ktype); - if (bcache_device_init(d, block_bytes(c))) + if (bcache_device_init(d, block_bytes(c), u->sectors)) goto err; bcache_device_attach(d, c, u - c->uuids); - set_capacity(d->disk, u->sectors); bch_flash_dev_request_init(d); add_disk(d->disk); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index f5c2d8695230..cf8d91ec3238 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -9,6 +9,7 @@ #include "sysfs.h" #include "btree.h" #include "request.h" +#include "writeback.h" #include #include @@ -128,7 +129,7 @@ SHOW(__bch_cached_dev) char derivative[20]; char target[20]; bch_hprint(dirty, - atomic_long_read(&dc->disk.sectors_dirty) << 9); + bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(derivative, dc->writeback_rate_derivative << 9); bch_hprint(target, dc->writeback_rate_target << 9); @@ -144,7 +145,7 @@ SHOW(__bch_cached_dev) } sysfs_hprint(dirty_data, - atomic_long_read(&dc->disk.sectors_dirty) << 9); + bcache_dev_sectors_dirty(&dc->disk) << 9); var_printf(sequential_merge, "%i"); var_hprint(sequential_cutoff); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 553949eefd51..dd815475c524 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -9,6 +9,7 @@ #include "bcache.h" #include "btree.h" #include "debug.h" +#include "writeback.h" #include @@ -38,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc) int change = 0; int64_t error; - int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty); + int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); int64_t derivative = dirty - dc->disk.sectors_dirty_last; dc->disk.sectors_dirty_last = dirty; @@ -183,10 +184,8 @@ void bch_writeback_queue(struct cached_dev *dc) } } -void bch_writeback_add(struct cached_dev *dc, unsigned sectors) +void bch_writeback_add(struct cached_dev *dc) { - atomic_long_add(sectors, &dc->disk.sectors_dirty); - if (!atomic_read(&dc->has_dirty) && !atomic_xchg(&dc->has_dirty, 1)) { atomic_inc(&dc->count); @@ -205,6 +204,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors) } } +void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, + uint64_t offset, int nr_sectors) +{ + struct bcache_device *d = c->devices[inode]; + unsigned stripe_size, stripe_offset; + uint64_t stripe; + + if (!d) + return; + + stripe_size = 1 << d->stripe_size_bits; + stripe = offset >> d->stripe_size_bits; + stripe_offset = offset & (stripe_size - 1); + + while (nr_sectors) { + int s = min_t(unsigned, abs(nr_sectors), + stripe_size - stripe_offset); + + if (nr_sectors < 0) + s = -s; + + atomic_add(s, d->stripe_sectors_dirty + stripe); + nr_sectors -= s; + stripe_offset = 0; + stripe++; + } +} + /* Background writeback - IO loop */ static void dirty_io_destructor(struct closure *cl) @@ -392,8 +419,9 @@ static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, break; if (KEY_DIRTY(k)) - atomic_long_add(KEY_SIZE(k), - &dc->disk.sectors_dirty); + bcache_dev_sectors_dirty_add(b->c, dc->disk.id, + KEY_START(k), + KEY_SIZE(k)); } else { btree(sectors_dirty_init, k, b, op, dc); if (KEY_INODE(k) > dc->disk.id) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h new file mode 100644 index 000000000000..5ce9771df047 --- /dev/null +++ b/drivers/md/bcache/writeback.h @@ -0,0 +1,21 @@ +#ifndef _BCACHE_WRITEBACK_H +#define _BCACHE_WRITEBACK_H + +static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) +{ + uint64_t i, ret = 0; + + for (i = 0; i < d->nr_stripes; i++) + ret += atomic_read(d->stripe_sectors_dirty + i); + + return ret; +} + +void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); +void bch_writeback_queue(struct cached_dev *); +void bch_writeback_add(struct cached_dev *); + +void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_cached_dev_writeback_init(struct cached_dev *); + +#endif -- cgit v1.2.3 From 72c270612bd33192fa836ad0f2939af1ca218292 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Jun 2013 06:24:39 -0700 Subject: bcache: Write out full stripes Now that we're tracking dirty data per stripe, we can add two optimizations for raid5/6: * If a stripe is already dirty, force writes to that stripe to writeback mode - to help build up full stripes of dirty data * When flushing dirty data, preferentially write out full stripes first if there are any. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 3 +-- drivers/md/bcache/btree.c | 19 ++++++++++--------- drivers/md/bcache/btree.h | 9 +++++---- drivers/md/bcache/debug.c | 4 ++-- drivers/md/bcache/movinggc.c | 5 +++-- drivers/md/bcache/request.c | 23 +++++++--------------- drivers/md/bcache/sysfs.c | 8 ++++++++ drivers/md/bcache/writeback.c | 44 +++++++++++++++++++++++++++++++++++++++++-- drivers/md/bcache/writeback.h | 43 ++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 121 insertions(+), 37 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index dbddef0cdb59..342ba86c6e4f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -387,8 +387,6 @@ struct keybuf_key { typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); struct keybuf { - keybuf_pred_fn *key_predicate; - struct bkey last_scanned; spinlock_t lock; @@ -532,6 +530,7 @@ struct cached_dev { unsigned sequential_merge:1; unsigned verify:1; + unsigned partial_stripes_expensive:1; unsigned writeback_metadata:1; unsigned writeback_running:1; unsigned char writeback_percent; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index b93cf56260a4..09fb8a2f43da 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2252,7 +2252,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, } static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, - struct keybuf *buf, struct bkey *end) + struct keybuf *buf, struct bkey *end, + keybuf_pred_fn *pred) { struct btree_iter iter; bch_btree_iter_init(b, &iter, &buf->last_scanned); @@ -2271,7 +2272,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (bkey_cmp(&buf->last_scanned, end) >= 0) break; - if (buf->key_predicate(buf, k)) { + if (pred(buf, k)) { struct keybuf_key *w; spin_lock(&buf->lock); @@ -2290,7 +2291,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (!k) break; - btree(refill_keybuf, k, b, op, buf, end); + btree(refill_keybuf, k, b, op, buf, end, pred); /* * Might get an error here, but can't really do anything * and it'll get logged elsewhere. Just read what we @@ -2308,7 +2309,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, } void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, - struct bkey *end) + struct bkey *end, keybuf_pred_fn *pred) { struct bkey start = buf->last_scanned; struct btree_op op; @@ -2316,7 +2317,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, cond_resched(); - btree_root(refill_keybuf, c, &op, buf, end); + btree_root(refill_keybuf, c, &op, buf, end, pred); closure_sync(&op.cl); pr_debug("found %s keys from %llu:%llu to %llu:%llu", @@ -2402,7 +2403,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, struct keybuf *buf, - struct bkey *end) + struct bkey *end, + keybuf_pred_fn *pred) { struct keybuf_key *ret; @@ -2416,15 +2418,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, break; } - bch_refill_keybuf(c, buf, end); + bch_refill_keybuf(c, buf, end, pred); } return ret; } -void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) +void bch_keybuf_init(struct keybuf *buf) { - buf->key_predicate = fn; buf->last_scanned = MAX_KEY; buf->keys = RB_ROOT; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 2b016b93cad4..f66d69a7baf1 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -391,13 +391,14 @@ void bch_moving_gc(struct closure *); int bch_btree_check(struct cache_set *, struct btree_op *); uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); -void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); -void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); +void bch_keybuf_init(struct keybuf *); +void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, + keybuf_pred_fn *); bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, struct bkey *); void bch_keybuf_del(struct keybuf *, struct keybuf_key *); struct keybuf_key *bch_keybuf_next(struct keybuf *); -struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, - struct keybuf *, struct bkey *); +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, + struct bkey *, keybuf_pred_fn *); #endif diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 82e3a07771ec..1c8fd319846e 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -357,7 +357,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, if (i->bytes) break; - w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); + w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); if (!w) break; @@ -380,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file) file->private_data = i; i->c = c; - bch_keybuf_init(&i->keys, dump_pred); + bch_keybuf_init(&i->keys); i->keys.last_scanned = KEY(0, 0, 0); return 0; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 04f6b97ffda6..a241e9fd4f7f 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -136,7 +136,8 @@ static void read_moving(struct closure *cl) /* XXX: if we error, background writeback could stall indefinitely */ while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { - w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); + w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, + &MAX_KEY, moving_pred); if (!w) break; @@ -248,5 +249,5 @@ void bch_moving_gc(struct closure *cl) void bch_moving_init_cache_set(struct cache_set *c) { - bch_keybuf_init(&c->moving_gc_keys, moving_pred); + bch_keybuf_init(&c->moving_gc_keys); } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 017c95fced8e..17bd59704eba 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -22,8 +22,6 @@ #define CUTOFF_CACHE_ADD 95 #define CUTOFF_CACHE_READA 90 -#define CUTOFF_WRITEBACK 50 -#define CUTOFF_WRITEBACK_SYNC 75 struct kmem_cache *bch_search_cache; @@ -998,17 +996,6 @@ static void cached_dev_write_complete(struct closure *cl) cached_dev_bio_complete(cl); } -static bool should_writeback(struct cached_dev *dc, struct bio *bio) -{ - unsigned threshold = (bio->bi_rw & REQ_SYNC) - ? CUTOFF_WRITEBACK_SYNC - : CUTOFF_WRITEBACK; - - return !atomic_read(&dc->disk.detaching) && - cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && - dc->disk.c->gc_stats.in_use < threshold; -} - static void request_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; @@ -1030,12 +1017,16 @@ static void request_write(struct cached_dev *dc, struct search *s) if (bio->bi_rw & REQ_DISCARD) goto skip; + if (should_writeback(dc, s->orig_bio, + cache_mode(dc, bio), + s->op.skip)) { + s->op.skip = false; + s->writeback = true; + } + if (s->op.skip) goto skip; - if (should_writeback(dc, s->orig_bio)) - s->writeback = true; - trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); if (!s->writeback) { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index cf8d91ec3238..70c6dff0d0cd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -81,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse); rw_attribute(writeback_rate_d_smooth); read_attribute(writeback_rate_debug); +read_attribute(stripe_size); +read_attribute(partial_stripes_expensive); + rw_attribute(synchronous); rw_attribute(journal_delay_ms); rw_attribute(discard); @@ -147,6 +150,9 @@ SHOW(__bch_cached_dev) sysfs_hprint(dirty_data, bcache_dev_sectors_dirty(&dc->disk) << 9); + sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); + var_printf(partial_stripes_expensive, "%u"); + var_printf(sequential_merge, "%i"); var_hprint(sequential_cutoff); var_hprint(readahead); @@ -286,6 +292,8 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_d_smooth, &sysfs_writeback_rate_debug, &sysfs_dirty_data, + &sysfs_stripe_size, + &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, &sysfs_sequential_merge, &sysfs_clear_stats, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index dd815475c524..d81ee5ccc726 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -108,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) return KEY_DIRTY(k); } +static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) +{ + uint64_t stripe; + unsigned nr_sectors = KEY_SIZE(k); + struct cached_dev *dc = container_of(buf, struct cached_dev, + writeback_keys); + unsigned stripe_size = 1 << dc->disk.stripe_size_bits; + + if (!KEY_DIRTY(k)) + return false; + + stripe = KEY_START(k) >> dc->disk.stripe_size_bits; + while (1) { + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != + stripe_size) + return false; + + if (nr_sectors <= stripe_size) + return true; + + nr_sectors -= stripe_size; + stripe++; + } +} + static void dirty_init(struct keybuf_key *w) { struct dirty_io *io = w->private; @@ -152,7 +177,22 @@ static void refill_dirty(struct closure *cl) searched_from_start = true; } - bch_refill_keybuf(dc->disk.c, buf, &end); + if (dc->partial_stripes_expensive) { + uint64_t i; + + for (i = 0; i < dc->disk.nr_stripes; i++) + if (atomic_read(dc->disk.stripe_sectors_dirty + i) == + 1 << dc->disk.stripe_size_bits) + goto full_stripes; + + goto normal_refill; +full_stripes: + bch_refill_keybuf(dc->disk.c, buf, &end, + dirty_full_stripe_pred); + } else { +normal_refill: + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); + } if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { /* Searched the entire btree - delay awhile */ @@ -446,7 +486,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); - bch_keybuf_init(&dc->writeback_keys, dirty_pred); + bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; dc->writeback_running = true; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 5ce9771df047..c91f61bb95b6 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -1,6 +1,9 @@ #ifndef _BCACHE_WRITEBACK_H #define _BCACHE_WRITEBACK_H +#define CUTOFF_WRITEBACK 40 +#define CUTOFF_WRITEBACK_SYNC 70 + static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) { uint64_t i, ret = 0; @@ -11,6 +14,46 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } +static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, + uint64_t offset, + unsigned nr_sectors) +{ + uint64_t stripe = offset >> d->stripe_size_bits; + + while (1) { + if (atomic_read(d->stripe_sectors_dirty + stripe)) + return true; + + if (nr_sectors <= 1 << d->stripe_size_bits) + return false; + + nr_sectors -= 1 << d->stripe_size_bits; + stripe++; + } +} + +static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, + unsigned cache_mode, bool would_skip) +{ + unsigned in_use = dc->disk.c->gc_stats.in_use; + + if (cache_mode != CACHE_MODE_WRITEBACK || + atomic_read(&dc->disk.detaching) || + in_use > CUTOFF_WRITEBACK_SYNC) + return false; + + if (dc->partial_stripes_expensive && + bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, + bio_sectors(bio))) + return true; + + if (would_skip) + return false; + + return bio->bi_rw & REQ_SYNC || + in_use <= CUTOFF_WRITEBACK; +} + void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); void bch_writeback_queue(struct cached_dev *); void bch_writeback_add(struct cached_dev *); -- cgit v1.2.3 From a25c32bedeff3573b53572b87bcafe10ec5b75a9 Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Fri, 7 Jun 2013 23:27:01 +0200 Subject: bcache: Send a uevent with a cached device's UUID Signed-off-by: Gabriel de Perthuis --- drivers/md/bcache/super.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 8c73f0c7f28a..a104dad2c7fa 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -825,6 +825,11 @@ static void calc_cached_dev_sectors(struct cache_set *c) void bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; + char *env[] = { + "DRIVER=bcache", + kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), + NULL + }; if (atomic_xchg(&dc->running, 1)) return; @@ -841,10 +846,11 @@ void bch_cached_dev_run(struct cached_dev *dc) add_disk(d->disk); bd_link_disk_holder(dc->bdev, dc->disk.disk); -#if 0 - char *env[] = { "SYMLINK=label" , NULL }; + /* won't show up in the uevent file, use udevadm monitor -e instead + * only class / kset properties are persistent */ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); -#endif + kfree(env[1]); + if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) pr_debug("error creating sysfs link"); -- cgit v1.2.3 From ab9e14002e271eba41f7f9ab7e9b03cac4adc22d Mon Sep 17 00:00:00 2001 From: Gabriel de Perthuis Date: Sun, 9 Jun 2013 00:54:48 +0200 Subject: bcache: Send label uevents Signed-off-by: Gabriel de Perthuis Signed-off-by: Kent Overstreet --- drivers/md/bcache/super.c | 9 ++++++++- drivers/md/bcache/sysfs.c | 9 +++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a104dad2c7fa..cff2d182dfb0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -825,12 +825,18 @@ static void calc_cached_dev_sectors(struct cache_set *c) void bch_cached_dev_run(struct cached_dev *dc) { struct bcache_device *d = &dc->disk; + char buf[SB_LABEL_SIZE + 1]; char *env[] = { "DRIVER=bcache", kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), - NULL + NULL, + NULL, }; + memcpy(buf, dc->sb.label, SB_LABEL_SIZE); + buf[SB_LABEL_SIZE] = '\0'; + env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); + if (atomic_xchg(&dc->running, 1)) return; @@ -850,6 +856,7 @@ void bch_cached_dev_run(struct cached_dev *dc) * only class / kset properties are persistent */ kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); kfree(env[1]); + kfree(env[2]); if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 70c6dff0d0cd..dd3f00a42729 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -178,6 +178,7 @@ STORE(__cached_dev) disk.kobj); unsigned v = size; struct cache_set *c; + struct kobj_uevent_env *env; #define d_strtoul(var) sysfs_strtoul(var, dc->var) #define d_strtoi_h(var) sysfs_hatoi(var, dc->var) @@ -222,6 +223,7 @@ STORE(__cached_dev) } if (attr == &sysfs_label) { + /* note: endlines are preserved */ memcpy(dc->sb.label, buf, SB_LABEL_SIZE); bch_write_bdev_super(dc, NULL); if (dc->disk.c) { @@ -229,6 +231,13 @@ STORE(__cached_dev) buf, SB_LABEL_SIZE); bch_uuid_write(dc->disk.c); } + env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); + add_uevent_var(env, "DRIVER=bcache"); + add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid), + add_uevent_var(env, "CACHED_LABEL=%s", buf); + kobject_uevent_env( + &disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp); + kfree(env); } if (attr == &sysfs_attach) { -- cgit v1.2.3 From e49c7c374e7aacd1f04ecbc21d9dbbeeea4a77d6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 26 Jun 2013 17:25:38 -0700 Subject: bcache: FUA fixes Journal writes need to be marked FUA, not just REQ_FLUSH. And btree node writes have... weird ordering requirements. Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 25 ++++++++++++++++++++++--- drivers/md/bcache/journal.c | 2 +- drivers/md/bcache/request.c | 13 ++++++++++++- 3 files changed, 35 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 09fb8a2f43da..a6ad49ac5f2b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -328,10 +328,25 @@ static void do_btree_node_write(struct btree *b) b->bio->bi_end_io = btree_node_write_endio; b->bio->bi_private = &b->io.cl; - b->bio->bi_rw = REQ_META|WRITE_SYNC; - b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); + b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; + b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); bch_bio_map(b->bio, i); + /* + * If we're appending to a leaf node, we don't technically need FUA - + * this write just needs to be persisted before the next journal write, + * which will be marked FLUSH|FUA. + * + * Similarly if we're writing a new btree root - the pointer is going to + * be in the next journal entry. + * + * But if we're writing a new btree node (that isn't a root) or + * appending to a non leaf btree node, we need either FUA or a flush + * when we write the parent with the new pointer. FUA is cheaper than a + * flush, and writes appending to leaf nodes aren't blocking anything so + * just make all btree node writes FUA to keep things sane. + */ + bkey_copy(&k.key, &b->key); SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); @@ -2092,6 +2107,9 @@ int bch_btree_insert(struct btree_op *op, struct cache_set *c) void bch_btree_set_root(struct btree *b) { unsigned i; + struct closure cl; + + closure_init_stack(&cl); trace_bcache_btree_set_root(b); @@ -2107,7 +2125,8 @@ void bch_btree_set_root(struct btree *b) b->c->root = b; __bkey_put(b->c, &b->key); - bch_journal_meta(b->c, NULL); + bch_journal_meta(b->c, &cl); + closure_sync(&cl); } /* Cache lookup */ diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 5ca22149b749..4b250667bb7f 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -620,7 +620,7 @@ static void journal_write_unlocked(struct closure *cl) bio_reset(bio); bio->bi_sector = PTR_OFFSET(k, i); bio->bi_bdev = ca->bdev; - bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH; + bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; bio->bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 17bd59704eba..bcdf1f782c3e 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1035,8 +1035,19 @@ static void request_write(struct cached_dev *dc, struct search *s) closure_bio_submit(bio, cl, s->d); } else { - s->op.cache_bio = bio; bch_writeback_add(dc); + + if (s->op.flush_journal) { + /* Also need to send a flush to the backing device */ + s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, + dc->disk.bio_split); + + bio->bi_size = 0; + bio->bi_vcnt = 0; + closure_bio_submit(bio, cl, s->d); + } else { + s->op.cache_bio = bio; + } } out: closure_call(&s->op.cl, bch_insert_data, NULL, cl); -- cgit v1.2.3 From 36c9ea9837c1cb21c778781495101eaff7e5eb56 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 3 Jun 2013 13:04:56 -0700 Subject: bcache: Document shrinker reserve better Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index a6ad49ac5f2b..04636a70ffaa 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -622,6 +622,13 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) else if (!mutex_trylock(&c->bucket_lock)) return -1; + /* + * It's _really_ critical that we don't free too many btree nodes - we + * have to always leave ourselves a reserve. The reserve is how we + * guarantee that allocating memory for a new btree node can always + * succeed, so that inserting keys into the btree can always succeed and + * IO can always make forward progress: + */ nr /= c->btree_pages; nr = min_t(unsigned long, nr, mca_can_free(c)); -- cgit v1.2.3 From f3059a54610f6c516c0942d58b9435921768ce2d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 15 May 2013 17:13:45 -0700 Subject: bcache: Delete fuzz tester This code has rotted and it hasn't been used in ages anyways. Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 4 +- drivers/md/bcache/btree.h | 2 - drivers/md/bcache/debug.c | 148 ---------------------------------------------- 3 files changed, 2 insertions(+), 152 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 04636a70ffaa..e0cca3673d0f 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -135,7 +135,7 @@ static uint64_t btree_csum_set(struct btree *b, struct bset *i) return crc ^ 0xffffffffffffffffULL; } -void bch_btree_node_read_done(struct btree *b) +static void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; struct bset *i = b->sets[0].data; @@ -1834,7 +1834,7 @@ merged: return true; } -bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) +static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) { bool ret = false; struct bkey *k; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index f66d69a7baf1..3333d3723633 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -369,7 +369,6 @@ static inline bool should_split(struct btree *b) } void bch_btree_node_read(struct btree *); -void bch_btree_node_read_done(struct btree *); void bch_btree_node_write(struct btree *, struct closure *); void bch_cannibalize_unlock(struct cache_set *, struct closure *); @@ -378,7 +377,6 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *); struct btree *bch_btree_node_get(struct cache_set *, struct bkey *, int, struct btree_op *); -bool bch_btree_insert_keys(struct btree *, struct btree_op *); bool bch_btree_insert_check_key(struct btree *, struct btree_op *, struct bio *); int bch_btree_insert(struct btree_op *, struct cache_set *); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 1c8fd319846e..ba2ceee0fdbb 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -412,149 +412,6 @@ void bch_debug_init_cache_set(struct cache_set *c) #endif -/* Fuzz tester has rotted: */ -#if 0 - -static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a, - const char *buffer, size_t size) -{ - void dump(struct btree *b) - { - struct bset *i; - - for (i = b->sets[0].data; - index(i, b) < btree_blocks(b) && - i->seq == b->sets[0].data->seq; - i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c)) - dump_bset(b, i); - } - - struct cache_sb *sb; - struct cache_set *c; - struct btree *all[3], *b, *fill, *orig; - int j; - - struct btree_op op; - bch_btree_op_init_stack(&op); - - sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL); - if (!sb) - return -ENOMEM; - - sb->bucket_size = 128; - sb->block_size = 4; - - c = bch_cache_set_alloc(sb); - if (!c) - return -ENOMEM; - - for (j = 0; j < 3; j++) { - BUG_ON(list_empty(&c->btree_cache)); - all[j] = list_first_entry(&c->btree_cache, struct btree, list); - list_del_init(&all[j]->list); - - all[j]->key = KEY(0, 0, c->sb.bucket_size); - bkey_copy_key(&all[j]->key, &MAX_KEY); - } - - b = all[0]; - fill = all[1]; - orig = all[2]; - - while (1) { - for (j = 0; j < 3; j++) - all[j]->written = all[j]->nsets = 0; - - bch_bset_init_next(b); - - while (1) { - struct bset *i = write_block(b); - struct bkey *k = op.keys.top; - unsigned rand; - - bkey_init(k); - rand = get_random_int(); - - op.type = rand & 1 - ? BTREE_INSERT - : BTREE_REPLACE; - rand >>= 1; - - SET_KEY_SIZE(k, bucket_remainder(c, rand)); - rand >>= c->bucket_bits; - rand &= 1024 * 512 - 1; - rand += c->sb.bucket_size; - SET_KEY_OFFSET(k, rand); -#if 0 - SET_KEY_PTRS(k, 1); -#endif - bch_keylist_push(&op.keys); - bch_btree_insert_keys(b, &op); - - if (should_split(b) || - set_blocks(i, b->c) != - __set_blocks(i, i->keys + 15, b->c)) { - i->csum = csum_set(i); - - memcpy(write_block(fill), - i, set_bytes(i)); - - b->written += set_blocks(i, b->c); - fill->written = b->written; - if (b->written == btree_blocks(b)) - break; - - bch_btree_sort_lazy(b); - bch_bset_init_next(b); - } - } - - memcpy(orig->sets[0].data, - fill->sets[0].data, - btree_bytes(c)); - - bch_btree_sort(b); - fill->written = 0; - bch_btree_node_read_done(fill); - - if (b->sets[0].data->keys != fill->sets[0].data->keys || - memcmp(b->sets[0].data->start, - fill->sets[0].data->start, - b->sets[0].data->keys * sizeof(uint64_t))) { - struct bset *i = b->sets[0].data; - struct bkey *k, *l; - - for (k = i->start, - l = fill->sets[0].data->start; - k < end(i); - k = bkey_next(k), l = bkey_next(l)) - if (bkey_cmp(k, l) || - KEY_SIZE(k) != KEY_SIZE(l)) { - char buf1[80]; - char buf2[80]; - - bch_bkey_to_text(buf1, sizeof(buf1), k); - bch_bkey_to_text(buf2, sizeof(buf2), l); - - pr_err("key %zi differs: %s != %s", - (uint64_t *) k - i->d, - buf1, buf2); - } - - for (j = 0; j < 3; j++) { - pr_err("**** Set %i ****", j); - dump(all[j]); - } - panic("\n"); - } - - pr_info("fuzz complete: %i keys", b->sets[0].data->keys); - } -} - -kobj_attribute_write(fuzz, btree_fuzz); -#endif - void bch_debug_exit(void) { if (!IS_ERR_OR_NULL(debug)) @@ -564,11 +421,6 @@ void bch_debug_exit(void) int __init bch_debug_init(struct kobject *kobj) { int ret = 0; -#if 0 - ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr); - if (ret) - return ret; -#endif debug = debugfs_create_dir("bcache", NULL); return ret; -- cgit v1.2.3 From 8e51e414a3c6d92ef2cc41720c67342a8e2c0bf7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Jun 2013 18:15:57 -0700 Subject: bcache: Use standard utility code Some of bcache's utility code has made it into the rest of the kernel, so drop the bcache versions. Bcache used to have a workaround for allocating from a bio set under generic_make_request() (if you allocated more than once, the bios you already allocated would get stuck on current->bio_list when you submitted, and you'd risk deadlock) - bcache would mask out __GFP_WAIT when allocating bios under generic_make_request() so that allocation could fail and it could retry from workqueue. But bio_alloc_bioset() has a workaround now, so we can drop this hack and the associated error handling. Signed-off-by: Kent Overstreet --- drivers/md/bcache/btree.c | 7 +--- drivers/md/bcache/debug.c | 2 +- drivers/md/bcache/io.c | 64 +++++++++++-------------------- drivers/md/bcache/movinggc.c | 7 ++-- drivers/md/bcache/request.c | 87 +++++++++---------------------------------- drivers/md/bcache/util.c | 17 --------- drivers/md/bcache/util.h | 4 -- drivers/md/bcache/writeback.c | 7 ++-- 8 files changed, 51 insertions(+), 144 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index e0cca3673d0f..15b58239c683 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -350,7 +350,7 @@ static void do_btree_node_write(struct btree *b) bkey_copy(&k.key, &b->key); SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); - if (!bch_bio_alloc_pages(b->bio, GFP_NOIO)) { + if (!bio_alloc_pages(b->bio, GFP_NOIO)) { int j; struct bio_vec *bv; void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); @@ -1865,7 +1865,7 @@ bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, should_split(b)) goto out; - op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio)); + op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); SET_KEY_PTRS(&op->replace, 1); get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); @@ -2194,9 +2194,6 @@ static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, KEY_OFFSET(k) - bio->bi_sector); n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (!n) - return -EAGAIN; - if (n == bio) op->lookup_done = true; diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index ba2ceee0fdbb..88e6411eab4f 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -199,7 +199,7 @@ void bch_data_verify(struct search *s) if (!check) return; - if (bch_bio_alloc_pages(check, GFP_NOIO)) + if (bio_alloc_pages(check, GFP_NOIO)) goto out_put; check->bi_rw = READ_SYNC; diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index 0f6d69658b61..9056632995b1 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -68,13 +68,6 @@ static void bch_generic_make_request_hack(struct bio *bio) * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a * bvec boundry; it is the caller's responsibility to ensure that @bio is not * freed before the split. - * - * If bch_bio_split() is running under generic_make_request(), it's not safe to - * allocate more than one bio from the same bio set. Therefore, if it is running - * under generic_make_request() it masks out __GFP_WAIT when doing the - * allocation. The caller must check for failure if there's any possibility of - * it being called from under generic_make_request(); it is then the caller's - * responsibility to retry from a safe context (by e.g. punting to workqueue). */ struct bio *bch_bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) @@ -85,15 +78,6 @@ struct bio *bch_bio_split(struct bio *bio, int sectors, BUG_ON(sectors <= 0); - /* - * If we're being called from underneath generic_make_request() and we - * already allocated any bios from this bio set, we risk deadlock if we - * use the mempool. So instead, we possibly fail and let the caller punt - * to workqueue or somesuch and retry in a safe context. - */ - if (current->bio_list) - gfp &= ~__GFP_WAIT; - if (sectors >= bio_sectors(bio)) return bio; @@ -164,17 +148,18 @@ static unsigned bch_bio_max_sectors(struct bio *bio) struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES, queue_max_segments(q)); - struct bio_vec *bv, *end = bio_iovec(bio) + - min_t(int, bio_segments(bio), max_segments); if (bio->bi_rw & REQ_DISCARD) return min(ret, q->limits.max_discard_sectors); if (bio_segments(bio) > max_segments || q->merge_bvec_fn) { + struct bio_vec *bv; + int i, seg = 0; + ret = 0; - for (bv = bio_iovec(bio); bv < end; bv++) { + bio_for_each_segment(bv, bio, i) { struct bvec_merge_data bvm = { .bi_bdev = bio->bi_bdev, .bi_sector = bio->bi_sector, @@ -182,10 +167,14 @@ static unsigned bch_bio_max_sectors(struct bio *bio) .bi_rw = bio->bi_rw, }; + if (seg == max_segments) + break; + if (q->merge_bvec_fn && q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len) break; + seg++; ret += bv->bv_len >> 9; } } @@ -222,30 +211,10 @@ static void bch_bio_submit_split_endio(struct bio *bio, int error) closure_put(cl); } -static void __bch_bio_submit_split(struct closure *cl) -{ - struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl); - struct bio *bio = s->bio, *n; - - do { - n = bch_bio_split(bio, bch_bio_max_sectors(bio), - GFP_NOIO, s->p->bio_split); - if (!n) - continue_at(cl, __bch_bio_submit_split, system_wq); - - n->bi_end_io = bch_bio_submit_split_endio; - n->bi_private = cl; - - closure_get(cl); - bch_generic_make_request_hack(n); - } while (n != bio); - - continue_at(cl, bch_bio_submit_split_done, NULL); -} - void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) { struct bio_split_hook *s; + struct bio *n; if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD)) goto submit; @@ -254,6 +223,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) goto submit; s = mempool_alloc(p->bio_split_hook, GFP_NOIO); + closure_init(&s->cl, NULL); s->bio = bio; s->p = p; @@ -261,8 +231,18 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) s->bi_private = bio->bi_private; bio_get(bio); - closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL); - return; + do { + n = bch_bio_split(bio, bch_bio_max_sectors(bio), + GFP_NOIO, s->p->bio_split); + + n->bi_end_io = bch_bio_submit_split_endio; + n->bi_private = &s->cl; + + closure_get(&s->cl); + bch_generic_make_request_hack(n); + } while (n != bio); + + continue_at(&s->cl, bch_bio_submit_split_done, NULL); submit: bch_generic_make_request_hack(bio); } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index a241e9fd4f7f..1a3b4f4786c3 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -46,9 +46,10 @@ static void write_moving_finish(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, s.cl); struct bio *bio = &io->bio.bio; - struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt); + struct bio_vec *bv; + int i; - while (bv-- != bio->bi_io_vec) + bio_for_each_segment_all(bv, bio, i) __free_page(bv->bv_page); if (io->s.op.insert_collision) @@ -158,7 +159,7 @@ static void read_moving(struct closure *cl) bio->bi_rw = READ; bio->bi_end_io = read_moving_endio; - if (bch_bio_alloc_pages(bio, GFP_KERNEL)) + if (bio_alloc_pages(bio, GFP_KERNEL)) goto err; trace_bcache_gc_copy(&w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index bcdf1f782c3e..b6e74d3c8faf 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -509,10 +509,6 @@ static void bch_insert_data_loop(struct closure *cl) goto err; n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split); - if (!n) { - __bkey_put(op->c, k); - continue_at(cl, bch_insert_data_loop, bcache_wq); - } n->bi_end_io = bch_insert_data_endio; n->bi_private = cl; @@ -821,53 +817,13 @@ static void request_read_done(struct closure *cl) */ if (s->op.cache_bio) { - struct bio_vec *src, *dst; - unsigned src_offset, dst_offset, bytes; - void *dst_ptr; - bio_reset(s->op.cache_bio); s->op.cache_bio->bi_sector = s->cache_miss->bi_sector; s->op.cache_bio->bi_bdev = s->cache_miss->bi_bdev; s->op.cache_bio->bi_size = s->cache_bio_sectors << 9; bch_bio_map(s->op.cache_bio, NULL); - src = bio_iovec(s->op.cache_bio); - dst = bio_iovec(s->cache_miss); - src_offset = src->bv_offset; - dst_offset = dst->bv_offset; - dst_ptr = kmap(dst->bv_page); - - while (1) { - if (dst_offset == dst->bv_offset + dst->bv_len) { - kunmap(dst->bv_page); - dst++; - if (dst == bio_iovec_idx(s->cache_miss, - s->cache_miss->bi_vcnt)) - break; - - dst_offset = dst->bv_offset; - dst_ptr = kmap(dst->bv_page); - } - - if (src_offset == src->bv_offset + src->bv_len) { - src++; - if (src == bio_iovec_idx(s->op.cache_bio, - s->op.cache_bio->bi_vcnt)) - BUG(); - - src_offset = src->bv_offset; - } - - bytes = min(dst->bv_offset + dst->bv_len - dst_offset, - src->bv_offset + src->bv_len - src_offset); - - memcpy(dst_ptr + dst_offset, - page_address(src->bv_page) + src_offset, - bytes); - - src_offset += bytes; - dst_offset += bytes; - } + bio_copy_data(s->cache_miss, s->op.cache_bio); bio_put(s->cache_miss); s->cache_miss = NULL; @@ -912,9 +868,6 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct bio *miss; miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); - if (!miss) - return -EAGAIN; - if (miss == bio) s->op.lookup_done = true; @@ -933,8 +886,9 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, reada = min(dc->readahead >> 9, sectors - bio_sectors(miss)); - if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev)) - reada = bdev_sectors(miss->bi_bdev) - bio_end(miss); + if (bio_end_sector(miss) + reada > bdev_sectors(miss->bi_bdev)) + reada = bdev_sectors(miss->bi_bdev) - + bio_end_sector(miss); } s->cache_bio_sectors = bio_sectors(miss) + reada; @@ -958,7 +912,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, goto out_put; bch_bio_map(s->op.cache_bio, NULL); - if (bch_bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) + if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO)) goto out_put; s->cache_miss = miss; @@ -1002,7 +956,7 @@ static void request_write(struct cached_dev *dc, struct search *s) struct bio *bio = &s->bio.bio; struct bkey start, end; start = KEY(dc->disk.id, bio->bi_sector, 0); - end = KEY(dc->disk.id, bio_end(bio), 0); + end = KEY(dc->disk.id, bio_end_sector(bio), 0); bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end); @@ -1176,7 +1130,7 @@ found: if (i->sequential + bio->bi_size > i->sequential) i->sequential += bio->bi_size; - i->last = bio_end(bio); + i->last = bio_end_sector(bio); i->jiffies = jiffies + msecs_to_jiffies(5000); s->task->sequential_io = i->sequential; @@ -1294,30 +1248,25 @@ void bch_cached_dev_request_init(struct cached_dev *dc) static int flash_dev_cache_miss(struct btree *b, struct search *s, struct bio *bio, unsigned sectors) { + struct bio_vec *bv; + int i; + /* Zero fill bio */ - while (bio->bi_idx != bio->bi_vcnt) { - struct bio_vec *bv = bio_iovec(bio); + bio_for_each_segment(bv, bio, i) { unsigned j = min(bv->bv_len >> 9, sectors); void *p = kmap(bv->bv_page); memset(p + bv->bv_offset, 0, j << 9); kunmap(bv->bv_page); - bv->bv_len -= j << 9; - bv->bv_offset += j << 9; - - if (bv->bv_len) - return 0; - - bio->bi_sector += j; - bio->bi_size -= j << 9; - - bio->bi_idx++; - sectors -= j; + sectors -= j; } - s->op.lookup_done = true; + bio_advance(bio, min(sectors << 9, bio->bi_size)); + + if (!bio->bi_size) + s->op.lookup_done = true; return 0; } @@ -1344,8 +1293,8 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) closure_call(&s->op.cl, btree_read_async, NULL, cl); } else if (bio_has_data(bio) || s->op.skip) { bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, - &KEY(d->id, bio->bi_sector, 0), - &KEY(d->id, bio_end(bio), 0)); + &KEY(d->id, bio->bi_sector, 0), + &KEY(d->id, bio_end_sector(bio), 0)); s->writeback = true; s->op.cache_bio = bio; diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index da3a99e85b1e..98eb81159a22 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -228,23 +228,6 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset, } } -int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp) -{ - int i; - struct bio_vec *bv; - - bio_for_each_segment(bv, bio, i) { - bv->bv_page = alloc_page(gfp); - if (!bv->bv_page) { - while (bv-- != bio->bi_io_vec + bio->bi_idx) - __free_page(bv->bv_page); - return -ENOMEM; - } - } - - return 0; -} - /* * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any * use permitted, subject to terms of PostgreSQL license; see.) diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index e02780545f12..1ae2a73ad85f 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -564,12 +564,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) return x; } -#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio)) - void bch_bio_map(struct bio *bio, void *base); -int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp); - static inline sector_t bdev_sectors(struct block_device *bdev) { return bdev->bd_inode->i_size >> 9; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d81ee5ccc726..22cbff551628 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -285,9 +285,10 @@ static void write_dirty_finish(struct closure *cl) struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; - struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt); + struct bio_vec *bv; + int i; - while (bv-- != io->bio.bi_io_vec) + bio_for_each_segment_all(bv, &io->bio, i) __free_page(bv->bv_page); /* This is kind of a dumb way of signalling errors. */ @@ -418,7 +419,7 @@ static void read_dirty(struct closure *cl) io->bio.bi_rw = READ; io->bio.bi_end_io = read_dirty_endio; - if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL)) + if (bio_alloc_pages(&io->bio, GFP_KERNEL)) goto err_free; trace_bcache_writeback(&w->key); -- cgit v1.2.3 From 6aa8f1a6ca41c49721d2de4e048d3da8d06411f9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Jul 2013 18:04:21 -0700 Subject: bcache: Fix a dumb race In the far-too-complicated closure code - closures can have destructors, for probably dubious reasons; they get run after the closure is no longer waiting on anything but before dropping the parent ref, intended just for freeing whatever memory the closure is embedded in. Trouble is, when remaining goes to 0 and we've got nothing more to run - we also have to unlock the closure, setting remaining to -1. If there's a destructor, that unlock isn't doing anything - nobody could be trying to lock it if we're about to free it - but if the unlock _is needed... that check for a destructor was racy. Argh. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 --- drivers/md/bcache/closure.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index bd05a9a8c7cf..9aba2017f0d1 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -66,16 +66,18 @@ static inline void closure_put_after_sub(struct closure *cl, int flags) } else { struct closure *parent = cl->parent; struct closure_waitlist *wait = closure_waitlist(cl); + closure_fn *destructor = cl->fn; closure_debug_destroy(cl); + smp_mb(); atomic_set(&cl->remaining, -1); if (wait) closure_wake_up(wait); - if (cl->fn) - cl->fn(cl); + if (destructor) + destructor(cl); if (parent) closure_put(parent); -- cgit v1.2.3 From d2a65ce2ac224413b291307201c5dafb03aa90d7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Jul 2013 09:05:46 +0300 Subject: bcache: check for allocation failures There is a missing NULL check after the kzalloc(). Signed-off-by: Dan Carpenter --- drivers/md/bcache/sysfs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index dd3f00a42729..12a2c2846f99 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -232,6 +232,8 @@ STORE(__cached_dev) bch_uuid_write(dc->disk.c); } env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); + if (!env) + return -ENOMEM; add_uevent_var(env, "DRIVER=bcache"); add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid), add_uevent_var(env, "CACHED_LABEL=%s", buf); -- cgit v1.2.3 From 54d12f2b4fd0f218590d1490b41a18d0e2328a9a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Jul 2013 18:44:40 -0700 Subject: bcache: Advertise that flushes are supported Whoops - bcache's flush/FUA was mostly correct, but flushes get filtered out unless we say we support them... Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 --- drivers/md/bcache/request.c | 8 +++++++- drivers/md/bcache/super.c | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index b6e74d3c8faf..786a1a4f74d8 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -488,6 +488,12 @@ static void bch_insert_data_loop(struct closure *cl) bch_queue_gc(op->c); } + /* + * Journal writes are marked REQ_FLUSH; if the original write was a + * flush, it'll wait on the journal write. + */ + bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA); + do { unsigned i; struct bkey *k; @@ -710,7 +716,7 @@ static struct search *search_alloc(struct bio *bio, struct bcache_device *d) s->task = current; s->orig_bio = bio; s->write = (bio->bi_rw & REQ_WRITE) != 0; - s->op.flush_journal = (bio->bi_rw & REQ_FLUSH) != 0; + s->op.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; s->op.skip = (bio->bi_rw & REQ_DISCARD) != 0; s->recoverable = 1; s->start_time = jiffies; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index cff2d182dfb0..728fdc673f31 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -806,6 +806,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); + blk_queue_flush(q, REQ_FLUSH|REQ_FUA); + return 0; } -- cgit v1.2.3 From c9502ea4424b31728703d113fc6b30bfead14633 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Jul 2013 21:25:02 -0700 Subject: bcache: Fix a sysfs splat on shutdown If we stopped a bcache device when we were already detaching (or something like that), bcache_device_unlink() would try to remove a symlink from sysfs that was already gone because the bcache dev kobject had already been removed from sysfs. So keep track of whether we've removed stuff from sysfs. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 342ba86c6e4f..68f1ded81ae0 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -434,6 +434,7 @@ struct bcache_device { /* If nonzero, we're detaching/unregistering from cache set */ atomic_t detaching; + int flush_done; uint64_t nr_stripes; unsigned stripe_size_bits; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 728fdc673f31..7a1dcdb2536e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -706,7 +706,8 @@ static void bcache_device_detach(struct bcache_device *d) atomic_set(&d->detaching, 0); } - bcache_device_unlink(d); + if (!d->flush_done) + bcache_device_unlink(d); d->c->devices[d->id] = NULL; closure_put(&d->c->caching); @@ -1055,6 +1056,14 @@ static void cached_dev_flush(struct closure *cl) struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); struct bcache_device *d = &dc->disk; + mutex_lock(&bch_register_lock); + d->flush_done = 1; + + if (d->c) + bcache_device_unlink(d); + + mutex_unlock(&bch_register_lock); + bch_cache_accounting_destroy(&dc->accounting); kobject_del(&d->kobj); -- cgit v1.2.3 From 5caa52afc5abd1396e4af720469abb5843a71eb8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Jul 2013 21:03:25 -0700 Subject: bcache: Shutdown fix Stopping a cache set is supposed to make it stop attached backing devices, but somewhere along the way that code got lost. Fixing this mainly has the effect of fixing our reboot notifier. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 --- drivers/md/bcache/super.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7a1dcdb2536e..f6a62174e8f6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1354,18 +1354,22 @@ static void cache_set_flush(struct closure *cl) static void __cache_set_unregister(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); - struct cached_dev *dc, *t; + struct cached_dev *dc; size_t i; mutex_lock(&bch_register_lock); - if (test_bit(CACHE_SET_UNREGISTERING, &c->flags)) - list_for_each_entry_safe(dc, t, &c->cached_devs, list) - bch_cached_dev_detach(dc); - for (i = 0; i < c->nr_uuids; i++) - if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i])) - bcache_device_stop(c->devices[i]); + if (c->devices[i]) { + if (!UUID_FLASH_ONLY(&c->uuids[i]) && + test_bit(CACHE_SET_UNREGISTERING, &c->flags)) { + dc = container_of(c->devices[i], + struct cached_dev, disk); + bch_cached_dev_detach(dc); + } else { + bcache_device_stop(c->devices[i]); + } + } mutex_unlock(&bch_register_lock); -- cgit v1.2.3 From faa5673617656ee58369a3cfe4a312cfcdc59c81 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Jul 2013 22:42:14 -0700 Subject: bcache: Journal replay fix The journal replay code starts by finding something that looks like a valid journal entry, then it does a binary search over the unchecked region of the journal for the journal entries with the highest sequence numbers. Trouble is, the logic was wrong - journal_read_bucket() returns true if it found journal entries we need, but if the range of journal entries we're looking for loops around the end of the journal - in that case journal_read_bucket() could return true when it hadn't found the highest sequence number we'd seen yet, and in that case the binary search did the wrong thing. Whoops. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 --- drivers/md/bcache/journal.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 4b250667bb7f..ba95ab84b2be 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -184,9 +184,14 @@ bsearch: pr_debug("starting binary search, l %u r %u", l, r); while (l + 1 < r) { + seq = list_entry(list->prev, struct journal_replay, + list)->j.seq; + m = (l + r) >> 1; + read_bucket(m); - if (read_bucket(m)) + if (seq != list_entry(list->prev, struct journal_replay, + list)->j.seq) l = m; else r = m; -- cgit v1.2.3 From 29ebf465b9050f241c4433a796a32e6c896a9dcd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 11 Jul 2013 19:43:21 -0700 Subject: bcache: Fix GC_SECTORS_USED() calculation Part of the job of garbage collection is to add up however many sectors of live data it finds in each bucket, but that doesn't work very well if it doesn't reset GC_SECTORS_USED() when it starts. Whoops. This wouldn't have broken anything horribly, but allocation tries to preferentially reclaim buckets that are mostly empty and that's not gonna work with an incorrect GC_SECTORS_USED() value. Signed-off-by: Kent Overstreet Cc: linux-stable # >= v3.10 --- drivers/md/bcache/btree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 15b58239c683..ee372884c405 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1410,8 +1410,10 @@ static void btree_gc_start(struct cache_set *c) for_each_cache(ca, c, i) for_each_bucket(b, ca) { b->gc_gen = b->gen; - if (!atomic_read(&b->pin)) + if (!atomic_read(&b->pin)) { SET_GC_MARK(b, GC_MARK_RECLAIMABLE); + SET_GC_SECTORS_USED(b, 0); + } } mutex_unlock(&c->bucket_lock); -- cgit v1.2.3 From 79826c35eb99cd3c0873b8396f45fa26c87fb0b0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 10 Jul 2013 18:31:58 -0700 Subject: bcache: Allocation kthread fixes The alloc kthread should've been using try_to_freeze() - and also there was the potential for the alloc kthread to get woken up after it had shut down, which would have been bad. Signed-off-by: Kent Overstreet --- drivers/md/bcache/alloc.c | 18 ++++++++---------- drivers/md/bcache/bcache.h | 4 ---- drivers/md/bcache/super.c | 11 +++++++---- 3 files changed, 15 insertions(+), 18 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index b54b73b9b2b7..e45f5575fd4d 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -63,6 +63,7 @@ #include "bcache.h" #include "btree.h" +#include #include #include #include @@ -363,11 +364,10 @@ do { \ break; \ \ mutex_unlock(&(ca)->set->bucket_lock); \ - if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \ - closure_put(&ca->set->cl); \ + if (kthread_should_stop()) \ return 0; \ - } \ \ + try_to_freeze(); \ schedule(); \ mutex_lock(&(ca)->set->bucket_lock); \ } \ @@ -547,14 +547,12 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark, int bch_cache_allocator_start(struct cache *ca) { - ca->alloc_thread = kthread_create(bch_allocator_thread, - ca, "bcache_allocator"); - if (IS_ERR(ca->alloc_thread)) - return PTR_ERR(ca->alloc_thread); - - closure_get(&ca->set->cl); - wake_up_process(ca->alloc_thread); + struct task_struct *k = kthread_run(bch_allocator_thread, + ca, "bcache_allocator"); + if (IS_ERR(k)) + return PTR_ERR(k); + ca->alloc_thread = k; return 0; } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 68f1ded81ae0..b39f6f0b45f2 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -664,13 +664,9 @@ struct gc_stat { * CACHE_SET_STOPPING always gets set first when we're closing down a cache set; * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e. * flushing dirty data). - * - * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down - * the allocation thread. */ #define CACHE_SET_UNREGISTERING 0 #define CACHE_SET_STOPPING 1 -#define CACHE_SET_STOPPING_2 2 struct cache_set { struct closure cl; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index f6a62174e8f6..547c4c57b052 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1329,11 +1330,9 @@ static void cache_set_free(struct closure *cl) static void cache_set_flush(struct closure *cl) { struct cache_set *c = container_of(cl, struct cache_set, caching); + struct cache *ca; struct btree *b; - - /* Shut down allocator threads */ - set_bit(CACHE_SET_STOPPING_2, &c->flags); - wake_up_allocators(c); + unsigned i; bch_cache_accounting_destroy(&c->accounting); @@ -1348,6 +1347,10 @@ static void cache_set_flush(struct closure *cl) if (btree_node_dirty(b)) bch_btree_node_write(b, NULL); + for_each_cache(ca, c, i) + if (ca->alloc_thread) + kthread_stop(ca->alloc_thread); + closure_return(cl); } -- cgit v1.2.3