diff options
-rw-r--r-- | drivers/md/Kconfig | 8 | ||||
-rw-r--r-- | drivers/md/Makefile | 5 | ||||
-rw-r--r-- | drivers/md/dm-cache-background-tracker.c | 238 | ||||
-rw-r--r-- | drivers/md/dm-cache-background-tracker.h | 46 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-cleaner.c | 469 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-internal.h | 76 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-smq.c | 821 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy.h | 187 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 2469 |
10 files changed, 1922 insertions, 2399 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index b7767da50c26..982cd0626bc7 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -325,14 +325,6 @@ config DM_CACHE_SMQ of less memory utilization, improved performance and increased adaptability in the face of changing workloads. -config DM_CACHE_CLEANER - tristate "Cleaner Cache Policy (EXPERIMENTAL)" - depends on DM_CACHE - default y - ---help--- - A simple cache policy that writes back all data to the - origin. Used when decommissioning a dm-cache. - config DM_ERA tristate "Era target (EXPERIMENTAL)" depends on BLK_DEV_DM diff --git a/drivers/md/Makefile b/drivers/md/Makefile index d378b1db7852..2801b2fb452d 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -13,9 +13,9 @@ dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o dm-bio-prison-y += dm-bio-prison-v1.o dm-bio-prison-v2.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o -dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o +dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ + dm-cache-background-tracker.o dm-cache-smq-y += dm-cache-policy-smq.o -dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o dm-verity-y += dm-verity-target.o md-mod-y += md.o bitmap.o @@ -57,7 +57,6 @@ obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o -obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o diff --git a/drivers/md/dm-cache-background-tracker.c b/drivers/md/dm-cache-background-tracker.c new file mode 100644 index 000000000000..9b1afdfb13f0 --- /dev/null +++ b/drivers/md/dm-cache-background-tracker.c @@ -0,0 +1,238 @@ +/* + * Copyright (C) 2017 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#include "dm-cache-background-tracker.h" + +/*----------------------------------------------------------------*/ + +#define DM_MSG_PREFIX "dm-background-tracker" + +struct bt_work { + struct list_head list; + struct rb_node node; + struct policy_work work; +}; + +struct background_tracker { + unsigned max_work; + atomic_t pending_promotes; + atomic_t pending_writebacks; + atomic_t pending_demotes; + + struct list_head issued; + struct list_head queued; + struct rb_root pending; + + struct kmem_cache *work_cache; +}; + +struct background_tracker *btracker_create(unsigned max_work) +{ + struct background_tracker *b = kmalloc(sizeof(*b), GFP_KERNEL); + + b->max_work = max_work; + atomic_set(&b->pending_promotes, 0); + atomic_set(&b->pending_writebacks, 0); + atomic_set(&b->pending_demotes, 0); + + INIT_LIST_HEAD(&b->issued); + INIT_LIST_HEAD(&b->queued); + + b->pending = RB_ROOT; + b->work_cache = KMEM_CACHE(bt_work, 0); + if (!b->work_cache) { + DMERR("couldn't create mempool for background work items"); + kfree(b); + b = NULL; + } + + return b; +} +EXPORT_SYMBOL_GPL(btracker_create); + +void btracker_destroy(struct background_tracker *b) +{ + kmem_cache_destroy(b->work_cache); + kfree(b); +} +EXPORT_SYMBOL_GPL(btracker_destroy); + +static int cmp_oblock(dm_oblock_t lhs, dm_oblock_t rhs) +{ + if (from_oblock(lhs) < from_oblock(rhs)) + return -1; + + if (from_oblock(rhs) < from_oblock(lhs)) + return 1; + + return 0; +} + +static bool __insert_pending(struct background_tracker *b, + struct bt_work *nw) +{ + int cmp; + struct bt_work *w; + struct rb_node **new = &b->pending.rb_node, *parent = NULL; + + while (*new) { + w = container_of(*new, struct bt_work, node); + + parent = *new; + cmp = cmp_oblock(w->work.oblock, nw->work.oblock); + if (cmp < 0) + new = &((*new)->rb_left); + + else if (cmp > 0) + new = &((*new)->rb_right); + + else + /* already present */ + return false; + } + + rb_link_node(&nw->node, parent, new); + rb_insert_color(&nw->node, &b->pending); + + return true; +} + +static struct bt_work *__find_pending(struct background_tracker *b, + dm_oblock_t oblock) +{ + int cmp; + struct bt_work *w; + struct rb_node **new = &b->pending.rb_node; + + while (*new) { + w = container_of(*new, struct bt_work, node); + + cmp = cmp_oblock(w->work.oblock, oblock); + if (cmp < 0) + new = &((*new)->rb_left); + + else if (cmp > 0) + new = &((*new)->rb_right); + + else + break; + } + + return *new ? w : NULL; +} + + +static void update_stats(struct background_tracker *b, struct policy_work *w, int delta) +{ + switch (w->op) { + case POLICY_PROMOTE: + atomic_add(delta, &b->pending_promotes); + break; + + case POLICY_DEMOTE: + atomic_add(delta, &b->pending_demotes); + break; + + case POLICY_WRITEBACK: + atomic_add(delta, &b->pending_writebacks); + break; + } +} + +unsigned btracker_nr_writebacks_queued(struct background_tracker *b) +{ + return atomic_read(&b->pending_writebacks); +} +EXPORT_SYMBOL_GPL(btracker_nr_writebacks_queued); + +unsigned btracker_nr_demotions_queued(struct background_tracker *b) +{ + return atomic_read(&b->pending_demotes); +} +EXPORT_SYMBOL_GPL(btracker_nr_demotions_queued); + +static bool max_work_reached(struct background_tracker *b) +{ + // FIXME: finish + return false; +} + +int btracker_queue(struct background_tracker *b, + struct policy_work *work, + struct policy_work **pwork) +{ + struct bt_work *w; + + if (pwork) + *pwork = NULL; + + if (max_work_reached(b)) + return -ENOMEM; + + w = kmem_cache_alloc(b->work_cache, GFP_NOWAIT); + if (!w) + return -ENOMEM; + + memcpy(&w->work, work, sizeof(*work)); + + if (!__insert_pending(b, w)) { + /* + * There was a race, we'll just ignore this second + * bit of work for the same oblock. + */ + kmem_cache_free(b->work_cache, w); + return -EINVAL; + } + + if (pwork) { + *pwork = &w->work; + list_add(&w->list, &b->issued); + } else + list_add(&w->list, &b->queued); + update_stats(b, &w->work, 1); + + return 0; +} +EXPORT_SYMBOL_GPL(btracker_queue); + +/* + * Returns -ENODATA if there's no work. + */ +int btracker_issue(struct background_tracker *b, struct policy_work **work) +{ + struct bt_work *w; + + if (list_empty(&b->queued)) + return -ENODATA; + + w = list_first_entry(&b->queued, struct bt_work, list); + list_move(&w->list, &b->issued); + *work = &w->work; + + return 0; +} +EXPORT_SYMBOL_GPL(btracker_issue); + +void btracker_complete(struct background_tracker *b, + struct policy_work *op) +{ + struct bt_work *w = container_of(op, struct bt_work, work); + + update_stats(b, &w->work, -1); + rb_erase(&w->node, &b->pending); + list_del(&w->list); + kmem_cache_free(b->work_cache, w); +} +EXPORT_SYMBOL_GPL(btracker_complete); + +bool btracker_promotion_already_present(struct background_tracker *b, + dm_oblock_t oblock) +{ + return __find_pending(b, oblock) != NULL; +} +EXPORT_SYMBOL_GPL(btracker_promotion_already_present); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-background-tracker.h b/drivers/md/dm-cache-background-tracker.h new file mode 100644 index 000000000000..27ab90dbc275 --- /dev/null +++ b/drivers/md/dm-cache-background-tracker.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 Red Hat. All rights reserved. + * + * This file is released under the GPL. + */ + +#ifndef DM_CACHE_BACKGROUND_WORK_H +#define DM_CACHE_BACKGROUND_WORK_H + +#include <linux/vmalloc.h> +#include "dm-cache-policy.h" + +/*----------------------------------------------------------------*/ + +struct background_work; +struct background_tracker; + +/* + * FIXME: discuss lack of locking in all methods. + */ +struct background_tracker *btracker_create(unsigned max_work); +void btracker_destroy(struct background_tracker *b); + +unsigned btracker_nr_writebacks_queued(struct background_tracker *b); +unsigned btracker_nr_demotions_queued(struct background_tracker *b); + +/* + * returns -EINVAL iff the work is already queued. -ENOMEM if the work + * couldn't be queued for another reason. + */ +int btracker_queue(struct background_tracker *b, + struct policy_work *work, + struct policy_work **pwork); + +/* + * Returns -ENODATA if there's no work. + */ +int btracker_issue(struct background_tracker *b, struct policy_work **work); +void btracker_complete(struct background_tracker *b, + struct policy_work *op); +bool btracker_promotion_already_present(struct background_tracker *b, + dm_oblock_t oblock); + +/*----------------------------------------------------------------*/ + +#endif diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 4f07c08cf107..179ed5bf81a3 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -50,6 +50,8 @@ #define DM_CACHE_FEATURE_COMPAT_RO_SUPP 0UL #define DM_CACHE_FEATURE_INCOMPAT_SUPP 0UL +struct dm_cache_metadata; + /* * Reopens or creates a new, empty metadata volume. Returns an ERR_PTR on * failure. If reopening then features must match. diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c deleted file mode 100644 index 2e8a8f1d8358..000000000000 --- a/drivers/md/dm-cache-policy-cleaner.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Copyright (C) 2012 Red Hat. All rights reserved. - * - * writeback cache policy supporting flushing out dirty cache blocks. - * - * This file is released under the GPL. - */ - -#include "dm-cache-policy.h" -#include "dm.h" - -#include <linux/hash.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> - -/*----------------------------------------------------------------*/ - -#define DM_MSG_PREFIX "cache cleaner" - -/* Cache entry struct. */ -struct wb_cache_entry { - struct list_head list; - struct hlist_node hlist; - - dm_oblock_t oblock; - dm_cblock_t cblock; - bool dirty:1; - bool pending:1; -}; - -struct hash { - struct hlist_head *table; - dm_block_t hash_bits; - unsigned nr_buckets; -}; - -struct policy { - struct dm_cache_policy policy; - spinlock_t lock; - - struct list_head free; - struct list_head clean; - struct list_head clean_pending; - struct list_head dirty; - - /* - * We know exactly how many cblocks will be needed, - * so we can allocate them up front. - */ - dm_cblock_t cache_size, nr_cblocks_allocated; - struct wb_cache_entry *cblocks; - struct hash chash; -}; - -/*----------------------------------------------------------------------------*/ - -/* - * Low-level functions. - */ -static unsigned next_power(unsigned n, unsigned min) -{ - return roundup_pow_of_two(max(n, min)); -} - -static struct policy *to_policy(struct dm_cache_policy *p) -{ - return container_of(p, struct policy, policy); -} - -static struct list_head *list_pop(struct list_head *q) -{ - struct list_head *r = q->next; - - list_del(r); - - return r; -} - -/*----------------------------------------------------------------------------*/ - -/* Allocate/free various resources. */ -static int alloc_hash(struct hash *hash, unsigned elts) -{ - hash->nr_buckets = next_power(elts >> 4, 16); - hash->hash_bits = __ffs(hash->nr_buckets); - hash->table = vzalloc(sizeof(*hash->table) * hash->nr_buckets); - - return hash->table ? 0 : -ENOMEM; -} - -static void free_hash(struct hash *hash) -{ - vfree(hash->table); -} - -static int alloc_cache_blocks_with_hash(struct policy *p, dm_cblock_t cache_size) -{ - int r = -ENOMEM; - - p->cblocks = vzalloc(sizeof(*p->cblocks) * from_cblock(cache_size)); - if (p->cblocks) { - unsigned u = from_cblock(cache_size); - - while (u--) - list_add(&p->cblocks[u].list, &p->free); - - p->nr_cblocks_allocated = 0; - - /* Cache entries hash. */ - r = alloc_hash(&p->chash, from_cblock(cache_size)); - if (r) - vfree(p->cblocks); - } - - return r; -} - -static void free_cache_blocks_and_hash(struct policy *p) -{ - free_hash(&p->chash); - vfree(p->cblocks); -} - -static struct wb_cache_entry *alloc_cache_entry(struct policy *p) -{ - struct wb_cache_entry *e; - - BUG_ON(from_cblock(p->nr_cblocks_allocated) >= from_cblock(p->cache_size)); - - e = list_entry(list_pop(&p->free), struct wb_cache_entry, list); - p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) + 1); - - return e; -} - -/*----------------------------------------------------------------------------*/ - -/* Hash functions (lookup, insert, remove). */ -static struct wb_cache_entry *lookup_cache_entry(struct policy *p, dm_oblock_t oblock) -{ - struct hash *hash = &p->chash; - unsigned h = hash_64(from_oblock(oblock), hash->hash_bits); - struct wb_cache_entry *cur; - struct hlist_head *bucket = &hash->table[h]; - - hlist_for_each_entry(cur, bucket, hlist) { - if (cur->oblock == oblock) { - /* Move upfront bucket for faster access. */ - hlist_del(&cur->hlist); - hlist_add_head(&cur->hlist, bucket); - return cur; - } - } - - return NULL; -} - -static void insert_cache_hash_entry(struct policy *p, struct wb_cache_entry *e) -{ - unsigned h = hash_64(from_oblock(e->oblock), p->chash.hash_bits); - - hlist_add_head(&e->hlist, &p->chash.table[h]); -} - -static void remove_cache_hash_entry(struct wb_cache_entry *e) -{ - hlist_del(&e->hlist); -} - -/* Public interface (see dm-cache-policy.h */ -static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - result->op = POLICY_MISS; - - if (can_block) - spin_lock_irqsave(&p->lock, flags); - - else if (!spin_trylock_irqsave(&p->lock, flags)) - return -EWOULDBLOCK; - - e = lookup_cache_entry(p, oblock); - if (e) { - result->op = POLICY_HIT; - result->cblock = e->cblock; - - } - - spin_unlock_irqrestore(&p->lock, flags); - - return 0; -} - -static int wb_lookup(struct dm_cache_policy *pe, dm_oblock_t oblock, dm_cblock_t *cblock) -{ - int r; - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - if (!spin_trylock_irqsave(&p->lock, flags)) - return -EWOULDBLOCK; - - e = lookup_cache_entry(p, oblock); - if (e) { - *cblock = e->cblock; - r = 0; - - } else - r = -ENOENT; - - spin_unlock_irqrestore(&p->lock, flags); - - return r; -} - -static void __set_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock, bool set) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - - e = lookup_cache_entry(p, oblock); - BUG_ON(!e); - - if (set) { - if (!e->dirty) { - e->dirty = true; - list_move(&e->list, &p->dirty); - } - - } else { - if (e->dirty) { - e->pending = false; - e->dirty = false; - list_move(&e->list, &p->clean); - } - } -} - -static void wb_set_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - __set_clear_dirty(pe, oblock, true); - spin_unlock_irqrestore(&p->lock, flags); -} - -static void wb_clear_dirty(struct dm_cache_policy *pe, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - __set_clear_dirty(pe, oblock, false); - spin_unlock_irqrestore(&p->lock, flags); -} - -static void add_cache_entry(struct policy *p, struct wb_cache_entry *e) -{ - insert_cache_hash_entry(p, e); - if (e->dirty) - list_add(&e->list, &p->dirty); - else - list_add(&e->list, &p->clean); -} - -static int wb_load_mapping(struct dm_cache_policy *pe, - dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) -{ - int r; - struct policy *p = to_policy(pe); - struct wb_cache_entry *e = alloc_cache_entry(p); - - if (e) { - e->cblock = cblock; - e->oblock = oblock; - e->dirty = false; /* blocks default to clean */ - add_cache_entry(p, e); - r = 0; - - } else - r = -ENOMEM; - - return r; -} - -static void wb_destroy(struct dm_cache_policy *pe) -{ - struct policy *p = to_policy(pe); - - free_cache_blocks_and_hash(p); - kfree(p); -} - -static struct wb_cache_entry *__wb_force_remove_mapping(struct policy *p, dm_oblock_t oblock) -{ - struct wb_cache_entry *r = lookup_cache_entry(p, oblock); - - BUG_ON(!r); - - remove_cache_hash_entry(r); - list_del(&r->list); - - return r; -} - -static void wb_remove_mapping(struct dm_cache_policy *pe, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - e = __wb_force_remove_mapping(p, oblock); - list_add_tail(&e->list, &p->free); - BUG_ON(!from_cblock(p->nr_cblocks_allocated)); - p->nr_cblocks_allocated = to_cblock(from_cblock(p->nr_cblocks_allocated) - 1); - spin_unlock_irqrestore(&p->lock, flags); -} - -static void wb_force_mapping(struct dm_cache_policy *pe, - dm_oblock_t current_oblock, dm_oblock_t oblock) -{ - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - e = __wb_force_remove_mapping(p, current_oblock); - e->oblock = oblock; - add_cache_entry(p, e); - spin_unlock_irqrestore(&p->lock, flags); -} - -static struct wb_cache_entry *get_next_dirty_entry(struct policy *p) -{ - struct list_head *l; - struct wb_cache_entry *r; - - if (list_empty(&p->dirty)) - return NULL; - - l = list_pop(&p->dirty); - r = container_of(l, struct wb_cache_entry, list); - list_add(l, &p->clean_pending); - - return r; -} - -static int wb_writeback_work(struct dm_cache_policy *pe, - dm_oblock_t *oblock, - dm_cblock_t *cblock, - bool critical_only) -{ - int r = -ENOENT; - struct policy *p = to_policy(pe); - struct wb_cache_entry *e; - unsigned long flags; - - spin_lock_irqsave(&p->lock, flags); - - e = get_next_dirty_entry(p); - if (e) { - *oblock = e->oblock; - *cblock = e->cblock; - r = 0; - } - - spin_unlock_irqrestore(&p->lock, flags); - - return r; -} - -static dm_cblock_t wb_residency(struct dm_cache_policy *pe) -{ - return to_policy(pe)->nr_cblocks_allocated; -} - -/* Init the policy plugin interface function pointers. */ -static void init_policy_functions(struct policy *p) -{ - p->policy.destroy = wb_destroy; - p->policy.map = wb_map; - p->policy.lookup = wb_lookup; - p->policy.set_dirty = wb_set_dirty; - p->policy.clear_dirty = wb_clear_dirty; - p->policy.load_mapping = wb_load_mapping; - p->policy.get_hint = NULL; - p->policy.remove_mapping = wb_remove_mapping; - p->policy.writeback_work = wb_writeback_work; - p->policy.force_mapping = wb_force_mapping; - p->policy.residency = wb_residency; - p->policy.tick = NULL; -} - -static struct dm_cache_policy *wb_create(dm_cblock_t cache_size, - sector_t origin_size, - sector_t cache_block_size) -{ - int r; - struct policy *p = kzalloc(sizeof(*p), GFP_KERNEL); - - if (!p) - return NULL; - - init_policy_functions(p); - INIT_LIST_HEAD(&p->free); - INIT_LIST_HEAD(&p->clean); - INIT_LIST_HEAD(&p->clean_pending); - INIT_LIST_HEAD(&p->dirty); - - p->cache_size = cache_size; - spin_lock_init(&p->lock); - - /* Allocate cache entry structs and add them to free list. */ - r = alloc_cache_blocks_with_hash(p, cache_size); - if (!r) - return &p->policy; - - kfree(p); - - return NULL; -} -/*----------------------------------------------------------------------------*/ - -static struct dm_cache_policy_type wb_policy_type = { - .name = "cleaner", - .version = {1, 0, 0}, - .hint_size = 4, - .owner = THIS_MODULE, - .create = wb_create -}; - -static int __init wb_init(void) -{ - int r = dm_cache_policy_register(&wb_policy_type); - - if (r < 0) - DMERR("register failed %d", r); - else - DMINFO("version %u.%u.%u loaded", - wb_policy_type.version[0], - wb_policy_type.version[1], - wb_policy_type.version[2]); - - return r; -} - -static void __exit wb_exit(void) -{ - dm_cache_policy_unregister(&wb_policy_type); -} - -module_init(wb_init); -module_exit(wb_exit); - -MODULE_AUTHOR("Heinz Mauelshagen <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("cleaner cache policy"); diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 808ee0e2b2c4..56f0a23f698c 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -12,70 +12,65 @@ /*----------------------------------------------------------------*/ -/* - * Little inline functions that simplify calling the policy methods. - */ -static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) +static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, bool *background_queued) { - return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result); + return p->lookup(p, oblock, cblock, data_dir, fast_copy, background_queued); } -static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) +static inline int policy_lookup_with_work(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work) { - BUG_ON(!p->lookup); - return p->lookup(p, oblock, cblock); -} + if (!p->lookup_with_work) { + *work = NULL; + return p->lookup(p, oblock, cblock, data_dir, fast_copy, NULL); + } -static inline void policy_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - if (p->set_dirty) - p->set_dirty(p, oblock); + return p->lookup_with_work(p, oblock, cblock, data_dir, fast_copy, work); } -static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +static inline int policy_get_background_work(struct dm_cache_policy *p, + bool idle, struct policy_work **result) { - if (p->clear_dirty) - p->clear_dirty(p, oblock); + return p->get_background_work(p, idle, result); } -static inline int policy_load_mapping(struct dm_cache_policy *p, - dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) +static inline void policy_complete_background_work(struct dm_cache_policy *p, + struct policy_work *work, + bool success) { - return p->load_mapping(p, oblock, cblock, hint, hint_valid); + return p->complete_background_work(p, work, success); } -static inline uint32_t policy_get_hint(struct dm_cache_policy *p, - dm_cblock_t cblock) +static inline void policy_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) { - return p->get_hint ? p->get_hint(p, cblock) : 0; + p->set_dirty(p, cblock); } -static inline int policy_writeback_work(struct dm_cache_policy *p, - dm_oblock_t *oblock, - dm_cblock_t *cblock, - bool critical_only) +static inline void policy_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) { - return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT; + p->clear_dirty(p, cblock); } -static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) +static inline int policy_load_mapping(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) { - p->remove_mapping(p, oblock); + return p->load_mapping(p, oblock, cblock, dirty, hint, hint_valid); } -static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) +static inline int policy_invalidate_mapping(struct dm_cache_policy *p, + dm_cblock_t cblock) { - return p->remove_cblock(p, cblock); + return p->invalidate_mapping(p, cblock); } -static inline void policy_force_mapping(struct dm_cache_policy *p, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) +static inline uint32_t policy_get_hint(struct dm_cache_policy *p, + dm_cblock_t cblock) { - return p->force_mapping(p, current_oblock, new_oblock); + return p->get_hint ? p->get_hint(p, cblock) : 0; } static inline dm_cblock_t policy_residency(struct dm_cache_policy *p) @@ -107,6 +102,11 @@ static inline int policy_set_config_value(struct dm_cache_policy *p, return p->set_config_value ? p->set_config_value(p, key, value) : -EINVAL; } +static inline void policy_allow_migrations(struct dm_cache_policy *p, bool allow) +{ + return p->allow_migrations(p, allow); +} + /*----------------------------------------------------------------*/ /* diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index f19c6930a67c..74436dc2122f 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -4,8 +4,9 @@ * This file is released under the GPL. */ -#include "dm-cache-policy.h" +#include "dm-cache-background-tracker.h" #include "dm-cache-policy-internal.h" +#include "dm-cache-policy.h" #include "dm.h" #include <linux/hash.h> @@ -38,10 +39,11 @@ struct entry { unsigned hash_next:28; unsigned prev:28; unsigned next:28; - unsigned level:7; + unsigned level:6; bool dirty:1; bool allocated:1; bool sentinel:1; + bool pending_work:1; dm_oblock_t oblock; }; @@ -279,14 +281,28 @@ static unsigned q_size(struct queue *q) */ static void q_push(struct queue *q, struct entry *e) { + BUG_ON(e->pending_work); + if (!e->sentinel) q->nr_elts++; l_add_tail(q->es, q->qs + e->level, e); } +static void q_push_front(struct queue *q, struct entry *e) +{ + BUG_ON(e->pending_work); + + if (!e->sentinel) + q->nr_elts++; + + l_add_head(q->es, q->qs + e->level, e); +} + static void q_push_before(struct queue *q, struct entry *old, struct entry *e) { + BUG_ON(e->pending_work); + if (!e->sentinel) q->nr_elts++; @@ -336,19 +352,6 @@ static struct entry *q_pop(struct queue *q) } /* - * Pops an entry from a level that is not past a sentinel. - */ -static struct entry *q_pop_old(struct queue *q, unsigned max_level) -{ - struct entry *e = q_peek(q, max_level, false); - - if (e) - q_del(q, e); - - return e; -} - -/* * This function assumes there is a non-sentinel entry to pop. It's only * used by redistribute, so we know this is true. It also doesn't adjust * the q->nr_elts count. @@ -446,45 +449,49 @@ static void q_redistribute(struct queue *q) break; e->level = level + 1u; - l_add_head(q->es, l_above, e); + l_add_tail(q->es, l_above, e); } } } -static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels) +static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels, + struct entry *s1, struct entry *s2) { struct entry *de; - unsigned new_level; - - q_del(q, e); + unsigned sentinels_passed = 0; + unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels); + /* try and find an entry to swap with */ if (extra_levels && (e->level < q->nr_levels - 1u)) { - new_level = min(q->nr_levels - 1u, e->level + extra_levels); - for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) { - if (de->sentinel) - continue; + for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de)) + sentinels_passed++; + if (de) { q_del(q, de); de->level = e->level; + if (s1) { + switch (sentinels_passed) { + case 0: + q_push_before(q, s1, de); + break; + + case 1: + q_push_before(q, s2, de); + break; - if (dest) - q_push_before(q, dest, de); - else + default: + q_push(q, de); + } + } else q_push(q, de); - break; } - - e->level = new_level; } + q_del(q, e); + e->level = new_level; q_push(q, e); } -static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels) -{ - q_requeue_before(q, NULL, e, extra_levels); -} - /*----------------------------------------------------------------*/ #define FP_SHIFT 8 @@ -550,7 +557,7 @@ static enum performance stats_assess(struct stats *s) /*----------------------------------------------------------------*/ -struct hash_table { +struct smq_hash_table { struct entry_space *es; unsigned long long hash_bits; unsigned *buckets; @@ -560,7 +567,7 @@ struct hash_table { * All cache entries are stored in a chained hash table. To save space we * use indexing again, and only store indexes to the next entry. */ -static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries) +static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries) { unsigned i, nr_buckets; @@ -578,34 +585,34 @@ static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_ent return 0; } -static void h_exit(struct hash_table *ht) +static void h_exit(struct smq_hash_table *ht) { vfree(ht->buckets); } -static struct entry *h_head(struct hash_table *ht, unsigned bucket) +static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket) { return to_entry(ht->es, ht->buckets[bucket]); } -static struct entry *h_next(struct hash_table *ht, struct entry *e) +static struct entry *h_next(struct smq_hash_table *ht, struct entry *e) { return to_entry(ht->es, e->hash_next); } -static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e) +static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e) { e->hash_next = ht->buckets[bucket]; ht->buckets[bucket] = to_index(ht->es, e); } -static void h_insert(struct hash_table *ht, struct entry *e) +static void h_insert(struct smq_hash_table *ht, struct entry *e) { unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); __h_insert(ht, h, e); } -static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock, +static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock, struct entry **prev) { struct entry *e; @@ -621,7 +628,7 @@ static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t o return NULL; } -static void __h_unlink(struct hash_table *ht, unsigned h, +static void __h_unlink(struct smq_hash_table *ht, unsigned h, struct entry *e, struct entry *prev) { if (prev) @@ -633,7 +640,7 @@ static void __h_unlink(struct hash_table *ht, unsigned h, /* * Also moves each entry to the front of the bucket. */ -static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) +static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock) { struct entry *e, *prev; unsigned h = hash_64(from_oblock(oblock), ht->hash_bits); @@ -651,7 +658,7 @@ static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock) return e; } -static void h_remove(struct hash_table *ht, struct entry *e) +static void h_remove(struct smq_hash_table *ht, struct entry *e) { unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits); struct entry *prev; @@ -699,7 +706,10 @@ static void init_entry(struct entry *e) e->next = INDEXER_NULL; e->prev = INDEXER_NULL; e->level = 0u; + e->dirty = true; /* FIXME: audit */ e->allocated = true; + e->sentinel = false; + e->pending_work = false; } static struct entry *alloc_entry(struct entry_alloc *ea) @@ -762,11 +772,11 @@ static struct entry *get_entry(struct entry_alloc *ea, unsigned index) #define NR_HOTSPOT_LEVELS 64u #define NR_CACHE_LEVELS 64u -#define WRITEBACK_PERIOD (10 * HZ) -#define DEMOTE_PERIOD (60 * HZ) +#define WRITEBACK_PERIOD (10ul * HZ) +#define DEMOTE_PERIOD (60ul * HZ) #define HOTSPOT_UPDATE_PERIOD (HZ) -#define CACHE_UPDATE_PERIOD (10u * HZ) +#define CACHE_UPDATE_PERIOD (60ul * HZ) struct smq_policy { struct dm_cache_policy policy; @@ -814,8 +824,8 @@ struct smq_policy { * The hash tables allows us to quickly find an entry by origin * block. */ - struct hash_table table; - struct hash_table hotspot_table; + struct smq_hash_table table; + struct smq_hash_table hotspot_table; bool current_writeback_sentinels; unsigned long next_writeback_period; @@ -828,6 +838,10 @@ struct smq_policy { unsigned long next_hotspot_period; unsigned long next_cache_period; + + struct background_tracker *bg_work; + + bool migrations_allowed; }; /*----------------------------------------------------------------*/ @@ -876,15 +890,15 @@ static void __update_demote_sentinels(struct smq_policy *mq) static void update_sentinels(struct smq_policy *mq) { if (time_after(jiffies, mq->next_writeback_period)) { - __update_writeback_sentinels(mq); mq->next_writeback_period = jiffies + WRITEBACK_PERIOD; mq->current_writeback_sentinels = !mq->current_writeback_sentinels; + __update_writeback_sentinels(mq); } if (time_after(jiffies, mq->next_demote_period)) { - __update_demote_sentinels(mq); mq->next_demote_period = jiffies + DEMOTE_PERIOD; mq->current_demote_sentinels = !mq->current_demote_sentinels; + __update_demote_sentinels(mq); } } @@ -920,55 +934,40 @@ static void sentinels_init(struct smq_policy *mq) /*----------------------------------------------------------------*/ -/* - * These methods tie together the dirty queue, clean queue and hash table. - */ -static void push_new(struct smq_policy *mq, struct entry *e) +static void del_queue(struct smq_policy *mq, struct entry *e) { - struct queue *q = e->dirty ? &mq->dirty : &mq->clean; - h_insert(&mq->table, e); - q_push(q, e); + q_del(e->dirty ? &mq->dirty : &mq->clean, e); } -static void push(struct smq_policy *mq, struct entry *e) +static void push_queue(struct smq_policy *mq, struct entry *e) { - struct entry *sentinel; - - h_insert(&mq->table, e); - - /* - * Punch this into the queue just in front of the sentinel, to - * ensure it's cleaned straight away. - */ - if (e->dirty) { - sentinel = writeback_sentinel(mq, e->level); - q_push_before(&mq->dirty, sentinel, e); - } else { - sentinel = demote_sentinel(mq, e->level); - q_push_before(&mq->clean, sentinel, e); - } + if (e->dirty) + q_push(&mq->dirty, e); + else + q_push(&mq->clean, e); } -/* - * Removes an entry from cache. Removes from the hash table. - */ -static void __del(struct smq_policy *mq, struct queue *q, struct entry *e) +// !h, !q, a -> h, q, a +static void push(struct smq_policy *mq, struct entry *e) { - q_del(q, e); - h_remove(&mq->table, e); + h_insert(&mq->table, e); + if (!e->pending_work) + push_queue(mq, e); } -static void del(struct smq_policy *mq, struct entry *e) +static void push_queue_front(struct smq_policy *mq, struct entry *e) { - __del(mq, e->dirty ? &mq->dirty : &mq->clean, e); + if (e->dirty) + q_push_front(&mq->dirty, e); + else + q_push_front(&mq->clean, e); } -static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level) +static void push_front(struct smq_policy *mq, struct entry *e) { - struct entry *e = q_pop_old(q, max_level); - if (e) - h_remove(&mq->table, e); - return e; + h_insert(&mq->table, e); + if (!e->pending_work) + push_queue_front(mq, e); } static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) @@ -978,16 +977,21 @@ static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e) static void requeue(struct smq_policy *mq, struct entry *e) { - struct entry *sentinel; + /* + * Pending work has temporarily been taken out of the queues. + */ + if (e->pending_work) + return; if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) { - if (e->dirty) { - sentinel = writeback_sentinel(mq, e->level); - q_requeue_before(&mq->dirty, sentinel, e, 1u); - } else { - sentinel = demote_sentinel(mq, e->level); - q_requeue_before(&mq->clean, sentinel, e, 1u); + if (!e->dirty) { + q_requeue(&mq->clean, e, 1u, NULL, NULL); + return; } + + q_requeue(&mq->dirty, e, 1u, + get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels), + get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels)); } } @@ -1026,6 +1030,8 @@ static void update_promote_levels(struct smq_policy *mq) unsigned threshold_level = allocator_empty(&mq->cache_alloc) ? default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u); + threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS); + /* * If the hotspot queue is performing badly then we have little * confidence that we know which blocks to promote. So we cut down @@ -1045,7 +1051,7 @@ static void update_promote_levels(struct smq_policy *mq) } mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level; - mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u; + mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level); } /* @@ -1095,34 +1101,142 @@ static void end_cache_period(struct smq_policy *mq) } } -static int demote_cblock(struct smq_policy *mq, - struct policy_locker *locker, - dm_oblock_t *oblock) +/*----------------------------------------------------------------*/ + +/* + * Targets are given as a percentage. + */ +#define CLEAN_TARGET 25u +#define FREE_TARGET 25u + +static unsigned percent_to_target(struct smq_policy *mq, unsigned p) { - struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false); - if (!demoted) - /* - * We could get a block from mq->dirty, but that - * would add extra latency to the triggering bio as it - * waits for the writeback. Better to not promote this - * time and hope there's a clean block next time this block - * is hit. - */ - return -ENOSPC; + return from_cblock(mq->cache_size) * p / 100u; +} + +static bool clean_target_met(struct smq_policy *mq, bool idle) +{ + /* + * Cache entries may not be populated. So we cannot rely on the + * size of the clean queue. + */ + unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); - if (locker->fn(locker, demoted->oblock)) + if (idle) /* - * We couldn't lock this block. + * We'd like to clean everything. */ - return -EBUSY; + return q_size(&mq->dirty) == 0u; + else + return (nr_clean + btracker_nr_writebacks_queued(mq->bg_work)) >= + percent_to_target(mq, CLEAN_TARGET); +} - del(mq, demoted); - *oblock = demoted->oblock; - free_entry(&mq->cache_alloc, demoted); +static bool free_target_met(struct smq_policy *mq, bool idle) +{ + unsigned nr_free = from_cblock(mq->cache_size) - + mq->cache_alloc.nr_allocated; - return 0; + if (idle) + return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >= + percent_to_target(mq, FREE_TARGET); + else + return true; } +/*----------------------------------------------------------------*/ + +static void mark_pending(struct smq_policy *mq, struct entry *e) +{ + BUG_ON(e->sentinel); + BUG_ON(!e->allocated); + BUG_ON(e->pending_work); + e->pending_work = true; +} + +static void clear_pending(struct smq_policy *mq, struct entry *e) +{ + BUG_ON(!e->pending_work); + e->pending_work = false; +} + +static void queue_writeback(struct smq_policy *mq) +{ + int r; + struct policy_work work; + struct entry *e; + + e = q_peek(&mq->dirty, mq->dirty.nr_levels, false); + if (e) { + mark_pending(mq, e); + q_del(&mq->dirty, e); + + work.op = POLICY_WRITEBACK; + work.oblock = e->oblock; + work.cblock = infer_cblock(mq, e); + + r = btracker_queue(mq->bg_work, &work, NULL); + WARN_ON_ONCE(r); // FIXME: finish, I think we have to get rid of this race. + } +} + +static void queue_demotion(struct smq_policy *mq) +{ + struct policy_work work; + struct entry *e; + + if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed))) + return; + + e = q_peek(&mq->clean, mq->clean.nr_levels, true); + if (!e) { + if (!clean_target_met(mq, false)) + queue_writeback(mq); + return; + } + + mark_pending(mq, e); + q_del(&mq->clean, e); + + work.op = POLICY_DEMOTE; + work.oblock = e->oblock; + work.cblock = infer_cblock(mq, e); + btracker_queue(mq->bg_work, &work, NULL); +} + +static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock, + struct policy_work **workp) +{ + struct entry *e; + struct policy_work work; + + if (!mq->migrations_allowed) + return; + + if (allocator_empty(&mq->cache_alloc)) { + if (!free_target_met(mq, false)) + queue_demotion(mq); + return; + } + + if (btracker_promotion_already_present(mq->bg_work, oblock)) + return; + + /* + * We allocate the entry now to reserve the cblock. If the + * background work is aborted we must remember to free it. + */ + e = alloc_entry(&mq->cache_alloc); + BUG_ON(!e); + e->pending_work = true; + work.op = POLICY_PROMOTE; + work.oblock = oblock; + work.cblock = infer_cblock(mq, e); + btracker_queue(mq->bg_work, &work, workp); +} + +/*----------------------------------------------------------------*/ + enum promote_result { PROMOTE_NOT, PROMOTE_TEMPORARY, @@ -1137,49 +1251,18 @@ static enum promote_result maybe_promote(bool promote) return promote ? PROMOTE_PERMANENT : PROMOTE_NOT; } -static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio, - bool fast_promote) +static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, + int data_dir, bool fast_promote) { - if (bio_data_dir(bio) == WRITE) { + if (data_dir == WRITE) { if (!allocator_empty(&mq->cache_alloc) && fast_promote) return PROMOTE_TEMPORARY; - else - return maybe_promote(hs_e->level >= mq->write_promote_level); + return maybe_promote(hs_e->level >= mq->write_promote_level); } else return maybe_promote(hs_e->level >= mq->read_promote_level); } -static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock, - struct policy_locker *locker, - struct policy_result *result, enum promote_result pr) -{ - int r; - struct entry *e; - - if (allocator_empty(&mq->cache_alloc)) { - result->op = POLICY_REPLACE; - r = demote_cblock(mq, locker, &result->old_oblock); - if (r) { - result->op = POLICY_MISS; - return; - } - - } else - result->op = POLICY_NEW; - - e = alloc_entry(&mq->cache_alloc); - BUG_ON(!e); - e->oblock = oblock; - - if (pr == PROMOTE_TEMPORARY) - push(mq, e); - else - push_new(mq, e); - - result->cblock = infer_cblock(mq, e); -} - static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) { sector_t r = from_oblock(b); @@ -1187,7 +1270,7 @@ static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b) return to_oblock(r); } -static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio) +static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b) { unsigned hi; dm_oblock_t hb = to_hblock(mq, b); @@ -1199,7 +1282,8 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, hi = get_index(&mq->hotspot_alloc, e); q_requeue(&mq->hotspot, e, test_and_set_bit(hi, mq->hotspot_hit_bits) ? - 0u : mq->hotspot_level_jump); + 0u : mq->hotspot_level_jump, + NULL, NULL); } else { stats_miss(&mq->hotspot_stats); @@ -1225,47 +1309,6 @@ static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, return e; } -/* - * Looks the oblock up in the hash table, then decides whether to put in - * pre_cache, or cache etc. - */ -static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock, - bool can_migrate, bool fast_promote, - struct policy_locker *locker, struct policy_result *result) -{ - struct entry *e, *hs_e; - enum promote_result pr; - - hs_e = update_hotspot_queue(mq, oblock, bio); - - e = h_lookup(&mq->table, oblock); - if (e) { - stats_level_accessed(&mq->cache_stats, e->level); - - requeue(mq, e); - result->op = POLICY_HIT; - result->cblock = infer_cblock(mq, e); - - } else { - stats_miss(&mq->cache_stats); - - pr = should_promote(mq, hs_e, bio, fast_promote); - if (pr == PROMOTE_NOT) - result->op = POLICY_MISS; - - else { - if (!can_migrate) { - result->op = POLICY_MISS; - return -EWOULDBLOCK; - } - - insert_in_cache(mq, oblock, locker, result, pr); - } - } - - return 0; -} - /*----------------------------------------------------------------*/ /* @@ -1282,6 +1325,7 @@ static void smq_destroy(struct dm_cache_policy *p) { struct smq_policy *mq = to_smq_policy(p); + btracker_destroy(mq->bg_work); h_exit(&mq->hotspot_table); h_exit(&mq->table); free_bitset(mq->hotspot_hit_bits); @@ -1290,234 +1334,247 @@ static void smq_destroy(struct dm_cache_policy *p) kfree(mq); } -static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool fast_promote, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) -{ - int r; - unsigned long flags; - struct smq_policy *mq = to_smq_policy(p); - - result->op = POLICY_MISS; - - spin_lock_irqsave(&mq->lock, flags); - r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result); - spin_unlock_irqrestore(&mq->lock, flags); - - return r; -} +/*----------------------------------------------------------------*/ -static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) +static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work, bool *background_work) { - int r; - unsigned long flags; - struct smq_policy *mq = to_smq_policy(p); - struct entry *e; + struct entry *e, *hs_e; + enum promote_result pr; + + *background_work = false; - spin_lock_irqsave(&mq->lock, flags); e = h_lookup(&mq->table, oblock); if (e) { + stats_level_accessed(&mq->cache_stats, e->level); + + requeue(mq, e); *cblock = infer_cblock(mq, e); - r = 0; - } else - r = -ENOENT; - spin_unlock_irqrestore(&mq->lock, flags); + return 0; - return r; -} + } else { + stats_miss(&mq->cache_stats); -static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set) -{ - struct entry *e; + /* + * The hotspot queue only gets updated with misses. + */ + hs_e = update_hotspot_queue(mq, oblock); - e = h_lookup(&mq->table, oblock); - BUG_ON(!e); + pr = should_promote(mq, hs_e, data_dir, fast_copy); + if (pr != PROMOTE_NOT) { + queue_promotion(mq, oblock, work); + *background_work = true; + } - del(mq, e); - e->dirty = set; - push(mq, e); + return -ENOENT; + } } -static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + bool *background_work) { + int r; unsigned long flags; struct smq_policy *mq = to_smq_policy(p); spin_lock_irqsave(&mq->lock, flags); - __smq_set_clear_dirty(mq, oblock, true); + r = __lookup(mq, oblock, cblock, + data_dir, fast_copy, + NULL, background_work); spin_unlock_irqrestore(&mq->lock, flags); + + return r; } -static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +static int smq_lookup_with_work(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work) { - struct smq_policy *mq = to_smq_policy(p); + int r; + bool background_queued; unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); spin_lock_irqsave(&mq->lock, flags); - __smq_set_clear_dirty(mq, oblock, false); + r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued); spin_unlock_irqrestore(&mq->lock, flags); -} -static unsigned random_level(dm_cblock_t cblock) -{ - return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1); + return r; } -static int smq_load_mapping(struct dm_cache_policy *p, - dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) +static int smq_get_background_work(struct dm_cache_policy *p, bool idle, + struct policy_work **result) { + int r; + unsigned long flags; struct smq_policy *mq = to_smq_policy(p); - struct entry *e; - e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); - e->oblock = oblock; - e->dirty = false; /* this gets corrected in a minute */ - e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); - push(mq, e); - - return 0; -} + spin_lock_irqsave(&mq->lock, flags); + r = btracker_issue(mq->bg_work, result); + if (r == -ENODATA) { + /* find some writeback work to do */ + if (mq->migrations_allowed && !free_target_met(mq, idle)) + queue_demotion(mq); -static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock) -{ - struct smq_policy *mq = to_smq_policy(p); - struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); + else if (!clean_target_met(mq, idle)) + queue_writeback(mq); - if (!e->allocated) - return 0; + r = btracker_issue(mq->bg_work, result); + } + spin_unlock_irqrestore(&mq->lock, flags); - return e->level; + return r; } -static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock) -{ - struct entry *e; +/* + * We need to clear any pending work flags that have been set, and in the + * case of promotion free the entry for the destination cblock. + */ +static void __complete_background_work(struct smq_policy *mq, + struct policy_work *work, + bool success) +{ + struct entry *e = get_entry(&mq->cache_alloc, + from_cblock(work->cblock)); + + switch (work->op) { + case POLICY_PROMOTE: + // !h, !q, a + clear_pending(mq, e); + if (success) { + e->oblock = work->oblock; + push(mq, e); + // h, q, a + } else { + free_entry(&mq->cache_alloc, e); + // !h, !q, !a + } + break; - e = h_lookup(&mq->table, oblock); - BUG_ON(!e); + case POLICY_DEMOTE: + // h, !q, a + if (success) { + h_remove(&mq->table, e); + free_entry(&mq->cache_alloc, e); + // !h, !q, !a + } else { + clear_pending(mq, e); + push_queue(mq, e); + // h, q, a + } + break; - del(mq, e); - free_entry(&mq->cache_alloc, e); + case POLICY_WRITEBACK: + // h, !q, a + clear_pending(mq, e); + push_queue(mq, e); + // h, q, a + break; + } + + btracker_complete(mq->bg_work, work); } -static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) +static void smq_complete_background_work(struct dm_cache_policy *p, + struct policy_work *work, + bool success) { - struct smq_policy *mq = to_smq_policy(p); unsigned long flags; + struct smq_policy *mq = to_smq_policy(p); spin_lock_irqsave(&mq->lock, flags); - __remove_mapping(mq, oblock); + __complete_background_work(mq, work, success); spin_unlock_irqrestore(&mq->lock, flags); } -static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock) +// in_hash(oblock) -> in_hash(oblock) +static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set) { struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); - if (!e || !e->allocated) - return -ENODATA; - - del(mq, e); - free_entry(&mq->cache_alloc, e); - - return 0; + if (e->pending_work) + e->dirty = set; + else { + del_queue(mq, e); + e->dirty = set; + push_queue(mq, e); + } } -static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) +static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) { - int r; unsigned long flags; struct smq_policy *mq = to_smq_policy(p); spin_lock_irqsave(&mq->lock, flags); - r = __remove_cblock(mq, cblock); + __smq_set_clear_dirty(mq, cblock, true); spin_unlock_irqrestore(&mq->lock, flags); - - return r; } - -#define CLEAN_TARGET_CRITICAL 5u /* percent */ - -static bool clean_target_met(struct smq_policy *mq, bool critical) +static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock) { - if (critical) { - /* - * Cache entries may not be populated. So we're cannot rely on the - * size of the clean queue. - */ - unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty); - unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u; + struct smq_policy *mq = to_smq_policy(p); + unsigned long flags; - return nr_clean >= target; - } else - return !q_size(&mq->dirty); + spin_lock_irqsave(&mq->lock, flags); + __smq_set_clear_dirty(mq, cblock, false); + spin_unlock_irqrestore(&mq->lock, flags); } -static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock, - dm_cblock_t *cblock, bool critical_only) +static unsigned random_level(dm_cblock_t cblock) { - struct entry *e = NULL; - bool target_met = clean_target_met(mq, critical_only); - - if (critical_only) - /* - * Always try and keep the bottom level clean. - */ - e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels); + return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1); +} - else - e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels); +static int smq_load_mapping(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) +{ + struct smq_policy *mq = to_smq_policy(p); + struct entry *e; - if (!e) - return -ENODATA; + e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); + e->oblock = oblock; + e->dirty = dirty; + e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); + e->pending_work = false; - *oblock = e->oblock; - *cblock = infer_cblock(mq, e); - e->dirty = false; - push_new(mq, e); + /* + * When we load mappings we push ahead of both sentinels in order to + * allow demotions and cleaning to occur immediately. + */ + push_front(mq, e); return 0; } -static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, - dm_cblock_t *cblock, bool critical_only) +static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock) { - int r; - unsigned long flags; struct smq_policy *mq = to_smq_policy(p); + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); - spin_lock_irqsave(&mq->lock, flags); - r = __smq_writeback_work(mq, oblock, cblock, critical_only); - spin_unlock_irqrestore(&mq->lock, flags); - - return r; -} - -static void __force_mapping(struct smq_policy *mq, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) -{ - struct entry *e = h_lookup(&mq->table, current_oblock); + if (!e->allocated) + return -ENODATA; - if (e) { - del(mq, e); - e->oblock = new_oblock; - e->dirty = true; - push(mq, e); - } + // FIXME: what if this block has pending background work? + del_queue(mq, e); + h_remove(&mq->table, e); + free_entry(&mq->cache_alloc, e); + return 0; } -static void smq_force_mapping(struct dm_cache_policy *p, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) +static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock) { - unsigned long flags; struct smq_policy *mq = to_smq_policy(p); + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); - spin_lock_irqsave(&mq->lock, flags); - __force_mapping(mq, current_oblock, new_oblock); - spin_unlock_irqrestore(&mq->lock, flags); + if (!e->allocated) + return 0; + + return e->level; } static dm_cblock_t smq_residency(struct dm_cache_policy *p) @@ -1546,6 +1603,12 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block) spin_unlock_irqrestore(&mq->lock, flags); } +static void smq_allow_migrations(struct dm_cache_policy *p, bool allow) +{ + struct smq_policy *mq = to_smq_policy(p); + mq->migrations_allowed = allow; +} + /* * smq has no config values, but the old mq policy did. To avoid breaking * software we continue to accept these configurables for the mq policy, @@ -1590,18 +1653,18 @@ static int mq_emit_config_values(struct dm_cache_policy *p, char *result, static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) { mq->policy.destroy = smq_destroy; - mq->policy.map = smq_map; mq->policy.lookup = smq_lookup; + mq->policy.lookup_with_work = smq_lookup_with_work; + mq->policy.get_background_work = smq_get_background_work; + mq->policy.complete_background_work = smq_complete_background_work; mq->policy.set_dirty = smq_set_dirty; mq->policy.clear_dirty = smq_clear_dirty; mq->policy.load_mapping = smq_load_mapping; + mq->policy.invalidate_mapping = smq_invalidate_mapping; mq->policy.get_hint = smq_get_hint; - mq->policy.remove_mapping = smq_remove_mapping; - mq->policy.remove_cblock = smq_remove_cblock; - mq->policy.writeback_work = smq_writeback_work; - mq->policy.force_mapping = smq_force_mapping; mq->policy.residency = smq_residency; mq->policy.tick = smq_tick; + mq->policy.allow_migrations = smq_allow_migrations; if (mimic_mq) { mq->policy.set_config_value = mq_set_config_value; @@ -1633,7 +1696,8 @@ static void calc_hotspot_params(sector_t origin_size, static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size, - bool mimic_mq) + bool mimic_mq, + bool migrations_allowed) { unsigned i; unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; @@ -1658,11 +1722,11 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, } init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue); - for (i = 0; i < nr_sentinels_per_queue; i++) + for (i = 0; i < nr_sentinels_per_queue; i++) get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true; init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels); - for (i = 0; i < nr_sentinels_per_queue; i++) + for (i = 0; i < nr_sentinels_per_queue; i++) get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true; init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels, @@ -1715,8 +1779,16 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, mq->next_hotspot_period = jiffies; mq->next_cache_period = jiffies; + mq->bg_work = btracker_create(10240); /* FIXME: hard coded value */ + if (!mq->bg_work) + goto bad_btracker; + + mq->migrations_allowed = migrations_allowed; + return &mq->policy; +bad_btracker: + h_exit(&mq->hotspot_table); bad_alloc_hotspot_table: h_exit(&mq->table); bad_alloc_table: @@ -1735,21 +1807,28 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size) { - return __smq_create(cache_size, origin_size, cache_block_size, false); + return __smq_create(cache_size, origin_size, cache_block_size, false, true); } static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size) { - return __smq_create(cache_size, origin_size, cache_block_size, true); + return __smq_create(cache_size, origin_size, cache_block_size, true, true); +} + +static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, false, false); } /*----------------------------------------------------------------*/ static struct dm_cache_policy_type smq_policy_type = { .name = "smq", - .version = {1, 5, 0}, + .version = {2, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create @@ -1757,15 +1836,23 @@ static struct dm_cache_policy_type smq_policy_type = { static struct dm_cache_policy_type mq_policy_type = { .name = "mq", - .version = {1, 5, 0}, + .version = {2, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create, }; +static struct dm_cache_policy_type cleaner_policy_type = { + .name = "cleaner", + .version = {2, 0, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = cleaner_create, +}; + static struct dm_cache_policy_type default_policy_type = { .name = "default", - .version = {1, 5, 0}, + .version = {2, 0, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create, @@ -1785,23 +1872,36 @@ static int __init smq_init(void) r = dm_cache_policy_register(&mq_policy_type); if (r) { DMERR("register failed (as mq) %d", r); - dm_cache_policy_unregister(&smq_policy_type); - return -ENOMEM; + goto out_mq; + } + + r = dm_cache_policy_register(&cleaner_policy_type); + if (r) { + DMERR("register failed (as cleaner) %d", r); + goto out_cleaner; } r = dm_cache_policy_register(&default_policy_type); if (r) { DMERR("register failed (as default) %d", r); - dm_cache_policy_unregister(&mq_policy_type); - dm_cache_policy_unregister(&smq_policy_type); - return -ENOMEM; + goto out_default; } return 0; + +out_default: + dm_cache_policy_unregister(&cleaner_policy_type); +out_cleaner: + dm_cache_policy_unregister(&mq_policy_type); +out_mq: + dm_cache_policy_unregister(&smq_policy_type); + + return -ENOMEM; } static void __exit smq_exit(void) { + dm_cache_policy_unregister(&cleaner_policy_type); dm_cache_policy_unregister(&smq_policy_type); dm_cache_policy_unregister(&mq_policy_type); dm_cache_policy_unregister(&default_policy_type); @@ -1816,3 +1916,4 @@ MODULE_DESCRIPTION("smq cache policy"); MODULE_ALIAS("dm-cache-default"); MODULE_ALIAS("dm-cache-mq"); +MODULE_ALIAS("dm-cache-cleaner"); diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index aa10b1493f34..c05fc3436cef 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -13,183 +13,100 @@ /*----------------------------------------------------------------*/ -/* FIXME: make it clear which methods are optional. Get debug policy to - * double check this at start. - */ - /* * The cache policy makes the important decisions about which blocks get to * live on the faster cache device. - * - * When the core target has to remap a bio it calls the 'map' method of the - * policy. This returns an instruction telling the core target what to do. - * - * POLICY_HIT: - * That block is in the cache. Remap to the cache and carry on. - * - * POLICY_MISS: - * This block is on the origin device. Remap and carry on. - * - * POLICY_NEW: - * This block is currently on the origin device, but the policy wants to - * move it. The core should: - * - * - hold any further io to this origin block - * - copy the origin to the given cache block - * - release all the held blocks - * - remap the original block to the cache - * - * POLICY_REPLACE: - * This block is currently on the origin device. The policy wants to - * move it to the cache, with the added complication that the destination - * cache block needs a writeback first. The core should: - * - * - hold any further io to this origin block - * - hold any further io to the origin block that's being written back - * - writeback - * - copy new block to cache - * - release held blocks - * - remap bio to cache and reissue. - * - * Should the core run into trouble while processing a POLICY_NEW or - * POLICY_REPLACE instruction it will roll back the policies mapping using - * remove_mapping() or force_mapping(). These methods must not fail. This - * approach avoids having transactional semantics in the policy (ie, the - * core informing the policy when a migration is complete), and hence makes - * it easier to write new policies. - * - * In general policy methods should never block, except in the case of the - * map function when can_migrate is set. So be careful to implement using - * bounded, preallocated memory. */ enum policy_operation { - POLICY_HIT, - POLICY_MISS, - POLICY_NEW, - POLICY_REPLACE -}; - -/* - * When issuing a POLICY_REPLACE the policy needs to make a callback to - * lock the block being demoted. This doesn't need to occur during a - * writeback operation since the block remains in the cache. - */ -struct policy_locker; -typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock); - -struct policy_locker { - policy_lock_fn fn; + POLICY_PROMOTE, + POLICY_DEMOTE, + POLICY_WRITEBACK }; /* * This is the instruction passed back to the core target. */ -struct policy_result { +struct policy_work { enum policy_operation op; - dm_oblock_t old_oblock; /* POLICY_REPLACE */ - dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */ + dm_oblock_t oblock; + dm_cblock_t cblock; }; /* - * The cache policy object. Just a bunch of methods. It is envisaged that - * this structure will be embedded in a bigger, policy specific structure - * (ie. use container_of()). + * The cache policy object. It is envisaged that this structure will be + * embedded in a bigger, policy specific structure (ie. use container_of()). */ struct dm_cache_policy { - - /* - * FIXME: make it clear which methods are optional, and which may - * block. - */ - /* * Destroys this object. */ void (*destroy)(struct dm_cache_policy *p); /* - * See large comment above. - * - * oblock - the origin block we're interested in. - * - * can_block - indicates whether the current thread is allowed to - * block. -EWOULDBLOCK returned if it can't and would. - * - * can_migrate - gives permission for POLICY_NEW or POLICY_REPLACE - * instructions. If denied and the policy would have - * returned one of these instructions it should - * return -EWOULDBLOCK. + * Find the location of a block. * - * discarded_oblock - indicates whether the whole origin block is - * in a discarded state (FIXME: better to tell the - * policy about this sooner, so it can recycle that - * cache block if it wants.) - * bio - the bio that triggered this call. - * result - gets filled in with the instruction. + * Must not block. * - * May only return 0, or -EWOULDBLOCK (if !can_migrate) + * Returns 0 if in cache (cblock will be set), -ENOENT if not, < 0 for + * other errors (-EWOULDBLOCK would be typical). data_dir should be + * READ or WRITE. fast_copy should be set if migrating this block would + * be 'cheap' somehow (eg, discarded data). background_queued will be set + * if a migration has just been queued. */ - int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result); + int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, bool *background_queued); /* - * Sometimes we want to see if a block is in the cache, without - * triggering any update of stats. (ie. it's not a real hit). - * - * Must not block. + * Sometimes the core target can optimise a migration, eg, the + * block may be discarded, or the bio may cover an entire block. + * In order to optimise it needs the migration immediately though + * so it knows to do something different with the bio. * - * Returns 0 if in cache, -ENOENT if not, < 0 for other errors - * (-EWOULDBLOCK would be typical). + * This method is optional (policy-internal will fallback to using + * lookup). */ - int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock); - - void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); - void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock); + int (*lookup_with_work)(struct dm_cache_policy *p, + dm_oblock_t oblock, dm_cblock_t *cblock, + int data_dir, bool fast_copy, + struct policy_work **work); /* - * Called when a cache target is first created. Used to load a - * mapping from the metadata device into the policy. + * Retrieves background work. Returns -ENODATA when there's no + * background work. */ - int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, - dm_cblock_t cblock, uint32_t hint, bool hint_valid); + int (*get_background_work)(struct dm_cache_policy *p, bool idle, + struct policy_work **result); /* - * Gets the hint for a given cblock. Called in a single threaded - * context. So no locking required. + * You must pass in the same work pointer that you were given, not + * a copy. */ - uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); + void (*complete_background_work)(struct dm_cache_policy *p, + struct policy_work *work, + bool success); + + void (*set_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); + void (*clear_dirty)(struct dm_cache_policy *p, dm_cblock_t cblock); /* - * Override functions used on the error paths of the core target. - * They must succeed. + * Called when a cache target is first created. Used to load a + * mapping from the metadata device into the policy. */ - void (*remove_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock); - void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock, - dm_oblock_t new_oblock); + int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, + dm_cblock_t cblock, bool dirty, + uint32_t hint, bool hint_valid); /* - * This is called via the invalidate_cblocks message. It is - * possible the particular cblock has already been removed due to a - * write io in passthrough mode. In which case this should return - * -ENODATA. + * Drops the mapping, irrespective of whether it's clean or dirty. + * Returns -ENODATA if cblock is not mapped. */ - int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock); + int (*invalidate_mapping)(struct dm_cache_policy *p, dm_cblock_t cblock); /* - * Provide a dirty block to be written back by the core target. If - * critical_only is set then the policy should only provide work if - * it urgently needs it. - * - * Returns: - * - * 0 and @cblock,@oblock: block to write back provided - * - * -ENODATA: no dirty blocks available + * Gets the hint for a given cblock. Called in a single threaded + * context. So no locking required. */ - int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock, - bool critical_only); + uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); /* * How full is the cache? @@ -202,6 +119,8 @@ struct dm_cache_policy { * queue merging has occurred). To stop the policy being fooled by * these, the core target sends regular tick() calls to the policy. * The policy should only count an entry as hit once per tick. + * + * This method is optional. */ void (*tick)(struct dm_cache_policy *p, bool can_block); @@ -213,6 +132,8 @@ struct dm_cache_policy { int (*set_config_value)(struct dm_cache_policy *p, const char *key, const char *value); + void (*allow_migrations)(struct dm_cache_policy *p, bool allow); + /* * Book keeping ptr for the policy register, not for general use. */ diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2eaa414e1509..b7de289a10bb 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -5,7 +5,7 @@ */ #include "dm.h" -#include "dm-bio-prison-v1.h" +#include "dm-bio-prison-v2.h" #include "dm-bio-record.h" #include "dm-cache-metadata.h" @@ -15,6 +15,7 @@ #include <linux/init.h> #include <linux/mempool.h> #include <linux/module.h> +#include <linux/rwsem.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -25,7 +26,18 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle, /*----------------------------------------------------------------*/ -#define IOT_RESOLUTION 4 +/* + * Glossary: + * + * oblock: index of an origin block + * cblock: index of a cache block + * promotion: movement of a block from origin to cache + * demotion: movement of a block from cache to origin + * migration: movement of a block between the origin and cache device, + * either direction + */ + +/*----------------------------------------------------------------*/ struct io_tracker { spinlock_t lock; @@ -99,19 +111,178 @@ static void iot_io_end(struct io_tracker *iot, sector_t len) /*----------------------------------------------------------------*/ /* - * Glossary: - * - * oblock: index of an origin block - * cblock: index of a cache block - * promotion: movement of a block from origin to cache - * demotion: movement of a block from cache to origin - * migration: movement of a block between the origin and cache device, - * either direction + * Represents a chunk of future work. 'input' allows continuations to pass + * values between themselves, typically error values. */ +struct continuation { + struct work_struct ws; + int input; +}; + +static inline void init_continuation(struct continuation *k, + void (*fn)(struct work_struct *)) +{ + INIT_WORK(&k->ws, fn); + k->input = 0; +} + +static inline void queue_continuation(struct workqueue_struct *wq, + struct continuation *k) +{ + queue_work(wq, &k->ws); +} /*----------------------------------------------------------------*/ /* + * The batcher collects together pieces of work that need a particular + * operation to occur before they can proceed (typically a commit). + */ +struct batcher { + /* + * The operation that everyone is waiting for. + */ + int (*commit_op)(void *context); + void *commit_context; + + /* + * This is how bios should be issued once the commit op is complete + * (accounted_request). + */ + void (*issue_op)(struct bio *bio, void *context); + void *issue_context; + + /* + * Queued work gets put on here after commit. + */ + struct workqueue_struct *wq; + + spinlock_t lock; + struct list_head work_items; + struct bio_list bios; + struct work_struct commit_work; + + bool commit_scheduled; +}; + +static void __commit(struct work_struct *_ws) +{ + struct batcher *b = container_of(_ws, struct batcher, commit_work); + + int r; + unsigned long flags; + struct list_head work_items; + struct work_struct *ws, *tmp; + struct continuation *k; + struct bio *bio; + struct bio_list bios; + + INIT_LIST_HEAD(&work_items); + bio_list_init(&bios); + + /* + * We have to grab these before the commit_op to avoid a race + * condition. + */ + spin_lock_irqsave(&b->lock, flags); + list_splice_init(&b->work_items, &work_items); + bio_list_merge(&bios, &b->bios); + bio_list_init(&b->bios); + b->commit_scheduled = false; + spin_unlock_irqrestore(&b->lock, flags); + + r = b->commit_op(b->commit_context); + + list_for_each_entry_safe(ws, tmp, &work_items, entry) { + k = container_of(ws, struct continuation, ws); + k->input = r; + INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */ + queue_work(b->wq, ws); + } + + while ((bio = bio_list_pop(&bios))) { + if (r) { + bio->bi_error = r; + bio_endio(bio); + } else + b->issue_op(bio, b->issue_context); + } +} + +static void batcher_init(struct batcher *b, + int (*commit_op)(void *), + void *commit_context, + void (*issue_op)(struct bio *bio, void *), + void *issue_context, + struct workqueue_struct *wq) +{ + b->commit_op = commit_op; + b->commit_context = commit_context; + b->issue_op = issue_op; + b->issue_context = issue_context; + b->wq = wq; + + spin_lock_init(&b->lock); + INIT_LIST_HEAD(&b->work_items); + bio_list_init(&b->bios); + INIT_WORK(&b->commit_work, __commit); + b->commit_scheduled = false; +} + +static void async_commit(struct batcher *b) +{ + queue_work(b->wq, &b->commit_work); +} + +static void continue_after_commit(struct batcher *b, struct continuation *k) +{ + unsigned long flags; + bool commit_scheduled; + + spin_lock_irqsave(&b->lock, flags); + commit_scheduled = b->commit_scheduled; + list_add_tail(&k->ws.entry, &b->work_items); + spin_unlock_irqrestore(&b->lock, flags); + + if (commit_scheduled) + async_commit(b); +} + +/* + * Bios are errored if commit failed. + */ +static void issue_after_commit(struct batcher *b, struct bio *bio) +{ + unsigned long flags; + bool commit_scheduled; + + spin_lock_irqsave(&b->lock, flags); + commit_scheduled = b->commit_scheduled; + bio_list_add(&b->bios, bio); + spin_unlock_irqrestore(&b->lock, flags); + + if (commit_scheduled) + async_commit(b); +} + +/* + * Call this if some urgent work is waiting for the commit to complete. + */ +static void schedule_commit(struct batcher *b) +{ + bool immediate; + unsigned long flags; + + spin_lock_irqsave(&b->lock, flags); + immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios); + b->commit_scheduled = true; + spin_unlock_irqrestore(&b->lock, flags); + + if (immediate) + async_commit(b); +} + +/* * There are a couple of places where we let a bio run, but want to do some * work before calling its endio function. We do this by temporarily * changing the endio fn. @@ -189,31 +360,13 @@ struct cache_stats { atomic_t write_miss; atomic_t demotion; atomic_t promotion; + atomic_t writeback; atomic_t copies_avoided; atomic_t cache_cell_clash; atomic_t commit_count; atomic_t discard_count; }; -/* - * Defines a range of cblocks, begin to (end - 1) are in the range. end is - * the one-past-the-end value. - */ -struct cblock_range { - dm_cblock_t begin; - dm_cblock_t end; -}; - -struct invalidation_request { - struct list_head list; - struct cblock_range *cblocks; - - atomic_t complete; - int err; - - wait_queue_head_t result_wait; -}; - struct cache { struct dm_target *ti; struct dm_target_callbacks callbacks; @@ -255,11 +408,7 @@ struct cache { spinlock_t lock; struct list_head deferred_cells; struct bio_list deferred_bios; - struct bio_list deferred_flush_bios; struct bio_list deferred_writethrough_bios; - struct list_head quiesced_migrations; - struct list_head completed_migrations; - struct list_head need_commit_migrations; sector_t migration_threshold; wait_queue_head_t migration_wait; atomic_t nr_allocated_migrations; @@ -270,9 +419,7 @@ struct cache { */ atomic_t nr_io_migrations; - wait_queue_head_t quiescing_wait; - atomic_t quiescing; - atomic_t quiescing_ack; + struct rw_semaphore quiesce_lock; /* * cache_size entries, dirty if set @@ -296,13 +443,11 @@ struct cache { struct dm_kcopyd_client *copier; struct workqueue_struct *wq; - struct work_struct worker; - + struct work_struct deferred_bio_worker; + struct work_struct deferred_writethrough_worker; + struct work_struct migration_worker; struct delayed_work waker; - unsigned long last_commit_jiffies; - - struct dm_bio_prison *prison; - struct dm_deferred_set *all_io_ds; + struct dm_bio_prison_v2 *prison; mempool_t *migration_pool; @@ -330,12 +475,17 @@ struct cache { struct list_head invalidation_requests; struct io_tracker origin_tracker; + + struct work_struct commit_ws; + struct batcher committer; + + struct rw_semaphore background_work_lock; }; struct per_bio_data { bool tick:1; unsigned req_nr:2; - struct dm_deferred_entry *all_io_entry; + struct dm_bio_prison_cell_v2 *cell; struct dm_hook_info hook_info; sector_t len; @@ -350,55 +500,64 @@ struct per_bio_data { }; struct dm_cache_migration { - struct list_head list; + struct continuation k; struct cache *cache; - unsigned long start_jiffies; - dm_oblock_t old_oblock; - dm_oblock_t new_oblock; - dm_cblock_t cblock; - - bool err:1; - bool discard:1; - bool writeback:1; - bool demote:1; - bool promote:1; - bool requeue_holder:1; - bool invalidate:1; + struct policy_work *op; + struct bio *overwrite_bio; + struct dm_bio_prison_cell_v2 *cell; - struct dm_bio_prison_cell *old_ocell; - struct dm_bio_prison_cell *new_ocell; + dm_cblock_t invalidate_cblock; + dm_oblock_t invalidate_oblock; }; -/* - * Processing a bio in the worker thread may require these memory - * allocations. We prealloc to avoid deadlocks (the same worker thread - * frees them back to the mempool). - */ -struct prealloc { - struct dm_cache_migration *mg; - struct dm_bio_prison_cell *cell1; - struct dm_bio_prison_cell *cell2; -}; +/*----------------------------------------------------------------*/ + +static bool writethrough_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_WRITETHROUGH; +} -static enum cache_metadata_mode get_cache_mode(struct cache *cache); +static bool writeback_mode(struct cache_features *f) +{ + return f->io_mode == CM_IO_WRITEBACK; +} -static void wake_worker(struct cache *cache) +static inline bool passthrough_mode(struct cache_features *f) { - queue_work(cache->wq, &cache->worker); + return unlikely(f->io_mode == CM_IO_PASSTHROUGH); } /*----------------------------------------------------------------*/ -static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache) +static void wake_deferred_bio_worker(struct cache *cache) { - /* FIXME: change to use a local slab. */ - return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT); + queue_work(cache->wq, &cache->deferred_bio_worker); } -static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell) +static void wake_deferred_writethrough_worker(struct cache *cache) { - dm_bio_prison_free_cell(cache->prison, cell); + queue_work(cache->wq, &cache->deferred_writethrough_worker); +} + +static void wake_migration_worker(struct cache *cache) +{ + if (passthrough_mode(&cache->features)) + return; + + queue_work(cache->wq, &cache->migration_worker); +} + +/*----------------------------------------------------------------*/ + +static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache) +{ + return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT); +} + +static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell) +{ + dm_bio_prison_free_cell_v2(cache->prison, cell); } static struct dm_cache_migration *alloc_migration(struct cache *cache) @@ -424,146 +583,127 @@ static void free_migration(struct dm_cache_migration *mg) mempool_free(mg, cache->migration_pool); } -static int prealloc_data_structs(struct cache *cache, struct prealloc *p) -{ - if (!p->mg) { - p->mg = alloc_migration(cache); - if (!p->mg) - return -ENOMEM; - } - - if (!p->cell1) { - p->cell1 = alloc_prison_cell(cache); - if (!p->cell1) - return -ENOMEM; - } - - if (!p->cell2) { - p->cell2 = alloc_prison_cell(cache); - if (!p->cell2) - return -ENOMEM; - } - - return 0; -} +/*----------------------------------------------------------------*/ -static void prealloc_free_structs(struct cache *cache, struct prealloc *p) +static inline dm_oblock_t oblock_succ(dm_oblock_t b) { - if (p->cell2) - free_prison_cell(cache, p->cell2); - - if (p->cell1) - free_prison_cell(cache, p->cell1); - - if (p->mg) - free_migration(p->mg); + return to_oblock(from_oblock(b) + 1ull); } -static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) +static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key) { - struct dm_cache_migration *mg = p->mg; - - BUG_ON(!mg); - p->mg = NULL; - - return mg; + key->virtual = 0; + key->dev = 0; + key->block_begin = from_oblock(begin); + key->block_end = from_oblock(end); } /* - * You must have a cell within the prealloc struct to return. If not this - * function will BUG() rather than returning NULL. + * We have two lock levels. Level 0, which is used to prevent WRITEs, and + * level 1 which prevents *both* READs and WRITEs. */ -static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p) +#define WRITE_LOCK_LEVEL 0 +#define READ_WRITE_LOCK_LEVEL 1 + +static unsigned lock_level(struct bio *bio) { - struct dm_bio_prison_cell *r = NULL; + return bio_data_dir(bio) == WRITE ? + WRITE_LOCK_LEVEL : + READ_WRITE_LOCK_LEVEL; +} - if (p->cell1) { - r = p->cell1; - p->cell1 = NULL; +/*---------------------------------------------------------------- + * Per bio data + *--------------------------------------------------------------*/ - } else if (p->cell2) { - r = p->cell2; - p->cell2 = NULL; - } else - BUG(); +/* + * If using writeback, leave out struct per_bio_data's writethrough fields. + */ +#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) +#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) - return r; +static size_t get_per_bio_data_size(struct cache *cache) +{ + return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; } -/* - * You can't have more than two cells in a prealloc struct. BUG() will be - * called if you try and overfill. - */ -static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell) +static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) { - if (!p->cell2) - p->cell2 = cell; + struct per_bio_data *pb = dm_per_bio_data(bio, data_size); + BUG_ON(!pb); + return pb; +} - else if (!p->cell1) - p->cell1 = cell; +static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) +{ + struct per_bio_data *pb = get_per_bio_data(bio, data_size); - else - BUG(); + pb->tick = false; + pb->req_nr = dm_bio_get_target_bio_nr(bio); + pb->cell = NULL; + pb->len = 0; + + return pb; } /*----------------------------------------------------------------*/ -static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key *key) +static void defer_bio(struct cache *cache, struct bio *bio) { - key->virtual = 0; - key->dev = 0; - key->block_begin = from_oblock(begin); - key->block_end = from_oblock(end); -} + unsigned long flags; -/* - * The caller hands in a preallocated cell, and a free function for it. - * The cell will be freed if there's an error, or if it wasn't used because - * a cell with that key already exists. - */ -typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell); + spin_lock_irqsave(&cache->lock, flags); + bio_list_add(&cache->deferred_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); -static int bio_detain_range(struct cache *cache, dm_oblock_t oblock_begin, dm_oblock_t oblock_end, - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, - cell_free_fn free_fn, void *free_context, - struct dm_bio_prison_cell **cell_result) + wake_deferred_bio_worker(cache); +} + +static void defer_bios(struct cache *cache, struct bio_list *bios) { - int r; - struct dm_cell_key key; + unsigned long flags; - build_key(oblock_begin, oblock_end, &key); - r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result); - if (r) - free_fn(free_context, cell_prealloc); + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&cache->deferred_bios, bios); + bio_list_init(bios); + spin_unlock_irqrestore(&cache->lock, flags); - return r; + wake_deferred_bio_worker(cache); } -static int bio_detain(struct cache *cache, dm_oblock_t oblock, - struct bio *bio, struct dm_bio_prison_cell *cell_prealloc, - cell_free_fn free_fn, void *free_context, - struct dm_bio_prison_cell **cell_result) +/*----------------------------------------------------------------*/ + +static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio) { + bool r; + size_t pb_size; + struct per_bio_data *pb; + struct dm_cell_key_v2 key; dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL); - return bio_detain_range(cache, oblock, end, bio, - cell_prealloc, free_fn, free_context, cell_result); -} + struct dm_bio_prison_cell_v2 *cell_prealloc, *cell; -static int get_cell(struct cache *cache, - dm_oblock_t oblock, - struct prealloc *structs, - struct dm_bio_prison_cell **cell_result) -{ - int r; - struct dm_cell_key key; - struct dm_bio_prison_cell *cell_prealloc; + cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */ + if (!cell_prealloc) { + defer_bio(cache, bio); + return false; + } - cell_prealloc = prealloc_get_cell(structs); + build_key(oblock, end, &key); + r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell); + if (!r) { + /* + * Failed to get the lock. + */ + free_prison_cell(cache, cell_prealloc); + return r; + } - build_key(oblock, to_oblock(from_oblock(oblock) + 1ULL), &key); - r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result); - if (r) - prealloc_put_cell(structs, cell_prealloc); + if (cell != cell_prealloc) + free_prison_cell(cache, cell_prealloc); + + pb_size = get_per_bio_data_size(cache); + pb = get_per_bio_data(bio, pb_size); + pb->cell = cell; return r; } @@ -575,21 +715,33 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b) return test_bit(from_cblock(b), cache->dirty_bitset); } -static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) +static void set_dirty(struct cache *cache, dm_cblock_t cblock) { if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) { atomic_inc(&cache->nr_dirty); - policy_set_dirty(cache->policy, oblock); + policy_set_dirty(cache->policy, cblock); } } -static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock) +/* + * These two are called when setting after migrations to force the policy + * and dirty bitset to be in sync. + */ +static void force_set_dirty(struct cache *cache, dm_cblock_t cblock) +{ + if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) + atomic_inc(&cache->nr_dirty); + policy_set_dirty(cache->policy, cblock); +} + +static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock) { if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) { - policy_clear_dirty(cache->policy, oblock); if (atomic_dec_return(&cache->nr_dirty) == 0) dm_table_event(cache->ti->table); } + + policy_clear_dirty(cache->policy, cblock); } /*----------------------------------------------------------------*/ @@ -628,11 +780,6 @@ static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) oblocks_per_dblock(cache))); } -static dm_oblock_t dblock_to_oblock(struct cache *cache, dm_dblock_t dblock) -{ - return to_oblock(from_dblock(dblock) * oblocks_per_dblock(cache)); -} - static void set_discard(struct cache *cache, dm_dblock_t b) { unsigned long flags; @@ -679,83 +826,6 @@ static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b) return r; } -/*----------------------------------------------------------------*/ - -static void load_stats(struct cache *cache) -{ - struct dm_cache_statistics stats; - - dm_cache_metadata_get_stats(cache->cmd, &stats); - atomic_set(&cache->stats.read_hit, stats.read_hits); - atomic_set(&cache->stats.read_miss, stats.read_misses); - atomic_set(&cache->stats.write_hit, stats.write_hits); - atomic_set(&cache->stats.write_miss, stats.write_misses); -} - -static void save_stats(struct cache *cache) -{ - struct dm_cache_statistics stats; - - if (get_cache_mode(cache) >= CM_READ_ONLY) - return; - - stats.read_hits = atomic_read(&cache->stats.read_hit); - stats.read_misses = atomic_read(&cache->stats.read_miss); - stats.write_hits = atomic_read(&cache->stats.write_hit); - stats.write_misses = atomic_read(&cache->stats.write_miss); - - dm_cache_metadata_set_stats(cache->cmd, &stats); -} - -/*---------------------------------------------------------------- - * Per bio data - *--------------------------------------------------------------*/ - -/* - * If using writeback, leave out struct per_bio_data's writethrough fields. - */ -#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) -#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) - -static bool writethrough_mode(struct cache_features *f) -{ - return f->io_mode == CM_IO_WRITETHROUGH; -} - -static bool writeback_mode(struct cache_features *f) -{ - return f->io_mode == CM_IO_WRITEBACK; -} - -static bool passthrough_mode(struct cache_features *f) -{ - return f->io_mode == CM_IO_PASSTHROUGH; -} - -static size_t get_per_bio_data_size(struct cache *cache) -{ - return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; -} - -static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) -{ - struct per_bio_data *pb = dm_per_bio_data(bio, data_size); - BUG_ON(!pb); - return pb; -} - -static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size) -{ - struct per_bio_data *pb = get_per_bio_data(bio, data_size); - - pb->tick = false; - pb->req_nr = dm_bio_get_target_bio_nr(bio); - pb->all_io_entry = NULL; - pb->len = 0; - - return pb; -} - /*---------------------------------------------------------------- * Remapping *--------------------------------------------------------------*/ @@ -797,8 +867,9 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) } static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, - dm_oblock_t oblock) + dm_oblock_t oblock) { + // FIXME: this is called way too much. check_if_tick_bio_needed(cache, bio); remap_to_origin(cache, bio); if (bio_data_dir(bio) == WRITE) @@ -811,7 +882,7 @@ static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, check_if_tick_bio_needed(cache, bio); remap_to_cache(cache, bio, cblock); if (bio_data_dir(bio) == WRITE) { - set_dirty(cache, oblock, cblock); + set_dirty(cache, cblock); clear_discard(cache, oblock_to_dblock(cache, oblock)); } } @@ -828,22 +899,6 @@ static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio) return to_oblock(block_nr); } -/* - * You must increment the deferred set whilst the prison cell is held. To - * encourage this, we ask for 'cell' to be passed in. - */ -static void inc_ds(struct cache *cache, struct bio *bio, - struct dm_bio_prison_cell *cell) -{ - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - - BUG_ON(!cell); - BUG_ON(pb->all_io_entry); - - pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); -} - static bool accountable_bio(struct cache *cache, struct bio *bio) { return ((bio->bi_bdev == cache->origin_dev->bdev) && @@ -875,29 +930,10 @@ static void accounted_request(struct cache *cache, struct bio *bio) generic_make_request(bio); } -static void issue(struct cache *cache, struct bio *bio) +static void issue_op(struct bio *bio, void *context) { - unsigned long flags; - - if (!op_is_flush(bio->bi_opf)) { - accounted_request(cache, bio); - return; - } - - /* - * Batch together any bios that trigger commits and then issue a - * single commit for them in do_worker(). - */ - spin_lock_irqsave(&cache->lock, flags); - cache->commit_requested = true; - bio_list_add(&cache->deferred_flush_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); -} - -static void inc_and_issue(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell *cell) -{ - inc_ds(cache, bio, cell); - issue(cache, bio); + struct cache *cache = context; + accounted_request(cache, bio); } static void defer_writethrough_bio(struct cache *cache, struct bio *bio) @@ -908,7 +944,7 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio) bio_list_add(&cache->deferred_writethrough_bios, bio); spin_unlock_irqrestore(&cache->lock, flags); - wake_worker(cache); + wake_deferred_writethrough_worker(cache); } static void writethrough_endio(struct bio *bio) @@ -934,6 +970,7 @@ static void writethrough_endio(struct bio *bio) } /* + * FIXME: send in parallel, huge latency as is. * When running in writethrough mode we need to send writes to clean blocks * to both the cache and origin devices. In future we'd like to clone the * bio and send them in parallel, but for now we're doing them in @@ -1046,12 +1083,58 @@ static void metadata_operation_failed(struct cache *cache, const char *op, int r set_cache_mode(cache, CM_READ_ONLY); } +/*----------------------------------------------------------------*/ + +static void load_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + dm_cache_metadata_get_stats(cache->cmd, &stats); + atomic_set(&cache->stats.read_hit, stats.read_hits); + atomic_set(&cache->stats.read_miss, stats.read_misses); + atomic_set(&cache->stats.write_hit, stats.write_hits); + atomic_set(&cache->stats.write_miss, stats.write_misses); +} + +static void save_stats(struct cache *cache) +{ + struct dm_cache_statistics stats; + + if (get_cache_mode(cache) >= CM_READ_ONLY) + return; + + stats.read_hits = atomic_read(&cache->stats.read_hit); + stats.read_misses = atomic_read(&cache->stats.read_miss); + stats.write_hits = atomic_read(&cache->stats.write_hit); + stats.write_misses = atomic_read(&cache->stats.write_miss); + + dm_cache_metadata_set_stats(cache->cmd, &stats); +} + +static void update_stats(struct cache_stats *stats, enum policy_operation op) +{ + switch (op) { + case POLICY_PROMOTE: + atomic_inc(&stats->promotion); + break; + + case POLICY_DEMOTE: + atomic_inc(&stats->demotion); + break; + + case POLICY_WRITEBACK: + atomic_inc(&stats->writeback); + break; + } +} + /*---------------------------------------------------------------- * Migration processing * * Migration covers moving data from the origin device to the cache, or * vice versa. *--------------------------------------------------------------*/ + static void inc_io_migrations(struct cache *cache) { atomic_inc(&cache->nr_io_migrations); @@ -1067,213 +1150,109 @@ static bool discard_or_flush(struct bio *bio) return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf); } -static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell) -{ - if (discard_or_flush(cell->holder)) { - /* - * We have to handle these bios individually. - */ - dm_cell_release(cache->prison, cell, &cache->deferred_bios); - free_prison_cell(cache, cell); - } else - list_add_tail(&cell->user_list, &cache->deferred_cells); -} - -static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder) +static void calc_discard_block_range(struct cache *cache, struct bio *bio, + dm_dblock_t *b, dm_dblock_t *e) { - unsigned long flags; - - if (!holder && dm_cell_promote_or_release(cache->prison, cell)) { - /* - * There was no prisoner to promote to holder, the - * cell has been released. - */ - free_prison_cell(cache, cell); - return; - } + sector_t sb = bio->bi_iter.bi_sector; + sector_t se = bio_end_sector(bio); - spin_lock_irqsave(&cache->lock, flags); - __cell_defer(cache, cell); - spin_unlock_irqrestore(&cache->lock, flags); + *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); - wake_worker(cache); + if (se - sb < cache->discard_block_size) + *e = *b; + else + *e = to_dblock(block_div(se, cache->discard_block_size)); } -static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err) -{ - dm_cell_error(cache->prison, cell, err); - free_prison_cell(cache, cell); -} +/*----------------------------------------------------------------*/ -static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell) +static void prevent_background_work(struct cache *cache) { - cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE); + lockdep_off(); + down_write(&cache->background_work_lock); + lockdep_on(); } -static void free_io_migration(struct dm_cache_migration *mg) +static void allow_background_work(struct cache *cache) { - struct cache *cache = mg->cache; - - dec_io_migrations(cache); - free_migration(mg); - wake_worker(cache); + lockdep_off(); + up_write(&cache->background_work_lock); + lockdep_on(); } -static void migration_failure(struct dm_cache_migration *mg) +static bool background_work_begin(struct cache *cache) { - struct cache *cache = mg->cache; - const char *dev_name = cache_device_name(cache); - - if (mg->writeback) { - DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name); - set_dirty(cache, mg->old_oblock, mg->cblock); - cell_defer(cache, mg->old_ocell, false); - - } else if (mg->demote) { - DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name); - policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock); + bool r; - cell_defer(cache, mg->old_ocell, mg->promote ? false : true); - if (mg->promote) - cell_defer(cache, mg->new_ocell, true); - } else { - DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name); - policy_remove_mapping(cache->policy, mg->new_oblock); - cell_defer(cache, mg->new_ocell, true); - } + lockdep_off(); + r = down_read_trylock(&cache->background_work_lock); + lockdep_on(); - free_io_migration(mg); + return r; } -static void migration_success_pre_commit(struct dm_cache_migration *mg) +static void background_work_end(struct cache *cache) { - int r; - unsigned long flags; - struct cache *cache = mg->cache; - - if (mg->writeback) { - clear_dirty(cache, mg->old_oblock, mg->cblock); - cell_defer(cache, mg->old_ocell, false); - free_io_migration(mg); - return; + lockdep_off(); + up_read(&cache->background_work_lock); + lockdep_on(); +} - } else if (mg->demote) { - r = dm_cache_remove_mapping(cache->cmd, mg->cblock); - if (r) { - DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata", - cache_device_name(cache)); - metadata_operation_failed(cache, "dm_cache_remove_mapping", r); - policy_force_mapping(cache->policy, mg->new_oblock, - mg->old_oblock); - if (mg->promote) - cell_defer(cache, mg->new_ocell, true); - free_io_migration(mg); - return; - } - } else { - r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock); - if (r) { - DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata", - cache_device_name(cache)); - metadata_operation_failed(cache, "dm_cache_insert_mapping", r); - policy_remove_mapping(cache->policy, mg->new_oblock); - free_io_migration(mg); - return; - } - } +/*----------------------------------------------------------------*/ - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->need_commit_migrations); - cache->commit_requested = true; - spin_unlock_irqrestore(&cache->lock, flags); +static void quiesce(struct dm_cache_migration *mg, + void (*continuation)(struct work_struct *)) +{ + init_continuation(&mg->k, continuation); + dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws); } -static void migration_success_post_commit(struct dm_cache_migration *mg) +static struct dm_cache_migration *ws_to_mg(struct work_struct *ws) { - unsigned long flags; - struct cache *cache = mg->cache; - - if (mg->writeback) { - DMWARN_LIMIT("%s: writeback unexpectedly triggered commit", - cache_device_name(cache)); - return; - - } else if (mg->demote) { - cell_defer(cache, mg->old_ocell, mg->promote ? false : true); - - if (mg->promote) { - mg->demote = false; - - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->quiesced_migrations); - spin_unlock_irqrestore(&cache->lock, flags); - - } else { - if (mg->invalidate) - policy_remove_mapping(cache->policy, mg->old_oblock); - free_io_migration(mg); - } - - } else { - if (mg->requeue_holder) { - clear_dirty(cache, mg->new_oblock, mg->cblock); - cell_defer(cache, mg->new_ocell, true); - } else { - /* - * The block was promoted via an overwrite, so it's dirty. - */ - set_dirty(cache, mg->new_oblock, mg->cblock); - bio_endio(mg->new_ocell->holder); - cell_defer(cache, mg->new_ocell, false); - } - free_io_migration(mg); - } + struct continuation *k = container_of(ws, struct continuation, ws); + return container_of(k, struct dm_cache_migration, k); } static void copy_complete(int read_err, unsigned long write_err, void *context) { - unsigned long flags; - struct dm_cache_migration *mg = (struct dm_cache_migration *) context; - struct cache *cache = mg->cache; + struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k); if (read_err || write_err) - mg->err = true; - - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->completed_migrations); - spin_unlock_irqrestore(&cache->lock, flags); + mg->k.input = -EIO; - wake_worker(cache); + queue_continuation(mg->cache->wq, &mg->k); } -static void issue_copy(struct dm_cache_migration *mg) +static int copy(struct dm_cache_migration *mg, bool promote) { int r; struct dm_io_region o_region, c_region; struct cache *cache = mg->cache; - sector_t cblock = from_cblock(mg->cblock); o_region.bdev = cache->origin_dev->bdev; + o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block; o_region.count = cache->sectors_per_block; c_region.bdev = cache->cache_dev->bdev; - c_region.sector = cblock * cache->sectors_per_block; + c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block; c_region.count = cache->sectors_per_block; - if (mg->writeback || mg->demote) { - /* demote */ - o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block; - r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg); - } else { - /* promote */ - o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block; - r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg); - } + if (promote) + r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k); + else + r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k); - if (r < 0) { - DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache)); - migration_failure(mg); - } + return r; +} + +static void bio_drop_shared_lock(struct cache *cache, struct bio *bio) +{ + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + + if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell)) + free_prison_cell(cache, pb->cell); + pb->cell = NULL; } static void overwrite_endio(struct bio *bio) @@ -1282,368 +1261,475 @@ static void overwrite_endio(struct bio *bio) struct cache *cache = mg->cache; size_t pb_data_size = get_per_bio_data_size(cache); struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - unsigned long flags; dm_unhook_bio(&pb->hook_info, bio); if (bio->bi_error) - mg->err = true; + mg->k.input = bio->bi_error; - mg->requeue_holder = false; - - spin_lock_irqsave(&cache->lock, flags); - list_add_tail(&mg->list, &cache->completed_migrations); - spin_unlock_irqrestore(&cache->lock, flags); - - wake_worker(cache); + queue_continuation(mg->cache->wq, &mg->k); } -static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio) +static void overwrite(struct dm_cache_migration *mg, + void (*continuation)(struct work_struct *)) { + struct bio *bio = mg->overwrite_bio; size_t pb_data_size = get_per_bio_data_size(mg->cache); struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg); - remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock); /* - * No need to inc_ds() here, since the cell will be held for the - * duration of the io. + * The overwrite bio is part of the copy operation, as such it does + * not set/clear discard or dirty flags. */ + if (mg->op->op == POLICY_PROMOTE) + remap_to_cache(mg->cache, bio, mg->op->cblock); + else + remap_to_origin(mg->cache, bio); + + init_continuation(&mg->k, continuation); accounted_request(mg->cache, bio); } -static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) +/* + * Migration steps: + * + * 1) exclusive lock preventing WRITEs + * 2) quiesce + * 3) copy or issue overwrite bio + * 4) upgrade to exclusive lock preventing READs and WRITEs + * 5) quiesce + * 6) update metadata and commit + * 7) unlock + */ +static void mg_complete(struct dm_cache_migration *mg, bool success) { - return (bio_data_dir(bio) == WRITE) && - (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); -} + struct bio_list bios; + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + dm_cblock_t cblock = op->cblock; + + if (success) + update_stats(&cache->stats, op->op); + + switch (op->op) { + case POLICY_PROMOTE: + clear_discard(cache, oblock_to_dblock(cache, op->oblock)); + policy_complete_background_work(cache->policy, op, success); + + if (mg->overwrite_bio) { + if (success) + force_set_dirty(cache, cblock); + else + mg->overwrite_bio->bi_error = (mg->k.input ? : -EIO); + bio_endio(mg->overwrite_bio); + } else { + if (success) + force_clear_dirty(cache, cblock); + dec_io_migrations(cache); + } + break; -static void avoid_copy(struct dm_cache_migration *mg) -{ - atomic_inc(&mg->cache->stats.copies_avoided); - migration_success_pre_commit(mg); -} + case POLICY_DEMOTE: + /* + * We clear dirty here to update the nr_dirty counter. + */ + if (success) + force_clear_dirty(cache, cblock); + policy_complete_background_work(cache->policy, op, success); + dec_io_migrations(cache); + break; -static void calc_discard_block_range(struct cache *cache, struct bio *bio, - dm_dblock_t *b, dm_dblock_t *e) -{ - sector_t sb = bio->bi_iter.bi_sector; - sector_t se = bio_end_sector(bio); + case POLICY_WRITEBACK: + if (success) + force_clear_dirty(cache, cblock); + policy_complete_background_work(cache->policy, op, success); + dec_io_migrations(cache); + break; + } - *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size)); + bio_list_init(&bios); + if (mg->cell) { + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) + free_prison_cell(cache, mg->cell); + } - if (se - sb < cache->discard_block_size) - *e = *b; - else - *e = to_dblock(block_div(se, cache->discard_block_size)); + free_migration(mg); + defer_bios(cache, &bios); + wake_migration_worker(cache); + + background_work_end(cache); } -static void issue_discard(struct dm_cache_migration *mg) +static void mg_success(struct work_struct *ws) { - dm_dblock_t b, e; - struct bio *bio = mg->new_ocell->holder; - struct cache *cache = mg->cache; - - calc_discard_block_range(cache, bio, &b, &e); - while (b != e) { - set_discard(cache, b); - b = to_dblock(from_dblock(b) + 1); - } - - bio_endio(bio); - cell_defer(cache, mg->new_ocell, false); - free_migration(mg); - wake_worker(cache); + struct dm_cache_migration *mg = ws_to_mg(ws); + mg_complete(mg, mg->k.input == 0); } -static void issue_copy_or_discard(struct dm_cache_migration *mg) +static void mg_update_metadata(struct work_struct *ws) { - bool avoid; + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); struct cache *cache = mg->cache; + struct policy_work *op = mg->op; - if (mg->discard) { - issue_discard(mg); - return; - } + switch (op->op) { + case POLICY_PROMOTE: + r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock); + if (r) { + DMERR_LIMIT("%s: migration failed; couldn't insert mapping", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_insert_mapping", r); - if (mg->writeback || mg->demote) - avoid = !is_dirty(cache, mg->cblock) || - is_discarded_oblock(cache, mg->old_oblock); - else { - struct bio *bio = mg->new_ocell->holder; + mg_complete(mg, false); + return; + } + mg_complete(mg, true); + break; - avoid = is_discarded_oblock(cache, mg->new_oblock); + case POLICY_DEMOTE: + r = dm_cache_remove_mapping(cache->cmd, op->cblock); + if (r) { + DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); - if (writeback_mode(&cache->features) && - !avoid && bio_writes_complete_block(cache, bio)) { - issue_overwrite(mg, bio); + mg_complete(mg, false); return; } - } - avoid ? avoid_copy(mg) : issue_copy(mg); + /* + * It would be nice if we only had to commit when a REQ_FLUSH + * comes through. But there's one scenario that we have to + * look out for: + * + * - vblock x in a cache block + * - domotion occurs + * - cache block gets reallocated and over written + * - crash + * + * When we recover, because there was no commit the cache will + * rollback to having the data for vblock x in the cache block. + * But the cache block has since been overwritten, so it'll end + * up pointing to data that was never in 'x' during the history + * of the device. + * + * To avoid this issue we require a commit as part of the + * demotion operation. + */ + init_continuation(&mg->k, mg_success); + continue_after_commit(&cache->committer, &mg->k); + schedule_commit(&cache->committer); + break; + + case POLICY_WRITEBACK: + mg_complete(mg, true); + break; + } } -static void complete_migration(struct dm_cache_migration *mg) +static void mg_update_metadata_after_copy(struct work_struct *ws) { - if (mg->err) - migration_failure(mg); + struct dm_cache_migration *mg = ws_to_mg(ws); + + /* + * Did the copy succeed? + */ + if (mg->k.input) + mg_complete(mg, false); else - migration_success_pre_commit(mg); + mg_update_metadata(ws); } -static void process_migrations(struct cache *cache, struct list_head *head, - void (*fn)(struct dm_cache_migration *)) +static void mg_upgrade_lock(struct work_struct *ws) { - unsigned long flags; - struct list_head list; - struct dm_cache_migration *mg, *tmp; + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); - INIT_LIST_HEAD(&list); - spin_lock_irqsave(&cache->lock, flags); - list_splice_init(head, &list); - spin_unlock_irqrestore(&cache->lock, flags); + /* + * Did the copy succeed? + */ + if (mg->k.input) + mg_complete(mg, false); - list_for_each_entry_safe(mg, tmp, &list, list) - fn(mg); -} + else { + /* + * Now we want the lock to prevent both reads and writes. + */ + r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell, + READ_WRITE_LOCK_LEVEL); + if (r < 0) + mg_complete(mg, false); -static void __queue_quiesced_migration(struct dm_cache_migration *mg) -{ - list_add_tail(&mg->list, &mg->cache->quiesced_migrations); + else if (r) + quiesce(mg, mg_update_metadata); + + else + mg_update_metadata(ws); + } } -static void queue_quiesced_migration(struct dm_cache_migration *mg) +static void mg_copy(struct work_struct *ws) { - unsigned long flags; - struct cache *cache = mg->cache; + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); - spin_lock_irqsave(&cache->lock, flags); - __queue_quiesced_migration(mg); - spin_unlock_irqrestore(&cache->lock, flags); + if (mg->overwrite_bio) { + /* + * It's safe to do this here, even though it's new data + * because all IO has been locked out of the block. + * + * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL + * so _not_ using mg_upgrade_lock() as continutation. + */ + overwrite(mg, mg_update_metadata_after_copy); - wake_worker(cache); -} + } else { + struct cache *cache = mg->cache; + struct policy_work *op = mg->op; + bool is_policy_promote = (op->op == POLICY_PROMOTE); -static void queue_quiesced_migrations(struct cache *cache, struct list_head *work) -{ - unsigned long flags; - struct dm_cache_migration *mg, *tmp; + if ((!is_policy_promote && !is_dirty(cache, op->cblock)) || + is_discarded_oblock(cache, op->oblock)) { + mg_upgrade_lock(ws); + return; + } - spin_lock_irqsave(&cache->lock, flags); - list_for_each_entry_safe(mg, tmp, work, list) - __queue_quiesced_migration(mg); - spin_unlock_irqrestore(&cache->lock, flags); + init_continuation(&mg->k, mg_upgrade_lock); - wake_worker(cache); + r = copy(mg, is_policy_promote); + if (r) { + DMERR_LIMIT("%s: migration copy failed", cache_device_name(cache)); + mg->k.input = -EIO; + mg_complete(mg, false); + } + } } -static void check_for_quiesced_migrations(struct cache *cache, - struct per_bio_data *pb) +static int mg_lock_writes(struct dm_cache_migration *mg) { - struct list_head work; + int r; + struct dm_cell_key_v2 key; + struct cache *cache = mg->cache; + struct dm_bio_prison_cell_v2 *prealloc; - if (!pb->all_io_entry) - return; + prealloc = alloc_prison_cell(cache); + if (!prealloc) { + DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache)); + mg_complete(mg, false); + return -ENOMEM; + } + + /* + * Prevent writes to the block, but allow reads to continue. + * Unless we're using an overwrite bio, in which case we lock + * everything. + */ + build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key); + r = dm_cell_lock_v2(cache->prison, &key, + mg->overwrite_bio ? READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL, + prealloc, &mg->cell); + if (r < 0) { + free_prison_cell(cache, prealloc); + mg_complete(mg, false); + return r; + } - INIT_LIST_HEAD(&work); - dm_deferred_entry_dec(pb->all_io_entry, &work); + if (mg->cell != prealloc) + free_prison_cell(cache, prealloc); - if (!list_empty(&work)) - queue_quiesced_migrations(cache, &work); -} + if (r == 0) + mg_copy(&mg->k.ws); + else + quiesce(mg, mg_copy); -static void quiesce_migration(struct dm_cache_migration *mg) -{ - if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list)) - queue_quiesced_migration(mg); + return 0; } -static void promote(struct cache *cache, struct prealloc *structs, - dm_oblock_t oblock, dm_cblock_t cblock, - struct dm_bio_prison_cell *cell) +static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio) { - struct dm_cache_migration *mg = prealloc_get_migration(structs); + struct dm_cache_migration *mg; + + if (!background_work_begin(cache)) { + policy_complete_background_work(cache->policy, op, false); + return -EPERM; + } + + mg = alloc_migration(cache); + if (!mg) { + policy_complete_background_work(cache->policy, op, false); + background_work_end(cache); + return -ENOMEM; + } + + memset(mg, 0, sizeof(*mg)); - mg->err = false; - mg->discard = false; - mg->writeback = false; - mg->demote = false; - mg->promote = true; - mg->requeue_holder = true; - mg->invalidate = false; mg->cache = cache; - mg->new_oblock = oblock; - mg->cblock = cblock; - mg->old_ocell = NULL; - mg->new_ocell = cell; - mg->start_jiffies = jiffies; + mg->op = op; + mg->overwrite_bio = bio; - inc_io_migrations(cache); - quiesce_migration(mg); + if (!bio) + inc_io_migrations(cache); + + return mg_lock_writes(mg); } -static void writeback(struct cache *cache, struct prealloc *structs, - dm_oblock_t oblock, dm_cblock_t cblock, - struct dm_bio_prison_cell *cell) +/*---------------------------------------------------------------- + * invalidation processing + *--------------------------------------------------------------*/ + +static void invalidate_complete(struct dm_cache_migration *mg, bool success) { - struct dm_cache_migration *mg = prealloc_get_migration(structs); + struct bio_list bios; + struct cache *cache = mg->cache; - mg->err = false; - mg->discard = false; - mg->writeback = true; - mg->demote = false; - mg->promote = false; - mg->requeue_holder = true; - mg->invalidate = false; - mg->cache = cache; - mg->old_oblock = oblock; - mg->cblock = cblock; - mg->old_ocell = cell; - mg->new_ocell = NULL; - mg->start_jiffies = jiffies; - - inc_io_migrations(cache); - quiesce_migration(mg); -} - -static void demote_then_promote(struct cache *cache, struct prealloc *structs, - dm_oblock_t old_oblock, dm_oblock_t new_oblock, - dm_cblock_t cblock, - struct dm_bio_prison_cell *old_ocell, - struct dm_bio_prison_cell *new_ocell) -{ - struct dm_cache_migration *mg = prealloc_get_migration(structs); - - mg->err = false; - mg->discard = false; - mg->writeback = false; - mg->demote = true; - mg->promote = true; - mg->requeue_holder = true; - mg->invalidate = false; - mg->cache = cache; - mg->old_oblock = old_oblock; - mg->new_oblock = new_oblock; - mg->cblock = cblock; - mg->old_ocell = old_ocell; - mg->new_ocell = new_ocell; - mg->start_jiffies = jiffies; + bio_list_init(&bios); + if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios)) + free_prison_cell(cache, mg->cell); - inc_io_migrations(cache); - quiesce_migration(mg); -} + if (!success && mg->overwrite_bio) + bio_io_error(mg->overwrite_bio); -/* - * Invalidate a cache entry. No writeback occurs; any changes in the cache - * block are thrown away. - */ -static void invalidate(struct cache *cache, struct prealloc *structs, - dm_oblock_t oblock, dm_cblock_t cblock, - struct dm_bio_prison_cell *cell) -{ - struct dm_cache_migration *mg = prealloc_get_migration(structs); - - mg->err = false; - mg->discard = false; - mg->writeback = false; - mg->demote = true; - mg->promote = false; - mg->requeue_holder = true; - mg->invalidate = true; - mg->cache = cache; - mg->old_oblock = oblock; - mg->cblock = cblock; - mg->old_ocell = cell; - mg->new_ocell = NULL; - mg->start_jiffies = jiffies; + free_migration(mg); + defer_bios(cache, &bios); - inc_io_migrations(cache); - quiesce_migration(mg); + background_work_end(cache); } -static void discard(struct cache *cache, struct prealloc *structs, - struct dm_bio_prison_cell *cell) +static void invalidate_completed(struct work_struct *ws) { - struct dm_cache_migration *mg = prealloc_get_migration(structs); + struct dm_cache_migration *mg = ws_to_mg(ws); + invalidate_complete(mg, !mg->k.input); +} - mg->err = false; - mg->discard = true; - mg->writeback = false; - mg->demote = false; - mg->promote = false; - mg->requeue_holder = false; - mg->invalidate = false; - mg->cache = cache; - mg->old_ocell = NULL; - mg->new_ocell = cell; - mg->start_jiffies = jiffies; +static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock) +{ + int r = policy_invalidate_mapping(cache->policy, cblock); + if (!r) { + r = dm_cache_remove_mapping(cache->cmd, cblock); + if (r) { + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", + cache_device_name(cache)); + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); + } + + } else if (r == -ENODATA) { + /* + * Harmless, already unmapped. + */ + r = 0; + + } else + DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache)); - quiesce_migration(mg); + return r; } -/*---------------------------------------------------------------- - * bio processing - *--------------------------------------------------------------*/ -static void defer_bio(struct cache *cache, struct bio *bio) +static void invalidate_remove(struct work_struct *ws) { - unsigned long flags; + int r; + struct dm_cache_migration *mg = ws_to_mg(ws); + struct cache *cache = mg->cache; - spin_lock_irqsave(&cache->lock, flags); - bio_list_add(&cache->deferred_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); + r = invalidate_cblock(cache, mg->invalidate_cblock); + if (r) { + invalidate_complete(mg, false); + return; + } - wake_worker(cache); + init_continuation(&mg->k, invalidate_completed); + continue_after_commit(&cache->committer, &mg->k); + remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock); + mg->overwrite_bio = NULL; + schedule_commit(&cache->committer); } -static void process_flush_bio(struct cache *cache, struct bio *bio) +static int invalidate_lock(struct dm_cache_migration *mg) { - size_t pb_data_size = get_per_bio_data_size(cache); - struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); + int r; + struct dm_cell_key_v2 key; + struct cache *cache = mg->cache; + struct dm_bio_prison_cell_v2 *prealloc; - BUG_ON(bio->bi_iter.bi_size); - if (!pb->req_nr) - remap_to_origin(cache, bio); - else - remap_to_cache(cache, bio, 0); + prealloc = alloc_prison_cell(cache); + if (!prealloc) { + invalidate_complete(mg, false); + return -ENOMEM; + } - /* - * REQ_PREFLUSH is not directed at any particular block so we don't - * need to inc_ds(). REQ_FUA's are split into a write + REQ_PREFLUSH - * by dm-core. - */ - issue(cache, bio); + build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key); + r = dm_cell_lock_v2(cache->prison, &key, + READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell); + if (r < 0) { + free_prison_cell(cache, prealloc); + invalidate_complete(mg, false); + return r; + } + + if (mg->cell != prealloc) + free_prison_cell(cache, prealloc); + + if (r) + quiesce(mg, invalidate_remove); + + else { + /* + * We can't call invalidate_remove() directly here because we + * might still be in request context. + */ + init_continuation(&mg->k, invalidate_remove); + queue_work(cache->wq, &mg->k.ws); + } + + return 0; } -static void process_discard_bio(struct cache *cache, struct prealloc *structs, - struct bio *bio) +static int invalidate_start(struct cache *cache, dm_cblock_t cblock, + dm_oblock_t oblock, struct bio *bio) { - int r; - dm_dblock_t b, e; - struct dm_bio_prison_cell *cell_prealloc, *new_ocell; + struct dm_cache_migration *mg; - calc_discard_block_range(cache, bio, &b, &e); - if (b == e) { - bio_endio(bio); - return; + if (!background_work_begin(cache)) + return -EPERM; + + mg = alloc_migration(cache); + if (!mg) { + background_work_end(cache); + return -ENOMEM; } - cell_prealloc = prealloc_get_cell(structs); - r = bio_detain_range(cache, dblock_to_oblock(cache, b), dblock_to_oblock(cache, e), bio, cell_prealloc, - (cell_free_fn) prealloc_put_cell, - structs, &new_ocell); - if (r > 0) - return; + memset(mg, 0, sizeof(*mg)); - discard(cache, structs, new_ocell); + mg->cache = cache; + mg->overwrite_bio = bio; + mg->invalidate_cblock = cblock; + mg->invalidate_oblock = oblock; + + return invalidate_lock(mg); } -static bool spare_migration_bandwidth(struct cache *cache) +/*---------------------------------------------------------------- + * bio processing + *--------------------------------------------------------------*/ + +enum busy { + IDLE, + MODERATE, + BUSY +}; + +static enum busy spare_migration_bandwidth(struct cache *cache) { + bool idle = iot_idle_for(&cache->origin_tracker, HZ); sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * cache->sectors_per_block; - return current_volume < cache->migration_threshold; + + if (current_volume <= cache->migration_threshold) + return idle ? IDLE : MODERATE; + else + return idle ? MODERATE : BUSY; } static void inc_hit_counter(struct cache *cache, struct bio *bio) @@ -1660,255 +1746,143 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio) /*----------------------------------------------------------------*/ -struct inc_detail { - struct cache *cache; - struct bio_list bios_for_issue; - struct bio_list unhandled_bios; - bool any_writes; -}; - -static void inc_fn(void *context, struct dm_bio_prison_cell *cell) +static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) { - struct bio *bio; - struct inc_detail *detail = context; - struct cache *cache = detail->cache; - - inc_ds(cache, cell->holder, cell); - if (bio_data_dir(cell->holder) == WRITE) - detail->any_writes = true; - - while ((bio = bio_list_pop(&cell->bios))) { - if (discard_or_flush(bio)) { - bio_list_add(&detail->unhandled_bios, bio); - continue; - } - - if (bio_data_dir(bio) == WRITE) - detail->any_writes = true; - - bio_list_add(&detail->bios_for_issue, bio); - inc_ds(cache, bio, cell); - } + return (bio_data_dir(bio) == WRITE) && + (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT)); } -// FIXME: refactor these two -static void remap_cell_to_origin_clear_discard(struct cache *cache, - struct dm_bio_prison_cell *cell, - dm_oblock_t oblock, bool issue_holder) +static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) { - struct bio *bio; - unsigned long flags; - struct inc_detail detail; - - detail.cache = cache; - bio_list_init(&detail.bios_for_issue); - bio_list_init(&detail.unhandled_bios); - detail.any_writes = false; - - spin_lock_irqsave(&cache->lock, flags); - dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); - bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); - spin_unlock_irqrestore(&cache->lock, flags); - - remap_to_origin(cache, cell->holder); - if (issue_holder) - issue(cache, cell->holder); - else - accounted_begin(cache, cell->holder); - - if (detail.any_writes) - clear_discard(cache, oblock_to_dblock(cache, oblock)); - - while ((bio = bio_list_pop(&detail.bios_for_issue))) { - remap_to_origin(cache, bio); - issue(cache, bio); - } - - free_prison_cell(cache, cell); + return writeback_mode(&cache->features) && + (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); } -static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell, - dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder) +static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, + bool *commit_needed) { - struct bio *bio; - unsigned long flags; - struct inc_detail detail; - - detail.cache = cache; - bio_list_init(&detail.bios_for_issue); - bio_list_init(&detail.unhandled_bios); - detail.any_writes = false; - - spin_lock_irqsave(&cache->lock, flags); - dm_cell_visit_release(cache->prison, inc_fn, &detail, cell); - bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios); - spin_unlock_irqrestore(&cache->lock, flags); + int r, data_dir; + bool rb, background_queued; + dm_cblock_t cblock; + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - remap_to_cache(cache, cell->holder, cblock); - if (issue_holder) - issue(cache, cell->holder); - else - accounted_begin(cache, cell->holder); + *commit_needed = false; - if (detail.any_writes) { - set_dirty(cache, oblock, cblock); - clear_discard(cache, oblock_to_dblock(cache, oblock)); - } - - while ((bio = bio_list_pop(&detail.bios_for_issue))) { - remap_to_cache(cache, bio, cblock); - issue(cache, bio); + rb = bio_detain_shared(cache, block, bio); + if (!rb) { + /* + * An exclusive lock is held for this block, so we have to + * wait. We set the commit_needed flag so the current + * transaction will be committed asap, allowing this lock + * to be dropped. + */ + *commit_needed = true; + return DM_MAPIO_SUBMITTED; } - free_prison_cell(cache, cell); -} + data_dir = bio_data_dir(bio); -/*----------------------------------------------------------------*/ - -struct old_oblock_lock { - struct policy_locker locker; - struct cache *cache; - struct prealloc *structs; - struct dm_bio_prison_cell *cell; -}; + if (optimisable_bio(cache, bio, block)) { + struct policy_work *op = NULL; -static int null_locker(struct policy_locker *locker, dm_oblock_t b) -{ - /* This should never be called */ - BUG(); - return 0; -} + r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op); + if (unlikely(r && r != -ENOENT)) { + DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d", + cache_device_name(cache), r); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } -static int cell_locker(struct policy_locker *locker, dm_oblock_t b) -{ - struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker); - struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs); + if (r == -ENOENT && op) { + bio_drop_shared_lock(cache, bio); + BUG_ON(op->op != POLICY_PROMOTE); + mg_start(cache, op, bio); + return DM_MAPIO_SUBMITTED; + } + } else { + r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued); + if (unlikely(r && r != -ENOENT)) { + DMERR_LIMIT("%s: policy_lookup() failed with r = %d", + cache_device_name(cache), r); + bio_io_error(bio); + return DM_MAPIO_SUBMITTED; + } - return bio_detain(l->cache, b, NULL, cell_prealloc, - (cell_free_fn) prealloc_put_cell, - l->structs, &l->cell); -} + if (background_queued) + wake_migration_worker(cache); + } -static void process_cell(struct cache *cache, struct prealloc *structs, - struct dm_bio_prison_cell *new_ocell) -{ - int r; - bool release_cell = true; - struct bio *bio = new_ocell->holder; - dm_oblock_t block = get_bio_block(cache, bio); - struct policy_result lookup_result; - bool passthrough = passthrough_mode(&cache->features); - bool fast_promotion, can_migrate; - struct old_oblock_lock ool; - - fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); - can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache)); - - ool.locker.fn = cell_locker; - ool.cache = cache; - ool.structs = structs; - ool.cell = NULL; - r = policy_map(cache->policy, block, true, can_migrate, fast_promotion, - bio, &ool.locker, &lookup_result); - - if (r == -EWOULDBLOCK) - /* migration has been denied */ - lookup_result.op = POLICY_MISS; - - switch (lookup_result.op) { - case POLICY_HIT: - if (passthrough) { - inc_miss_counter(cache, bio); + if (r == -ENOENT) { + /* + * Miss. + */ + inc_miss_counter(cache, bio); + if (pb->req_nr == 0) { + accounted_begin(cache, bio); + remap_to_origin_clear_discard(cache, bio, block); + } else { /* - * Passthrough always maps to the origin, - * invalidating any cache blocks that are written - * to. + * This is a duplicate writethrough io that is no + * longer needed because the block has been demoted. */ + bio_endio(bio); + return DM_MAPIO_SUBMITTED; + } + } else { + /* + * Hit. + */ + inc_hit_counter(cache, bio); + /* + * Passthrough always maps to the origin, invalidating any + * cache blocks that are written to. + */ + if (passthrough_mode(&cache->features)) { if (bio_data_dir(bio) == WRITE) { + bio_drop_shared_lock(cache, bio); atomic_inc(&cache->stats.demotion); - invalidate(cache, structs, block, lookup_result.cblock, new_ocell); - release_cell = false; - - } else { - /* FIXME: factor out issue_origin() */ + invalidate_start(cache, cblock, block, bio); + } else remap_to_origin_clear_discard(cache, bio, block); - inc_and_issue(cache, bio, new_ocell); - } + } else { - inc_hit_counter(cache, bio); - - if (bio_data_dir(bio) == WRITE && - writethrough_mode(&cache->features) && - !is_dirty(cache, lookup_result.cblock)) { - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - inc_and_issue(cache, bio, new_ocell); - - } else { - remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true); - release_cell = false; - } + if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && + !is_dirty(cache, cblock)) { + remap_to_origin_then_cache(cache, bio, block, cblock); + accounted_begin(cache, bio); + } else + remap_to_cache_dirty(cache, bio, block, cblock); } - - break; - - case POLICY_MISS: - inc_miss_counter(cache, bio); - remap_cell_to_origin_clear_discard(cache, new_ocell, block, true); - release_cell = false; - break; - - case POLICY_NEW: - atomic_inc(&cache->stats.promotion); - promote(cache, structs, block, lookup_result.cblock, new_ocell); - release_cell = false; - break; - - case POLICY_REPLACE: - atomic_inc(&cache->stats.demotion); - atomic_inc(&cache->stats.promotion); - demote_then_promote(cache, structs, lookup_result.old_oblock, - block, lookup_result.cblock, - ool.cell, new_ocell); - release_cell = false; - break; - - default: - DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u", - cache_device_name(cache), __func__, - (unsigned) lookup_result.op); - bio_io_error(bio); } - if (release_cell) - cell_defer(cache, new_ocell, false); -} - -static void process_bio(struct cache *cache, struct prealloc *structs, - struct bio *bio) -{ - int r; - dm_oblock_t block = get_bio_block(cache, bio); - struct dm_bio_prison_cell *cell_prealloc, *new_ocell; - /* - * Check to see if that block is currently migrating. + * dm core turns FUA requests into a separate payload and FLUSH req. */ - cell_prealloc = prealloc_get_cell(structs); - r = bio_detain(cache, block, bio, cell_prealloc, - (cell_free_fn) prealloc_put_cell, - structs, &new_ocell); - if (r > 0) - return; + if (bio->bi_opf & REQ_FUA) { + /* + * issue_after_commit will call accounted_begin a second time. So + * we call accounted_complete() to avoid double accounting. + */ + accounted_complete(cache, bio); + issue_after_commit(&cache->committer, bio); + *commit_needed = true; + return DM_MAPIO_SUBMITTED; + } - process_cell(cache, structs, new_ocell); + return DM_MAPIO_REMAPPED; } -static int need_commit_due_to_time(struct cache *cache) +static bool process_bio(struct cache *cache, struct bio *bio) { - return jiffies < cache->last_commit_jiffies || - jiffies > cache->last_commit_jiffies + COMMIT_PERIOD; + bool commit_needed; + + if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) + generic_make_request(bio); + + return commit_needed; } /* @@ -1929,123 +1903,88 @@ static int commit(struct cache *cache, bool clean_shutdown) return r; } -static int commit_if_needed(struct cache *cache) +/* + * Used by the batcher. + */ +static int commit_op(void *context) { - int r = 0; + struct cache *cache = context; - if ((cache->commit_requested || need_commit_due_to_time(cache)) && - dm_cache_changed_this_transaction(cache->cmd)) { - r = commit(cache, false); - cache->commit_requested = false; - cache->last_commit_jiffies = jiffies; - } + if (dm_cache_changed_this_transaction(cache->cmd)) + return commit(cache, false); - return r; + return 0; } -static void process_deferred_bios(struct cache *cache) -{ - bool prealloc_used = false; - unsigned long flags; - struct bio_list bios; - struct bio *bio; - struct prealloc structs; - - memset(&structs, 0, sizeof(structs)); - bio_list_init(&bios); - - spin_lock_irqsave(&cache->lock, flags); - bio_list_merge(&bios, &cache->deferred_bios); - bio_list_init(&cache->deferred_bios); - spin_unlock_irqrestore(&cache->lock, flags); - - while (!bio_list_empty(&bios)) { - /* - * If we've got no free migration structs, and processing - * this bio might require one, we pause until there are some - * prepared mappings to process. - */ - prealloc_used = true; - if (prealloc_data_structs(cache, &structs)) { - spin_lock_irqsave(&cache->lock, flags); - bio_list_merge(&cache->deferred_bios, &bios); - spin_unlock_irqrestore(&cache->lock, flags); - break; - } +/*----------------------------------------------------------------*/ - bio = bio_list_pop(&bios); +static bool process_flush_bio(struct cache *cache, struct bio *bio) +{ + size_t pb_data_size = get_per_bio_data_size(cache); + struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size); - if (bio->bi_opf & REQ_PREFLUSH) - process_flush_bio(cache, bio); - else if (bio_op(bio) == REQ_OP_DISCARD) - process_discard_bio(cache, &structs, bio); - else - process_bio(cache, &structs, bio); - } + if (!pb->req_nr) + remap_to_origin(cache, bio); + else + remap_to_cache(cache, bio, 0); - if (prealloc_used) - prealloc_free_structs(cache, &structs); + issue_after_commit(&cache->committer, bio); + return true; } -static void process_deferred_cells(struct cache *cache) +static bool process_discard_bio(struct cache *cache, struct bio *bio) { - bool prealloc_used = false; - unsigned long flags; - struct dm_bio_prison_cell *cell, *tmp; - struct list_head cells; - struct prealloc structs; - - memset(&structs, 0, sizeof(structs)); - - INIT_LIST_HEAD(&cells); - - spin_lock_irqsave(&cache->lock, flags); - list_splice_init(&cache->deferred_cells, &cells); - spin_unlock_irqrestore(&cache->lock, flags); - - list_for_each_entry_safe(cell, tmp, &cells, user_list) { - /* - * If we've got no free migration structs, and processing - * this bio might require one, we pause until there are some - * prepared mappings to process. - */ - prealloc_used = true; - if (prealloc_data_structs(cache, &structs)) { - spin_lock_irqsave(&cache->lock, flags); - list_splice(&cells, &cache->deferred_cells); - spin_unlock_irqrestore(&cache->lock, flags); - break; - } + dm_dblock_t b, e; - process_cell(cache, &structs, cell); + // FIXME: do we need to lock the region? Or can we just assume the + // user wont be so foolish as to issue discard concurrently with + // other IO? + calc_discard_block_range(cache, bio, &b, &e); + while (b != e) { + set_discard(cache, b); + b = to_dblock(from_dblock(b) + 1); } - if (prealloc_used) - prealloc_free_structs(cache, &structs); + bio_endio(bio); + + return false; } -static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) +static void process_deferred_bios(struct work_struct *ws) { + struct cache *cache = container_of(ws, struct cache, deferred_bio_worker); + unsigned long flags; + bool commit_needed = false; struct bio_list bios; struct bio *bio; bio_list_init(&bios); spin_lock_irqsave(&cache->lock, flags); - bio_list_merge(&bios, &cache->deferred_flush_bios); - bio_list_init(&cache->deferred_flush_bios); + bio_list_merge(&bios, &cache->deferred_bios); + bio_list_init(&cache->deferred_bios); spin_unlock_irqrestore(&cache->lock, flags); - /* - * These bios have already been through inc_ds() - */ - while ((bio = bio_list_pop(&bios))) - submit_bios ? accounted_request(cache, bio) : bio_io_error(bio); + while ((bio = bio_list_pop(&bios))) { + if (bio->bi_opf & REQ_PREFLUSH) + commit_needed = process_flush_bio(cache, bio) || commit_needed; + + else if (bio_op(bio) == REQ_OP_DISCARD) + commit_needed = process_discard_bio(cache, bio) || commit_needed; + + else + commit_needed = process_bio(cache, bio) || commit_needed; + } + + if (commit_needed) + schedule_commit(&cache->committer); } -static void process_deferred_writethrough_bios(struct cache *cache) +static void process_deferred_writethrough_bios(struct work_struct *ws) { + struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker); + unsigned long flags; struct bio_list bios; struct bio *bio; @@ -2058,153 +1997,15 @@ static void process_deferred_writethrough_bios(struct cache *cache) spin_unlock_irqrestore(&cache->lock, flags); /* - * These bios have already been through inc_ds() + * These bios have already been through accounted_begin() */ while ((bio = bio_list_pop(&bios))) - accounted_request(cache, bio); -} - -static void writeback_some_dirty_blocks(struct cache *cache) -{ - bool prealloc_used = false; - dm_oblock_t oblock; - dm_cblock_t cblock; - struct prealloc structs; - struct dm_bio_prison_cell *old_ocell; - bool busy = !iot_idle_for(&cache->origin_tracker, HZ); - - memset(&structs, 0, sizeof(structs)); - - while (spare_migration_bandwidth(cache)) { - if (policy_writeback_work(cache->policy, &oblock, &cblock, busy)) - break; /* no work to do */ - - prealloc_used = true; - if (prealloc_data_structs(cache, &structs) || - get_cell(cache, oblock, &structs, &old_ocell)) { - policy_set_dirty(cache->policy, oblock); - break; - } - - writeback(cache, &structs, oblock, cblock, old_ocell); - } - - if (prealloc_used) - prealloc_free_structs(cache, &structs); -} - -/*---------------------------------------------------------------- - * Invalidations. - * Dropping something from the cache *without* writing back. - *--------------------------------------------------------------*/ - -static void process_invalidation_request(struct cache *cache, struct invalidation_request *req) -{ - int r = 0; - uint64_t begin = from_cblock(req->cblocks->begin); - uint64_t end = from_cblock(req->cblocks->end); - - while (begin != end) { - r = policy_remove_cblock(cache->policy, to_cblock(begin)); - if (!r) { - r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin)); - if (r) { - metadata_operation_failed(cache, "dm_cache_remove_mapping", r); - break; - } - - } else if (r == -ENODATA) { - /* harmless, already unmapped */ - r = 0; - - } else { - DMERR("%s: policy_remove_cblock failed", cache_device_name(cache)); - break; - } - - begin++; - } - - cache->commit_requested = true; - - req->err = r; - atomic_set(&req->complete, 1); - - wake_up(&req->result_wait); -} - -static void process_invalidation_requests(struct cache *cache) -{ - struct list_head list; - struct invalidation_request *req, *tmp; - - INIT_LIST_HEAD(&list); - spin_lock(&cache->invalidation_lock); - list_splice_init(&cache->invalidation_requests, &list); - spin_unlock(&cache->invalidation_lock); - - list_for_each_entry_safe (req, tmp, &list, list) - process_invalidation_request(cache, req); + generic_make_request(bio); } /*---------------------------------------------------------------- * Main worker loop *--------------------------------------------------------------*/ -static bool is_quiescing(struct cache *cache) -{ - return atomic_read(&cache->quiescing); -} - -static void ack_quiescing(struct cache *cache) -{ - if (is_quiescing(cache)) { - atomic_inc(&cache->quiescing_ack); - wake_up(&cache->quiescing_wait); - } -} - -static void wait_for_quiescing_ack(struct cache *cache) -{ - wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack)); -} - -static void start_quiescing(struct cache *cache) -{ - atomic_inc(&cache->quiescing); - wait_for_quiescing_ack(cache); -} - -static void stop_quiescing(struct cache *cache) -{ - atomic_set(&cache->quiescing, 0); - atomic_set(&cache->quiescing_ack, 0); -} - -static void wait_for_migrations(struct cache *cache) -{ - wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); -} - -static void stop_worker(struct cache *cache) -{ - cancel_delayed_work(&cache->waker); - flush_workqueue(cache->wq); -} - -static void requeue_deferred_cells(struct cache *cache) -{ - unsigned long flags; - struct list_head cells; - struct dm_bio_prison_cell *cell, *tmp; - - INIT_LIST_HEAD(&cells); - spin_lock_irqsave(&cache->lock, flags); - list_splice_init(&cache->deferred_cells, &cells); - spin_unlock_irqrestore(&cache->lock, flags); - - list_for_each_entry_safe(cell, tmp, &cells, user_list) - cell_requeue(cache, cell); -} static void requeue_deferred_bios(struct cache *cache) { @@ -2221,53 +2022,6 @@ static void requeue_deferred_bios(struct cache *cache) } } -static int more_work(struct cache *cache) -{ - if (is_quiescing(cache)) - return !list_empty(&cache->quiesced_migrations) || - !list_empty(&cache->completed_migrations) || - !list_empty(&cache->need_commit_migrations); - else - return !bio_list_empty(&cache->deferred_bios) || - !list_empty(&cache->deferred_cells) || - !bio_list_empty(&cache->deferred_flush_bios) || - !bio_list_empty(&cache->deferred_writethrough_bios) || - !list_empty(&cache->quiesced_migrations) || - !list_empty(&cache->completed_migrations) || - !list_empty(&cache->need_commit_migrations) || - cache->invalidate; -} - -static void do_worker(struct work_struct *ws) -{ - struct cache *cache = container_of(ws, struct cache, worker); - - do { - if (!is_quiescing(cache)) { - writeback_some_dirty_blocks(cache); - process_deferred_writethrough_bios(cache); - process_deferred_bios(cache); - process_deferred_cells(cache); - process_invalidation_requests(cache); - } - - process_migrations(cache, &cache->quiesced_migrations, issue_copy_or_discard); - process_migrations(cache, &cache->completed_migrations, complete_migration); - - if (commit_if_needed(cache)) { - process_deferred_flush_bios(cache, false); - process_migrations(cache, &cache->need_commit_migrations, migration_failure); - } else { - process_deferred_flush_bios(cache, true); - process_migrations(cache, &cache->need_commit_migrations, - migration_success_post_commit); - } - - ack_quiescing(cache); - - } while (more_work(cache)); -} - /* * We want to commit periodically so that not too much * unwritten metadata builds up. @@ -2275,25 +2029,39 @@ static void do_worker(struct work_struct *ws) static void do_waker(struct work_struct *ws) { struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker); + policy_tick(cache->policy, true); - wake_worker(cache); + wake_migration_worker(cache); + schedule_commit(&cache->committer); queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD); } -/*----------------------------------------------------------------*/ - -static int is_congested(struct dm_dev *dev, int bdi_bits) +static void check_migrations(struct work_struct *ws) { - struct request_queue *q = bdev_get_queue(dev->bdev); - return bdi_congested(q->backing_dev_info, bdi_bits); -} + int r; + struct policy_work *op; + struct cache *cache = container_of(ws, struct cache, migration_worker); + enum busy b; -static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) -{ - struct cache *cache = container_of(cb, struct cache, callbacks); + for (;;) { + b = spare_migration_bandwidth(cache); + if (b == BUSY) + break; - return is_congested(cache->origin_dev, bdi_bits) || - is_congested(cache->cache_dev, bdi_bits); + r = policy_get_background_work(cache->policy, b == IDLE, &op); + if (r == -ENODATA) + break; + + if (r) { + DMERR_LIMIT("%s: policy_background_work failed", + cache_device_name(cache)); + break; + } + + r = mg_start(cache, op, NULL); + if (r) + break; + } } /*---------------------------------------------------------------- @@ -2310,11 +2078,8 @@ static void destroy(struct cache *cache) mempool_destroy(cache->migration_pool); - if (cache->all_io_ds) - dm_deferred_set_destroy(cache->all_io_ds); - if (cache->prison) - dm_bio_prison_destroy(cache->prison); + dm_bio_prison_destroy_v2(cache->prison); if (cache->wq) destroy_workqueue(cache->wq); @@ -2707,6 +2472,7 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, return PTR_ERR(p); } cache->policy = p; + BUG_ON(!cache->policy); return 0; } @@ -2750,6 +2516,20 @@ static void set_cache_size(struct cache *cache, dm_cblock_t size) cache->cache_size = size; } +static int is_congested(struct dm_dev *dev, int bdi_bits) +{ + struct request_queue *q = bdev_get_queue(dev->bdev); + return bdi_congested(q->backing_dev_info, bdi_bits); +} + +static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) +{ + struct cache *cache = container_of(cb, struct cache, callbacks); + + return is_congested(cache->origin_dev, bdi_bits) || + is_congested(cache->cache_dev, bdi_bits); +} + #define DEFAULT_MIGRATION_THRESHOLD 2048 static int cache_create(struct cache_args *ca, struct cache **result) @@ -2788,7 +2568,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL; - /* FIXME: factor out this whole section */ origin_blocks = cache->origin_sectors = ca->origin_sectors; origin_blocks = block_div(origin_blocks, ca->block_size); cache->origin_blocks = to_oblock(origin_blocks); @@ -2854,24 +2633,18 @@ static int cache_create(struct cache_args *ca, struct cache **result) r = -EINVAL; goto bad; } + + policy_allow_migrations(cache->policy, false); } spin_lock_init(&cache->lock); INIT_LIST_HEAD(&cache->deferred_cells); bio_list_init(&cache->deferred_bios); - bio_list_init(&cache->deferred_flush_bios); bio_list_init(&cache->deferred_writethrough_bios); - INIT_LIST_HEAD(&cache->quiesced_migrations); - INIT_LIST_HEAD(&cache->completed_migrations); - INIT_LIST_HEAD(&cache->need_commit_migrations); atomic_set(&cache->nr_allocated_migrations, 0); atomic_set(&cache->nr_io_migrations, 0); init_waitqueue_head(&cache->migration_wait); - init_waitqueue_head(&cache->quiescing_wait); - atomic_set(&cache->quiescing, 0); - atomic_set(&cache->quiescing_ack, 0); - r = -ENOMEM; atomic_set(&cache->nr_dirty, 0); cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size)); @@ -2900,27 +2673,23 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); + cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0); if (!cache->wq) { *error = "could not create workqueue for metadata object"; goto bad; } - INIT_WORK(&cache->worker, do_worker); + INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); + INIT_WORK(&cache->deferred_writethrough_worker, + process_deferred_writethrough_bios); + INIT_WORK(&cache->migration_worker, check_migrations); INIT_DELAYED_WORK(&cache->waker, do_waker); - cache->last_commit_jiffies = jiffies; - cache->prison = dm_bio_prison_create(); + cache->prison = dm_bio_prison_create_v2(cache->wq); if (!cache->prison) { *error = "could not create bio prison"; goto bad; } - cache->all_io_ds = dm_deferred_set_create(); - if (!cache->all_io_ds) { - *error = "could not create all_io deferred set"; - goto bad; - } - cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE, migration_cache); if (!cache->migration_pool) { @@ -2947,11 +2716,15 @@ static int cache_create(struct cache_args *ca, struct cache **result) spin_lock_init(&cache->invalidation_lock); INIT_LIST_HEAD(&cache->invalidation_requests); + batcher_init(&cache->committer, commit_op, cache, + issue_op, cache, cache->wq); iot_init(&cache->origin_tracker); + init_rwsem(&cache->background_work_lock); + prevent_background_work(cache); + *result = cache; return 0; - bad: destroy(cache); return r; @@ -3009,7 +2782,6 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) } ti->private = cache; - out: destroy_cache_args(ca); return r; @@ -3022,17 +2794,11 @@ static int cache_map(struct dm_target *ti, struct bio *bio) struct cache *cache = ti->private; int r; - struct dm_bio_prison_cell *cell = NULL; + bool commit_needed; dm_oblock_t block = get_bio_block(cache, bio); size_t pb_data_size = get_per_bio_data_size(cache); - bool can_migrate = false; - bool fast_promotion; - struct policy_result lookup_result; - struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size); - struct old_oblock_lock ool; - - ool.locker.fn = null_locker; + init_per_bio_data(bio, pb_data_size); if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) { /* * This can only occur if the io goes to a partial block at @@ -3049,101 +2815,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } - /* - * Check to see if that block is currently migrating. - */ - cell = alloc_prison_cell(cache); - if (!cell) { - defer_bio(cache, bio); - return DM_MAPIO_SUBMITTED; - } - - r = bio_detain(cache, block, bio, cell, - (cell_free_fn) free_prison_cell, - cache, &cell); - if (r) { - if (r < 0) - defer_bio(cache, bio); - - return DM_MAPIO_SUBMITTED; - } - - fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio); - - r = policy_map(cache->policy, block, false, can_migrate, fast_promotion, - bio, &ool.locker, &lookup_result); - if (r == -EWOULDBLOCK) { - cell_defer(cache, cell, true); - return DM_MAPIO_SUBMITTED; - - } else if (r) { - DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d", - cache_device_name(cache), r); - cell_defer(cache, cell, false); - bio_io_error(bio); - return DM_MAPIO_SUBMITTED; - } - - r = DM_MAPIO_REMAPPED; - switch (lookup_result.op) { - case POLICY_HIT: - if (passthrough_mode(&cache->features)) { - if (bio_data_dir(bio) == WRITE) { - /* - * We need to invalidate this block, so - * defer for the worker thread. - */ - cell_defer(cache, cell, true); - r = DM_MAPIO_SUBMITTED; - - } else { - inc_miss_counter(cache, bio); - remap_to_origin_clear_discard(cache, bio, block); - accounted_begin(cache, bio); - inc_ds(cache, bio, cell); - // FIXME: we want to remap hits or misses straight - // away rather than passing over to the worker. - cell_defer(cache, cell, false); - } - - } else { - inc_hit_counter(cache, bio); - if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && - !is_dirty(cache, lookup_result.cblock)) { - remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); - accounted_begin(cache, bio); - inc_ds(cache, bio, cell); - cell_defer(cache, cell, false); - - } else - remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false); - } - break; - - case POLICY_MISS: - inc_miss_counter(cache, bio); - if (pb->req_nr != 0) { - /* - * This is a duplicate writethrough io that is no - * longer needed because the block has been demoted. - */ - bio_endio(bio); - // FIXME: remap everything as a miss - cell_defer(cache, cell, false); - r = DM_MAPIO_SUBMITTED; - - } else - remap_cell_to_origin_clear_discard(cache, cell, block, false); - break; - - default: - DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u", - cache_device_name(cache), __func__, - (unsigned) lookup_result.op); - cell_defer(cache, cell, false); - bio_io_error(bio); - r = DM_MAPIO_SUBMITTED; - } + r = map_bio(cache, bio, block, &commit_needed); + if (commit_needed) + schedule_commit(&cache->committer); return r; } @@ -3163,7 +2837,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error) spin_unlock_irqrestore(&cache->lock, flags); } - check_for_quiesced_migrations(cache, pb); + bio_drop_shared_lock(cache, bio); accounted_complete(cache, bio); return 0; @@ -3263,12 +2937,18 @@ static void cache_postsuspend(struct dm_target *ti) { struct cache *cache = ti->private; - start_quiescing(cache); - wait_for_migrations(cache); - stop_worker(cache); + prevent_background_work(cache); + BUG_ON(atomic_read(&cache->nr_io_migrations)); + + cancel_delayed_work(&cache->waker); + flush_workqueue(cache->wq); + WARN_ON(cache->origin_tracker.in_flight); + + /* + * If it's a flush suspend there won't be any deferred bios, so this + * call is harmless. + */ requeue_deferred_bios(cache); - requeue_deferred_cells(cache); - stop_quiescing(cache); if (get_cache_mode(cache) == CM_WRITE) (void) sync_metadata(cache); @@ -3280,15 +2960,10 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, int r; struct cache *cache = context; - r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid); + r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); if (r) return r; - if (dirty) - set_dirty(cache, oblock, cblock); - else - clear_dirty(cache, oblock, cblock); - return 0; } @@ -3487,6 +3162,7 @@ static void cache_resume(struct dm_target *ti) struct cache *cache = ti->private; cache->need_tick_bio = true; + allow_background_work(cache); do_waker(&cache->waker.work); } @@ -3621,10 +3297,19 @@ err: } /* + * Defines a range of cblocks, begin to (end - 1) are in the range. end is + * the one-past-the-end value. + */ +struct cblock_range { + dm_cblock_t begin; + dm_cblock_t end; +}; + +/* * A cache block range can take two forms: * * i) A single cblock, eg. '3456' - * ii) A begin and end cblock with dots between, eg. 123-234 + * ii) A begin and end cblock with a dash between, eg. 123-234 */ static int parse_cblock_range(struct cache *cache, const char *str, struct cblock_range *result) @@ -3690,23 +3375,31 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range return 0; } +static inline dm_cblock_t cblock_succ(dm_cblock_t b) +{ + return to_cblock(from_cblock(b) + 1); +} + static int request_invalidation(struct cache *cache, struct cblock_range *range) { - struct invalidation_request req; + int r = 0; - INIT_LIST_HEAD(&req.list); - req.cblocks = range; - atomic_set(&req.complete, 0); - req.err = 0; - init_waitqueue_head(&req.result_wait); + /* + * We don't need to do any locking here because we know we're in + * passthrough mode. There's is potential for a race between an + * invalidation triggered by an io and an invalidation message. This + * is harmless, we must not worry if the policy call fails. + */ + while (range->begin != range->end) { + r = invalidate_cblock(cache, range->begin); + if (r) + return r; - spin_lock(&cache->invalidation_lock); - list_add(&req.list, &cache->invalidation_requests); - spin_unlock(&cache->invalidation_lock); - wake_worker(cache); + range->begin = cblock_succ(range->begin); + } - wait_event(req.result_wait, atomic_read(&req.complete)); - return req.err; + cache->commit_requested = true; + return r; } static int process_invalidate_cblocks_message(struct cache *cache, unsigned count, @@ -3816,7 +3509,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 10, 0}, + .version = {2, 0, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, |