diff options
30 files changed, 849 insertions, 2024 deletions
diff --git a/Documentation/device-mapper/cache-policies.txt b/Documentation/device-mapper/cache-policies.txt index d9246a32e673..e5062ad18717 100644 --- a/Documentation/device-mapper/cache-policies.txt +++ b/Documentation/device-mapper/cache-policies.txt @@ -28,51 +28,16 @@ Overview of supplied cache replacement policies multiqueue (mq) --------------- -This policy has been deprecated in favor of the smq policy (see below). +This policy is now an alias for smq (see below). -The multiqueue policy has three sets of 16 queues: one set for entries -waiting for the cache and another two for those in the cache (a set for -clean entries and a set for dirty entries). +The following tunables are accepted, but have no effect: -Cache entries in the queues are aged based on logical time. Entry into -the cache is based on variable thresholds and queue selection is based -on hit count on entry. The policy aims to take different cache miss -costs into account and to adjust to varying load patterns automatically. - -Message and constructor argument pairs are: 'sequential_threshold <#nr_sequential_ios>' 'random_threshold <#nr_random_ios>' 'read_promote_adjustment <value>' 'write_promote_adjustment <value>' 'discard_promote_adjustment <value>' -The sequential threshold indicates the number of contiguous I/Os -required before a stream is treated as sequential. Once a stream is -considered sequential it will bypass the cache. The random threshold -is the number of intervening non-contiguous I/Os that must be seen -before the stream is treated as random again. - -The sequential and random thresholds default to 512 and 4 respectively. - -Large, sequential I/Os are probably better left on the origin device -since spindles tend to have good sequential I/O bandwidth. The -io_tracker counts contiguous I/Os to try to spot when the I/O is in one -of these sequential modes. But there are use-cases for wanting to -promote sequential blocks to the cache (e.g. fast application startup). -If sequential threshold is set to 0 the sequential I/O detection is -disabled and sequential I/O will no longer implicitly bypass the cache. -Setting the random threshold to 0 does _not_ disable the random I/O -stream detection. - -Internally the mq policy determines a promotion threshold. If the hit -count of a block not in the cache goes above this threshold it gets -promoted to the cache. The read, write and discard promote adjustment -tunables allow you to tweak the promotion threshold by adding a small -value based on the io type. They default to 4, 8 and 1 respectively. -If you're trying to quickly warm a new cache device you may wish to -reduce these to encourage promotion. Remember to switch them back to -their defaults after the cache fills though. - Stochastic multiqueue (smq) --------------------------- diff --git a/block/blk-core.c b/block/blk-core.c index b83d29755b5a..45f4d7efbf34 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2198,7 +2198,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); - blk_mq_insert_request(rq, false, true, true); + blk_mq_insert_request(rq, false, true, false); return 0; } diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 0a2e7273db9e..02a5345a44a6 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -249,6 +249,7 @@ config DM_DEBUG_BLOCK_STACK_TRACING block manager locking used by thin provisioning and caching. If unsure, say N. + config DM_BIO_PRISON tristate depends on BLK_DEV_DM @@ -304,16 +305,6 @@ config DM_CACHE algorithms used to select which blocks are promoted, demoted, cleaned etc. It supports writeback and writethrough modes. -config DM_CACHE_MQ - tristate "MQ Cache Policy (EXPERIMENTAL)" - depends on DM_CACHE - default y - ---help--- - A cache policy that uses a multiqueue ordered by recent hit - count to select which blocks should be promoted and demoted. - This is meant to be a general purpose policy. It prioritises - reads over writes. - config DM_CACHE_SMQ tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)" depends on DM_CACHE diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 62a65764e8e0..52ba8dd82821 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -12,7 +12,6 @@ dm-log-userspace-y \ += dm-log-userspace-base.o dm-log-userspace-transfer.o dm-thin-pool-y += dm-thin.o dm-thin-metadata.o dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o -dm-cache-mq-y += dm-cache-policy-mq.o dm-cache-smq-y += dm-cache-policy-smq.o dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-era-y += dm-era-target.o @@ -55,7 +54,6 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o obj-$(CONFIG_DM_VERITY) += dm-verity.o obj-$(CONFIG_DM_CACHE) += dm-cache.o -obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index f6543f3a970f..27f2ef300f8b 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -867,19 +867,40 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd, return 0; } -#define WRITE_LOCK(cmd) \ - if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \ +#define WRITE_LOCK(cmd) \ + down_write(&cmd->root_lock); \ + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ + up_write(&cmd->root_lock); \ return -EINVAL; \ - down_write(&cmd->root_lock) + } #define WRITE_LOCK_VOID(cmd) \ - if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \ + down_write(&cmd->root_lock); \ + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ + up_write(&cmd->root_lock); \ return; \ - down_write(&cmd->root_lock) + } #define WRITE_UNLOCK(cmd) \ up_write(&cmd->root_lock) +#define READ_LOCK(cmd) \ + down_read(&cmd->root_lock); \ + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ + up_read(&cmd->root_lock); \ + return -EINVAL; \ + } + +#define READ_LOCK_VOID(cmd) \ + down_read(&cmd->root_lock); \ + if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) { \ + up_read(&cmd->root_lock); \ + return; \ + } + +#define READ_UNLOCK(cmd) \ + up_read(&cmd->root_lock) + int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size) { int r; @@ -1015,22 +1036,20 @@ int dm_cache_load_discards(struct dm_cache_metadata *cmd, { int r; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = __load_discards(cmd, fn, context); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } -dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd) +int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result) { - dm_cblock_t r; + READ_LOCK(cmd); + *result = cmd->cache_blocks; + READ_UNLOCK(cmd); - down_read(&cmd->root_lock); - r = cmd->cache_blocks; - up_read(&cmd->root_lock); - - return r; + return 0; } static int __remove(struct dm_cache_metadata *cmd, dm_cblock_t cblock) @@ -1188,9 +1207,9 @@ int dm_cache_load_mappings(struct dm_cache_metadata *cmd, { int r; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = __load_mappings(cmd, policy, fn, context); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1215,18 +1234,18 @@ static int __dump_mappings(struct dm_cache_metadata *cmd) void dm_cache_dump(struct dm_cache_metadata *cmd) { - down_read(&cmd->root_lock); + READ_LOCK_VOID(cmd); __dump_mappings(cmd); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); } int dm_cache_changed_this_transaction(struct dm_cache_metadata *cmd) { int r; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = cmd->changed; - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1276,9 +1295,9 @@ int dm_cache_set_dirty(struct dm_cache_metadata *cmd, void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd, struct dm_cache_statistics *stats) { - down_read(&cmd->root_lock); + READ_LOCK_VOID(cmd); *stats = cmd->stats; - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); } void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd, @@ -1312,9 +1331,9 @@ int dm_cache_get_free_metadata_block_count(struct dm_cache_metadata *cmd, { int r = -EINVAL; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = dm_sm_get_nr_free(cmd->metadata_sm, result); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1324,9 +1343,9 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, { int r = -EINVAL; - down_read(&cmd->root_lock); + READ_LOCK(cmd); r = dm_sm_get_nr_blocks(cmd->metadata_sm, result); - up_read(&cmd->root_lock); + READ_UNLOCK(cmd); return r; } @@ -1417,7 +1436,13 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy * int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result) { - return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); + int r; + + READ_LOCK(cmd); + r = blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result); + READ_UNLOCK(cmd); + + return r; } void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd) @@ -1440,10 +1465,7 @@ int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd) struct dm_block *sblock; struct cache_disk_superblock *disk_super; - /* - * We ignore fail_io for this function. - */ - down_write(&cmd->root_lock); + WRITE_LOCK(cmd); set_bit(NEEDS_CHECK, &cmd->flags); r = superblock_lock(cmd, &sblock); @@ -1458,19 +1480,17 @@ int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd) dm_bm_unlock(sblock); out: - up_write(&cmd->root_lock); + WRITE_UNLOCK(cmd); return r; } -bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd) +int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result) { - bool needs_check; + READ_LOCK(cmd); + *result = !!test_bit(NEEDS_CHECK, &cmd->flags); + READ_UNLOCK(cmd); - down_read(&cmd->root_lock); - needs_check = !!test_bit(NEEDS_CHECK, &cmd->flags); - up_read(&cmd->root_lock); - - return needs_check; + return 0; } int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 2ffee21f318d..8528744195e5 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -66,7 +66,7 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd); * origin blocks to map to. */ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size); -dm_cblock_t dm_cache_size(struct dm_cache_metadata *cmd); +int dm_cache_size(struct dm_cache_metadata *cmd, dm_cblock_t *result); int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd, sector_t discard_block_size, @@ -137,7 +137,7 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy * */ int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result); -bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd); +int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result); int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd); void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd); void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd); diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c deleted file mode 100644 index ddb26980cd66..000000000000 --- a/drivers/md/dm-cache-policy-mq.c +++ /dev/null @@ -1,1473 +0,0 @@ -/* - * Copyright (C) 2012 Red Hat. All rights reserved. - * - * This file is released under the GPL. - */ - -#include "dm-cache-policy.h" -#include "dm.h" - -#include <linux/hash.h> -#include <linux/jiffies.h> -#include <linux/module.h> -#include <linux/mutex.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> - -#define DM_MSG_PREFIX "cache-policy-mq" - -static struct kmem_cache *mq_entry_cache; - -/*----------------------------------------------------------------*/ - -static unsigned next_power(unsigned n, unsigned min) -{ - return roundup_pow_of_two(max(n, min)); -} - -/*----------------------------------------------------------------*/ - -/* - * Large, sequential ios are probably better left on the origin device since - * spindles tend to have good bandwidth. - * - * The io_tracker tries to spot when the io is in one of these sequential - * modes. - * - * Two thresholds to switch between random and sequential io mode are defaulting - * as follows and can be adjusted via the constructor and message interfaces. - */ -#define RANDOM_THRESHOLD_DEFAULT 4 -#define SEQUENTIAL_THRESHOLD_DEFAULT 512 - -enum io_pattern { - PATTERN_SEQUENTIAL, - PATTERN_RANDOM -}; - -struct io_tracker { - enum io_pattern pattern; - - unsigned nr_seq_samples; - unsigned nr_rand_samples; - unsigned thresholds[2]; - - dm_oblock_t last_end_oblock; -}; - -static void iot_init(struct io_tracker *t, - int sequential_threshold, int random_threshold) -{ - t->pattern = PATTERN_RANDOM; - t->nr_seq_samples = 0; - t->nr_rand_samples = 0; - t->last_end_oblock = 0; - t->thresholds[PATTERN_RANDOM] = random_threshold; - t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold; -} - -static enum io_pattern iot_pattern(struct io_tracker *t) -{ - return t->pattern; -} - -static void iot_update_stats(struct io_tracker *t, struct bio *bio) -{ - if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1) - t->nr_seq_samples++; - else { - /* - * Just one non-sequential IO is enough to reset the - * counters. - */ - if (t->nr_seq_samples) { - t->nr_seq_samples = 0; - t->nr_rand_samples = 0; - } - - t->nr_rand_samples++; - } - - t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1); -} - -static void iot_check_for_pattern_switch(struct io_tracker *t) -{ - switch (t->pattern) { - case PATTERN_SEQUENTIAL: - if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) { - t->pattern = PATTERN_RANDOM; - t->nr_seq_samples = t->nr_rand_samples = 0; - } - break; - - case PATTERN_RANDOM: - if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) { - t->pattern = PATTERN_SEQUENTIAL; - t->nr_seq_samples = t->nr_rand_samples = 0; - } - break; - } -} - -static void iot_examine_bio(struct io_tracker *t, struct bio *bio) -{ - iot_update_stats(t, bio); - iot_check_for_pattern_switch(t); -} - -/*----------------------------------------------------------------*/ - - -/* - * This queue is divided up into different levels. Allowing us to push - * entries to the back of any of the levels. Think of it as a partially - * sorted queue. - */ -#define NR_QUEUE_LEVELS 16u -#define NR_SENTINELS NR_QUEUE_LEVELS * 3 - -#define WRITEBACK_PERIOD HZ - -struct queue { - unsigned nr_elts; - bool current_writeback_sentinels; - unsigned long next_writeback; - struct list_head qs[NR_QUEUE_LEVELS]; - struct list_head sentinels[NR_SENTINELS]; -}; - -static void queue_init(struct queue *q) -{ - unsigned i; - - q->nr_elts = 0; - q->current_writeback_sentinels = false; - q->next_writeback = 0; - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - INIT_LIST_HEAD(q->qs + i); - INIT_LIST_HEAD(q->sentinels + i); - INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i); - INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i); - } -} - -static unsigned queue_size(struct queue *q) -{ - return q->nr_elts; -} - -static bool queue_empty(struct queue *q) -{ - return q->nr_elts == 0; -} - -/* - * Insert an entry to the back of the given level. - */ -static void queue_push(struct queue *q, unsigned level, struct list_head *elt) -{ - q->nr_elts++; - list_add_tail(elt, q->qs + level); -} - -static void queue_remove(struct queue *q, struct list_head *elt) -{ - q->nr_elts--; - list_del(elt); -} - -static bool is_sentinel(struct queue *q, struct list_head *h) -{ - return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS)); -} - -/* - * Gives us the oldest entry of the lowest popoulated level. If the first - * level is emptied then we shift down one level. - */ -static struct list_head *queue_peek(struct queue *q) -{ - unsigned level; - struct list_head *h; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each(h, q->qs + level) - if (!is_sentinel(q, h)) - return h; - - return NULL; -} - -static struct list_head *queue_pop(struct queue *q) -{ - struct list_head *r = queue_peek(q); - - if (r) { - q->nr_elts--; - list_del(r); - } - - return r; -} - -/* - * Pops an entry from a level that is not past a sentinel. - */ -static struct list_head *queue_pop_old(struct queue *q) -{ - unsigned level; - struct list_head *h; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each(h, q->qs + level) { - if (is_sentinel(q, h)) - break; - - q->nr_elts--; - list_del(h); - return h; - } - - return NULL; -} - -static struct list_head *list_pop(struct list_head *lh) -{ - struct list_head *r = lh->next; - - BUG_ON(!r); - list_del_init(r); - - return r; -} - -static struct list_head *writeback_sentinel(struct queue *q, unsigned level) -{ - if (q->current_writeback_sentinels) - return q->sentinels + NR_QUEUE_LEVELS + level; - else - return q->sentinels + 2 * NR_QUEUE_LEVELS + level; -} - -static void queue_update_writeback_sentinels(struct queue *q) -{ - unsigned i; - struct list_head *h; - - if (time_after(jiffies, q->next_writeback)) { - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - h = writeback_sentinel(q, i); - list_del(h); - list_add_tail(h, q->qs + i); - } - - q->next_writeback = jiffies + WRITEBACK_PERIOD; - q->current_writeback_sentinels = !q->current_writeback_sentinels; - } -} - -/* - * Sometimes we want to iterate through entries that have been pushed since - * a certain event. We use sentinel entries on the queues to delimit these - * 'tick' events. - */ -static void queue_tick(struct queue *q) -{ - unsigned i; - - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - list_del(q->sentinels + i); - list_add_tail(q->sentinels + i, q->qs + i); - } -} - -typedef void (*iter_fn)(struct list_head *, void *); -static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context) -{ - unsigned i; - struct list_head *h; - - for (i = 0; i < NR_QUEUE_LEVELS; i++) { - list_for_each_prev(h, q->qs + i) { - if (is_sentinel(q, h)) - break; - - fn(h, context); - } - } -} - -/*----------------------------------------------------------------*/ - -/* - * Describes a cache entry. Used in both the cache and the pre_cache. - */ -struct entry { - struct hlist_node hlist; - struct list_head list; - dm_oblock_t oblock; - - /* - * FIXME: pack these better - */ - bool dirty:1; - unsigned hit_count; -}; - -/* - * Rather than storing the cblock in an entry, we allocate all entries in - * an array, and infer the cblock from the entry position. - * - * Free entries are linked together into a list. - */ -struct entry_pool { - struct entry *entries, *entries_end; - struct list_head free; - unsigned nr_allocated; -}; - -static int epool_init(struct entry_pool *ep, unsigned nr_entries) -{ - unsigned i; - - ep->entries = vzalloc(sizeof(struct entry) * nr_entries); - if (!ep->entries) - return -ENOMEM; - - ep->entries_end = ep->entries + nr_entries; - - INIT_LIST_HEAD(&ep->free); - for (i = 0; i < nr_entries; i++) - list_add(&ep->entries[i].list, &ep->free); - - ep->nr_allocated = 0; - - return 0; -} - -static void epool_exit(struct entry_pool *ep) -{ - vfree(ep->entries); -} - -static struct entry *alloc_entry(struct entry_pool *ep) -{ - struct entry *e; - - if (list_empty(&ep->free)) - return NULL; - - e = list_entry(list_pop(&ep->free), struct entry, list); - INIT_LIST_HEAD(&e->list); - INIT_HLIST_NODE(&e->hlist); - ep->nr_allocated++; - - return e; -} - -/* - * This assumes the cblock hasn't already been allocated. - */ -static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) -{ - struct entry *e = ep->entries + from_cblock(cblock); - - list_del_init(&e->list); - INIT_HLIST_NODE(&e->hlist); - ep->nr_allocated++; - - return e; -} - -static void free_entry(struct entry_pool *ep, struct entry *e) -{ - BUG_ON(!ep->nr_allocated); - ep->nr_allocated--; - INIT_HLIST_NODE(&e->hlist); - list_add(&e->list, &ep->free); -} - -/* - * Returns NULL if the entry is free. - */ -static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) -{ - struct entry *e = ep->entries + from_cblock(cblock); - return !hlist_unhashed(&e->hlist) ? e : NULL; -} - -static bool epool_empty(struct entry_pool *ep) -{ - return list_empty(&ep->free); -} - -static bool in_pool(struct entry_pool *ep, struct entry *e) -{ - return e >= ep->entries && e < ep->entries_end; -} - -static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e) -{ - return to_cblock(e - ep->entries); -} - -/*----------------------------------------------------------------*/ - -struct mq_policy { - struct dm_cache_policy policy; - - /* protects everything */ - struct mutex lock; - dm_cblock_t cache_size; - struct io_tracker tracker; - - /* - * Entries come from two pools, one of pre-cache entries, and one - * for the cache proper. - */ - struct entry_pool pre_cache_pool; - struct entry_pool cache_pool; - - /* - * We maintain three queues of entries. The cache proper, - * consisting of a clean and dirty queue, contains the currently - * active mappings. Whereas the pre_cache tracks blocks that - * are being hit frequently and potential candidates for promotion - * to the cache. - */ - struct queue pre_cache; - struct queue cache_clean; - struct queue cache_dirty; - - /* - * Keeps track of time, incremented by the core. We use this to - * avoid attributing multiple hits within the same tick. - * - * Access to tick_protected should be done with the spin lock held. - * It's copied to tick at the start of the map function (within the - * mutex). - */ - spinlock_t tick_lock; - unsigned tick_protected; - unsigned tick; - - /* - * A count of the number of times the map function has been called - * and found an entry in the pre_cache or cache. Currently used to - * calculate the generation. - */ - unsigned hit_count; - - /* - * A generation is a longish period that is used to trigger some - * book keeping effects. eg, decrementing hit counts on entries. - * This is needed to allow the cache to evolve as io patterns - * change. - */ - unsigned generation; - unsigned generation_period; /* in lookups (will probably change) */ - - unsigned discard_promote_adjustment; - unsigned read_promote_adjustment; - unsigned write_promote_adjustment; - - /* - * The hash table allows us to quickly find an entry by origin - * block. Both pre_cache and cache entries are in here. - */ - unsigned nr_buckets; - dm_block_t hash_bits; - struct hlist_head *table; -}; - -#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 -#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 -#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 -#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128 - -/*----------------------------------------------------------------*/ - -/* - * Simple hash table implementation. Should replace with the standard hash - * table that's making its way upstream. - */ -static void hash_insert(struct mq_policy *mq, struct entry *e) -{ - unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits); - - hlist_add_head(&e->hlist, mq->table + h); -} - -static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock) -{ - unsigned h = hash_64(from_oblock(oblock), mq->hash_bits); - struct hlist_head *bucket = mq->table + h; - struct entry *e; - - hlist_for_each_entry(e, bucket, hlist) - if (e->oblock == oblock) { - hlist_del(&e->hlist); - hlist_add_head(&e->hlist, bucket); - return e; - } - - return NULL; -} - -static void hash_remove(struct entry *e) -{ - hlist_del(&e->hlist); -} - -/*----------------------------------------------------------------*/ - -static bool any_free_cblocks(struct mq_policy *mq) -{ - return !epool_empty(&mq->cache_pool); -} - -static bool any_clean_cblocks(struct mq_policy *mq) -{ - return !queue_empty(&mq->cache_clean); -} - -/*----------------------------------------------------------------*/ - -/* - * Now we get to the meat of the policy. This section deals with deciding - * when to to add entries to the pre_cache and cache, and move between - * them. - */ - -/* - * The queue level is based on the log2 of the hit count. - */ -static unsigned queue_level(struct entry *e) -{ - return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); -} - -static bool in_cache(struct mq_policy *mq, struct entry *e) -{ - return in_pool(&mq->cache_pool, e); -} - -/* - * Inserts the entry into the pre_cache or the cache. Ensures the cache - * block is marked as allocated if necc. Inserts into the hash table. - * Sets the tick which records when the entry was last moved about. - */ -static void push(struct mq_policy *mq, struct entry *e) -{ - hash_insert(mq, e); - - if (in_cache(mq, e)) - queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, - queue_level(e), &e->list); - else - queue_push(&mq->pre_cache, queue_level(e), &e->list); -} - -/* - * Removes an entry from pre_cache or cache. Removes from the hash table. - */ -static void del(struct mq_policy *mq, struct entry *e) -{ - if (in_cache(mq, e)) - queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list); - else - queue_remove(&mq->pre_cache, &e->list); - - hash_remove(e); -} - -/* - * Like del, except it removes the first entry in the queue (ie. the least - * recently used). - */ -static struct entry *pop(struct mq_policy *mq, struct queue *q) -{ - struct entry *e; - struct list_head *h = queue_pop(q); - - if (!h) - return NULL; - - e = container_of(h, struct entry, list); - hash_remove(e); - - return e; -} - -static struct entry *pop_old(struct mq_policy *mq, struct queue *q) -{ - struct entry *e; - struct list_head *h = queue_pop_old(q); - - if (!h) - return NULL; - - e = container_of(h, struct entry, list); - hash_remove(e); - - return e; -} - -static struct entry *peek(struct queue *q) -{ - struct list_head *h = queue_peek(q); - return h ? container_of(h, struct entry, list) : NULL; -} - -/* - * The promotion threshold is adjusted every generation. As are the counts - * of the entries. - * - * At the moment the threshold is taken by averaging the hit counts of some - * of the entries in the cache (the first 20 entries across all levels in - * ascending order, giving preference to the clean entries at each level). - * - * We can be much cleverer than this though. For example, each promotion - * could bump up the threshold helping to prevent churn. Much more to do - * here. - */ - -#define MAX_TO_AVERAGE 20 - -static void check_generation(struct mq_policy *mq) -{ - unsigned total = 0, nr = 0, count = 0, level; - struct list_head *head; - struct entry *e; - - if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) { - mq->hit_count = 0; - mq->generation++; - - for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { - head = mq->cache_clean.qs + level; - list_for_each_entry(e, head, list) { - nr++; - total += e->hit_count; - - if (++count >= MAX_TO_AVERAGE) - break; - } - - head = mq->cache_dirty.qs + level; - list_for_each_entry(e, head, list) { - nr++; - total += e->hit_count; - - if (++count >= MAX_TO_AVERAGE) - break; - } - } - } -} - -/* - * Whenever we use an entry we bump up it's hit counter, and push it to the - * back to it's current level. - */ -static void requeue(struct mq_policy *mq, struct entry *e) -{ - check_generation(mq); - del(mq, e); - push(mq, e); -} - -/* - * Demote the least recently used entry from the cache to the pre_cache. - * Returns the new cache entry to use, and the old origin block it was - * mapped to. - * - * We drop the hit count on the demoted entry back to 1 to stop it bouncing - * straight back into the cache if it's subsequently hit. There are - * various options here, and more experimentation would be good: - * - * - just forget about the demoted entry completely (ie. don't insert it - into the pre_cache). - * - divide the hit count rather that setting to some hard coded value. - * - set the hit count to a hard coded value other than 1, eg, is it better - * if it goes in at level 2? - */ -static int demote_cblock(struct mq_policy *mq, - struct policy_locker *locker, dm_oblock_t *oblock) -{ - struct entry *demoted = peek(&mq->cache_clean); - - if (!demoted) - /* - * We could get a block from mq->cache_dirty, but that - * would add extra latency to the triggering bio as it - * waits for the writeback. Better to not promote this - * time and hope there's a clean block next time this block - * is hit. - */ - return -ENOSPC; - - if (locker->fn(locker, demoted->oblock)) - /* - * We couldn't lock the demoted block. - */ - return -EBUSY; - - del(mq, demoted); - *oblock = demoted->oblock; - free_entry(&mq->cache_pool, demoted); - - /* - * We used to put the demoted block into the pre-cache, but I think - * it's simpler to just let it work it's way up from zero again. - * Stops blocks flickering in and out of the cache. - */ - - return 0; -} - -/* - * Entries in the pre_cache whose hit count passes the promotion - * threshold move to the cache proper. Working out the correct - * value for the promotion_threshold is crucial to this policy. - */ -static unsigned promote_threshold(struct mq_policy *mq) -{ - struct entry *e; - - if (any_free_cblocks(mq)) - return 0; - - e = peek(&mq->cache_clean); - if (e) - return e->hit_count; - - e = peek(&mq->cache_dirty); - if (e) - return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD; - - /* This should never happen */ - return 0; -} - -/* - * We modify the basic promotion_threshold depending on the specific io. - * - * If the origin block has been discarded then there's no cost to copy it - * to the cache. - * - * We bias towards reads, since they can be demoted at no cost if they - * haven't been dirtied. - */ -static unsigned adjusted_promote_threshold(struct mq_policy *mq, - bool discarded_oblock, int data_dir) -{ - if (data_dir == READ) - return promote_threshold(mq) + mq->read_promote_adjustment; - - if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { - /* - * We don't need to do any copying at all, so give this a - * very low threshold. - */ - return mq->discard_promote_adjustment; - } - - return promote_threshold(mq) + mq->write_promote_adjustment; -} - -static bool should_promote(struct mq_policy *mq, struct entry *e, - bool discarded_oblock, int data_dir) -{ - return e->hit_count >= - adjusted_promote_threshold(mq, discarded_oblock, data_dir); -} - -static int cache_entry_found(struct mq_policy *mq, - struct entry *e, - struct policy_result *result) -{ - requeue(mq, e); - - if (in_cache(mq, e)) { - result->op = POLICY_HIT; - result->cblock = infer_cblock(&mq->cache_pool, e); - } - - return 0; -} - -/* - * Moves an entry from the pre_cache to the cache. The main work is - * finding which cache block to use. - */ -static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, - struct policy_locker *locker, - struct policy_result *result) -{ - int r; - struct entry *new_e; - - /* Ensure there's a free cblock in the cache */ - if (epool_empty(&mq->cache_pool)) { - result->op = POLICY_REPLACE; - r = demote_cblock(mq, locker, &result->old_oblock); - if (r) { - result->op = POLICY_MISS; - return 0; - } - - } else - result->op = POLICY_NEW; - - new_e = alloc_entry(&mq->cache_pool); - BUG_ON(!new_e); - - new_e->oblock = e->oblock; - new_e->dirty = false; - new_e->hit_count = e->hit_count; - - del(mq, e); - free_entry(&mq->pre_cache_pool, e); - push(mq, new_e); - - result->cblock = infer_cblock(&mq->cache_pool, new_e); - - return 0; -} - -static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - int r = 0; - - if (!should_promote(mq, e, discarded_oblock, data_dir)) { - requeue(mq, e); - result->op = POLICY_MISS; - - } else if (!can_migrate) - r = -EWOULDBLOCK; - - else { - requeue(mq, e); - r = pre_cache_to_cache(mq, e, locker, result); - } - - return r; -} - -static void insert_in_pre_cache(struct mq_policy *mq, - dm_oblock_t oblock) -{ - struct entry *e = alloc_entry(&mq->pre_cache_pool); - - if (!e) - /* - * There's no spare entry structure, so we grab the least - * used one from the pre_cache. - */ - e = pop(mq, &mq->pre_cache); - - if (unlikely(!e)) { - DMWARN("couldn't pop from pre cache"); - return; - } - - e->dirty = false; - e->oblock = oblock; - e->hit_count = 1; - push(mq, e); -} - -static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, - struct policy_locker *locker, - struct policy_result *result) -{ - int r; - struct entry *e; - - if (epool_empty(&mq->cache_pool)) { - result->op = POLICY_REPLACE; - r = demote_cblock(mq, locker, &result->old_oblock); - if (unlikely(r)) { - result->op = POLICY_MISS; - insert_in_pre_cache(mq, oblock); - return; - } - - /* - * This will always succeed, since we've just demoted. - */ - e = alloc_entry(&mq->cache_pool); - BUG_ON(!e); - - } else { - e = alloc_entry(&mq->cache_pool); - result->op = POLICY_NEW; - } - - e->oblock = oblock; - e->dirty = false; - e->hit_count = 1; - push(mq, e); - - result->cblock = infer_cblock(&mq->cache_pool, e); -} - -static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) { - if (can_migrate) - insert_in_cache(mq, oblock, locker, result); - else - return -EWOULDBLOCK; - } else { - insert_in_pre_cache(mq, oblock); - result->op = POLICY_MISS; - } - - return 0; -} - -/* - * Looks the oblock up in the hash table, then decides whether to put in - * pre_cache, or cache etc. - */ -static int map(struct mq_policy *mq, dm_oblock_t oblock, - bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_locker *locker, - struct policy_result *result) -{ - int r = 0; - struct entry *e = hash_lookup(mq, oblock); - - if (e && in_cache(mq, e)) - r = cache_entry_found(mq, e, result); - - else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] && - iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) - result->op = POLICY_MISS; - - else if (e) - r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, - data_dir, locker, result); - - else - r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, - data_dir, locker, result); - - if (r == -EWOULDBLOCK) - result->op = POLICY_MISS; - - return r; -} - -/*----------------------------------------------------------------*/ - -/* - * Public interface, via the policy struct. See dm-cache-policy.h for a - * description of these. - */ - -static struct mq_policy *to_mq_policy(struct dm_cache_policy *p) -{ - return container_of(p, struct mq_policy, policy); -} - -static void mq_destroy(struct dm_cache_policy *p) -{ - struct mq_policy *mq = to_mq_policy(p); - - vfree(mq->table); - epool_exit(&mq->cache_pool); - epool_exit(&mq->pre_cache_pool); - kfree(mq); -} - -static void update_pre_cache_hits(struct list_head *h, void *context) -{ - struct entry *e = container_of(h, struct entry, list); - e->hit_count++; -} - -static void update_cache_hits(struct list_head *h, void *context) -{ - struct mq_policy *mq = context; - struct entry *e = container_of(h, struct entry, list); - e->hit_count++; - mq->hit_count++; -} - -static void copy_tick(struct mq_policy *mq) -{ - unsigned long flags, tick; - - spin_lock_irqsave(&mq->tick_lock, flags); - tick = mq->tick_protected; - if (tick != mq->tick) { - queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq); - queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq); - queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq); - mq->tick = tick; - } - - queue_tick(&mq->pre_cache); - queue_tick(&mq->cache_dirty); - queue_tick(&mq->cache_clean); - queue_update_writeback_sentinels(&mq->cache_dirty); - spin_unlock_irqrestore(&mq->tick_lock, flags); -} - -static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock, - bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_locker *locker, - struct policy_result *result) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - - result->op = POLICY_MISS; - - if (can_block) - mutex_lock(&mq->lock); - else if (!mutex_trylock(&mq->lock)) - return -EWOULDBLOCK; - - copy_tick(mq); - - iot_examine_bio(&mq->tracker, bio); - r = map(mq, oblock, can_migrate, discarded_oblock, - bio_data_dir(bio), locker, result); - - mutex_unlock(&mq->lock); - - return r; -} - -static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - struct entry *e; - - if (!mutex_trylock(&mq->lock)) - return -EWOULDBLOCK; - - e = hash_lookup(mq, oblock); - if (e && in_cache(mq, e)) { - *cblock = infer_cblock(&mq->cache_pool, e); - r = 0; - } else - r = -ENOENT; - - mutex_unlock(&mq->lock); - - return r; -} - -static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set) -{ - struct entry *e; - - e = hash_lookup(mq, oblock); - BUG_ON(!e || !in_cache(mq, e)); - - del(mq, e); - e->dirty = set; - push(mq, e); -} - -static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __mq_set_clear_dirty(mq, oblock, true); - mutex_unlock(&mq->lock); -} - -static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __mq_set_clear_dirty(mq, oblock, false); - mutex_unlock(&mq->lock); -} - -static int mq_load_mapping(struct dm_cache_policy *p, - dm_oblock_t oblock, dm_cblock_t cblock, - uint32_t hint, bool hint_valid) -{ - struct mq_policy *mq = to_mq_policy(p); - struct entry *e; - - e = alloc_particular_entry(&mq->cache_pool, cblock); - e->oblock = oblock; - e->dirty = false; /* this gets corrected in a minute */ - e->hit_count = hint_valid ? hint : 1; - push(mq, e); - - return 0; -} - -static int mq_save_hints(struct mq_policy *mq, struct queue *q, - policy_walk_fn fn, void *context) -{ - int r; - unsigned level; - struct list_head *h; - struct entry *e; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each(h, q->qs + level) { - if (is_sentinel(q, h)) - continue; - - e = container_of(h, struct entry, list); - r = fn(context, infer_cblock(&mq->cache_pool, e), - e->oblock, e->hit_count); - if (r) - return r; - } - - return 0; -} - -static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, - void *context) -{ - struct mq_policy *mq = to_mq_policy(p); - int r = 0; - - mutex_lock(&mq->lock); - - r = mq_save_hints(mq, &mq->cache_clean, fn, context); - if (!r) - r = mq_save_hints(mq, &mq->cache_dirty, fn, context); - - mutex_unlock(&mq->lock); - - return r; -} - -static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) -{ - struct entry *e; - - e = hash_lookup(mq, oblock); - BUG_ON(!e || !in_cache(mq, e)); - - del(mq, e); - free_entry(&mq->cache_pool, e); -} - -static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __remove_mapping(mq, oblock); - mutex_unlock(&mq->lock); -} - -static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock) -{ - struct entry *e = epool_find(&mq->cache_pool, cblock); - - if (!e) - return -ENODATA; - - del(mq, e); - free_entry(&mq->cache_pool, e); - - return 0; -} - -static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - r = __remove_cblock(mq, cblock); - mutex_unlock(&mq->lock); - - return r; -} - -#define CLEAN_TARGET_PERCENTAGE 25 - -static bool clean_target_met(struct mq_policy *mq) -{ - /* - * Cache entries may not be populated. So we're cannot rely on the - * size of the clean queue. - */ - unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty); - unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100; - - return nr_clean >= target; -} - -static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, - dm_cblock_t *cblock) -{ - struct entry *e = pop_old(mq, &mq->cache_dirty); - - if (!e && !clean_target_met(mq)) - e = pop(mq, &mq->cache_dirty); - - if (!e) - return -ENODATA; - - *oblock = e->oblock; - *cblock = infer_cblock(&mq->cache_pool, e); - e->dirty = false; - push(mq, e); - - return 0; -} - -static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, - dm_cblock_t *cblock, bool critical_only) -{ - int r; - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - r = __mq_writeback_work(mq, oblock, cblock); - mutex_unlock(&mq->lock); - - return r; -} - -static void __force_mapping(struct mq_policy *mq, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) -{ - struct entry *e = hash_lookup(mq, current_oblock); - - if (e && in_cache(mq, e)) { - del(mq, e); - e->oblock = new_oblock; - e->dirty = true; - push(mq, e); - } -} - -static void mq_force_mapping(struct dm_cache_policy *p, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) -{ - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - __force_mapping(mq, current_oblock, new_oblock); - mutex_unlock(&mq->lock); -} - -static dm_cblock_t mq_residency(struct dm_cache_policy *p) -{ - dm_cblock_t r; - struct mq_policy *mq = to_mq_policy(p); - - mutex_lock(&mq->lock); - r = to_cblock(mq->cache_pool.nr_allocated); - mutex_unlock(&mq->lock); - - return r; -} - -static void mq_tick(struct dm_cache_policy *p, bool can_block) -{ - struct mq_policy *mq = to_mq_policy(p); - unsigned long flags; - - spin_lock_irqsave(&mq->tick_lock, flags); - mq->tick_protected++; - spin_unlock_irqrestore(&mq->tick_lock, flags); - - if (can_block) { - mutex_lock(&mq->lock); - copy_tick(mq); - mutex_unlock(&mq->lock); - } -} - -static int mq_set_config_value(struct dm_cache_policy *p, - const char *key, const char *value) -{ - struct mq_policy *mq = to_mq_policy(p); - unsigned long tmp; - - if (kstrtoul(value, 10, &tmp)) - return -EINVAL; - - if (!strcasecmp(key, "random_threshold")) { - mq->tracker.thresholds[PATTERN_RANDOM] = tmp; - - } else if (!strcasecmp(key, "sequential_threshold")) { - mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp; - - } else if (!strcasecmp(key, "discard_promote_adjustment")) - mq->discard_promote_adjustment = tmp; - - else if (!strcasecmp(key, "read_promote_adjustment")) - mq->read_promote_adjustment = tmp; - - else if (!strcasecmp(key, "write_promote_adjustment")) - mq->write_promote_adjustment = tmp; - - else - return -EINVAL; - - return 0; -} - -static int mq_emit_config_values(struct dm_cache_policy *p, char *result, - unsigned maxlen, ssize_t *sz_ptr) -{ - ssize_t sz = *sz_ptr; - struct mq_policy *mq = to_mq_policy(p); - - DMEMIT("10 random_threshold %u " - "sequential_threshold %u " - "discard_promote_adjustment %u " - "read_promote_adjustment %u " - "write_promote_adjustment %u ", - mq->tracker.thresholds[PATTERN_RANDOM], - mq->tracker.thresholds[PATTERN_SEQUENTIAL], - mq->discard_promote_adjustment, - mq->read_promote_adjustment, - mq->write_promote_adjustment); - - *sz_ptr = sz; - return 0; -} - -/* Init the policy plugin interface function pointers. */ -static void init_policy_functions(struct mq_policy *mq) -{ - mq->policy.destroy = mq_destroy; - mq->policy.map = mq_map; - mq->policy.lookup = mq_lookup; - mq->policy.set_dirty = mq_set_dirty; - mq->policy.clear_dirty = mq_clear_dirty; - mq->policy.load_mapping = mq_load_mapping; - mq->policy.walk_mappings = mq_walk_mappings; - mq->policy.remove_mapping = mq_remove_mapping; - mq->policy.remove_cblock = mq_remove_cblock; - mq->policy.writeback_work = mq_writeback_work; - mq->policy.force_mapping = mq_force_mapping; - mq->policy.residency = mq_residency; - mq->policy.tick = mq_tick; - mq->policy.emit_config_values = mq_emit_config_values; - mq->policy.set_config_value = mq_set_config_value; -} - -static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, - sector_t origin_size, - sector_t cache_block_size) -{ - struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); - - if (!mq) - return NULL; - - init_policy_functions(mq); - iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); - mq->cache_size = cache_size; - - if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) { - DMERR("couldn't initialize pool of pre-cache entries"); - goto bad_pre_cache_init; - } - - if (epool_init(&mq->cache_pool, from_cblock(cache_size))) { - DMERR("couldn't initialize pool of cache entries"); - goto bad_cache_init; - } - - mq->tick_protected = 0; - mq->tick = 0; - mq->hit_count = 0; - mq->generation = 0; - mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; - mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; - mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; - mutex_init(&mq->lock); - spin_lock_init(&mq->tick_lock); - - queue_init(&mq->pre_cache); - queue_init(&mq->cache_clean); - queue_init(&mq->cache_dirty); - - mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); - - mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); - mq->hash_bits = __ffs(mq->nr_buckets); - mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); - if (!mq->table) - goto bad_alloc_table; - - return &mq->policy; - -bad_alloc_table: - epool_exit(&mq->cache_pool); -bad_cache_init: - epool_exit(&mq->pre_cache_pool); -bad_pre_cache_init: - kfree(mq); - - return NULL; -} - -/*----------------------------------------------------------------*/ - -static struct dm_cache_policy_type mq_policy_type = { - .name = "mq", - .version = {1, 4, 0}, - .hint_size = 4, - .owner = THIS_MODULE, - .create = mq_create -}; - -static int __init mq_init(void) -{ - int r; - - mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry", - sizeof(struct entry), - __alignof__(struct entry), - 0, NULL); - if (!mq_entry_cache) - return -ENOMEM; - - r = dm_cache_policy_register(&mq_policy_type); - if (r) { - DMERR("register failed %d", r); - kmem_cache_destroy(mq_entry_cache); - return -ENOMEM; - } - - return 0; -} - -static void __exit mq_exit(void) -{ - dm_cache_policy_unregister(&mq_policy_type); - - kmem_cache_destroy(mq_entry_cache); -} - -module_init(mq_init); -module_exit(mq_exit); - -MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("mq cache policy"); diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 28d4586748d0..cf48a617a3a4 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1567,8 +1567,48 @@ static void smq_tick(struct dm_cache_policy *p, bool can_block) spin_unlock_irqrestore(&mq->lock, flags); } +/* + * smq has no config values, but the old mq policy did. To avoid breaking + * software we continue to accept these configurables for the mq policy, + * but they have no effect. + */ +static int mq_set_config_value(struct dm_cache_policy *p, + const char *key, const char *value) +{ + unsigned long tmp; + + if (kstrtoul(value, 10, &tmp)) + return -EINVAL; + + if (!strcasecmp(key, "random_threshold") || + !strcasecmp(key, "sequential_threshold") || + !strcasecmp(key, "discard_promote_adjustment") || + !strcasecmp(key, "read_promote_adjustment") || + !strcasecmp(key, "write_promote_adjustment")) { + DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key); + return 0; + } + + return -EINVAL; +} + +static int mq_emit_config_values(struct dm_cache_policy *p, char *result, + unsigned maxlen, ssize_t *sz_ptr) +{ + ssize_t sz = *sz_ptr; + + DMEMIT("10 random_threshold 0 " + "sequential_threshold 0 " + "discard_promote_adjustment 0 " + "read_promote_adjustment 0 " + "write_promote_adjustment 0 "); + + *sz_ptr = sz; + return 0; +} + /* Init the policy plugin interface function pointers. */ -static void init_policy_functions(struct smq_policy *mq) +static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) { mq->policy.destroy = smq_destroy; mq->policy.map = smq_map; @@ -1583,6 +1623,11 @@ static void init_policy_functions(struct smq_policy *mq) mq->policy.force_mapping = smq_force_mapping; mq->policy.residency = smq_residency; mq->policy.tick = smq_tick; + + if (mimic_mq) { + mq->policy.set_config_value = mq_set_config_value; + mq->policy.emit_config_values = mq_emit_config_values; + } } static bool too_many_hotspot_blocks(sector_t origin_size, @@ -1606,9 +1651,10 @@ static void calc_hotspot_params(sector_t origin_size, *hotspot_block_size /= 2u; } -static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, - sector_t origin_size, - sector_t cache_block_size) +static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size, + bool mimic_mq) { unsigned i; unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; @@ -1618,7 +1664,7 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, if (!mq) return NULL; - init_policy_functions(mq); + init_policy_functions(mq, mimic_mq); mq->cache_size = cache_size; mq->cache_block_size = cache_block_size; @@ -1706,19 +1752,41 @@ bad_pool_init: return NULL; } +static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, false); +} + +static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, + sector_t origin_size, + sector_t cache_block_size) +{ + return __smq_create(cache_size, origin_size, cache_block_size, true); +} + /*----------------------------------------------------------------*/ static struct dm_cache_policy_type smq_policy_type = { .name = "smq", - .version = {1, 0, 0}, + .version = {1, 5, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create }; +static struct dm_cache_policy_type mq_policy_type = { + .name = "mq", + .version = {1, 5, 0}, + .hint_size = 4, + .owner = THIS_MODULE, + .create = mq_create, +}; + static struct dm_cache_policy_type default_policy_type = { .name = "default", - .version = {1, 4, 0}, + .version = {1, 5, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = smq_create, @@ -1735,9 +1803,17 @@ static int __init smq_init(void) return -ENOMEM; } + r = dm_cache_policy_register(&mq_policy_type); + if (r) { + DMERR("register failed (as mq) %d", r); + dm_cache_policy_unregister(&smq_policy_type); + return -ENOMEM; + } + r = dm_cache_policy_register(&default_policy_type); if (r) { DMERR("register failed (as default) %d", r); + dm_cache_policy_unregister(&mq_policy_type); dm_cache_policy_unregister(&smq_policy_type); return -ENOMEM; } @@ -1748,6 +1824,7 @@ static int __init smq_init(void) static void __exit smq_exit(void) { dm_cache_policy_unregister(&smq_policy_type); + dm_cache_policy_unregister(&mq_policy_type); dm_cache_policy_unregister(&default_policy_type); } @@ -1759,3 +1836,4 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("smq cache policy"); MODULE_ALIAS("dm-cache-default"); +MODULE_ALIAS("dm-cache-mq"); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 5780accffa30..ee0510f9a85e 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -984,9 +984,14 @@ static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mod static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode) { - bool needs_check = dm_cache_metadata_needs_check(cache->cmd); + bool needs_check; enum cache_metadata_mode old_mode = get_cache_mode(cache); + if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { + DMERR("unable to read needs_check flag, setting failure mode"); + new_mode = CM_FAIL; + } + if (new_mode == CM_WRITE && needs_check) { DMERR("%s: unable to switch cache to write mode until repaired.", cache_device_name(cache)); @@ -2771,7 +2776,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->split_discard_bios = false; cache->features = ca->features; - ti->per_bio_data_size = get_per_bio_data_size(cache); + ti->per_io_data_size = get_per_bio_data_size(cache); cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -3510,6 +3515,7 @@ static void cache_status(struct dm_target *ti, status_type_t type, char buf[BDEVNAME_SIZE]; struct cache *cache = ti->private; dm_cblock_t residency; + bool needs_check; switch (type) { case STATUSTYPE_INFO: @@ -3583,7 +3589,9 @@ static void cache_status(struct dm_target *ti, status_type_t type, else DMEMIT("rw "); - if (dm_cache_metadata_needs_check(cache->cmd)) + r = dm_cache_metadata_needs_check(cache->cmd, &needs_check); + + if (r || needs_check) DMEMIT("needs_check "); else DMEMIT("- "); @@ -3806,7 +3814,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 8, 0}, + .version = {1, 9, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3147c8d09ea8..5c934b670154 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1788,7 +1788,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } - cc->per_bio_data_size = ti->per_bio_data_size = + cc->per_bio_data_size = ti->per_io_data_size = ALIGN(sizeof(struct dm_crypt_io) + cc->dmreq_start + sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size, ARCH_KMALLOC_MINALIGN); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index b4c356a21123..cc70871a6d29 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -204,7 +204,7 @@ out: ti->num_flush_bios = 1; ti->num_discard_bios = 1; - ti->per_bio_data_size = sizeof(struct dm_delay_info); + ti->per_io_data_size = sizeof(struct dm_delay_info); ti->private = dc; return 0; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 09e2afcafd2d..b7341de87015 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -220,7 +220,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; - ti->per_bio_data_size = sizeof(struct per_bio_data); + ti->per_io_data_size = sizeof(struct per_bio_data); ti->private = fc; return 0; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 80a439543259..2adf81d81fca 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1291,7 +1291,8 @@ static int table_load(struct dm_ioctl *param, size_t param_size) immutable_target_type = dm_get_immutable_target_type(md); if (immutable_target_type && - (immutable_target_type != dm_table_get_immutable_target_type(t))) { + (immutable_target_type != dm_table_get_immutable_target_type(t)) && + !dm_table_get_wildcard_target(t)) { DMWARN("can't replace immutable target type %s", immutable_target_type->name); r = -EINVAL; @@ -1303,7 +1304,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) dm_set_md_type(md, dm_table_get_type(t)); /* setup md->queue to reflect md's type (may block) */ - r = dm_setup_md_queue(md); + r = dm_setup_md_queue(md, t); if (r) { DMWARN("unable to set up device queue for new table."); goto err_unlock_md_type; diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 624589d51c2c..608302e222af 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -475,7 +475,7 @@ static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->flush_supported = true; ti->num_discard_bios = 1; ti->discards_supported = true; - ti->per_bio_data_size = sizeof(struct per_bio_data); + ti->per_io_data_size = sizeof(struct per_bio_data); ti->private = lc; return 0; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index cfa29f574c2a..677ba223e2ae 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -23,6 +23,7 @@ #include <linux/delay.h> #include <scsi/scsi_dh.h> #include <linux/atomic.h> +#include <linux/blk-mq.h> #define DM_MSG_PREFIX "multipath" #define DM_PG_INIT_DELAY_MSECS 2000 @@ -33,11 +34,12 @@ struct pgpath { struct list_head list; struct priority_group *pg; /* Owning PG */ - unsigned is_active; /* Path status */ unsigned fail_count; /* Cumulative failure count */ struct dm_path path; struct delayed_work activate_path; + + bool is_active:1; /* Path status */ }; #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path) @@ -53,10 +55,10 @@ struct priority_group { struct path_selector ps; unsigned pg_num; /* Reference number */ - unsigned bypassed; /* Temporarily bypass this PG? */ - unsigned nr_pgpaths; /* Number of paths in PG */ struct list_head pgpaths; + + bool bypassed:1; /* Temporarily bypass this PG? */ }; /* Multipath context */ @@ -74,21 +76,20 @@ struct multipath { wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ - unsigned pg_init_required; /* pg_init needs calling? */ unsigned pg_init_in_progress; /* Only one pg_init allowed at once */ - unsigned pg_init_delay_retry; /* Delay pg_init retry? */ unsigned nr_valid_paths; /* Total number of usable paths */ struct pgpath *current_pgpath; struct priority_group *current_pg; struct priority_group *next_pg; /* Switch to this PG if set */ - unsigned repeat_count; /* I/Os left before calling PS again */ - unsigned queue_io:1; /* Must we queue all I/O? */ - unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */ - unsigned saved_queue_if_no_path:1; /* Saved state during suspension */ - unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ - unsigned pg_init_disabled:1; /* pg_init is not currently allowed */ + bool queue_io:1; /* Must we queue all I/O? */ + bool queue_if_no_path:1; /* Queue I/O if last path fails? */ + bool saved_queue_if_no_path:1; /* Saved state during suspension */ + bool retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */ + bool pg_init_disabled:1; /* pg_init is not currently allowed */ + bool pg_init_required:1; /* pg_init needs calling? */ + bool pg_init_delay_retry:1; /* Delay pg_init retry? */ unsigned pg_init_retries; /* Number of times to retry pg_init */ unsigned pg_init_count; /* Number of times pg_init called */ @@ -120,7 +121,6 @@ static struct kmem_cache *_mpio_cache; static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); -static int __pgpath_busy(struct pgpath *pgpath); /*----------------------------------------------- @@ -132,7 +132,7 @@ static struct pgpath *alloc_pgpath(void) struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL); if (pgpath) { - pgpath->is_active = 1; + pgpath->is_active = true; INIT_DELAYED_WORK(&pgpath->activate_path, activate_path); } @@ -181,25 +181,31 @@ static void free_priority_group(struct priority_group *pg, kfree(pg); } -static struct multipath *alloc_multipath(struct dm_target *ti) +static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq) { struct multipath *m; - unsigned min_ios = dm_get_reserved_rq_based_ios(); m = kzalloc(sizeof(*m), GFP_KERNEL); if (m) { INIT_LIST_HEAD(&m->priority_groups); spin_lock_init(&m->lock); - m->queue_io = 1; + m->queue_io = true; m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; INIT_WORK(&m->trigger_event, trigger_event); init_waitqueue_head(&m->pg_init_wait); mutex_init(&m->work_mutex); - m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); - if (!m->mpio_pool) { - kfree(m); - return NULL; + + m->mpio_pool = NULL; + if (!use_blk_mq) { + unsigned min_ios = dm_get_reserved_rq_based_ios(); + + m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); + if (!m->mpio_pool) { + kfree(m); + return NULL; + } } + m->ti = ti; ti->private = m; } @@ -222,26 +228,41 @@ static void free_multipath(struct multipath *m) kfree(m); } -static int set_mapinfo(struct multipath *m, union map_info *info) +static struct dm_mpath_io *get_mpio(union map_info *info) +{ + return info->ptr; +} + +static struct dm_mpath_io *set_mpio(struct multipath *m, union map_info *info) { struct dm_mpath_io *mpio; + if (!m->mpio_pool) { + /* Use blk-mq pdu memory requested via per_io_data_size */ + mpio = get_mpio(info); + memset(mpio, 0, sizeof(*mpio)); + return mpio; + } + mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); if (!mpio) - return -ENOMEM; + return NULL; memset(mpio, 0, sizeof(*mpio)); info->ptr = mpio; - return 0; + return mpio; } -static void clear_mapinfo(struct multipath *m, union map_info *info) +static void clear_request_fn_mpio(struct multipath *m, union map_info *info) { - struct dm_mpath_io *mpio = info->ptr; + /* Only needed for non blk-mq (.request_fn) multipath */ + if (m->mpio_pool) { + struct dm_mpath_io *mpio = info->ptr; - info->ptr = NULL; - mempool_free(mpio, m->mpio_pool); + info->ptr = NULL; + mempool_free(mpio, m->mpio_pool); + } } /*----------------------------------------------- @@ -257,7 +278,7 @@ static int __pg_init_all_paths(struct multipath *m) return 0; m->pg_init_count++; - m->pg_init_required = 0; + m->pg_init_required = false; /* Check here to reset pg_init_required */ if (!m->current_pg) @@ -283,11 +304,11 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath) /* Must we initialise the PG first, and queue I/O till it's ready? */ if (m->hw_handler_name) { - m->pg_init_required = 1; - m->queue_io = 1; + m->pg_init_required = true; + m->queue_io = true; } else { - m->pg_init_required = 0; - m->queue_io = 0; + m->pg_init_required = false; + m->queue_io = false; } m->pg_init_count = 0; @@ -298,7 +319,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, { struct dm_path *path; - path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes); + path = pg->ps.type->select_path(&pg->ps, nr_bytes); if (!path) return -ENXIO; @@ -313,10 +334,10 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, static void __choose_pgpath(struct multipath *m, size_t nr_bytes) { struct priority_group *pg; - unsigned bypassed = 1; + bool bypassed = true; if (!m->nr_valid_paths) { - m->queue_io = 0; + m->queue_io = false; goto failed; } @@ -344,7 +365,7 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes) continue; if (!__choose_path_in_pg(m, pg, nr_bytes)) { if (!bypassed) - m->pg_init_delay_retry = 1; + m->pg_init_delay_retry = true; return; } } @@ -380,7 +401,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, union map_info *map_context, struct request *rq, struct request **__clone) { - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; int r = DM_MAPIO_REQUEUE; size_t nr_bytes = clone ? blk_rq_bytes(clone) : blk_rq_bytes(rq); struct pgpath *pgpath; @@ -390,8 +411,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, spin_lock_irq(&m->lock); /* Do we need to select a new pgpath? */ - if (!m->current_pgpath || - (!m->queue_io && (m->repeat_count && --m->repeat_count == 0))) + if (!m->current_pgpath || !m->queue_io) __choose_pgpath(m, nr_bytes); pgpath = m->current_pgpath; @@ -405,11 +425,11 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, goto out_unlock; } - if (set_mapinfo(m, map_context) < 0) + mpio = set_mpio(m, map_context); + if (!mpio) /* ENOMEM, requeue */ goto out_unlock; - mpio = map_context->ptr; mpio->pgpath = pgpath; mpio->nr_bytes = nr_bytes; @@ -418,17 +438,24 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, spin_unlock_irq(&m->lock); if (clone) { - /* Old request-based interface: allocated clone is passed in */ + /* + * Old request-based interface: allocated clone is passed in. + * Used by: .request_fn stacked on .request_fn path(s). + */ clone->q = bdev_get_queue(bdev); clone->rq_disk = bdev->bd_disk; clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; } else { - /* blk-mq request-based interface */ - *__clone = blk_get_request(bdev_get_queue(bdev), - rq_data_dir(rq), GFP_ATOMIC); + /* + * blk-mq request-based interface; used by both: + * .request_fn stacked on blk-mq path(s) and + * blk-mq stacked on blk-mq path(s). + */ + *__clone = blk_mq_alloc_request(bdev_get_queue(bdev), + rq_data_dir(rq), BLK_MQ_REQ_NOWAIT); if (IS_ERR(*__clone)) { /* ENOMEM, requeue */ - clear_mapinfo(m, map_context); + clear_request_fn_mpio(m, map_context); return r; } (*__clone)->bio = (*__clone)->biotail = NULL; @@ -463,14 +490,14 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, static void multipath_release_clone(struct request *clone) { - blk_put_request(clone); + blk_mq_free_request(clone); } /* * If we run out of usable paths, should we queue I/O or error it? */ -static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path, - unsigned save_old_value) +static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, + bool save_old_value) { unsigned long flags; @@ -776,12 +803,12 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) argc--; if (!strcasecmp(arg_name, "queue_if_no_path")) { - r = queue_if_no_path(m, 1, 0); + r = queue_if_no_path(m, true, false); continue; } if (!strcasecmp(arg_name, "retain_attached_hw_handler")) { - m->retain_attached_hw_handler = 1; + m->retain_attached_hw_handler = true; continue; } @@ -820,11 +847,12 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, struct dm_arg_set as; unsigned pg_count = 0; unsigned next_pg_num; + bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table)); as.argc = argc; as.argv = argv; - m = alloc_multipath(ti); + m = alloc_multipath(ti, use_blk_mq); if (!m) { ti->error = "can't allocate multipath"; return -EINVAL; @@ -880,6 +908,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_same_bios = 1; + if (use_blk_mq) + ti->per_io_data_size = sizeof(struct dm_mpath_io); return 0; @@ -917,7 +947,7 @@ static void flush_multipath_work(struct multipath *m) unsigned long flags; spin_lock_irqsave(&m->lock, flags); - m->pg_init_disabled = 1; + m->pg_init_disabled = true; spin_unlock_irqrestore(&m->lock, flags); flush_workqueue(kmpath_handlerd); @@ -926,7 +956,7 @@ static void flush_multipath_work(struct multipath *m) flush_work(&m->trigger_event); spin_lock_irqsave(&m->lock, flags); - m->pg_init_disabled = 0; + m->pg_init_disabled = false; spin_unlock_irqrestore(&m->lock, flags); } @@ -954,7 +984,7 @@ static int fail_path(struct pgpath *pgpath) DMWARN("Failing path %s.", pgpath->path.dev->name); pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path); - pgpath->is_active = 0; + pgpath->is_active = false; pgpath->fail_count++; m->nr_valid_paths--; @@ -987,18 +1017,13 @@ static int reinstate_path(struct pgpath *pgpath) if (pgpath->is_active) goto out; - if (!pgpath->pg->ps.type->reinstate_path) { - DMWARN("Reinstate path not supported by path selector %s", - pgpath->pg->ps.type->name); - r = -EINVAL; - goto out; - } + DMWARN("Reinstating path %s.", pgpath->path.dev->name); r = pgpath->pg->ps.type->reinstate_path(&pgpath->pg->ps, &pgpath->path); if (r) goto out; - pgpath->is_active = 1; + pgpath->is_active = true; if (!m->nr_valid_paths++) { m->current_pgpath = NULL; @@ -1045,7 +1070,7 @@ static int action_dev(struct multipath *m, struct dm_dev *dev, * Temporarily try to avoid having to use the specified PG */ static void bypass_pg(struct multipath *m, struct priority_group *pg, - int bypassed) + bool bypassed) { unsigned long flags; @@ -1078,7 +1103,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) spin_lock_irqsave(&m->lock, flags); list_for_each_entry(pg, &m->priority_groups, list) { - pg->bypassed = 0; + pg->bypassed = false; if (--pgnum) continue; @@ -1096,7 +1121,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) * Set/clear bypassed status of a PG. * PGs are numbered upwards from 1 in the order they were declared. */ -static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) +static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) { struct priority_group *pg; unsigned pgnum; @@ -1120,17 +1145,17 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) /* * Should we retry pg_init immediately? */ -static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) +static bool pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath) { unsigned long flags; - int limit_reached = 0; + bool limit_reached = false; spin_lock_irqsave(&m->lock, flags); if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled) - m->pg_init_required = 1; + m->pg_init_required = true; else - limit_reached = 1; + limit_reached = true; spin_unlock_irqrestore(&m->lock, flags); @@ -1143,7 +1168,7 @@ static void pg_init_done(void *data, int errors) struct priority_group *pg = pgpath->pg; struct multipath *m = pg->m; unsigned long flags; - unsigned delay_retry = 0; + bool delay_retry = false; /* device or driver problems */ switch (errors) { @@ -1166,7 +1191,7 @@ static void pg_init_done(void *data, int errors) * Probably doing something like FW upgrade on the * controller so try the other pg. */ - bypass_pg(m, pg, 1); + bypass_pg(m, pg, true); break; case SCSI_DH_RETRY: /* Wait before retrying. */ @@ -1177,6 +1202,7 @@ static void pg_init_done(void *data, int errors) fail_path(pgpath); errors = 0; break; + case SCSI_DH_DEV_OFFLINED: default: /* * We probably do not want to fail the path for a device @@ -1194,7 +1220,7 @@ static void pg_init_done(void *data, int errors) m->current_pg = NULL; } } else if (!m->pg_init_required) - pg->bypassed = 0; + pg->bypassed = false; if (--m->pg_init_in_progress) /* Activations of other paths are still on going */ @@ -1205,7 +1231,7 @@ static void pg_init_done(void *data, int errors) if (__pg_init_all_paths(m)) goto out; } - m->queue_io = 0; + m->queue_io = false; /* * Wake up any thread waiting to suspend. @@ -1291,21 +1317,21 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, int error, union map_info *map_context) { struct multipath *m = ti->private; - struct dm_mpath_io *mpio = map_context->ptr; + struct dm_mpath_io *mpio = get_mpio(map_context); struct pgpath *pgpath; struct path_selector *ps; int r; BUG_ON(!mpio); - r = do_end_io(m, clone, error, mpio); + r = do_end_io(m, clone, error, mpio); pgpath = mpio->pgpath; if (pgpath) { ps = &pgpath->pg->ps; if (ps->type->end_io) ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); } - clear_mapinfo(m, map_context); + clear_request_fn_mpio(m, map_context); return r; } @@ -1318,9 +1344,9 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, */ static void multipath_presuspend(struct dm_target *ti) { - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; - queue_if_no_path(m, 0, 1); + queue_if_no_path(m, false, true); } static void multipath_postsuspend(struct dm_target *ti) @@ -1337,7 +1363,7 @@ static void multipath_postsuspend(struct dm_target *ti) */ static void multipath_resume(struct dm_target *ti) { - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; unsigned long flags; spin_lock_irqsave(&m->lock, flags); @@ -1366,7 +1392,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type, { int sz = 0; unsigned long flags; - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; struct priority_group *pg; struct pgpath *p; unsigned pg_num; @@ -1474,7 +1500,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) { int r = -EINVAL; struct dm_dev *dev; - struct multipath *m = (struct multipath *) ti->private; + struct multipath *m = ti->private; action_fn action; mutex_lock(&m->work_mutex); @@ -1486,10 +1512,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) if (argc == 1) { if (!strcasecmp(argv[0], "queue_if_no_path")) { - r = queue_if_no_path(m, 1, 0); + r = queue_if_no_path(m, true, false); goto out; } else if (!strcasecmp(argv[0], "fail_if_no_path")) { - r = queue_if_no_path(m, 0, 0); + r = queue_if_no_path(m, false, false); goto out; } } @@ -1500,10 +1526,10 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv) } if (!strcasecmp(argv[0], "disable_group")) { - r = bypass_pg_num(m, argv[1], 1); + r = bypass_pg_num(m, argv[1], true); goto out; } else if (!strcasecmp(argv[0], "enable_group")) { - r = bypass_pg_num(m, argv[1], 0); + r = bypass_pg_num(m, argv[1], false); goto out; } else if (!strcasecmp(argv[0], "switch_group")) { r = switch_pg_num(m, argv[1]); @@ -1604,7 +1630,7 @@ out: return ret; } -static int __pgpath_busy(struct pgpath *pgpath) +static int pgpath_busy(struct pgpath *pgpath) { struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); @@ -1621,7 +1647,7 @@ static int __pgpath_busy(struct pgpath *pgpath) */ static int multipath_busy(struct dm_target *ti) { - int busy = 0, has_active = 0; + bool busy = false, has_active = false; struct multipath *m = ti->private; struct priority_group *pg; struct pgpath *pgpath; @@ -1632,7 +1658,7 @@ static int multipath_busy(struct dm_target *ti) /* pg_init in progress or no paths available */ if (m->pg_init_in_progress || (!m->nr_valid_paths && m->queue_if_no_path)) { - busy = 1; + busy = true; goto out; } /* Guess which priority_group will be used at next mapping time */ @@ -1654,13 +1680,12 @@ static int multipath_busy(struct dm_target *ti) * If there is one non-busy active path at least, the path selector * will be able to select it. So we consider such a pg as not busy. */ - busy = 1; + busy = true; list_for_each_entry(pgpath, &pg->pgpaths, list) if (pgpath->is_active) { - has_active = 1; - - if (!__pgpath_busy(pgpath)) { - busy = 0; + has_active = true; + if (!pgpath_busy(pgpath)) { + busy = false; break; } } @@ -1671,7 +1696,7 @@ static int multipath_busy(struct dm_target *ti) * the current_pg will be changed at next mapping time. * We need to try mapping to determine it. */ - busy = 0; + busy = false; out: spin_unlock_irqrestore(&m->lock, flags); @@ -1684,7 +1709,8 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 10, 0}, + .version = {1, 11, 0}, + .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h index e7d1fa8b0459..b6eb5365b1a4 100644 --- a/drivers/md/dm-path-selector.h +++ b/drivers/md/dm-path-selector.h @@ -50,13 +50,8 @@ struct path_selector_type { /* * Chooses a path for this io, if no paths are available then * NULL will be returned. - * - * repeat_count is the number of times to use the path before - * calling the function again. 0 means don't call it again unless - * the path fails. */ struct dm_path *(*select_path) (struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes); /* diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 3941fae0de9f..23f178641794 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c @@ -23,12 +23,13 @@ #include <linux/atomic.h> #define DM_MSG_PREFIX "multipath queue-length" -#define QL_MIN_IO 128 -#define QL_VERSION "0.1.0" +#define QL_MIN_IO 1 +#define QL_VERSION "0.2.0" struct selector { struct list_head valid_paths; struct list_head failed_paths; + spinlock_t lock; }; struct path_info { @@ -45,6 +46,7 @@ static struct selector *alloc_selector(void) if (s) { INIT_LIST_HEAD(&s->valid_paths); INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); } return s; @@ -113,6 +115,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, struct path_info *pi; unsigned repeat_count = QL_MIN_IO; char dummy; + unsigned long flags; /* * Arguments: [<repeat_count>] @@ -129,6 +132,11 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + /* Allocate the path information structure */ pi = kmalloc(sizeof(*pi), GFP_KERNEL); if (!pi) { @@ -142,7 +150,9 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, path->pscontext = pi; + spin_lock_irqsave(&s->lock, flags); list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -151,16 +161,22 @@ static void ql_fail_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move(&pi->list, &s->failed_paths); + spin_unlock_irqrestore(&s->lock, flags); } static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -168,14 +184,16 @@ static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path) /* * Select a path having the minimum number of in-flight I/Os */ -static struct dm_path *ql_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) +static struct dm_path *ql_select_path(struct path_selector *ps, size_t nr_bytes) { struct selector *s = ps->context; struct path_info *pi = NULL, *best = NULL; + struct dm_path *ret = NULL; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); if (list_empty(&s->valid_paths)) - return NULL; + goto out; /* Change preferred (first in list) path to evenly balance. */ list_move_tail(s->valid_paths.next, &s->valid_paths); @@ -190,11 +208,12 @@ static struct dm_path *ql_select_path(struct path_selector *ps, } if (!best) - return NULL; - - *repeat_count = best->repeat_count; + goto out; - return best->path; + ret = best->path; +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; } static int ql_start_io(struct path_selector *ps, struct dm_path *path, diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index f2a363a89629..b3ccf1e0d4f2 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1121,7 +1121,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->num_flush_bios = 1; ti->num_discard_bios = 1; - ti->per_bio_data_size = sizeof(struct dm_raid1_bio_record); + ti->per_io_data_size = sizeof(struct dm_raid1_bio_record); ti->discard_zeroes_data_unsupported = true; ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0); diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 6ab1192cdd5f..4ace1da17db8 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c @@ -17,6 +17,8 @@ #include <linux/module.h> #define DM_MSG_PREFIX "multipath round-robin" +#define RR_MIN_IO 1000 +#define RR_VERSION "1.1.0" /*----------------------------------------------------------------- * Path-handling code, paths are held in lists @@ -41,23 +43,48 @@ static void free_paths(struct list_head *paths) * Round-robin selector *---------------------------------------------------------------*/ -#define RR_MIN_IO 1000 - struct selector { struct list_head valid_paths; struct list_head invalid_paths; + spinlock_t lock; + struct dm_path * __percpu *current_path; + struct percpu_counter repeat_count; }; +static void set_percpu_current_path(struct selector *s, struct dm_path *path) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(s->current_path, cpu) = path; +} + static struct selector *alloc_selector(void) { struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s) { - INIT_LIST_HEAD(&s->valid_paths); - INIT_LIST_HEAD(&s->invalid_paths); - } + if (!s) + return NULL; + + INIT_LIST_HEAD(&s->valid_paths); + INIT_LIST_HEAD(&s->invalid_paths); + spin_lock_init(&s->lock); + + s->current_path = alloc_percpu(struct dm_path *); + if (!s->current_path) + goto out_current_path; + set_percpu_current_path(s, NULL); + + if (percpu_counter_init(&s->repeat_count, 0, GFP_KERNEL)) + goto out_repeat_count; return s; + +out_repeat_count: + free_percpu(s->current_path); +out_current_path: + kfree(s); + return NULL;; } static int rr_create(struct path_selector *ps, unsigned argc, char **argv) @@ -74,10 +101,12 @@ static int rr_create(struct path_selector *ps, unsigned argc, char **argv) static void rr_destroy(struct path_selector *ps) { - struct selector *s = (struct selector *) ps->context; + struct selector *s = ps->context; free_paths(&s->valid_paths); free_paths(&s->invalid_paths); + free_percpu(s->current_path); + percpu_counter_destroy(&s->repeat_count); kfree(s); ps->context = NULL; } @@ -111,10 +140,11 @@ static int rr_status(struct path_selector *ps, struct dm_path *path, static int rr_add_path(struct path_selector *ps, struct dm_path *path, int argc, char **argv, char **error) { - struct selector *s = (struct selector *) ps->context; + struct selector *s = ps->context; struct path_info *pi; unsigned repeat_count = RR_MIN_IO; char dummy; + unsigned long flags; if (argc > 1) { *error = "round-robin ps: incorrect number of arguments"; @@ -139,42 +169,65 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, path->pscontext = pi; + spin_lock_irqsave(&s->lock, flags); list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } static void rr_fail_path(struct path_selector *ps, struct dm_path *p) { - struct selector *s = (struct selector *) ps->context; + unsigned long flags; + struct selector *s = ps->context; struct path_info *pi = p->pscontext; + spin_lock_irqsave(&s->lock, flags); + if (p == *this_cpu_ptr(s->current_path)) + set_percpu_current_path(s, NULL); + list_move(&pi->list, &s->invalid_paths); + spin_unlock_irqrestore(&s->lock, flags); } static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p) { - struct selector *s = (struct selector *) ps->context; + unsigned long flags; + struct selector *s = ps->context; struct path_info *pi = p->pscontext; + spin_lock_irqsave(&s->lock, flags); list_move(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } -static struct dm_path *rr_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) +static struct dm_path *rr_select_path(struct path_selector *ps, size_t nr_bytes) { - struct selector *s = (struct selector *) ps->context; + unsigned long flags; + struct selector *s = ps->context; struct path_info *pi = NULL; + struct dm_path *current_path = NULL; + + current_path = *this_cpu_ptr(s->current_path); + if (current_path) { + percpu_counter_dec(&s->repeat_count); + if (percpu_counter_read_positive(&s->repeat_count) > 0) + return current_path; + } + spin_lock_irqsave(&s->lock, flags); if (!list_empty(&s->valid_paths)) { pi = list_entry(s->valid_paths.next, struct path_info, list); list_move_tail(&pi->list, &s->valid_paths); - *repeat_count = pi->repeat_count; + percpu_counter_set(&s->repeat_count, pi->repeat_count); + set_percpu_current_path(s, pi->path); + current_path = pi->path; } + spin_unlock_irqrestore(&s->lock, flags); - return pi ? pi->path : NULL; + return current_path; } static struct path_selector_type rr_ps = { @@ -198,7 +251,7 @@ static int __init dm_rr_init(void) if (r < 0) DMERR("register failed %d", r); - DMINFO("version 1.0.0 loaded"); + DMINFO("version " RR_VERSION " loaded"); return r; } diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index 9df8f6bd6418..7b8642045c55 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c @@ -19,11 +19,12 @@ #define ST_MAX_RELATIVE_THROUGHPUT 100 #define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7 #define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT) -#define ST_VERSION "0.2.0" +#define ST_VERSION "0.3.0" struct selector { struct list_head valid_paths; struct list_head failed_paths; + spinlock_t lock; }; struct path_info { @@ -41,6 +42,7 @@ static struct selector *alloc_selector(void) if (s) { INIT_LIST_HEAD(&s->valid_paths); INIT_LIST_HEAD(&s->failed_paths); + spin_lock_init(&s->lock); } return s; @@ -111,6 +113,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, unsigned repeat_count = ST_MIN_IO; unsigned relative_throughput = 1; char dummy; + unsigned long flags; /* * Arguments: [<repeat_count> [<relative_throughput>]] @@ -134,6 +137,11 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, return -EINVAL; } + if (repeat_count > 1) { + DMWARN_LIMIT("repeat_count > 1 is deprecated, using 1 instead"); + repeat_count = 1; + } + if ((argc == 2) && (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { @@ -155,7 +163,9 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, path->pscontext = pi; + spin_lock_irqsave(&s->lock, flags); list_add_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -164,16 +174,22 @@ static void st_fail_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move(&pi->list, &s->failed_paths); + spin_unlock_irqrestore(&s->lock, flags); } static int st_reinstate_path(struct path_selector *ps, struct dm_path *path) { struct selector *s = ps->context; struct path_info *pi = path->pscontext; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); list_move_tail(&pi->list, &s->valid_paths); + spin_unlock_irqrestore(&s->lock, flags); return 0; } @@ -255,14 +271,16 @@ static int st_compare_load(struct path_info *pi1, struct path_info *pi2, return pi2->relative_throughput - pi1->relative_throughput; } -static struct dm_path *st_select_path(struct path_selector *ps, - unsigned *repeat_count, size_t nr_bytes) +static struct dm_path *st_select_path(struct path_selector *ps, size_t nr_bytes) { struct selector *s = ps->context; struct path_info *pi = NULL, *best = NULL; + struct dm_path *ret = NULL; + unsigned long flags; + spin_lock_irqsave(&s->lock, flags); if (list_empty(&s->valid_paths)) - return NULL; + goto out; /* Change preferred (first in list) path to evenly balance. */ list_move_tail(s->valid_paths.next, &s->valid_paths); @@ -272,11 +290,12 @@ static struct dm_path *st_select_path(struct path_selector *ps, best = pi; if (!best) - return NULL; - - *repeat_count = best->repeat_count; + goto out; - return best->path; + ret = best->path; +out: + spin_unlock_irqrestore(&s->lock, flags); + return ret; } static int st_start_io(struct path_selector *ps, struct dm_path *path, diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 3766386080a4..70bb0e8b62ce 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1105,6 +1105,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) int i; int r = -EINVAL; char *origin_path, *cow_path; + dev_t origin_dev, cow_dev; unsigned args_used, num_flush_bios = 1; fmode_t origin_mode = FMODE_READ; @@ -1135,11 +1136,19 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->error = "Cannot get origin device"; goto bad_origin; } + origin_dev = s->origin->bdev->bd_dev; cow_path = argv[0]; argv++; argc--; + cow_dev = dm_get_dev_t(cow_path); + if (cow_dev && cow_dev == origin_dev) { + ti->error = "COW device cannot be the same as origin device"; + r = -EINVAL; + goto bad_cow; + } + r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow); if (r) { ti->error = "Cannot get COW device"; @@ -1201,7 +1210,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->private = s; ti->num_flush_bios = num_flush_bios; - ti->per_bio_data_size = sizeof(struct dm_snap_tracked_chunk); + ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk); /* Add snapshot to the list of snapshots for this origin */ /* Exceptions aren't triggered till snapshot_resume() is called */ diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 061152a43730..f9e8f0bef332 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -365,6 +365,26 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, } /* + * Convert the path to a device + */ +dev_t dm_get_dev_t(const char *path) +{ + dev_t uninitialized_var(dev); + struct block_device *bdev; + + bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + dev = name_to_dev_t(path); + else { + dev = bdev->bd_dev; + bdput(bdev); + } + + return dev; +} +EXPORT_SYMBOL_GPL(dm_get_dev_t); + +/* * Add a device to the list, or just increment the usage count if * it's already present. */ @@ -372,23 +392,15 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, struct dm_dev **result) { int r; - dev_t uninitialized_var(dev); + dev_t dev; struct dm_dev_internal *dd; struct dm_table *t = ti->table; - struct block_device *bdev; BUG_ON(!t); - /* convert the path to a device */ - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) { - dev = name_to_dev_t(path); - if (!dev) - return -ENODEV; - } else { - dev = bdev->bd_dev; - bdput(bdev); - } + dev = dm_get_dev_t(path); + if (!dev) + return -ENODEV; dd = find_device(&t->devices, dev); if (!dd) { @@ -920,6 +932,30 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t) return t->immutable_target_type; } +struct dm_target *dm_table_get_immutable_target(struct dm_table *t) +{ + /* Immutable target is implicitly a singleton */ + if (t->num_targets > 1 || + !dm_target_is_immutable(t->targets[0].type)) + return NULL; + + return t->targets; +} + +struct dm_target *dm_table_get_wildcard_target(struct dm_table *t) +{ + struct dm_target *uninitialized_var(ti); + unsigned i = 0; + + while (i < dm_table_get_num_targets(t)) { + ti = dm_table_get_target(t, i++); + if (dm_target_is_wildcard(ti->type)) + return ti; + } + + return NULL; +} + bool dm_table_request_based(struct dm_table *t) { return __table_type_request_based(dm_table_get_type(t)); @@ -933,7 +969,7 @@ bool dm_table_mq_request_based(struct dm_table *t) static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { unsigned type = dm_table_get_type(t); - unsigned per_bio_data_size = 0; + unsigned per_io_data_size = 0; struct dm_target *tgt; unsigned i; @@ -945,10 +981,10 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device * if (type == DM_TYPE_BIO_BASED) for (i = 0; i < t->num_targets; i++) { tgt = t->targets + i; - per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size); + per_io_data_size = max(per_io_data_size, tgt->per_io_data_size); } - t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size); + t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_io_data_size); if (!t->mempools) return -ENOMEM; diff --git a/drivers/md/dm-target.c b/drivers/md/dm-target.c index 925ec1b15e75..a317dd884ba6 100644 --- a/drivers/md/dm-target.c +++ b/drivers/md/dm-target.c @@ -150,7 +150,8 @@ static void io_err_release_clone_rq(struct request *clone) static struct target_type error_target = { .name = "error", - .version = {1, 3, 0}, + .version = {1, 4, 0}, + .features = DM_TARGET_WILDCARD, .ctr = io_err_ctr, .dtr = io_err_dtr, .map = io_err_map, diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index f962d6453afd..43824d73366d 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -344,7 +344,7 @@ static void subtree_dec(void *context, const void *value) memcpy(&root_le, value, sizeof(root_le)); root = le64_to_cpu(root_le); if (dm_btree_del(info, root)) - DMERR("btree delete failed\n"); + DMERR("btree delete failed"); } static int subtree_equal(void *context, const void *value1_le, const void *value2_le) @@ -1981,5 +1981,8 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd) { - dm_tm_issue_prefetches(pmd->tm); + down_read(&pmd->root_lock); + if (!pmd->fail_io) + dm_tm_issue_prefetches(pmd->tm); + up_read(&pmd->root_lock); } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 72d91f477683..92237b6fa8cd 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -235,6 +235,7 @@ struct pool { struct pool_features pf; bool low_water_triggered:1; /* A dm event has been sent */ bool suspended:1; + bool out_of_data_space:1; struct dm_bio_prison *prison; struct dm_kcopyd_client *copier; @@ -461,9 +462,16 @@ static void cell_error_with_code(struct pool *pool, dm_bio_prison_free_cell(pool->prison, cell); } +static int get_pool_io_error_code(struct pool *pool) +{ + return pool->out_of_data_space ? -ENOSPC : -EIO; +} + static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) { - cell_error_with_code(pool, cell, -EIO); + int error = get_pool_io_error_code(pool); + + cell_error_with_code(pool, cell, error); } static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell) @@ -622,7 +630,9 @@ static void error_retry_list_with_code(struct pool *pool, int error) static void error_retry_list(struct pool *pool) { - return error_retry_list_with_code(pool, -EIO); + int error = get_pool_io_error_code(pool); + + return error_retry_list_with_code(pool, error); } /* @@ -2419,6 +2429,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) */ if (old_mode != new_mode) notify_of_pool_mode_change_to_oods(pool); + pool->out_of_data_space = true; pool->process_bio = process_bio_read_only; pool->process_discard = process_discard_bio; pool->process_cell = process_cell_read_only; @@ -2432,6 +2443,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) case PM_WRITE: if (old_mode != new_mode) notify_of_pool_mode_change(pool, "write"); + pool->out_of_data_space = false; pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space; dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; @@ -2832,6 +2844,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, INIT_LIST_HEAD(&pool->active_thins); pool->low_water_triggered = false; pool->suspended = true; + pool->out_of_data_space = false; pool->shared_read_ds = dm_deferred_set_create(); if (!pool->shared_read_ds) { @@ -3886,7 +3899,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 17, 0}, + .version = {1, 18, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4037,7 +4050,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->num_flush_bios = 1; ti->flush_supported = true; - ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook); + ti->per_io_data_size = sizeof(struct dm_thin_endio_hook); /* In case the pool supports discards, pass them on. */ ti->discard_zeroes_data_unsupported = true; @@ -4260,7 +4273,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 17, 0}, + .version = {1, 18, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 1cc10c4de701..459a9f8905ed 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -812,7 +812,7 @@ int verity_fec_ctr(struct dm_verity *v) } /* Reserve space for our per-bio data */ - ti->per_bio_data_size += sizeof(struct dm_verity_fec_io); + ti->per_io_data_size += sizeof(struct dm_verity_fec_io); return 0; } diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 5c5d30cb6ec5..0aba34a7b3b3 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -354,7 +354,7 @@ int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, size_t len)) { unsigned todo = 1 << v->data_dev_block_bits; - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); do { int r; @@ -460,7 +460,7 @@ static int verity_verify_io(struct dm_verity_io *io) static void verity_finish_io(struct dm_verity_io *io, int error) { struct dm_verity *v = io->v; - struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size); + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); bio->bi_end_io = io->orig_bi_end_io; bio->bi_error = error; @@ -574,7 +574,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) if (bio_data_dir(bio) == WRITE) return -EIO; - io = dm_per_bio_data(bio, ti->per_bio_data_size); + io = dm_per_bio_data(bio, ti->per_io_data_size); io->v = v; io->orig_bi_end_io = bio->bi_end_io; io->block = bio->bi_iter.bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); @@ -1036,15 +1036,15 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - ti->per_bio_data_size = sizeof(struct dm_verity_io) + + ti->per_io_data_size = sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2; r = verity_fec_ctr(v); if (r) goto bad; - ti->per_bio_data_size = roundup(ti->per_bio_data_size, - __alignof__(struct dm_verity_io)); + ti->per_io_data_size = roundup(ti->per_io_data_size, + __alignof__(struct dm_verity_io)); return 0; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index dd834927bc66..be4905769a45 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -106,14 +106,6 @@ struct dm_rq_clone_bio_info { struct bio clone; }; -union map_info *dm_get_rq_mapinfo(struct request *rq) -{ - if (rq && rq->end_io_data) - return &((struct dm_rq_target_io *)rq->end_io_data)->info; - return NULL; -} -EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); - #define MINOR_ALLOCED ((void *)-1) /* @@ -129,28 +121,18 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_SUSPENDED_INTERNALLY 7 /* - * A dummy definition to make RCU happy. - * struct dm_table should never be dereferenced in this file. - */ -struct dm_table { - int undefined__; -}; - -/* * Work processed by per-device workqueue. */ struct mapped_device { struct srcu_struct io_barrier; struct mutex suspend_lock; - atomic_t holders; - atomic_t open_count; /* - * The current mapping. + * The current mapping (struct dm_table *). * Use dm_get_live_table{_fast} or take suspend_lock for * dereference. */ - struct dm_table __rcu *map; + void __rcu *map; struct list_head table_devices; struct mutex table_devices_lock; @@ -158,10 +140,16 @@ struct mapped_device { unsigned long flags; struct request_queue *queue; + int numa_node_id; + unsigned type; /* Protect queue and type against concurrent access. */ struct mutex type_lock; + atomic_t holders; + atomic_t open_count; + + struct dm_target *immutable_target; struct target_type *immutable_target_type; struct gendisk *disk; @@ -175,8 +163,20 @@ struct mapped_device { atomic_t pending[2]; wait_queue_head_t wait; struct work_struct work; - struct bio_list deferred; spinlock_t deferred_lock; + struct bio_list deferred; + + /* + * Event handling. + */ + wait_queue_head_t eventq; + atomic_t event_nr; + atomic_t uevent_seq; + struct list_head uevent_list; + spinlock_t uevent_lock; /* Protect access to uevent_list */ + + /* the number of internal suspends */ + unsigned internal_suspend_count; /* * Processing queue (flush) @@ -192,32 +192,21 @@ struct mapped_device { struct bio_set *bs; /* - * Event handling. - */ - atomic_t event_nr; - wait_queue_head_t eventq; - atomic_t uevent_seq; - struct list_head uevent_list; - spinlock_t uevent_lock; /* Protect access to uevent_list */ - - /* * freeze/thaw support require holding onto a super block */ struct super_block *frozen_sb; - struct block_device *bdev; /* forced geometry settings */ struct hd_geometry geometry; + struct block_device *bdev; + /* kobject and completion */ struct dm_kobject_holder kobj_holder; /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; - /* the number of internal suspends */ - unsigned internal_suspend_count; - struct dm_stats stats; struct kthread_worker kworker; @@ -230,8 +219,9 @@ struct mapped_device { ktime_t last_rq_start_time; /* for blk-mq request-based DM support */ - struct blk_mq_tag_set tag_set; - bool use_blk_mq; + struct blk_mq_tag_set *tag_set; + bool use_blk_mq:1; + bool init_tio_pdu:1; }; #ifdef CONFIG_DM_MQ_DEFAULT @@ -240,10 +230,19 @@ static bool use_blk_mq = true; static bool use_blk_mq = false; #endif +#define DM_MQ_NR_HW_QUEUES 1 +#define DM_MQ_QUEUE_DEPTH 2048 +#define DM_NUMA_NODE NUMA_NO_NODE + +static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES; +static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; +static int dm_numa_node = DM_NUMA_NODE; + bool dm_use_blk_mq(struct mapped_device *md) { return md->use_blk_mq; } +EXPORT_SYMBOL_GPL(dm_use_blk_mq); /* * For mempools pre-allocation at the table loading time. @@ -277,6 +276,27 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; */ static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; +static int __dm_get_module_param_int(int *module_param, int min, int max) +{ + int param = ACCESS_ONCE(*module_param); + int modified_param = 0; + bool modified = true; + + if (param < min) + modified_param = min; + else if (param > max) + modified_param = max; + else + modified = false; + + if (modified) { + (void)cmpxchg(module_param, param, modified_param); + param = modified_param; + } + + return param; +} + static unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max) { @@ -310,6 +330,23 @@ unsigned dm_get_reserved_rq_based_ios(void) } EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); +static unsigned dm_get_blk_mq_nr_hw_queues(void) +{ + return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32); +} + +static unsigned dm_get_blk_mq_queue_depth(void) +{ + return __dm_get_module_param(&dm_mq_queue_depth, + DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH); +} + +static unsigned dm_get_numa_node(void) +{ + return __dm_get_module_param_int(&dm_numa_node, + DM_NUMA_NODE, num_online_nodes() - 1); +} + static int __init local_init(void) { int r = -ENOMEM; @@ -323,7 +360,7 @@ static int __init local_init(void) if (!_rq_tio_cache) goto out_free_io_cache; - _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), + _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request), __alignof__(struct request), 0, NULL); if (!_rq_cache) goto out_free_rq_tio_cache; @@ -556,16 +593,17 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } -static int dm_get_live_table_for_ioctl(struct mapped_device *md, - struct dm_target **tgt, struct block_device **bdev, - fmode_t *mode, int *srcu_idx) +static int dm_grab_bdev_for_ioctl(struct mapped_device *md, + struct block_device **bdev, + fmode_t *mode) { + struct dm_target *tgt; struct dm_table *map; - int r; + int srcu_idx, r; retry: r = -ENOTTY; - map = dm_get_live_table(md, srcu_idx); + map = dm_get_live_table(md, &srcu_idx); if (!map || !dm_table_get_size(map)) goto out; @@ -573,9 +611,8 @@ retry: if (dm_table_get_num_targets(map) != 1) goto out; - *tgt = dm_table_get_target(map, 0); - - if (!(*tgt)->type->prepare_ioctl) + tgt = dm_table_get_target(map, 0); + if (!tgt->type->prepare_ioctl) goto out; if (dm_suspended_md(md)) { @@ -583,14 +620,16 @@ retry: goto out; } - r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode); + r = tgt->type->prepare_ioctl(tgt, bdev, mode); if (r < 0) goto out; + bdgrab(*bdev); + dm_put_live_table(md, srcu_idx); return r; out: - dm_put_live_table(md, *srcu_idx); + dm_put_live_table(md, srcu_idx); if (r == -ENOTCONN && !fatal_signal_pending(current)) { msleep(10); goto retry; @@ -602,11 +641,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; - struct dm_target *tgt; - struct block_device *tgt_bdev = NULL; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -621,9 +658,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, goto out; } - r = __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg); + r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); out: - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -642,24 +679,24 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio) bio_put(&tio->clone); } -static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, - gfp_t gfp_mask) +static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md, + gfp_t gfp_mask) { return mempool_alloc(md->io_pool, gfp_mask); } -static void free_rq_tio(struct dm_rq_target_io *tio) +static void free_old_rq_tio(struct dm_rq_target_io *tio) { mempool_free(tio, tio->md->io_pool); } -static struct request *alloc_clone_request(struct mapped_device *md, - gfp_t gfp_mask) +static struct request *alloc_old_clone_request(struct mapped_device *md, + gfp_t gfp_mask) { return mempool_alloc(md->rq_pool, gfp_mask); } -static void free_clone_request(struct mapped_device *md, struct request *rq) +static void free_old_clone_request(struct mapped_device *md, struct request *rq) { mempool_free(rq, md->rq_pool); } @@ -827,7 +864,7 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, mutex_lock(&md->table_devices_lock); td = find_table_device(&md->table_devices, dev, mode); if (!td) { - td = kmalloc(sizeof(*td), GFP_KERNEL); + td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); if (!td) { mutex_unlock(&md->table_devices_lock); return -ENOMEM; @@ -1109,12 +1146,8 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) * back into ->request_fn() could deadlock attempting to grab the * queue lock again. */ - if (run_queue) { - if (md->queue->mq_ops) - blk_mq_run_hw_queues(md->queue, true); - else - blk_run_queue_async(md->queue); - } + if (!md->queue->mq_ops && run_queue) + blk_run_queue_async(md->queue); /* * dm_put() must be at the end of this function. See the comment above @@ -1134,15 +1167,10 @@ static void free_rq_clone(struct request *clone) tio->ti->type->release_clone_rq(clone); else if (!md->queue->mq_ops) /* request_fn queue stacked on request_fn queue(s) */ - free_clone_request(md, clone); - /* - * NOTE: for the blk-mq queue stacked on request_fn queue(s) case: - * no need to call free_clone_request() because we leverage blk-mq by - * allocating the clone at the end of the blk-mq pdu (see: clone_rq) - */ + free_old_clone_request(md, clone); if (!md->queue->mq_ops) - free_rq_tio(tio); + free_old_rq_tio(tio); } /* @@ -1192,13 +1220,13 @@ static void dm_unprep_request(struct request *rq) if (clone) free_rq_clone(clone); else if (!tio->md->queue->mq_ops) - free_rq_tio(tio); + free_old_rq_tio(tio); } /* * Requeue the original request of a clone. */ -static void old_requeue_request(struct request *rq) +static void dm_old_requeue_request(struct request *rq) { struct request_queue *q = rq->q; unsigned long flags; @@ -1209,45 +1237,57 @@ static void old_requeue_request(struct request *rq) spin_unlock_irqrestore(q->queue_lock, flags); } +static void dm_mq_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + unsigned long flags; + + blk_mq_requeue_request(rq); + spin_lock_irqsave(q->queue_lock, flags); + if (!blk_queue_stopped(q)) + blk_mq_kick_requeue_list(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + static void dm_requeue_original_request(struct mapped_device *md, struct request *rq) { int rw = rq_data_dir(rq); + rq_end_stats(md, rq); dm_unprep_request(rq); - rq_end_stats(md, rq); if (!rq->q->mq_ops) - old_requeue_request(rq); - else { - blk_mq_requeue_request(rq); - blk_mq_kick_requeue_list(rq->q); - } + dm_old_requeue_request(rq); + else + dm_mq_requeue_request(rq); rq_completed(md, rw, false); } -static void old_stop_queue(struct request_queue *q) +static void dm_old_stop_queue(struct request_queue *q) { unsigned long flags; - if (blk_queue_stopped(q)) + spin_lock_irqsave(q->queue_lock, flags); + if (blk_queue_stopped(q)) { + spin_unlock_irqrestore(q->queue_lock, flags); return; + } - spin_lock_irqsave(q->queue_lock, flags); blk_stop_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } -static void stop_queue(struct request_queue *q) +static void dm_stop_queue(struct request_queue *q) { if (!q->mq_ops) - old_stop_queue(q); + dm_old_stop_queue(q); else blk_mq_stop_hw_queues(q); } -static void old_start_queue(struct request_queue *q) +static void dm_old_start_queue(struct request_queue *q) { unsigned long flags; @@ -1257,12 +1297,14 @@ static void old_start_queue(struct request_queue *q) spin_unlock_irqrestore(q->queue_lock, flags); } -static void start_queue(struct request_queue *q) +static void dm_start_queue(struct request_queue *q) { if (!q->mq_ops) - old_start_queue(q); - else + dm_old_start_queue(q); + else { blk_mq_start_stopped_hw_queues(q, true); + blk_mq_kick_requeue_list(q); + } } static void dm_done(struct request *clone, int error, bool mapped) @@ -1313,7 +1355,7 @@ static void dm_softirq_done(struct request *rq) if (!rq->q->mq_ops) { blk_end_request_all(rq, tio->error); rq_completed(tio->md, rw, false); - free_rq_tio(tio); + free_old_rq_tio(tio); } else { blk_mq_end_request(rq, tio->error); rq_completed(tio->md, rw, false); @@ -1336,7 +1378,10 @@ static void dm_complete_request(struct request *rq, int error) struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; - blk_complete_request(rq); + if (!rq->q->mq_ops) + blk_complete_request(rq); + else + blk_mq_complete_request(rq, error); } /* @@ -1352,7 +1397,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error) } /* - * Called with the clone's queue lock held (for non-blk-mq) + * Called with the clone's queue lock held (in the case of .request_fn) */ static void end_clone_request(struct request *clone, int error) { @@ -1522,21 +1567,26 @@ static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) /* * Creates a bio that consists of range of complete bvecs. */ -static void clone_bio(struct dm_target_io *tio, struct bio *bio, - sector_t sector, unsigned len) +static int clone_bio(struct dm_target_io *tio, struct bio *bio, + sector_t sector, unsigned len) { struct bio *clone = &tio->clone; __bio_clone_fast(clone, bio); - if (bio_integrity(bio)) - bio_integrity_clone(clone, bio, GFP_NOIO); + if (bio_integrity(bio)) { + int r = bio_integrity_clone(clone, bio, GFP_NOIO); + if (r < 0) + return r; + } bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); clone->bi_iter.bi_size = to_bytes(len); if (bio_integrity(bio)) bio_integrity_trim(clone, 0, len); + + return 0; } static struct dm_target_io *alloc_tio(struct clone_info *ci, @@ -1593,13 +1643,14 @@ static int __send_empty_flush(struct clone_info *ci) return 0; } -static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, +static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, sector_t sector, unsigned *len) { struct bio *bio = ci->bio; struct dm_target_io *tio; unsigned target_bio_nr; unsigned num_target_bios = 1; + int r = 0; /* * Does the target want to receive duplicate copies of the bio? @@ -1610,9 +1661,13 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { tio = alloc_tio(ci, ti, target_bio_nr); tio->len_ptr = len; - clone_bio(tio, bio, sector, *len); + r = clone_bio(tio, bio, sector, *len); + if (r < 0) + break; __map_bio(tio); } + + return r; } typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); @@ -1689,6 +1744,7 @@ static int __split_and_process_non_flush(struct clone_info *ci) struct bio *bio = ci->bio; struct dm_target *ti; unsigned len; + int r; if (unlikely(bio->bi_rw & REQ_DISCARD)) return __send_discard(ci); @@ -1701,7 +1757,9 @@ static int __split_and_process_non_flush(struct clone_info *ci) len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); - __clone_and_map_data_bio(ci, ti, ci->sector, &len); + r = __clone_and_map_data_bio(ci, ti, ci->sector, &len); + if (r < 0) + return r; ci->sector += len; ci->sector_count -= len; @@ -1839,28 +1897,22 @@ static int setup_clone(struct request *clone, struct request *rq, return 0; } -static struct request *clone_rq(struct request *rq, struct mapped_device *md, - struct dm_rq_target_io *tio, gfp_t gfp_mask) +static struct request *clone_old_rq(struct request *rq, struct mapped_device *md, + struct dm_rq_target_io *tio, gfp_t gfp_mask) { /* - * Do not allocate a clone if tio->clone was already set - * (see: dm_mq_queue_rq). + * Create clone for use with .request_fn request_queue */ - bool alloc_clone = !tio->clone; struct request *clone; - if (alloc_clone) { - clone = alloc_clone_request(md, gfp_mask); - if (!clone) - return NULL; - } else - clone = tio->clone; + clone = alloc_old_clone_request(md, gfp_mask); + if (!clone) + return NULL; blk_rq_init(NULL, clone); if (setup_clone(clone, rq, tio, gfp_mask)) { /* -ENOMEM */ - if (alloc_clone) - free_clone_request(md, clone); + free_old_clone_request(md, clone); return NULL; } @@ -1877,29 +1929,40 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq, tio->clone = NULL; tio->orig = rq; tio->error = 0; - memset(&tio->info, 0, sizeof(tio->info)); + /* + * Avoid initializing info for blk-mq; it passes + * target-specific data through info.ptr + * (see: dm_mq_init_request) + */ + if (!md->init_tio_pdu) + memset(&tio->info, 0, sizeof(tio->info)); if (md->kworker_task) init_kthread_work(&tio->work, map_tio_request); } -static struct dm_rq_target_io *prep_tio(struct request *rq, - struct mapped_device *md, gfp_t gfp_mask) +static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq, + struct mapped_device *md, + gfp_t gfp_mask) { struct dm_rq_target_io *tio; int srcu_idx; struct dm_table *table; - tio = alloc_rq_tio(md, gfp_mask); + tio = alloc_old_rq_tio(md, gfp_mask); if (!tio) return NULL; init_tio(tio, rq, md); table = dm_get_live_table(md, &srcu_idx); + /* + * Must clone a request if this .request_fn DM device + * is stacked on .request_fn device(s). + */ if (!dm_table_mq_request_based(table)) { - if (!clone_rq(rq, md, tio, gfp_mask)) { + if (!clone_old_rq(rq, md, tio, gfp_mask)) { dm_put_live_table(md, srcu_idx); - free_rq_tio(tio); + free_old_rq_tio(tio); return NULL; } } @@ -1911,7 +1974,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq, /* * Called with the queue lock held. */ -static int dm_prep_fn(struct request_queue *q, struct request *rq) +static int dm_old_prep_fn(struct request_queue *q, struct request *rq) { struct mapped_device *md = q->queuedata; struct dm_rq_target_io *tio; @@ -1921,7 +1984,7 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) return BLKPREP_KILL; } - tio = prep_tio(rq, md, GFP_ATOMIC); + tio = dm_old_prep_tio(rq, md, GFP_ATOMIC); if (!tio) return BLKPREP_DEFER; @@ -2079,12 +2142,18 @@ static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md) static void dm_request_fn(struct request_queue *q) { struct mapped_device *md = q->queuedata; - int srcu_idx; - struct dm_table *map = dm_get_live_table(md, &srcu_idx); - struct dm_target *ti; + struct dm_target *ti = md->immutable_target; struct request *rq; struct dm_rq_target_io *tio; - sector_t pos; + sector_t pos = 0; + + if (unlikely(!ti)) { + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); + + ti = dm_table_find_target(map, pos); + dm_put_live_table(md, srcu_idx); + } /* * For suspend, check blk_queue_stopped() and increment @@ -2095,33 +2164,21 @@ static void dm_request_fn(struct request_queue *q) while (!blk_queue_stopped(q)) { rq = blk_peek_request(q); if (!rq) - goto out; + return; /* always use block 0 to find the target for flushes for now */ pos = 0; if (!(rq->cmd_flags & REQ_FLUSH)) pos = blk_rq_pos(rq); - ti = dm_table_find_target(map, pos); - if (!dm_target_is_valid(ti)) { - /* - * Must perform setup, that rq_completed() requires, - * before calling dm_kill_unmapped_request - */ - DMERR_LIMIT("request attempted access beyond the end of device"); - dm_start_request(md, rq); - dm_kill_unmapped_request(rq, -EIO); - continue; + if ((dm_request_peeked_before_merge_deadline(md) && + md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && + md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) || + (ti->type->busy && ti->type->busy(ti))) { + blk_delay_queue(q, HZ / 100); + return; } - if (dm_request_peeked_before_merge_deadline(md) && - md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && - md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) - goto delay_and_out; - - if (ti->type->busy && ti->type->busy(ti)) - goto delay_and_out; - dm_start_request(md, rq); tio = tio_from_request(rq); @@ -2130,13 +2187,6 @@ static void dm_request_fn(struct request_queue *q) queue_kthread_work(&md->kworker, &tio->work); BUG_ON(!irqs_disabled()); } - - goto out; - -delay_and_out: - blk_delay_queue(q, HZ / 100); -out: - dm_put_live_table(md, srcu_idx); } static int dm_any_congested(void *congested_data, int bdi_bits) @@ -2146,19 +2196,18 @@ static int dm_any_congested(void *congested_data, int bdi_bits) struct dm_table *map; if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - map = dm_get_live_table_fast(md); - if (map) { + if (dm_request_based(md)) { /* - * Request-based dm cares about only own queue for - * the query about congestion status of request_queue + * With request-based DM we only need to check the + * top-level queue for congestion. */ - if (dm_request_based(md)) - r = md->queue->backing_dev_info.wb.state & - bdi_bits; - else + r = md->queue->backing_dev_info.wb.state & bdi_bits; + } else { + map = dm_get_live_table_fast(md); + if (map) r = dm_table_any_congested(map, bdi_bits); + dm_put_live_table_fast(md); } - dm_put_live_table_fast(md); } return r; @@ -2238,7 +2287,7 @@ static void dm_init_md_queue(struct mapped_device *md) md->queue->backing_dev_info.congested_data = md; } -static void dm_init_old_md_queue(struct mapped_device *md) +static void dm_init_normal_md_queue(struct mapped_device *md) { md->use_blk_mq = false; dm_init_md_queue(md); @@ -2285,10 +2334,11 @@ static void cleanup_mapped_device(struct mapped_device *md) */ static struct mapped_device *alloc_dev(int minor) { - int r; - struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); + int r, numa_node_id = dm_get_numa_node(); + struct mapped_device *md; void *old_md; + md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); if (!md) { DMWARN("unable to allocate device, out of memory."); return NULL; @@ -2309,7 +2359,9 @@ static struct mapped_device *alloc_dev(int minor) if (r < 0) goto bad_io_barrier; + md->numa_node_id = numa_node_id; md->use_blk_mq = use_blk_mq; + md->init_tio_pdu = false; md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); @@ -2323,13 +2375,13 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue(GFP_KERNEL); + md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); if (!md->queue) goto bad; dm_init_md_queue(md); - md->disk = alloc_disk(1); + md->disk = alloc_disk_node(1, numa_node_id); if (!md->disk) goto bad; @@ -2393,8 +2445,10 @@ static void free_dev(struct mapped_device *md) unlock_fs(md); cleanup_mapped_device(md); - if (md->use_blk_mq) - blk_mq_free_tag_set(&md->tag_set); + if (md->tag_set) { + blk_mq_free_tag_set(md->tag_set); + kfree(md->tag_set); + } free_table_devices(&md->table_devices); dm_stats_cleanup(&md->stats); @@ -2502,13 +2556,20 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * This must be done before setting the queue restrictions, * because request-based dm may be run just after the setting. */ - if (dm_table_request_based(t)) - stop_queue(q); + if (dm_table_request_based(t)) { + dm_stop_queue(q); + /* + * Leverage the fact that request-based DM targets are + * immutable singletons and establish md->immutable_target + * - used to optimize both dm_request_fn and dm_mq_queue_rq + */ + md->immutable_target = dm_table_get_immutable_target(t); + } __bind_mempools(md, t); old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); - rcu_assign_pointer(md->map, t); + rcu_assign_pointer(md->map, (void *)t); md->immutable_target_type = dm_table_get_immutable_target_type(t); dm_table_set_restrictions(t, q, limits); @@ -2574,7 +2635,6 @@ void dm_set_md_type(struct mapped_device *md, unsigned type) unsigned dm_get_md_type(struct mapped_device *md) { - BUG_ON(!mutex_is_locked(&md->type_lock)); return md->type; } @@ -2594,7 +2654,7 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_get_queue_limits); -static void init_rq_based_worker_thread(struct mapped_device *md) +static void dm_old_init_rq_based_worker_thread(struct mapped_device *md) { /* Initialize the request-based DM worker thread */ init_kthread_worker(&md->kworker); @@ -2603,26 +2663,22 @@ static void init_rq_based_worker_thread(struct mapped_device *md) } /* - * Fully initialize a request-based queue (->elevator, ->request_fn, etc). + * Fully initialize a .request_fn request-based queue. */ -static int dm_init_request_based_queue(struct mapped_device *md) +static int dm_old_init_request_queue(struct mapped_device *md) { - struct request_queue *q = NULL; - /* Fully initialize the queue */ - q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); - if (!q) + if (!blk_init_allocated_queue(md->queue, dm_request_fn, NULL)) return -EINVAL; /* disable dm_request_fn's merge heuristic by default */ md->seq_rq_merge_deadline_usecs = 0; - md->queue = q; - dm_init_old_md_queue(md); + dm_init_normal_md_queue(md); blk_queue_softirq_done(md->queue, dm_softirq_done); - blk_queue_prep_rq(md->queue, dm_prep_fn); + blk_queue_prep_rq(md->queue, dm_old_prep_fn); - init_rq_based_worker_thread(md); + dm_old_init_rq_based_worker_thread(md); elv_register_queue(md->queue); @@ -2642,6 +2698,11 @@ static int dm_mq_init_request(void *data, struct request *rq, */ tio->md = md; + if (md->init_tio_pdu) { + /* target-specific per-io data is immediately after the tio */ + tio->info.ptr = tio + 1; + } + return 0; } @@ -2651,28 +2712,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); struct mapped_device *md = tio->md; - int srcu_idx; - struct dm_table *map = dm_get_live_table(md, &srcu_idx); - struct dm_target *ti; - sector_t pos; + struct dm_target *ti = md->immutable_target; - /* always use block 0 to find the target for flushes for now */ - pos = 0; - if (!(rq->cmd_flags & REQ_FLUSH)) - pos = blk_rq_pos(rq); + if (unlikely(!ti)) { + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); - ti = dm_table_find_target(map, pos); - if (!dm_target_is_valid(ti)) { + ti = dm_table_find_target(map, 0); dm_put_live_table(md, srcu_idx); - DMERR_LIMIT("request attempted access beyond the end of device"); - /* - * Must perform setup, that rq_completed() requires, - * before returning BLK_MQ_RQ_QUEUE_ERROR - */ - dm_start_request(md, rq); - return BLK_MQ_RQ_QUEUE_ERROR; } - dm_put_live_table(md, srcu_idx); if (ti->type->busy && ti->type->busy(ti)) return BLK_MQ_RQ_QUEUE_BUSY; @@ -2688,20 +2736,12 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, */ tio->ti = ti; - /* Clone the request if underlying devices aren't blk-mq */ - if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) { - /* clone request is allocated at the end of the pdu */ - tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io); - (void) clone_rq(rq, md, tio, GFP_ATOMIC); - queue_kthread_work(&md->kworker, &tio->work); - } else { - /* Direct call is fine since .queue_rq allows allocations */ - if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { - /* Undo dm_start_request() before requeuing */ - rq_end_stats(md, rq); - rq_completed(md, rq_data_dir(rq), false); - return BLK_MQ_RQ_QUEUE_BUSY; - } + /* Direct call is fine since .queue_rq allows allocations */ + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { + /* Undo dm_start_request() before requeuing */ + rq_end_stats(md, rq); + rq_completed(md, rq_data_dir(rq), false); + return BLK_MQ_RQ_QUEUE_BUSY; } return BLK_MQ_RQ_QUEUE_OK; @@ -2714,47 +2754,56 @@ static struct blk_mq_ops dm_mq_ops = { .init_request = dm_mq_init_request, }; -static int dm_init_request_based_blk_mq_queue(struct mapped_device *md) +static int dm_mq_init_request_queue(struct mapped_device *md, + struct dm_target *immutable_tgt) { - unsigned md_type = dm_get_md_type(md); struct request_queue *q; int err; - memset(&md->tag_set, 0, sizeof(md->tag_set)); - md->tag_set.ops = &dm_mq_ops; - md->tag_set.queue_depth = BLKDEV_MAX_RQ; - md->tag_set.numa_node = NUMA_NO_NODE; - md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; - md->tag_set.nr_hw_queues = 1; - if (md_type == DM_TYPE_REQUEST_BASED) { - /* make the memory for non-blk-mq clone part of the pdu */ - md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request); - } else - md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); - md->tag_set.driver_data = md; - - err = blk_mq_alloc_tag_set(&md->tag_set); + if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) { + DMERR("request-based dm-mq may only be stacked on blk-mq device(s)"); + return -EINVAL; + } + + md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id); + if (!md->tag_set) + return -ENOMEM; + + md->tag_set->ops = &dm_mq_ops; + md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); + md->tag_set->numa_node = md->numa_node_id; + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); + md->tag_set->driver_data = md; + + md->tag_set->cmd_size = sizeof(struct dm_rq_target_io); + if (immutable_tgt && immutable_tgt->per_io_data_size) { + /* any target-specific per-io data is immediately after the tio */ + md->tag_set->cmd_size += immutable_tgt->per_io_data_size; + md->init_tio_pdu = true; + } + + err = blk_mq_alloc_tag_set(md->tag_set); if (err) - return err; + goto out_kfree_tag_set; - q = blk_mq_init_allocated_queue(&md->tag_set, md->queue); + q = blk_mq_init_allocated_queue(md->tag_set, md->queue); if (IS_ERR(q)) { err = PTR_ERR(q); goto out_tag_set; } - md->queue = q; dm_init_md_queue(md); /* backfill 'mq' sysfs registration normally done in blk_register_queue */ blk_mq_register_disk(md->disk); - if (md_type == DM_TYPE_REQUEST_BASED) - init_rq_based_worker_thread(md); - return 0; out_tag_set: - blk_mq_free_tag_set(&md->tag_set); + blk_mq_free_tag_set(md->tag_set); +out_kfree_tag_set: + kfree(md->tag_set); + return err; } @@ -2769,28 +2818,28 @@ static unsigned filter_md_type(unsigned type, struct mapped_device *md) /* * Setup the DM device's queue based on md's type */ -int dm_setup_md_queue(struct mapped_device *md) +int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { int r; unsigned md_type = filter_md_type(dm_get_md_type(md), md); switch (md_type) { case DM_TYPE_REQUEST_BASED: - r = dm_init_request_based_queue(md); + r = dm_old_init_request_queue(md); if (r) { - DMWARN("Cannot initialize queue for request-based mapped device"); + DMERR("Cannot initialize queue for request-based mapped device"); return r; } break; case DM_TYPE_MQ_REQUEST_BASED: - r = dm_init_request_based_blk_mq_queue(md); + r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t)); if (r) { - DMWARN("Cannot initialize queue for request-based blk-mq mapped device"); + DMERR("Cannot initialize queue for request-based dm-mq mapped device"); return r; } break; case DM_TYPE_BIO_BASED: - dm_init_old_md_queue(md); + dm_init_normal_md_queue(md); blk_queue_make_request(md->queue, dm_make_request); /* * DM handles splitting bios as needed. Free the bio_split bioset @@ -3133,7 +3182,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * dm defers requests to md->wq from md->queue. */ if (dm_request_based(md)) { - stop_queue(md->queue); + dm_stop_queue(md->queue); if (md->kworker_task) flush_kthread_worker(&md->kworker); } @@ -3157,7 +3206,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, dm_queue_flush(md); if (dm_request_based(md)) - start_queue(md->queue); + dm_start_queue(md->queue); unlock_fs(md); dm_table_presuspend_undo_targets(map); @@ -3236,7 +3285,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map) * Request-based dm is queueing the deferred I/Os in its request_queue. */ if (dm_request_based(md)) - start_queue(md->queue); + dm_start_queue(md->queue); unlock_fs(md); @@ -3482,9 +3531,9 @@ int dm_noflush_suspending(struct dm_target *ti) EXPORT_SYMBOL_GPL(dm_noflush_suspending); struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, - unsigned integrity, unsigned per_bio_data_size) + unsigned integrity, unsigned per_io_data_size) { - struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); + struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); struct kmem_cache *cachep = NULL; unsigned int pool_size = 0; unsigned int front_pad; @@ -3498,7 +3547,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t case DM_TYPE_BIO_BASED: cachep = _io_cache; pool_size = dm_get_reserved_bio_based_ios(); - front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); + front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); break; case DM_TYPE_REQUEST_BASED: cachep = _rq_tio_cache; @@ -3511,8 +3560,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t if (!pool_size) pool_size = dm_get_reserved_rq_based_ios(); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); - /* per_bio_data_size is not used. See __bind_mempools(). */ - WARN_ON(per_bio_data_size != 0); + /* per_io_data_size is used for blk-mq pdu at queue allocation */ break; default: BUG(); @@ -3554,15 +3602,14 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) } static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, - u32 flags) + u32 flags) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3572,20 +3619,19 @@ static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, - u32 flags) + u32 flags) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3595,7 +3641,7 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3603,11 +3649,10 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3617,20 +3662,19 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, - enum pr_type type, bool abort) + enum pr_type type, bool abort) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3640,7 +3684,7 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3648,11 +3692,10 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3662,7 +3705,7 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3701,6 +3744,15 @@ MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools" module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); +module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices"); + +module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices"); + +module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 7edcf97dfa5a..13a758ec0f88 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -73,6 +73,8 @@ int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); unsigned dm_table_get_type(struct dm_table *t); struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); +struct dm_target *dm_table_get_immutable_target(struct dm_table *t); +struct dm_target *dm_table_get_wildcard_target(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); bool dm_table_mq_request_based(struct dm_table *t); void dm_table_free_md_mempools(struct dm_table *t); @@ -84,7 +86,7 @@ void dm_set_md_type(struct mapped_device *md, unsigned type); unsigned dm_get_md_type(struct mapped_device *md); struct target_type *dm_get_immutable_target_type(struct mapped_device *md); -int dm_setup_md_queue(struct mapped_device *md); +int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); /* * To check the return value from dm_table_find_target(). diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index ec1c61c87d89..0830c9e86f0d 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -124,6 +124,8 @@ struct dm_dev { char name[16]; }; +dev_t dm_get_dev_t(const char *path); + /* * Constructors should call these functions to ensure destination devices * are opened/closed correctly. @@ -190,6 +192,13 @@ struct target_type { #define dm_target_is_immutable(type) ((type)->features & DM_TARGET_IMMUTABLE) /* + * Indicates that a target may replace any target; even immutable targets. + * .map, .map_rq, .clone_and_map_rq and .release_clone_rq are all defined. + */ +#define DM_TARGET_WILDCARD 0x00000008 +#define dm_target_is_wildcard(type) ((type)->features & DM_TARGET_WILDCARD) + +/* * Some targets need to be sent the same WRITE bio severals times so * that they can send copies of it to different devices. This function * examines any supplied bio and returns the number of copies of it the @@ -231,10 +240,10 @@ struct dm_target { unsigned num_write_same_bios; /* - * The minimum number of extra bytes allocated in each bio for the - * target to use. dm_per_bio_data returns the data location. + * The minimum number of extra bytes allocated in each io for the + * target to use. */ - unsigned per_bio_data_size; + unsigned per_io_data_size; /* * If defined, this function is called to find out how many |