summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-14 12:35:48 +0900
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-14 12:35:48 +0900
commit7f2dc5c4bcbff035b0d03f7aa78a182664b21e47 (patch)
treef4a5ff92305fbf70f1cba3a1ced0f492b27aab74 /drivers/md
parent82cb6acea4d10fa4080612c58deb63993f558e5a (diff)
parent7b6b2bc98c0303b7f043ad5b35906f833e56308d (diff)
downloadlinux-7f2dc5c4bcbff035b0d03f7aa78a182664b21e47.tar.bz2
Merge tag 'dm-3.13-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper changes from Mike Snitzer: "A set of device-mapper changes for 3.13. Improve reliability of buffer allocations for dm messages with a small number of arguments, a couple path group initialization fixes for dm multipath, a fix for resizing a dm array, various fixes and optimizations for dm cache, a fix for device mapper's Kconfig menu indentation. Features added include: - dm crypt support for activating legacy CBC TrueCrypt containers (useful for forensics of these old TCRYPT containers) - reduced dm-cache memory requirements for each block in the cache - basic support for shrinking a dm-cache's cache (fast) device - most notably, dm-cache support for managing cache coherency when deploying dm-cache with sophisticated origin volumes (that support hardware snapshots and/or clustering): these changes come in the form of a new passthrough operation mode and a cache block invalidation interface" * tag 'dm-3.13-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (32 commits) dm cache: resolve small nits and improve Documentation dm cache: add cache block invalidation support dm cache: add remove_cblock method to policy interface dm cache policy mq: reduce memory requirements dm cache metadata: check the metadata version when reading the superblock dm cache: add passthrough mode dm cache: cache shrinking support dm cache: promotion optimisation for writes dm cache: be much more aggressive about promoting writes to discarded blocks dm cache policy mq: implement writeback_work() and mq_{set,clear}_dirty() dm cache: optimize commit_if_needed dm space map disk: optimise sm_disk_dec_block MAINTAINERS: add reference to device-mapper's linux-dm.git tree dm: fix Kconfig menu indentation dm: allow remove to be deferred dm table: print error on preresume failure dm crypt: add TCW IV mode for old CBC TCRYPT containers dm crypt: properly handle extra key string in initialization dm cache: log error message if dm_kcopyd_copy() fails dm cache: use cell_defer() boolean argument consistently ...
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig22
-rw-r--r--drivers/md/dm-cache-metadata.c104
-rw-r--r--drivers/md/dm-cache-metadata.h5
-rw-r--r--drivers/md/dm-cache-policy-internal.h7
-rw-r--r--drivers/md/dm-cache-policy-mq.c681
-rw-r--r--drivers/md/dm-cache-policy.c4
-rw-r--r--drivers/md/dm-cache-policy.h21
-rw-r--r--drivers/md/dm-cache-target.c687
-rw-r--r--drivers/md/dm-crypt.c214
-rw-r--r--drivers/md/dm-ioctl.c36
-rw-r--r--drivers/md/dm-mpath.c34
-rw-r--r--drivers/md/dm-table.c23
-rw-r--r--drivers/md/dm.c47
-rw-r--r--drivers/md/dm.h13
-rw-r--r--drivers/md/persistent-data/dm-array.c5
-rw-r--r--drivers/md/persistent-data/dm-space-map-disk.c18
16 files changed, 1466 insertions, 455 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 30b426ed744b..f2ccbc3b9fe4 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -297,6 +297,17 @@ config DM_MIRROR
Allow volume managers to mirror logical volumes, also
needed for live data migration tools such as 'pvmove'.
+config DM_LOG_USERSPACE
+ tristate "Mirror userspace logging"
+ depends on DM_MIRROR && NET
+ select CONNECTOR
+ ---help---
+ The userspace logging module provides a mechanism for
+ relaying the dm-dirty-log API to userspace. Log designs
+ which are more suited to userspace implementation (e.g.
+ shared storage logs) or experimental logs can be implemented
+ by leveraging this framework.
+
config DM_RAID
tristate "RAID 1/4/5/6/10 target"
depends on BLK_DEV_DM
@@ -323,17 +334,6 @@ config DM_RAID
RAID-5, RAID-6 distributes the syndromes across the drives
in one of the available parity distribution methods.
-config DM_LOG_USERSPACE
- tristate "Mirror userspace logging"
- depends on DM_MIRROR && NET
- select CONNECTOR
- ---help---
- The userspace logging module provides a mechanism for
- relaying the dm-dirty-log API to userspace. Log designs
- which are more suited to userspace implementation (e.g.
- shared storage logs) or experimental logs can be implemented
- by leveraging this framework.
-
config DM_ZERO
tristate "Zero target"
depends on BLK_DEV_DM
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 1af7255bbffb..9ef0752e8a08 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -20,7 +20,13 @@
#define CACHE_SUPERBLOCK_MAGIC 06142003
#define CACHE_SUPERBLOCK_LOCATION 0
-#define CACHE_VERSION 1
+
+/*
+ * defines a range of metadata versions that this module can handle.
+ */
+#define MIN_CACHE_VERSION 1
+#define MAX_CACHE_VERSION 1
+
#define CACHE_METADATA_CACHE_SIZE 64
/*
@@ -134,6 +140,18 @@ static void sb_prepare_for_write(struct dm_block_validator *v,
SUPERBLOCK_CSUM_XOR));
}
+static int check_metadata_version(struct cache_disk_superblock *disk_super)
+{
+ uint32_t metadata_version = le32_to_cpu(disk_super->version);
+ if (metadata_version < MIN_CACHE_VERSION || metadata_version > MAX_CACHE_VERSION) {
+ DMERR("Cache metadata version %u found, but only versions between %u and %u supported.",
+ metadata_version, MIN_CACHE_VERSION, MAX_CACHE_VERSION);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int sb_check(struct dm_block_validator *v,
struct dm_block *b,
size_t sb_block_size)
@@ -164,7 +182,7 @@ static int sb_check(struct dm_block_validator *v,
return -EILSEQ;
}
- return 0;
+ return check_metadata_version(disk_super);
}
static struct dm_block_validator sb_validator = {
@@ -198,7 +216,7 @@ static int superblock_lock(struct dm_cache_metadata *cmd,
/*----------------------------------------------------------------*/
-static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
+static int __superblock_all_zeroes(struct dm_block_manager *bm, bool *result)
{
int r;
unsigned i;
@@ -214,10 +232,10 @@ static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
return r;
data_le = dm_block_data(b);
- *result = 1;
+ *result = true;
for (i = 0; i < sb_block_size; i++) {
if (data_le[i] != zero) {
- *result = 0;
+ *result = false;
break;
}
}
@@ -270,7 +288,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->flags = 0;
memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
disk_super->magic = cpu_to_le64(CACHE_SUPERBLOCK_MAGIC);
- disk_super->version = cpu_to_le32(CACHE_VERSION);
+ disk_super->version = cpu_to_le32(MAX_CACHE_VERSION);
memset(disk_super->policy_name, 0, sizeof(disk_super->policy_name));
memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
disk_super->policy_hint_size = 0;
@@ -411,7 +429,8 @@ bad:
static int __open_or_format_metadata(struct dm_cache_metadata *cmd,
bool format_device)
{
- int r, unformatted;
+ int r;
+ bool unformatted = false;
r = __superblock_all_zeroes(cmd->bm, &unformatted);
if (r)
@@ -666,19 +685,85 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
kfree(cmd);
}
+/*
+ * Checks that the given cache block is either unmapped or clean.
+ */
+static int block_unmapped_or_clean(struct dm_cache_metadata *cmd, dm_cblock_t b,
+ bool *result)
+{
+ int r;
+ __le64 value;
+ dm_oblock_t ob;
+ unsigned flags;
+
+ r = dm_array_get_value(&cmd->info, cmd->root, from_cblock(b), &value);
+ if (r) {
+ DMERR("block_unmapped_or_clean failed");
+ return r;
+ }
+
+ unpack_value(value, &ob, &flags);
+ *result = !((flags & M_VALID) && (flags & M_DIRTY));
+
+ return 0;
+}
+
+static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
+ dm_cblock_t begin, dm_cblock_t end,
+ bool *result)
+{
+ int r;
+ *result = true;
+
+ while (begin != end) {
+ r = block_unmapped_or_clean(cmd, begin, result);
+ if (r)
+ return r;
+
+ if (!*result) {
+ DMERR("cache block %llu is dirty",
+ (unsigned long long) from_cblock(begin));
+ return 0;
+ }
+
+ begin = to_cblock(from_cblock(begin) + 1);
+ }
+
+ return 0;
+}
+
int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
{
int r;
+ bool clean;
__le64 null_mapping = pack_value(0, 0);
down_write(&cmd->root_lock);
__dm_bless_for_disk(&null_mapping);
+
+ if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
+ r = blocks_are_unmapped_or_clean(cmd, new_cache_size, cmd->cache_blocks, &clean);
+ if (r) {
+ __dm_unbless_for_disk(&null_mapping);
+ goto out;
+ }
+
+ if (!clean) {
+ DMERR("unable to shrink cache due to dirty blocks");
+ r = -EINVAL;
+ __dm_unbless_for_disk(&null_mapping);
+ goto out;
+ }
+ }
+
r = dm_array_resize(&cmd->info, cmd->root, from_cblock(cmd->cache_blocks),
from_cblock(new_cache_size),
&null_mapping, &cmd->root);
if (!r)
cmd->cache_blocks = new_cache_size;
cmd->changed = true;
+
+out:
up_write(&cmd->root_lock);
return r;
@@ -1182,3 +1267,8 @@ int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
return r;
}
+
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
+{
+ return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index f45cef21f3d0..cd906f14f98d 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -137,6 +137,11 @@ int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
int dm_cache_save_hint(struct dm_cache_metadata *cmd,
dm_cblock_t cblock, uint32_t hint);
+/*
+ * Query method. Are all the blocks in the cache clean?
+ */
+int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
+
/*----------------------------------------------------------------*/
#endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 0928abdc49f0..2256a1f24f73 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -61,7 +61,12 @@ static inline int policy_writeback_work(struct dm_cache_policy *p,
static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
{
- return p->remove_mapping(p, oblock);
+ p->remove_mapping(p, oblock);
+}
+
+static inline int policy_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+ return p->remove_cblock(p, cblock);
}
static inline void policy_force_mapping(struct dm_cache_policy *p,
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 4296155090b2..416b7b752a6e 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -26,19 +26,6 @@ static unsigned next_power(unsigned n, unsigned min)
/*----------------------------------------------------------------*/
-static unsigned long *alloc_bitset(unsigned nr_entries)
-{
- size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
- return vzalloc(s);
-}
-
-static void free_bitset(unsigned long *bits)
-{
- vfree(bits);
-}
-
-/*----------------------------------------------------------------*/
-
/*
* Large, sequential ios are probably better left on the origin device since
* spindles tend to have good bandwidth.
@@ -151,6 +138,21 @@ static void queue_init(struct queue *q)
}
/*
+ * Checks to see if the queue is empty.
+ * FIXME: reduce cpu usage.
+ */
+static bool queue_empty(struct queue *q)
+{
+ unsigned i;
+
+ for (i = 0; i < NR_QUEUE_LEVELS; i++)
+ if (!list_empty(q->qs + i))
+ return false;
+
+ return true;
+}
+
+/*
* Insert an entry to the back of the given level.
*/
static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
@@ -218,17 +220,116 @@ struct entry {
struct hlist_node hlist;
struct list_head list;
dm_oblock_t oblock;
- dm_cblock_t cblock; /* valid iff in_cache */
/*
* FIXME: pack these better
*/
- bool in_cache:1;
+ bool dirty:1;
unsigned hit_count;
unsigned generation;
unsigned tick;
};
+/*
+ * Rather than storing the cblock in an entry, we allocate all entries in
+ * an array, and infer the cblock from the entry position.
+ *
+ * Free entries are linked together into a list.
+ */
+struct entry_pool {
+ struct entry *entries, *entries_end;
+ struct list_head free;
+ unsigned nr_allocated;
+};
+
+static int epool_init(struct entry_pool *ep, unsigned nr_entries)
+{
+ unsigned i;
+
+ ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
+ if (!ep->entries)
+ return -ENOMEM;
+
+ ep->entries_end = ep->entries + nr_entries;
+
+ INIT_LIST_HEAD(&ep->free);
+ for (i = 0; i < nr_entries; i++)
+ list_add(&ep->entries[i].list, &ep->free);
+
+ ep->nr_allocated = 0;
+
+ return 0;
+}
+
+static void epool_exit(struct entry_pool *ep)
+{
+ vfree(ep->entries);
+}
+
+static struct entry *alloc_entry(struct entry_pool *ep)
+{
+ struct entry *e;
+
+ if (list_empty(&ep->free))
+ return NULL;
+
+ e = list_entry(list_pop(&ep->free), struct entry, list);
+ INIT_LIST_HEAD(&e->list);
+ INIT_HLIST_NODE(&e->hlist);
+ ep->nr_allocated++;
+
+ return e;
+}
+
+/*
+ * This assumes the cblock hasn't already been allocated.
+ */
+static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
+{
+ struct entry *e = ep->entries + from_cblock(cblock);
+ list_del(&e->list);
+
+ INIT_LIST_HEAD(&e->list);
+ INIT_HLIST_NODE(&e->hlist);
+ ep->nr_allocated++;
+
+ return e;
+}
+
+static void free_entry(struct entry_pool *ep, struct entry *e)
+{
+ BUG_ON(!ep->nr_allocated);
+ ep->nr_allocated--;
+ INIT_HLIST_NODE(&e->hlist);
+ list_add(&e->list, &ep->free);
+}
+
+/*
+ * Returns NULL if the entry is free.
+ */
+static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
+{
+ struct entry *e = ep->entries + from_cblock(cblock);
+ return !hlist_unhashed(&e->hlist) ? e : NULL;
+}
+
+static bool epool_empty(struct entry_pool *ep)
+{
+ return list_empty(&ep->free);
+}
+
+static bool in_pool(struct entry_pool *ep, struct entry *e)
+{
+ return e >= ep->entries && e < ep->entries_end;
+}
+
+static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
+{
+ return to_cblock(e - ep->entries);
+}
+
+/*----------------------------------------------------------------*/
+
struct mq_policy {
struct dm_cache_policy policy;
@@ -238,13 +339,22 @@ struct mq_policy {
struct io_tracker tracker;
/*
- * We maintain two queues of entries. The cache proper contains
- * the currently active mappings. Whereas the pre_cache tracks
- * blocks that are being hit frequently and potential candidates
- * for promotion to the cache.
+ * Entries come from two pools, one of pre-cache entries, and one
+ * for the cache proper.
+ */
+ struct entry_pool pre_cache_pool;
+ struct entry_pool cache_pool;
+
+ /*
+ * We maintain three queues of entries. The cache proper,
+ * consisting of a clean and dirty queue, contains the currently
+ * active mappings. Whereas the pre_cache tracks blocks that
+ * are being hit frequently and potential candidates for promotion
+ * to the cache.
*/
struct queue pre_cache;
- struct queue cache;
+ struct queue cache_clean;
+ struct queue cache_dirty;
/*
* Keeps track of time, incremented by the core. We use this to
@@ -282,25 +392,6 @@ struct mq_policy {
unsigned promote_threshold;
/*
- * We need cache_size entries for the cache, and choose to have
- * cache_size entries for the pre_cache too. One motivation for
- * using the same size is to make the hit counts directly
- * comparable between pre_cache and cache.
- */
- unsigned nr_entries;
- unsigned nr_entries_allocated;
- struct list_head free;
-
- /*
- * Cache blocks may be unallocated. We store this info in a
- * bitset.
- */
- unsigned long *allocation_bitset;
- unsigned nr_cblocks_allocated;
- unsigned find_free_nr_words;
- unsigned find_free_last_word;
-
- /*
* The hash table allows us to quickly find an entry by origin
* block. Both pre_cache and cache entries are in here.
*/
@@ -310,49 +401,6 @@ struct mq_policy {
};
/*----------------------------------------------------------------*/
-/* Free/alloc mq cache entry structures. */
-static void takeout_queue(struct list_head *lh, struct queue *q)
-{
- unsigned level;
-
- for (level = 0; level < NR_QUEUE_LEVELS; level++)
- list_splice(q->qs + level, lh);
-}
-
-static void free_entries(struct mq_policy *mq)
-{
- struct entry *e, *tmp;
-
- takeout_queue(&mq->free, &mq->pre_cache);
- takeout_queue(&mq->free, &mq->cache);
-
- list_for_each_entry_safe(e, tmp, &mq->free, list)
- kmem_cache_free(mq_entry_cache, e);
-}
-
-static int alloc_entries(struct mq_policy *mq, unsigned elts)
-{
- unsigned u = mq->nr_entries;
-
- INIT_LIST_HEAD(&mq->free);
- mq->nr_entries_allocated = 0;
-
- while (u--) {
- struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL);
-
- if (!e) {
- free_entries(mq);
- return -ENOMEM;
- }
-
-
- list_add(&e->list, &mq->free);
- }
-
- return 0;
-}
-
-/*----------------------------------------------------------------*/
/*
* Simple hash table implementation. Should replace with the standard hash
@@ -388,96 +436,14 @@ static void hash_remove(struct entry *e)
/*----------------------------------------------------------------*/
-/*
- * Allocates a new entry structure. The memory is allocated in one lump,
- * so we just handing it out here. Returns NULL if all entries have
- * already been allocated. Cannot fail otherwise.
- */
-static struct entry *alloc_entry(struct mq_policy *mq)
-{
- struct entry *e;
-
- if (mq->nr_entries_allocated >= mq->nr_entries) {
- BUG_ON(!list_empty(&mq->free));
- return NULL;
- }
-
- e = list_entry(list_pop(&mq->free), struct entry, list);
- INIT_LIST_HEAD(&e->list);
- INIT_HLIST_NODE(&e->hlist);
-
- mq->nr_entries_allocated++;
- return e;
-}
-
-/*----------------------------------------------------------------*/
-
-/*
- * Mark cache blocks allocated or not in the bitset.
- */
-static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock)
-{
- BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
- BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset));
-
- set_bit(from_cblock(cblock), mq->allocation_bitset);
- mq->nr_cblocks_allocated++;
-}
-
-static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock)
-{
- BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size));
- BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset));
-
- clear_bit(from_cblock(cblock), mq->allocation_bitset);
- mq->nr_cblocks_allocated--;
-}
-
static bool any_free_cblocks(struct mq_policy *mq)
{
- return mq->nr_cblocks_allocated < from_cblock(mq->cache_size);
+ return !epool_empty(&mq->cache_pool);
}
-/*
- * Fills result out with a cache block that isn't in use, or return
- * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is
- * reponsible for that.
- */
-static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end,
- dm_cblock_t *result, unsigned *last_word)
+static bool any_clean_cblocks(struct mq_policy *mq)
{
- int r = -ENOSPC;
- unsigned w;
-
- for (w = begin; w < end; w++) {
- /*
- * ffz is undefined if no zero exists
- */
- if (mq->allocation_bitset[w] != ~0UL) {
- *last_word = w;
- *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w]));
- if (from_cblock(*result) < from_cblock(mq->cache_size))
- r = 0;
-
- break;
- }
- }
-
- return r;
-}
-
-static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result)
-{
- int r;
-
- if (!any_free_cblocks(mq))
- return -ENOSPC;
-
- r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word);
- if (r == -ENOSPC && mq->find_free_last_word)
- r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word);
-
- return r;
+ return !queue_empty(&mq->cache_clean);
}
/*----------------------------------------------------------------*/
@@ -496,33 +462,35 @@ static unsigned queue_level(struct entry *e)
return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
}
+static bool in_cache(struct mq_policy *mq, struct entry *e)
+{
+ return in_pool(&mq->cache_pool, e);
+}
+
/*
* Inserts the entry into the pre_cache or the cache. Ensures the cache
- * block is marked as allocated if necc. Inserts into the hash table. Sets the
- * tick which records when the entry was last moved about.
+ * block is marked as allocated if necc. Inserts into the hash table.
+ * Sets the tick which records when the entry was last moved about.
*/
static void push(struct mq_policy *mq, struct entry *e)
{
e->tick = mq->tick;
hash_insert(mq, e);
- if (e->in_cache) {
- alloc_cblock(mq, e->cblock);
- queue_push(&mq->cache, queue_level(e), &e->list);
- } else
+ if (in_cache(mq, e))
+ queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
+ queue_level(e), &e->list);
+ else
queue_push(&mq->pre_cache, queue_level(e), &e->list);
}
/*
* Removes an entry from pre_cache or cache. Removes from the hash table.
- * Frees off the cache block if necc.
*/
static void del(struct mq_policy *mq, struct entry *e)
{
queue_remove(&e->list);
hash_remove(e);
- if (e->in_cache)
- free_cblock(mq, e->cblock);
}
/*
@@ -531,14 +499,14 @@ static void del(struct mq_policy *mq, struct entry *e)
*/
static struct entry *pop(struct mq_policy *mq, struct queue *q)
{
- struct entry *e = container_of(queue_pop(q), struct entry, list);
+ struct entry *e;
+ struct list_head *h = queue_pop(q);
- if (e) {
- hash_remove(e);
+ if (!h)
+ return NULL;
- if (e->in_cache)
- free_cblock(mq, e->cblock);
- }
+ e = container_of(h, struct entry, list);
+ hash_remove(e);
return e;
}
@@ -556,7 +524,8 @@ static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
* of the entries.
*
* At the moment the threshold is taken by averaging the hit counts of some
- * of the entries in the cache (the first 20 entries of the first level).
+ * of the entries in the cache (the first 20 entries across all levels in
+ * ascending order, giving preference to the clean entries at each level).
*
* We can be much cleverer than this though. For example, each promotion
* could bump up the threshold helping to prevent churn. Much more to do
@@ -571,14 +540,21 @@ static void check_generation(struct mq_policy *mq)
struct list_head *head;
struct entry *e;
- if ((mq->hit_count >= mq->generation_period) &&
- (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) {
-
+ if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
mq->hit_count = 0;
mq->generation++;
for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
- head = mq->cache.qs + level;
+ head = mq->cache_clean.qs + level;
+ list_for_each_entry(e, head, list) {
+ nr++;
+ total += e->hit_count;
+
+ if (++count >= MAX_TO_AVERAGE)
+ break;
+ }
+
+ head = mq->cache_dirty.qs + level;
list_for_each_entry(e, head, list) {
nr++;
total += e->hit_count;
@@ -631,19 +607,30 @@ static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
* - set the hit count to a hard coded value other than 1, eg, is it better
* if it goes in at level 2?
*/
-static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
+static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
{
- dm_cblock_t result;
- struct entry *demoted = pop(mq, &mq->cache);
+ struct entry *demoted = pop(mq, &mq->cache_clean);
+
+ if (!demoted)
+ /*
+ * We could get a block from mq->cache_dirty, but that
+ * would add extra latency to the triggering bio as it
+ * waits for the writeback. Better to not promote this
+ * time and hope there's a clean block next time this block
+ * is hit.
+ */
+ return -ENOSPC;
- BUG_ON(!demoted);
- result = demoted->cblock;
*oblock = demoted->oblock;
- demoted->in_cache = false;
- demoted->hit_count = 1;
- push(mq, demoted);
+ free_entry(&mq->cache_pool, demoted);
+
+ /*
+ * We used to put the demoted block into the pre-cache, but I think
+ * it's simpler to just let it work it's way up from zero again.
+ * Stops blocks flickering in and out of the cache.
+ */
- return result;
+ return 0;
}
/*
@@ -662,17 +649,18 @@ static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
static unsigned adjusted_promote_threshold(struct mq_policy *mq,
bool discarded_oblock, int data_dir)
{
- if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE)
+ if (data_dir == READ)
+ return mq->promote_threshold + READ_PROMOTE_THRESHOLD;
+
+ if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
/*
* We don't need to do any copying at all, so give this a
- * very low threshold. In practice this only triggers
- * during initial population after a format.
+ * very low threshold.
*/
return DISCARDED_PROMOTE_THRESHOLD;
+ }
- return data_dir == READ ?
- (mq->promote_threshold + READ_PROMOTE_THRESHOLD) :
- (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD);
+ return mq->promote_threshold + WRITE_PROMOTE_THRESHOLD;
}
static bool should_promote(struct mq_policy *mq, struct entry *e,
@@ -688,34 +676,49 @@ static int cache_entry_found(struct mq_policy *mq,
{
requeue_and_update_tick(mq, e);
- if (e->in_cache) {
+ if (in_cache(mq, e)) {
result->op = POLICY_HIT;
- result->cblock = e->cblock;
+ result->cblock = infer_cblock(&mq->cache_pool, e);
}
return 0;
}
/*
- * Moves and entry from the pre_cache to the cache. The main work is
+ * Moves an entry from the pre_cache to the cache. The main work is
* finding which cache block to use.
*/
static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
struct policy_result *result)
{
- dm_cblock_t cblock;
+ int r;
+ struct entry *new_e;
- if (find_free_cblock(mq, &cblock) == -ENOSPC) {
+ /* Ensure there's a free cblock in the cache */
+ if (epool_empty(&mq->cache_pool)) {
result->op = POLICY_REPLACE;
- cblock = demote_cblock(mq, &result->old_oblock);
+ r = demote_cblock(mq, &result->old_oblock);
+ if (r) {
+ result->op = POLICY_MISS;
+ return 0;
+ }
} else
result->op = POLICY_NEW;
- result->cblock = e->cblock = cblock;
+ new_e = alloc_entry(&mq->cache_pool);
+ BUG_ON(!new_e);
+
+ new_e->oblock = e->oblock;
+ new_e->dirty = false;
+ new_e->hit_count = e->hit_count;
+ new_e->generation = e->generation;
+ new_e->tick = e->tick;
del(mq, e);
- e->in_cache = true;
- push(mq, e);
+ free_entry(&mq->pre_cache_pool, e);
+ push(mq, new_e);
+
+ result->cblock = infer_cblock(&mq->cache_pool, new_e);
return 0;
}
@@ -743,7 +746,7 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
static void insert_in_pre_cache(struct mq_policy *mq,
dm_oblock_t oblock)
{
- struct entry *e = alloc_entry(mq);
+ struct entry *e = alloc_entry(&mq->pre_cache_pool);
if (!e)
/*
@@ -757,7 +760,7 @@ static void insert_in_pre_cache(struct mq_policy *mq,
return;
}
- e->in_cache = false;
+ e->dirty = false;
e->oblock = oblock;
e->hit_count = 1;
e->generation = mq->generation;
@@ -767,30 +770,36 @@ static void insert_in_pre_cache(struct mq_policy *mq,
static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
struct policy_result *result)
{
+ int r;
struct entry *e;
- dm_cblock_t cblock;
- if (find_free_cblock(mq, &cblock) == -ENOSPC) {
- result->op = POLICY_MISS;
- insert_in_pre_cache(mq, oblock);
- return;
- }
+ if (epool_empty(&mq->cache_pool)) {
+ result->op = POLICY_REPLACE;
+ r = demote_cblock(mq, &result->old_oblock);
+ if (unlikely(r)) {
+ result->op = POLICY_MISS;
+ insert_in_pre_cache(mq, oblock);
+ return;
+ }
- e = alloc_entry(mq);
- if (unlikely(!e)) {
- result->op = POLICY_MISS;
- return;
+ /*
+ * This will always succeed, since we've just demoted.
+ */
+ e = alloc_entry(&mq->cache_pool);
+ BUG_ON(!e);
+
+ } else {
+ e = alloc_entry(&mq->cache_pool);
+ result->op = POLICY_NEW;
}
e->oblock = oblock;
- e->cblock = cblock;
- e->in_cache = true;
+ e->dirty = false;
e->hit_count = 1;
e->generation = mq->generation;
push(mq, e);
- result->op = POLICY_NEW;
- result->cblock = e->cblock;
+ result->cblock = infer_cblock(&mq->cache_pool, e);
}
static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
@@ -821,13 +830,16 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
int r = 0;
struct entry *e = hash_lookup(mq, oblock);
- if (e && e->in_cache)
+ if (e && in_cache(mq, e))
r = cache_entry_found(mq, e, result);
+
else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
result->op = POLICY_MISS;
+
else if (e)
r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
data_dir, result);
+
else
r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
data_dir, result);
@@ -854,9 +866,9 @@ static void mq_destroy(struct dm_cache_policy *p)
{
struct mq_policy *mq = to_mq_policy(p);
- free_bitset(mq->allocation_bitset);
kfree(mq->table);
- free_entries(mq);
+ epool_exit(&mq->cache_pool);
+ epool_exit(&mq->pre_cache_pool);
kfree(mq);
}
@@ -904,8 +916,8 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t
return -EWOULDBLOCK;
e = hash_lookup(mq, oblock);
- if (e && e->in_cache) {
- *cblock = e->cblock;
+ if (e && in_cache(mq, e)) {
+ *cblock = infer_cblock(&mq->cache_pool, e);
r = 0;
} else
r = -ENOENT;
@@ -915,6 +927,36 @@ static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t
return r;
}
+static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set)
+{
+ struct entry *e;
+
+ e = hash_lookup(mq, oblock);
+ BUG_ON(!e || !in_cache(mq, e));
+
+ del(mq, e);
+ e->dirty = set;
+ push(mq, e);
+}
+
+static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __mq_set_clear_dirty(mq, oblock, true);
+ mutex_unlock(&mq->lock);
+}
+
+static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ __mq_set_clear_dirty(mq, oblock, false);
+ mutex_unlock(&mq->lock);
+}
+
static int mq_load_mapping(struct dm_cache_policy *p,
dm_oblock_t oblock, dm_cblock_t cblock,
uint32_t hint, bool hint_valid)
@@ -922,13 +964,9 @@ static int mq_load_mapping(struct dm_cache_policy *p,
struct mq_policy *mq = to_mq_policy(p);
struct entry *e;
- e = alloc_entry(mq);
- if (!e)
- return -ENOMEM;
-
- e->cblock = cblock;
+ e = alloc_particular_entry(&mq->cache_pool, cblock);
e->oblock = oblock;
- e->in_cache = true;
+ e->dirty = false; /* this gets corrected in a minute */
e->hit_count = hint_valid ? hint : 1;
e->generation = mq->generation;
push(mq, e);
@@ -936,57 +974,126 @@ static int mq_load_mapping(struct dm_cache_policy *p,
return 0;
}
+static int mq_save_hints(struct mq_policy *mq, struct queue *q,
+ policy_walk_fn fn, void *context)
+{
+ int r;
+ unsigned level;
+ struct entry *e;
+
+ for (level = 0; level < NR_QUEUE_LEVELS; level++)
+ list_for_each_entry(e, q->qs + level, list) {
+ r = fn(context, infer_cblock(&mq->cache_pool, e),
+ e->oblock, e->hit_count);
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
void *context)
{
struct mq_policy *mq = to_mq_policy(p);
int r = 0;
- struct entry *e;
- unsigned level;
mutex_lock(&mq->lock);
- for (level = 0; level < NR_QUEUE_LEVELS; level++)
- list_for_each_entry(e, &mq->cache.qs[level], list) {
- r = fn(context, e->cblock, e->oblock, e->hit_count);
- if (r)
- goto out;
- }
+ r = mq_save_hints(mq, &mq->cache_clean, fn, context);
+ if (!r)
+ r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
-out:
mutex_unlock(&mq->lock);
return r;
}
+static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
+{
+ struct entry *e;
+
+ e = hash_lookup(mq, oblock);
+ BUG_ON(!e || !in_cache(mq, e));
+
+ del(mq, e);
+ free_entry(&mq->cache_pool, e);
+}
+
static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
{
struct mq_policy *mq = to_mq_policy(p);
- struct entry *e;
mutex_lock(&mq->lock);
+ __remove_mapping(mq, oblock);
+ mutex_unlock(&mq->lock);
+}
- e = hash_lookup(mq, oblock);
+static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock)
+{
+ struct entry *e = epool_find(&mq->cache_pool, cblock);
- BUG_ON(!e || !e->in_cache);
+ if (!e)
+ return -ENODATA;
del(mq, e);
- e->in_cache = false;
- push(mq, e);
+ free_entry(&mq->cache_pool, e);
+ return 0;
+}
+
+static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+ int r;
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = __remove_cblock(mq, cblock);
mutex_unlock(&mq->lock);
+
+ return r;
}
-static void force_mapping(struct mq_policy *mq,
- dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
+ dm_cblock_t *cblock)
{
- struct entry *e = hash_lookup(mq, current_oblock);
+ struct entry *e = pop(mq, &mq->cache_dirty);
- BUG_ON(!e || !e->in_cache);
+ if (!e)
+ return -ENODATA;
- del(mq, e);
- e->oblock = new_oblock;
+ *oblock = e->oblock;
+ *cblock = infer_cblock(&mq->cache_pool, e);
+ e->dirty = false;
push(mq, e);
+
+ return 0;
+}
+
+static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+ dm_cblock_t *cblock)
+{
+ int r;
+ struct mq_policy *mq = to_mq_policy(p);
+
+ mutex_lock(&mq->lock);
+ r = __mq_writeback_work(mq, oblock, cblock);
+ mutex_unlock(&mq->lock);
+
+ return r;
+}
+
+static void __force_mapping(struct mq_policy *mq,
+ dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+ struct entry *e = hash_lookup(mq, current_oblock);
+
+ if (e && in_cache(mq, e)) {
+ del(mq, e);
+ e->oblock = new_oblock;
+ e->dirty = true;
+ push(mq, e);
+ }
}
static void mq_force_mapping(struct dm_cache_policy *p,
@@ -995,16 +1102,20 @@ static void mq_force_mapping(struct dm_cache_policy *p,
struct mq_policy *mq = to_mq_policy(p);
mutex_lock(&mq->lock);
- force_mapping(mq, current_oblock, new_oblock);
+ __force_mapping(mq, current_oblock, new_oblock);
mutex_unlock(&mq->lock);
}
static dm_cblock_t mq_residency(struct dm_cache_policy *p)
{
+ dm_cblock_t r;
struct mq_policy *mq = to_mq_policy(p);
- /* FIXME: lock mutex, not sure we can block here */
- return to_cblock(mq->nr_cblocks_allocated);
+ mutex_lock(&mq->lock);
+ r = to_cblock(mq->cache_pool.nr_allocated);
+ mutex_unlock(&mq->lock);
+
+ return r;
}
static void mq_tick(struct dm_cache_policy *p)
@@ -1057,10 +1168,13 @@ static void init_policy_functions(struct mq_policy *mq)
mq->policy.destroy = mq_destroy;
mq->policy.map = mq_map;
mq->policy.lookup = mq_lookup;
+ mq->policy.set_dirty = mq_set_dirty;
+ mq->policy.clear_dirty = mq_clear_dirty;
mq->policy.load_mapping = mq_load_mapping;
mq->policy.walk_mappings = mq_walk_mappings;
mq->policy.remove_mapping = mq_remove_mapping;
- mq->policy.writeback_work = NULL;
+ mq->policy.remove_cblock = mq_remove_cblock;
+ mq->policy.writeback_work = mq_writeback_work;
mq->policy.force_mapping = mq_force_mapping;
mq->policy.residency = mq_residency;
mq->policy.tick = mq_tick;
@@ -1072,7 +1186,6 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
sector_t origin_size,
sector_t cache_block_size)
{
- int r;
struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
if (!mq)
@@ -1080,8 +1193,18 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
init_policy_functions(mq);
iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
-
mq->cache_size = cache_size;
+
+ if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) {
+ DMERR("couldn't initialize pool of pre-cache entries");
+ goto bad_pre_cache_init;
+ }
+
+ if (epool_init(&mq->cache_pool, from_cblock(cache_size))) {
+ DMERR("couldn't initialize pool of cache entries");
+ goto bad_cache_init;
+ }
+
mq->tick_protected = 0;
mq->tick = 0;
mq->hit_count = 0;
@@ -1089,20 +1212,12 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
mq->promote_threshold = 0;
mutex_init(&mq->lock);
spin_lock_init(&mq->tick_lock);
- mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG);
- mq->find_free_last_word = 0;
queue_init(&mq->pre_cache);
- queue_init(&mq->cache);
- mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
+ queue_init(&mq->cache_clean);
+ queue_init(&mq->cache_dirty);
- mq->nr_entries = 2 * from_cblock(cache_size);
- r = alloc_entries(mq, mq->nr_entries);
- if (r)
- goto bad_cache_alloc;
-
- mq->nr_entries_allocated = 0;
- mq->nr_cblocks_allocated = 0;
+ mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
mq->hash_bits = ffs(mq->nr_buckets) - 1;
@@ -1110,17 +1225,13 @@ static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
if (!mq->table)
goto bad_alloc_table;
- mq->allocation_bitset = alloc_bitset(from_cblock(cache_size));
- if (!mq->allocation_bitset)
- goto bad_alloc_bitset;
-
return &mq->policy;
-bad_alloc_bitset:
- kfree(mq->table);
bad_alloc_table:
- free_entries(mq);
-bad_cache_alloc:
+ epool_exit(&mq->cache_pool);
+bad_cache_init:
+ epool_exit(&mq->pre_cache_pool);
+bad_pre_cache_init:
kfree(mq);
return NULL;
@@ -1130,7 +1241,7 @@ bad_cache_alloc:
static struct dm_cache_policy_type mq_policy_type = {
.name = "mq",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
@@ -1138,7 +1249,7 @@ static struct dm_cache_policy_type mq_policy_type = {
static struct dm_cache_policy_type default_policy_type = {
.name = "default",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.hint_size = 4,
.owner = THIS_MODULE,
.create = mq_create
diff --git a/drivers/md/dm-cache-policy.c b/drivers/md/dm-cache-policy.c
index 21c03c570c06..d80057968407 100644
--- a/drivers/md/dm-cache-policy.c
+++ b/drivers/md/dm-cache-policy.c
@@ -119,13 +119,13 @@ struct dm_cache_policy *dm_cache_policy_create(const char *name,
type = get_policy(name);
if (!type) {
DMWARN("unknown policy type");
- return NULL;
+ return ERR_PTR(-EINVAL);
}
p = type->create(cache_size, origin_size, cache_block_size);
if (!p) {
put_policy(type);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
p->private = type;
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 33369ca9614f..052c00a84a5c 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -135,9 +135,6 @@ struct dm_cache_policy {
*/
int (*lookup)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock);
- /*
- * oblock must be a mapped block. Must not block.
- */
void (*set_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
void (*clear_dirty)(struct dm_cache_policy *p, dm_oblock_t oblock);
@@ -159,8 +156,24 @@ struct dm_cache_policy {
void (*force_mapping)(struct dm_cache_policy *p, dm_oblock_t current_oblock,
dm_oblock_t new_oblock);
- int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+ /*
+ * This is called via the invalidate_cblocks message. It is
+ * possible the particular cblock has already been removed due to a
+ * write io in passthrough mode. In which case this should return
+ * -ENODATA.
+ */
+ int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
+ /*
+ * Provide a dirty block to be written back by the core target.
+ *
+ * Returns:
+ *
+ * 0 and @cblock,@oblock: block to write back provided
+ *
+ * -ENODATA: no dirty blocks available
+ */
+ int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
/*
* How full is the cache?
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 29569768ffbf..9efcf1059b99 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -61,6 +61,34 @@ static void free_bitset(unsigned long *bits)
/*----------------------------------------------------------------*/
+/*
+ * There are a couple of places where we let a bio run, but want to do some
+ * work before calling its endio function. We do this by temporarily
+ * changing the endio fn.
+ */
+struct dm_hook_info {
+ bio_end_io_t *bi_end_io;
+ void *bi_private;
+};
+
+static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
+ bio_end_io_t *bi_end_io, void *bi_private)
+{
+ h->bi_end_io = bio->bi_end_io;
+ h->bi_private = bio->bi_private;
+
+ bio->bi_end_io = bi_end_io;
+ bio->bi_private = bi_private;
+}
+
+static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
+{
+ bio->bi_end_io = h->bi_end_io;
+ bio->bi_private = h->bi_private;
+}
+
+/*----------------------------------------------------------------*/
+
#define PRISON_CELLS 1024
#define MIGRATION_POOL_SIZE 128
#define COMMIT_PERIOD HZ
@@ -76,14 +104,37 @@ static void free_bitset(unsigned long *bits)
/*
* FIXME: the cache is read/write for the time being.
*/
-enum cache_mode {
+enum cache_metadata_mode {
CM_WRITE, /* metadata may be changed */
CM_READ_ONLY, /* metadata may not be changed */
};
+enum cache_io_mode {
+ /*
+ * Data is written to cached blocks only. These blocks are marked
+ * dirty. If you lose the cache device you will lose data.
+ * Potential performance increase for both reads and writes.
+ */
+ CM_IO_WRITEBACK,
+
+ /*
+ * Data is written to both cache and origin. Blocks are never
+ * dirty. Potential performance benfit for reads only.
+ */
+ CM_IO_WRITETHROUGH,
+
+ /*
+ * A degraded mode useful for various cache coherency situations
+ * (eg, rolling back snapshots). Reads and writes always go to the
+ * origin. If a write goes to a cached oblock, then the cache
+ * block is invalidated.
+ */
+ CM_IO_PASSTHROUGH
+};
+
struct cache_features {
- enum cache_mode mode;
- bool write_through:1;
+ enum cache_metadata_mode mode;
+ enum cache_io_mode io_mode;
};
struct cache_stats {
@@ -99,6 +150,25 @@ struct cache_stats {
atomic_t discard_count;
};
+/*
+ * Defines a range of cblocks, begin to (end - 1) are in the range. end is
+ * the one-past-the-end value.
+ */
+struct cblock_range {
+ dm_cblock_t begin;
+ dm_cblock_t end;
+};
+
+struct invalidation_request {
+ struct list_head list;
+ struct cblock_range *cblocks;
+
+ atomic_t complete;
+ int err;
+
+ wait_queue_head_t result_wait;
+};
+
struct cache {
struct dm_target *ti;
struct dm_target_callbacks callbacks;
@@ -148,6 +218,10 @@ struct cache {
wait_queue_head_t migration_wait;
atomic_t nr_migrations;
+ wait_queue_head_t quiescing_wait;
+ atomic_t quiescing;
+ atomic_t quiescing_ack;
+
/*
* cache_size entries, dirty if set
*/
@@ -186,7 +260,7 @@ struct cache {
bool need_tick_bio:1;
bool sized:1;
- bool quiescing:1;
+ bool invalidate:1;
bool commit_requested:1;
bool loaded_mappings:1;
bool loaded_discards:1;
@@ -197,6 +271,12 @@ struct cache {
struct cache_features features;
struct cache_stats stats;
+
+ /*
+ * Invalidation fields.
+ */
+ spinlock_t invalidation_lock;
+ struct list_head invalidation_requests;
};
struct per_bio_data {
@@ -211,7 +291,7 @@ struct per_bio_data {
*/
struct cache *cache;
dm_cblock_t cblock;
- bio_end_io_t *saved_bi_end_io;
+ struct dm_hook_info hook_info;
struct dm_bio_details bio_details;
};
@@ -228,6 +308,8 @@ struct dm_cache_migration {
bool writeback:1;
bool demote:1;
bool promote:1;
+ bool requeue_holder:1;
+ bool invalidate:1;
struct dm_bio_prison_cell *old_ocell;
struct dm_bio_prison_cell *new_ocell;
@@ -533,9 +615,24 @@ static void save_stats(struct cache *cache)
#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache))
#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data))
+static bool writethrough_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_WRITETHROUGH;
+}
+
+static bool writeback_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_WRITEBACK;
+}
+
+static bool passthrough_mode(struct cache_features *f)
+{
+ return f->io_mode == CM_IO_PASSTHROUGH;
+}
+
static size_t get_per_bio_data_size(struct cache *cache)
{
- return cache->features.write_through ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
+ return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB;
}
static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size)
@@ -605,6 +702,7 @@ static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
dm_oblock_t oblock, dm_cblock_t cblock)
{
+ check_if_tick_bio_needed(cache, bio);
remap_to_cache(cache, bio, cblock);
if (bio_data_dir(bio) == WRITE) {
set_dirty(cache, oblock, cblock);
@@ -662,7 +760,8 @@ static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
static void writethrough_endio(struct bio *bio, int err)
{
struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT);
- bio->bi_end_io = pb->saved_bi_end_io;
+
+ dm_unhook_bio(&pb->hook_info, bio);
if (err) {
bio_endio(bio, err);
@@ -693,9 +792,8 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
pb->cache = cache;
pb->cblock = cblock;
- pb->saved_bi_end_io = bio->bi_end_io;
+ dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL);
dm_bio_record(&pb->bio_details, bio);
- bio->bi_end_io = writethrough_endio;
remap_to_origin_clear_discard(pb->cache, bio, oblock);
}
@@ -748,8 +846,9 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
static void cleanup_migration(struct dm_cache_migration *mg)
{
- dec_nr_migrations(mg->cache);
+ struct cache *cache = mg->cache;
free_migration(mg);
+ dec_nr_migrations(cache);
}
static void migration_failure(struct dm_cache_migration *mg)
@@ -765,13 +864,13 @@ static void migration_failure(struct dm_cache_migration *mg)
DMWARN_LIMIT("demotion failed; couldn't copy block");
policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
- cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
+ cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
if (mg->promote)
- cell_defer(cache, mg->new_ocell, 1);
+ cell_defer(cache, mg->new_ocell, true);
} else {
DMWARN_LIMIT("promotion failed; couldn't copy block");
policy_remove_mapping(cache->policy, mg->new_oblock);
- cell_defer(cache, mg->new_ocell, 1);
+ cell_defer(cache, mg->new_ocell, true);
}
cleanup_migration(mg);
@@ -823,7 +922,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
return;
} else if (mg->demote) {
- cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
+ cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
if (mg->promote) {
mg->demote = false;
@@ -832,11 +931,19 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
list_add_tail(&mg->list, &cache->quiesced_migrations);
spin_unlock_irqrestore(&cache->lock, flags);
- } else
+ } else {
+ if (mg->invalidate)
+ policy_remove_mapping(cache->policy, mg->old_oblock);
cleanup_migration(mg);
+ }
} else {
- cell_defer(cache, mg->new_ocell, true);
+ if (mg->requeue_holder)
+ cell_defer(cache, mg->new_ocell, true);
+ else {
+ bio_endio(mg->new_ocell->holder, 0);
+ cell_defer(cache, mg->new_ocell, false);
+ }
clear_dirty(cache, mg->new_oblock, mg->cblock);
cleanup_migration(mg);
}
@@ -881,8 +988,46 @@ static void issue_copy_real(struct dm_cache_migration *mg)
r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
}
- if (r < 0)
+ if (r < 0) {
+ DMERR_LIMIT("issuing migration failed");
migration_failure(mg);
+ }
+}
+
+static void overwrite_endio(struct bio *bio, int err)
+{
+ struct dm_cache_migration *mg = bio->bi_private;
+ struct cache *cache = mg->cache;
+ size_t pb_data_size = get_per_bio_data_size(cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+ unsigned long flags;
+
+ if (err)
+ mg->err = true;
+
+ spin_lock_irqsave(&cache->lock, flags);
+ list_add_tail(&mg->list, &cache->completed_migrations);
+ dm_unhook_bio(&pb->hook_info, bio);
+ mg->requeue_holder = false;
+ spin_unlock_irqrestore(&cache->lock, flags);
+
+ wake_worker(cache);
+}
+
+static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
+{
+ size_t pb_data_size = get_per_bio_data_size(mg->cache);
+ struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+ dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
+ remap_to_cache_dirty(mg->cache, bio, mg->new_oblock, mg->cblock);
+ generic_make_request(bio);
+}
+
+static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
+{
+ return (bio_data_dir(bio) == WRITE) &&
+ (bio->bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
}
static void avoid_copy(struct dm_cache_migration *mg)
@@ -899,9 +1044,17 @@ static void issue_copy(struct dm_cache_migration *mg)
if (mg->writeback || mg->demote)
avoid = !is_dirty(cache, mg->cblock) ||
is_discarded_oblock(cache, mg->old_oblock);
- else
+ else {
+ struct bio *bio = mg->new_ocell->holder;
+
avoid = is_discarded_oblock(cache, mg->new_oblock);
+ if (!avoid && bio_writes_complete_block(cache, bio)) {
+ issue_overwrite(mg, bio);
+ return;
+ }
+ }
+
avoid ? avoid_copy(mg) : issue_copy_real(mg);
}
@@ -991,6 +1144,8 @@ static void promote(struct cache *cache, struct prealloc *structs,
mg->writeback = false;
mg->demote = false;
mg->promote = true;
+ mg->requeue_holder = true;
+ mg->invalidate = false;
mg->cache = cache;
mg->new_oblock = oblock;
mg->cblock = cblock;
@@ -1012,6 +1167,8 @@ static void writeback(struct cache *cache, struct prealloc *structs,
mg->writeback = true;
mg->demote = false;
mg->promote = false;
+ mg->requeue_holder = true;
+ mg->invalidate = false;
mg->cache = cache;
mg->old_oblock = oblock;
mg->cblock = cblock;
@@ -1035,6 +1192,8 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
mg->writeback = false;
mg->demote = true;
mg->promote = true;
+ mg->requeue_holder = true;
+ mg->invalidate = false;
mg->cache = cache;
mg->old_oblock = old_oblock;
mg->new_oblock = new_oblock;
@@ -1047,6 +1206,33 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs,
quiesce_migration(mg);
}
+/*
+ * Invalidate a cache entry. No writeback occurs; any changes in the cache
+ * block are thrown away.
+ */
+static void invalidate(struct cache *cache, struct prealloc *structs,
+ dm_oblock_t oblock, dm_cblock_t cblock,
+ struct dm_bio_prison_cell *cell)
+{
+ struct dm_cache_migration *mg = prealloc_get_migration(structs);
+
+ mg->err = false;
+ mg->writeback = false;
+ mg->demote = true;
+ mg->promote = false;
+ mg->requeue_holder = true;
+ mg->invalidate = true;
+ mg->cache = cache;
+ mg->old_oblock = oblock;
+ mg->cblock = cblock;
+ mg->old_ocell = cell;
+ mg->new_ocell = NULL;
+ mg->start_jiffies = jiffies;
+
+ inc_nr_migrations(cache);
+ quiesce_migration(mg);
+}
+
/*----------------------------------------------------------------
* bio processing
*--------------------------------------------------------------*/
@@ -1109,13 +1295,6 @@ static bool spare_migration_bandwidth(struct cache *cache)
return current_volume < cache->migration_threshold;
}
-static bool is_writethrough_io(struct cache *cache, struct bio *bio,
- dm_cblock_t cblock)
-{
- return bio_data_dir(bio) == WRITE &&
- cache->features.write_through && !is_dirty(cache, cblock);
-}
-
static void inc_hit_counter(struct cache *cache, struct bio *bio)
{
atomic_inc(bio_data_dir(bio) == READ ?
@@ -1128,6 +1307,15 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
&cache->stats.read_miss : &cache->stats.write_miss);
}
+static void issue_cache_bio(struct cache *cache, struct bio *bio,
+ struct per_bio_data *pb,
+ dm_oblock_t oblock, dm_cblock_t cblock)
+{
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ remap_to_cache_dirty(cache, bio, oblock, cblock);
+ issue(cache, bio);
+}
+
static void process_bio(struct cache *cache, struct prealloc *structs,
struct bio *bio)
{
@@ -1139,7 +1327,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
size_t pb_data_size = get_per_bio_data_size(cache);
struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
bool discarded_block = is_discarded_oblock(cache, block);
- bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
+ bool passthrough = passthrough_mode(&cache->features);
+ bool can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
/*
* Check to see if that block is currently migrating.
@@ -1160,15 +1349,39 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
switch (lookup_result.op) {
case POLICY_HIT:
- inc_hit_counter(cache, bio);
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ if (passthrough) {
+ inc_miss_counter(cache, bio);
- if (is_writethrough_io(cache, bio, lookup_result.cblock))
- remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- else
- remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+ /*
+ * Passthrough always maps to the origin,
+ * invalidating any cache blocks that are written
+ * to.
+ */
+
+ if (bio_data_dir(bio) == WRITE) {
+ atomic_inc(&cache->stats.demotion);
+ invalidate(cache, structs, block, lookup_result.cblock, new_ocell);
+ release_cell = false;
+
+ } else {
+ /* FIXME: factor out issue_origin() */
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ remap_to_origin_clear_discard(cache, bio, block);
+ issue(cache, bio);
+ }
+ } else {
+ inc_hit_counter(cache, bio);
+
+ if (bio_data_dir(bio) == WRITE &&
+ writethrough_mode(&cache->features) &&
+ !is_dirty(cache, lookup_result.cblock)) {
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+ issue(cache, bio);
+ } else
+ issue_cache_bio(cache, bio, pb, block, lookup_result.cblock);
+ }
- issue(cache, bio);
break;
case POLICY_MISS:
@@ -1227,15 +1440,17 @@ static int need_commit_due_to_time(struct cache *cache)
static int commit_if_needed(struct cache *cache)
{
- if (dm_cache_changed_this_transaction(cache->cmd) &&
- (cache->commit_requested || need_commit_due_to_time(cache))) {
+ int r = 0;
+
+ if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
+ dm_cache_changed_this_transaction(cache->cmd)) {
atomic_inc(&cache->stats.commit_count);
- cache->last_commit_jiffies = jiffies;
cache->commit_requested = false;
- return dm_cache_commit(cache->cmd, false);
+ r = dm_cache_commit(cache->cmd, false);
+ cache->last_commit_jiffies = jiffies;
}
- return 0;
+ return r;
}
static void process_deferred_bios(struct cache *cache)
@@ -1344,36 +1559,88 @@ static void writeback_some_dirty_blocks(struct cache *cache)
}
/*----------------------------------------------------------------
- * Main worker loop
+ * Invalidations.
+ * Dropping something from the cache *without* writing back.
*--------------------------------------------------------------*/
-static void start_quiescing(struct cache *cache)
+
+static void process_invalidation_request(struct cache *cache, struct invalidation_request *req)
{
- unsigned long flags;
+ int r = 0;
+ uint64_t begin = from_cblock(req->cblocks->begin);
+ uint64_t end = from_cblock(req->cblocks->end);
- spin_lock_irqsave(&cache->lock, flags);
- cache->quiescing = 1;
- spin_unlock_irqrestore(&cache->lock, flags);
+ while (begin != end) {
+ r = policy_remove_cblock(cache->policy, to_cblock(begin));
+ if (!r) {
+ r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
+ if (r)
+ break;
+
+ } else if (r == -ENODATA) {
+ /* harmless, already unmapped */
+ r = 0;
+
+ } else {
+ DMERR("policy_remove_cblock failed");
+ break;
+ }
+
+ begin++;
+ }
+
+ cache->commit_requested = true;
+
+ req->err = r;
+ atomic_set(&req->complete, 1);
+
+ wake_up(&req->result_wait);
}
-static void stop_quiescing(struct cache *cache)
+static void process_invalidation_requests(struct cache *cache)
{
- unsigned long flags;
+ struct list_head list;
+ struct invalidation_request *req, *tmp;
- spin_lock_irqsave(&cache->lock, flags);
- cache->quiescing = 0;
- spin_unlock_irqrestore(&cache->lock, flags);
+ INIT_LIST_HEAD(&list);
+ spin_lock(&cache->invalidation_lock);
+ list_splice_init(&cache->invalidation_requests, &list);
+ spin_unlock(&cache->invalidation_lock);
+
+ list_for_each_entry_safe (req, tmp, &list, list)
+ process_invalidation_request(cache, req);
}
+/*----------------------------------------------------------------
+ * Main worker loop
+ *--------------------------------------------------------------*/
static bool is_quiescing(struct cache *cache)
{
- int r;
- unsigned long flags;
+ return atomic_read(&cache->quiescing);
+}
- spin_lock_irqsave(&cache->lock, flags);
- r = cache->quiescing;
- spin_unlock_irqrestore(&cache->lock, flags);
+static void ack_quiescing(struct cache *cache)
+{
+ if (is_quiescing(cache)) {
+ atomic_inc(&cache->quiescing_ack);
+ wake_up(&cache->quiescing_wait);
+ }
+}
- return r;
+static void wait_for_quiescing_ack(struct cache *cache)
+{
+ wait_event(cache->quiescing_wait, atomic_read(&cache->quiescing_ack));
+}
+
+static void start_quiescing(struct cache *cache)
+{
+ atomic_inc(&cache->quiescing);
+ wait_for_quiescing_ack(cache);
+}
+
+static void stop_quiescing(struct cache *cache)
+{
+ atomic_set(&cache->quiescing, 0);
+ atomic_set(&cache->quiescing_ack, 0);
}
static void wait_for_migrations(struct cache *cache)
@@ -1412,7 +1679,8 @@ static int more_work(struct cache *cache)
!bio_list_empty(&cache->deferred_writethrough_bios) ||
!list_empty(&cache->quiesced_migrations) ||
!list_empty(&cache->completed_migrations) ||
- !list_empty(&cache->need_commit_migrations);
+ !list_empty(&cache->need_commit_migrations) ||
+ cache->invalidate;
}
static void do_worker(struct work_struct *ws)
@@ -1420,16 +1688,16 @@ static void do_worker(struct work_struct *ws)
struct cache *cache = container_of(ws, struct cache, worker);
do {
- if (!is_quiescing(cache))
+ if (!is_quiescing(cache)) {
+ writeback_some_dirty_blocks(cache);
+ process_deferred_writethrough_bios(cache);
process_deferred_bios(cache);
+ process_invalidation_requests(cache);
+ }
process_migrations(cache, &cache->quiesced_migrations, issue_copy);
process_migrations(cache, &cache->completed_migrations, complete_migration);
- writeback_some_dirty_blocks(cache);
-
- process_deferred_writethrough_bios(cache);
-
if (commit_if_needed(cache)) {
process_deferred_flush_bios(cache, false);
@@ -1442,6 +1710,9 @@ static void do_worker(struct work_struct *ws)
process_migrations(cache, &cache->need_commit_migrations,
migration_success_post_commit);
}
+
+ ack_quiescing(cache);
+
} while (more_work(cache));
}
@@ -1715,7 +1986,7 @@ static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
static void init_features(struct cache_features *cf)
{
cf->mode = CM_WRITE;
- cf->write_through = false;
+ cf->io_mode = CM_IO_WRITEBACK;
}
static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
@@ -1740,10 +2011,13 @@ static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
arg = dm_shift_arg(as);
if (!strcasecmp(arg, "writeback"))
- cf->write_through = false;
+ cf->io_mode = CM_IO_WRITEBACK;
else if (!strcasecmp(arg, "writethrough"))
- cf->write_through = true;
+ cf->io_mode = CM_IO_WRITETHROUGH;
+
+ else if (!strcasecmp(arg, "passthrough"))
+ cf->io_mode = CM_IO_PASSTHROUGH;
else {
*error = "Unrecognised cache feature requested";
@@ -1872,14 +2146,15 @@ static int set_config_values(struct cache *cache, int argc, const char **argv)
static int create_cache_policy(struct cache *cache, struct cache_args *ca,
char **error)
{
- cache->policy = dm_cache_policy_create(ca->policy_name,
- cache->cache_size,
- cache->origin_sectors,
- cache->sectors_per_block);
- if (!cache->policy) {
+ struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
+ cache->cache_size,
+ cache->origin_sectors,
+ cache->sectors_per_block);
+ if (IS_ERR(p)) {
*error = "Error creating cache's policy";
- return -ENOMEM;
+ return PTR_ERR(p);
}
+ cache->policy = p;
return 0;
}
@@ -1995,6 +2270,22 @@ static int cache_create(struct cache_args *ca, struct cache **result)
}
cache->cmd = cmd;
+ if (passthrough_mode(&cache->features)) {
+ bool all_clean;
+
+ r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
+ if (r) {
+ *error = "dm_cache_metadata_all_clean() failed";
+ goto bad;
+ }
+
+ if (!all_clean) {
+ *error = "Cannot enter passthrough mode unless all blocks are clean";
+ r = -EINVAL;
+ goto bad;
+ }
+ }
+
spin_lock_init(&cache->lock);
bio_list_init(&cache->deferred_bios);
bio_list_init(&cache->deferred_flush_bios);
@@ -2005,6 +2296,10 @@ static int cache_create(struct cache_args *ca, struct cache **result)
atomic_set(&cache->nr_migrations, 0);
init_waitqueue_head(&cache->migration_wait);
+ init_waitqueue_head(&cache->quiescing_wait);
+ atomic_set(&cache->quiescing, 0);
+ atomic_set(&cache->quiescing_ack, 0);
+
r = -ENOMEM;
cache->nr_dirty = 0;
cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
@@ -2064,7 +2359,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
cache->need_tick_bio = true;
cache->sized = false;
- cache->quiescing = false;
+ cache->invalidate = false;
cache->commit_requested = false;
cache->loaded_mappings = false;
cache->loaded_discards = false;
@@ -2078,6 +2373,9 @@ static int cache_create(struct cache_args *ca, struct cache **result)
atomic_set(&cache->stats.commit_count, 0);
atomic_set(&cache->stats.discard_count, 0);
+ spin_lock_init(&cache->invalidation_lock);
+ INIT_LIST_HEAD(&cache->invalidation_requests);
+
*result = cache;
return 0;
@@ -2207,17 +2505,37 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_SUBMITTED;
}
+ r = DM_MAPIO_REMAPPED;
switch (lookup_result.op) {
case POLICY_HIT:
- inc_hit_counter(cache, bio);
- pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ if (passthrough_mode(&cache->features)) {
+ if (bio_data_dir(bio) == WRITE) {
+ /*
+ * We need to invalidate this block, so
+ * defer for the worker thread.
+ */
+ cell_defer(cache, cell, true);
+ r = DM_MAPIO_SUBMITTED;
+
+ } else {
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
+ inc_miss_counter(cache, bio);
+ remap_to_origin_clear_discard(cache, bio, block);
+
+ cell_defer(cache, cell, false);
+ }
- if (is_writethrough_io(cache, bio, lookup_result.cblock))
- remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
- else
- remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+ } else {
+ inc_hit_counter(cache, bio);
- cell_defer(cache, cell, false);
+ if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
+ !is_dirty(cache, lookup_result.cblock))
+ remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
+ else
+ remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+
+ cell_defer(cache, cell, false);
+ }
break;
case POLICY_MISS:
@@ -2242,10 +2560,10 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
(unsigned) lookup_result.op);
bio_io_error(bio);
- return DM_MAPIO_SUBMITTED;
+ r = DM_MAPIO_SUBMITTED;
}
- return DM_MAPIO_REMAPPED;
+ return r;
}
static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
@@ -2406,26 +2724,71 @@ static int load_discard(void *context, sector_t discard_block_size,
return 0;
}
+static dm_cblock_t get_cache_dev_size(struct cache *cache)
+{
+ sector_t size = get_dev_size(cache->cache_dev);
+ (void) sector_div(size, cache->sectors_per_block);
+ return to_cblock(size);
+}
+
+static bool can_resize(struct cache *cache, dm_cblock_t new_size)
+{
+ if (from_cblock(new_size) > from_cblock(cache->cache_size))
+ return true;
+
+ /*
+ * We can't drop a dirty block when shrinking the cache.
+ */
+ while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
+ new_size = to_cblock(from_cblock(new_size) + 1);
+ if (is_dirty(cache, new_size)) {
+ DMERR("unable to shrink cache; cache block %llu is dirty",
+ (unsigned long long) from_cblock(new_size));
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
+{
+ int r;
+
+ r = dm_cache_resize(cache->cmd, cache->cache_size);
+ if (r) {
+ DMERR("could not resize cache metadata");
+ return r;
+ }
+
+ cache->cache_size = new_size;
+
+ return 0;
+}
+
static int cache_preresume(struct dm_target *ti)
{
int r = 0;
struct cache *cache = ti->private;
- sector_t actual_cache_size = get_dev_size(cache->cache_dev);
- (void) sector_div(actual_cache_size, cache->sectors_per_block);
+ dm_cblock_t csize = get_cache_dev_size(cache);
/*
* Check to see if the cache has resized.
*/
- if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
- cache->cache_size = to_cblock(actual_cache_size);
-
- r = dm_cache_resize(cache->cmd, cache->cache_size);
- if (r) {
- DMERR("could not resize cache metadata");
+ if (!cache->sized) {
+ r = resize_cache_dev(cache, csize);
+ if (r)
return r;
- }
cache->sized = true;
+
+ } else if (csize != cache->cache_size) {
+ if (!can_resize(cache, csize))
+ return -EINVAL;
+
+ r = resize_cache_dev(cache, csize);
+ if (r)
+ return r;
}
if (!cache->loaded_mappings) {
@@ -2518,10 +2881,19 @@ static void cache_status(struct dm_target *ti, status_type_t type,
(unsigned long long) from_cblock(residency),
cache->nr_dirty);
- if (cache->features.write_through)
+ if (writethrough_mode(&cache->features))
DMEMIT("1 writethrough ");
- else
- DMEMIT("0 ");
+
+ else if (passthrough_mode(&cache->features))
+ DMEMIT("1 passthrough ");
+
+ else if (writeback_mode(&cache->features))
+ DMEMIT("1 writeback ");
+
+ else {
+ DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
+ goto err;
+ }
DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
if (sz < maxlen) {
@@ -2553,7 +2925,128 @@ err:
}
/*
- * Supports <key> <value>.
+ * A cache block range can take two forms:
+ *
+ * i) A single cblock, eg. '3456'
+ * ii) A begin and end cblock with dots between, eg. 123-234
+ */
+static int parse_cblock_range(struct cache *cache, const char *str,
+ struct cblock_range *result)
+{
+ char dummy;
+ uint64_t b, e;
+ int r;
+
+ /*
+ * Try and parse form (ii) first.
+ */
+ r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
+ if (r < 0)
+ return r;
+
+ if (r == 2) {
+ result->begin = to_cblock(b);
+ result->end = to_cblock(e);
+ return 0;
+ }
+
+ /*
+ * That didn't work, try form (i).
+ */
+ r = sscanf(str, "%llu%c", &b, &dummy);
+ if (r < 0)
+ return r;
+
+ if (r == 1) {
+ result->begin = to_cblock(b);
+ result->end = to_cblock(from_cblock(result->begin) + 1u);
+ return 0;
+ }
+
+ DMERR("invalid cblock range '%s'", str);
+ return -EINVAL;
+}
+
+static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
+{
+ uint64_t b = from_cblock(range->begin);
+ uint64_t e = from_cblock(range->end);
+ uint64_t n = from_cblock(cache->cache_size);
+
+ if (b >= n) {
+ DMERR("begin cblock out of range: %llu >= %llu", b, n);
+ return -EINVAL;
+ }
+
+ if (e > n) {
+ DMERR("end cblock out of range: %llu > %llu", e, n);
+ return -EINVAL;
+ }
+
+ if (b >= e) {
+ DMERR("invalid cblock range: %llu >= %llu", b, e);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int request_invalidation(struct cache *cache, struct cblock_range *range)
+{
+ struct invalidation_request req;
+
+ INIT_LIST_HEAD(&req.list);
+ req.cblocks = range;
+ atomic_set(&req.complete, 0);
+ req.err = 0;
+ init_waitqueue_head(&req.result_wait);
+
+ spin_lock(&cache->invalidation_lock);
+ list_add(&req.list, &cache->invalidation_requests);
+ spin_unlock(&cache->invalidation_lock);
+ wake_worker(cache);
+
+ wait_event(req.result_wait, atomic_read(&req.complete));
+ return req.err;
+}
+
+static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
+ const char **cblock_ranges)
+{
+ int r = 0;
+ unsigned i;
+ struct cblock_range range;
+
+ if (!passthrough_mode(&cache->features)) {
+ DMERR("cache has to be in passthrough mode for invalidation");
+ return -EPERM;
+ }
+
+ for (i = 0; i < count; i++) {
+ r = parse_cblock_range(cache, cblock_ranges[i], &range);
+ if (r)
+ break;
+
+ r = validate_cblock_range(cache, &range);
+ if (r)
+ break;
+
+ /*
+ * Pass begin and end origin blocks to the worker and wake it.
+ */
+ r = request_invalidation(cache, &range);
+ if (r)
+ break;
+ }
+
+ return r;
+}
+
+/*
+ * Supports
+ * "<key> <value>"
+ * and
+ * "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
*
* The key migration_threshold is supported by the cache target core.
*/
@@ -2561,6 +3054,12 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
{
struct cache *cache = ti->private;
+ if (!argc)
+ return -EINVAL;
+
+ if (!strcasecmp(argv[0], "invalidate_cblocks"))
+ return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
+
if (argc != 2)
return -EINVAL;
@@ -2630,7 +3129,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 1, 1},
+ .version = {1, 2, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 0fce0bc1a957..50ea7ed24dce 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2,6 +2,7 @@
* Copyright (C) 2003 Christophe Saout <christophe@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
* Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
*
* This file is released under the GPL.
*/
@@ -98,6 +99,13 @@ struct iv_lmk_private {
u8 *seed;
};
+#define TCW_WHITENING_SIZE 16
+struct iv_tcw_private {
+ struct crypto_shash *crc32_tfm;
+ u8 *iv_seed;
+ u8 *whitening;
+};
+
/*
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
@@ -139,6 +147,7 @@ struct crypt_config {
struct iv_essiv_private essiv;
struct iv_benbi_private benbi;
struct iv_lmk_private lmk;
+ struct iv_tcw_private tcw;
} iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
@@ -171,7 +180,8 @@ struct crypt_config {
unsigned long flags;
unsigned int key_size;
- unsigned int key_parts;
+ unsigned int key_parts; /* independent parts in key buffer */
+ unsigned int key_extra_size; /* additional keys length */
u8 key[0];
};
@@ -230,6 +240,16 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
* version 3: the same as version 2 with additional IV seed
* (it uses 65 keys, last key is used as IV seed)
*
+ * tcw: Compatible implementation of the block chaining mode used
+ * by the TrueCrypt device encryption system (prior to version 4.1).
+ * For more info see: http://www.truecrypt.org
+ * It operates on full 512 byte sectors and uses CBC
+ * with an IV derived from initial key and the sector number.
+ * In addition, whitening value is applied on every sector, whitening
+ * is calculated from initial key, sector number and mixed using CRC32.
+ * Note that this encryption scheme is vulnerable to watermarking attacks
+ * and should be used for old compatible containers access only.
+ *
* plumb: unimplemented, see:
* http://article.gmane.org/gmane.linux.kernel.device-mapper.dm-crypt/454
*/
@@ -530,7 +550,7 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv,
char ctx[crypto_shash_descsize(lmk->hash_tfm)];
} sdesc;
struct md5_state md5state;
- u32 buf[4];
+ __le32 buf[4];
int i, r;
sdesc.desc.tfm = lmk->hash_tfm;
@@ -608,6 +628,153 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
return r;
}
+static void crypt_iv_tcw_dtr(struct crypt_config *cc)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+ kzfree(tcw->iv_seed);
+ tcw->iv_seed = NULL;
+ kzfree(tcw->whitening);
+ tcw->whitening = NULL;
+
+ if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm))
+ crypto_free_shash(tcw->crc32_tfm);
+ tcw->crc32_tfm = NULL;
+}
+
+static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti,
+ const char *opts)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+ if (cc->key_size <= (cc->iv_size + TCW_WHITENING_SIZE)) {
+ ti->error = "Wrong key size for TCW";
+ return -EINVAL;
+ }
+
+ tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, 0);
+ if (IS_ERR(tcw->crc32_tfm)) {
+ ti->error = "Error initializing CRC32 in TCW";
+ return PTR_ERR(tcw->crc32_tfm);
+ }
+
+ tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL);
+ tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL);
+ if (!tcw->iv_seed || !tcw->whitening) {
+ crypt_iv_tcw_dtr(cc);
+ ti->error = "Error allocating seed storage in TCW";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int crypt_iv_tcw_init(struct crypt_config *cc)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ int key_offset = cc->key_size - cc->iv_size - TCW_WHITENING_SIZE;
+
+ memcpy(tcw->iv_seed, &cc->key[key_offset], cc->iv_size);
+ memcpy(tcw->whitening, &cc->key[key_offset + cc->iv_size],
+ TCW_WHITENING_SIZE);
+
+ return 0;
+}
+
+static int crypt_iv_tcw_wipe(struct crypt_config *cc)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+
+ memset(tcw->iv_seed, 0, cc->iv_size);
+ memset(tcw->whitening, 0, TCW_WHITENING_SIZE);
+
+ return 0;
+}
+
+static int crypt_iv_tcw_whitening(struct crypt_config *cc,
+ struct dm_crypt_request *dmreq,
+ u8 *data)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+ u8 buf[TCW_WHITENING_SIZE];
+ struct {
+ struct shash_desc desc;
+ char ctx[crypto_shash_descsize(tcw->crc32_tfm)];
+ } sdesc;
+ int i, r;
+
+ /* xor whitening with sector number */
+ memcpy(buf, tcw->whitening, TCW_WHITENING_SIZE);
+ crypto_xor(buf, (u8 *)&sector, 8);
+ crypto_xor(&buf[8], (u8 *)&sector, 8);
+
+ /* calculate crc32 for every 32bit part and xor it */
+ sdesc.desc.tfm = tcw->crc32_tfm;
+ sdesc.desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ for (i = 0; i < 4; i++) {
+ r = crypto_shash_init(&sdesc.desc);
+ if (r)
+ goto out;
+ r = crypto_shash_update(&sdesc.desc, &buf[i * 4], 4);
+ if (r)
+ goto out;
+ r = crypto_shash_final(&sdesc.desc, &buf[i * 4]);
+ if (r)
+ goto out;
+ }
+ crypto_xor(&buf[0], &buf[12], 4);
+ crypto_xor(&buf[4], &buf[8], 4);
+
+ /* apply whitening (8 bytes) to whole sector */
+ for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++)
+ crypto_xor(data + i * 8, buf, 8);
+out:
+ memset(buf, 0, sizeof(buf));
+ return r;
+}
+
+static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw;
+ u64 sector = cpu_to_le64((u64)dmreq->iv_sector);
+ u8 *src;
+ int r = 0;
+
+ /* Remove whitening from ciphertext */
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) {
+ src = kmap_atomic(sg_page(&dmreq->sg_in));
+ r = crypt_iv_tcw_whitening(cc, dmreq, src + dmreq->sg_in.offset);
+ kunmap_atomic(src);
+ }
+
+ /* Calculate IV */
+ memcpy(iv, tcw->iv_seed, cc->iv_size);
+ crypto_xor(iv, (u8 *)&sector, 8);
+ if (cc->iv_size > 8)
+ crypto_xor(&iv[8], (u8 *)&sector, cc->iv_size - 8);
+
+ return r;
+}
+
+static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ u8 *dst;
+ int r;
+
+ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE)
+ return 0;
+
+ /* Apply whitening on ciphertext */
+ dst = kmap_atomic(sg_page(&dmreq->sg_out));
+ r = crypt_iv_tcw_whitening(cc, dmreq, dst + dmreq->sg_out.offset);
+ kunmap_atomic(dst);
+
+ return r;
+}
+
static struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
@@ -643,6 +810,15 @@ static struct crypt_iv_operations crypt_iv_lmk_ops = {
.post = crypt_iv_lmk_post
};
+static struct crypt_iv_operations crypt_iv_tcw_ops = {
+ .ctr = crypt_iv_tcw_ctr,
+ .dtr = crypt_iv_tcw_dtr,
+ .init = crypt_iv_tcw_init,
+ .wipe = crypt_iv_tcw_wipe,
+ .generator = crypt_iv_tcw_gen,
+ .post = crypt_iv_tcw_post
+};
+
static void crypt_convert_init(struct crypt_config *cc,
struct convert_context *ctx,
struct bio *bio_out, struct bio *bio_in,
@@ -1274,9 +1450,12 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode)
static int crypt_setkey_allcpus(struct crypt_config *cc)
{
- unsigned subkey_size = cc->key_size >> ilog2(cc->tfms_count);
+ unsigned subkey_size;
int err = 0, i, r;
+ /* Ignore extra keys (which are used for IV etc) */
+ subkey_size = (cc->key_size - cc->key_extra_size) >> ilog2(cc->tfms_count);
+
for (i = 0; i < cc->tfms_count; i++) {
r = crypto_ablkcipher_setkey(cc->tfms[i],
cc->key + (i * subkey_size),
@@ -1409,6 +1588,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
return -EINVAL;
}
cc->key_parts = cc->tfms_count;
+ cc->key_extra_size = 0;
cc->cipher = kstrdup(cipher, GFP_KERNEL);
if (!cc->cipher)
@@ -1460,13 +1640,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
goto bad;
}
- /* Initialize and set key */
- ret = crypt_set_key(cc, key);
- if (ret < 0) {
- ti->error = "Error decoding and setting key";
- goto bad;
- }
-
/* Initialize IV */
cc->iv_size = crypto_ablkcipher_ivsize(any_tfm(cc));
if (cc->iv_size)
@@ -1493,18 +1666,33 @@ static int crypt_ctr_cipher(struct dm_target *ti,
cc->iv_gen_ops = &crypt_iv_null_ops;
else if (strcmp(ivmode, "lmk") == 0) {
cc->iv_gen_ops = &crypt_iv_lmk_ops;
- /* Version 2 and 3 is recognised according
+ /*
+ * Version 2 and 3 is recognised according
* to length of provided multi-key string.
* If present (version 3), last key is used as IV seed.
+ * All keys (including IV seed) are always the same size.
*/
- if (cc->key_size % cc->key_parts)
+ if (cc->key_size % cc->key_parts) {
cc->key_parts++;
+ cc->key_extra_size = cc->key_size / cc->key_parts;
+ }
+ } else if (strcmp(ivmode, "tcw") == 0) {
+ cc->iv_gen_ops = &crypt_iv_tcw_ops;
+ cc->key_parts += 2; /* IV + whitening */
+ cc->key_extra_size = cc->iv_size + TCW_WHITENING_SIZE;
} else {
ret = -EINVAL;
ti->error = "Invalid IV mode";
goto bad;
}
+ /* Initialize and set key */
+ ret = crypt_set_key(cc, key);
+ if (ret < 0) {
+ ti->error = "Error decoding and setting key";
+ goto bad;
+ }
+
/* Allocate IV */
if (cc->iv_gen_ops && cc->iv_gen_ops->ctr) {
ret = cc->iv_gen_ops->ctr(cc, ti, ivopts);
@@ -1817,7 +2005,7 @@ static int crypt_iterate_devices(struct dm_target *ti,
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 12, 1},
+ .version = {1, 13, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index afe08146f73e..51521429fb59 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -57,7 +57,7 @@ struct vers_iter {
static struct list_head _name_buckets[NUM_BUCKETS];
static struct list_head _uuid_buckets[NUM_BUCKETS];
-static void dm_hash_remove_all(int keep_open_devices);
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred);
/*
* Guards access to both hash tables.
@@ -86,7 +86,7 @@ static int dm_hash_init(void)
static void dm_hash_exit(void)
{
- dm_hash_remove_all(0);
+ dm_hash_remove_all(false, false, false);
}
/*-----------------------------------------------------------------
@@ -276,7 +276,7 @@ static struct dm_table *__hash_remove(struct hash_cell *hc)
return table;
}
-static void dm_hash_remove_all(int keep_open_devices)
+static void dm_hash_remove_all(bool keep_open_devices, bool mark_deferred, bool only_deferred)
{
int i, dev_skipped;
struct hash_cell *hc;
@@ -293,7 +293,8 @@ retry:
md = hc->md;
dm_get(md);
- if (keep_open_devices && dm_lock_for_deletion(md)) {
+ if (keep_open_devices &&
+ dm_lock_for_deletion(md, mark_deferred, only_deferred)) {
dm_put(md);
dev_skipped++;
continue;
@@ -450,6 +451,11 @@ static struct mapped_device *dm_hash_rename(struct dm_ioctl *param,
return md;
}
+void dm_deferred_remove(void)
+{
+ dm_hash_remove_all(true, false, true);
+}
+
/*-----------------------------------------------------------------
* Implementation of the ioctl commands
*---------------------------------------------------------------*/
@@ -461,7 +467,7 @@ typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
static int remove_all(struct dm_ioctl *param, size_t param_size)
{
- dm_hash_remove_all(1);
+ dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
param->data_size = 0;
return 0;
}
@@ -683,6 +689,9 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
if (dm_suspended_md(md))
param->flags |= DM_SUSPEND_FLAG;
+ if (dm_test_deferred_remove_flag(md))
+ param->flags |= DM_DEFERRED_REMOVE;
+
param->dev = huge_encode_dev(disk_devt(disk));
/*
@@ -832,8 +841,13 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
/*
* Ensure the device is not open and nothing further can open it.
*/
- r = dm_lock_for_deletion(md);
+ r = dm_lock_for_deletion(md, !!(param->flags & DM_DEFERRED_REMOVE), false);
if (r) {
+ if (r == -EBUSY && param->flags & DM_DEFERRED_REMOVE) {
+ up_write(&_hash_lock);
+ dm_put(md);
+ return 0;
+ }
DMDEBUG_LIMIT("unable to remove open device %s", hc->name);
up_write(&_hash_lock);
dm_put(md);
@@ -848,6 +862,8 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
dm_table_destroy(t);
}
+ param->flags &= ~DM_DEFERRED_REMOVE;
+
if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr))
param->flags |= DM_UEVENT_GENERATED_FLAG;
@@ -1469,6 +1485,14 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
if (**argv != '@')
return 2; /* no '@' prefix, deliver to target */
+ if (!strcasecmp(argv[0], "@cancel_deferred_remove")) {
+ if (argc != 1) {
+ DMERR("Invalid arguments for @cancel_deferred_remove");
+ return -EINVAL;
+ }
+ return dm_cancel_deferred_remove(md);
+ }
+
r = dm_stats_message(md, argc, argv, result, maxlen);
if (r < 2)
return r;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index de570a558764..6eb9dc9ef8f3 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -87,6 +87,7 @@ struct multipath {
unsigned queue_if_no_path:1; /* Queue I/O if last path fails? */
unsigned saved_queue_if_no_path:1; /* Saved state during suspension */
unsigned retain_attached_hw_handler:1; /* If there's already a hw_handler present, don't change it. */
+ unsigned pg_init_disabled:1; /* pg_init is not currently allowed */
unsigned pg_init_retries; /* Number of times to retry pg_init */
unsigned pg_init_count; /* Number of times pg_init called */
@@ -390,13 +391,16 @@ static int map_io(struct multipath *m, struct request *clone,
if (was_queued)
m->queue_size--;
- if ((pgpath && m->queue_io) ||
- (!pgpath && m->queue_if_no_path)) {
+ if (m->pg_init_required) {
+ if (!m->pg_init_in_progress)
+ queue_work(kmultipathd, &m->process_queued_ios);
+ r = DM_MAPIO_REQUEUE;
+ } else if ((pgpath && m->queue_io) ||
+ (!pgpath && m->queue_if_no_path)) {
/* Queue for the daemon to resubmit */
list_add_tail(&clone->queuelist, &m->queued_ios);
m->queue_size++;
- if ((m->pg_init_required && !m->pg_init_in_progress) ||
- !m->queue_io)
+ if (!m->queue_io)
queue_work(kmultipathd, &m->process_queued_ios);
pgpath = NULL;
r = DM_MAPIO_SUBMITTED;
@@ -497,7 +501,8 @@ static void process_queued_ios(struct work_struct *work)
(!pgpath && !m->queue_if_no_path))
must_queue = 0;
- if (m->pg_init_required && !m->pg_init_in_progress && pgpath)
+ if (m->pg_init_required && !m->pg_init_in_progress && pgpath &&
+ !m->pg_init_disabled)
__pg_init_all_paths(m);
spin_unlock_irqrestore(&m->lock, flags);
@@ -942,10 +947,20 @@ static void multipath_wait_for_pg_init_completion(struct multipath *m)
static void flush_multipath_work(struct multipath *m)
{
+ unsigned long flags;
+
+ spin_lock_irqsave(&m->lock, flags);
+ m->pg_init_disabled = 1;
+ spin_unlock_irqrestore(&m->lock, flags);
+
flush_workqueue(kmpath_handlerd);
multipath_wait_for_pg_init_completion(m);
flush_workqueue(kmultipathd);
flush_work(&m->trigger_event);
+
+ spin_lock_irqsave(&m->lock, flags);
+ m->pg_init_disabled = 0;
+ spin_unlock_irqrestore(&m->lock, flags);
}
static void multipath_dtr(struct dm_target *ti)
@@ -1164,7 +1179,7 @@ static int pg_init_limit_reached(struct multipath *m, struct pgpath *pgpath)
spin_lock_irqsave(&m->lock, flags);
- if (m->pg_init_count <= m->pg_init_retries)
+ if (m->pg_init_count <= m->pg_init_retries && !m->pg_init_disabled)
m->pg_init_required = 1;
else
limit_reached = 1;
@@ -1665,6 +1680,11 @@ static int multipath_busy(struct dm_target *ti)
spin_lock_irqsave(&m->lock, flags);
+ /* pg_init in progress, requeue until done */
+ if (m->pg_init_in_progress) {
+ busy = 1;
+ goto out;
+ }
/* Guess which priority_group will be used at next mapping time */
if (unlikely(!m->current_pgpath && m->next_pg))
pg = m->next_pg;
@@ -1714,7 +1734,7 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
- .version = {1, 5, 1},
+ .version = {1, 6, 0},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8f8783533ac7..465f08ca62b1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -545,14 +545,28 @@ static int adjoin(struct dm_table *table, struct dm_target *ti)
/*
* Used to dynamically allocate the arg array.
+ *
+ * We do first allocation with GFP_NOIO because dm-mpath and dm-thin must
+ * process messages even if some device is suspended. These messages have a
+ * small fixed number of arguments.
+ *
+ * On the other hand, dm-switch needs to process bulk data using messages and
+ * excessive use of GFP_NOIO could cause trouble.
*/
static char **realloc_argv(unsigned *array_size, char **old_argv)
{
char **argv;
unsigned new_size;
+ gfp_t gfp;
- new_size = *array_size ? *array_size * 2 : 64;
- argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
+ if (*array_size) {
+ new_size = *array_size * 2;
+ gfp = GFP_KERNEL;
+ } else {
+ new_size = 8;
+ gfp = GFP_NOIO;
+ }
+ argv = kmalloc(new_size * sizeof(*argv), gfp);
if (argv) {
memcpy(argv, old_argv, *array_size * sizeof(*argv));
*array_size = new_size;
@@ -1548,8 +1562,11 @@ int dm_table_resume_targets(struct dm_table *t)
continue;
r = ti->type->preresume(ti);
- if (r)
+ if (r) {
+ DMERR("%s: %s: preresume failed, error = %d",
+ dm_device_name(t->md), ti->type->name, r);
return r;
+ }
}
for (i = 0; i < t->num_targets; i++) {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b3e26c7d1417..0704c523a76b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -49,6 +49,11 @@ static unsigned int _major = 0;
static DEFINE_IDR(_minor_idr);
static DEFINE_SPINLOCK(_minor_lock);
+
+static void do_deferred_remove(struct work_struct *w);
+
+static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
+
/*
* For bio-based dm.
* One of these is allocated per bio.
@@ -116,6 +121,7 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
#define DMF_MERGE_IS_OPTIONAL 6
+#define DMF_DEFERRED_REMOVE 7
/*
* A dummy definition to make RCU happy.
@@ -299,6 +305,8 @@ out_free_io_cache:
static void local_exit(void)
{
+ flush_scheduled_work();
+
kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
@@ -404,7 +412,10 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode)
spin_lock(&_minor_lock);
- atomic_dec(&md->open_count);
+ if (atomic_dec_and_test(&md->open_count) &&
+ (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
+ schedule_work(&deferred_remove_work);
+
dm_put(md);
spin_unlock(&_minor_lock);
@@ -418,14 +429,18 @@ int dm_open_count(struct mapped_device *md)
/*
* Guarantees nothing is using the device before it's deleted.
*/
-int dm_lock_for_deletion(struct mapped_device *md)
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
{
int r = 0;
spin_lock(&_minor_lock);
- if (dm_open_count(md))
+ if (dm_open_count(md)) {
r = -EBUSY;
+ if (mark_deferred)
+ set_bit(DMF_DEFERRED_REMOVE, &md->flags);
+ } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
+ r = -EEXIST;
else
set_bit(DMF_DELETING, &md->flags);
@@ -434,6 +449,27 @@ int dm_lock_for_deletion(struct mapped_device *md)
return r;
}
+int dm_cancel_deferred_remove(struct mapped_device *md)
+{
+ int r = 0;
+
+ spin_lock(&_minor_lock);
+
+ if (test_bit(DMF_DELETING, &md->flags))
+ r = -EBUSY;
+ else
+ clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
+
+ spin_unlock(&_minor_lock);
+
+ return r;
+}
+
+static void do_deferred_remove(struct work_struct *w)
+{
+ dm_deferred_remove();
+}
+
sector_t dm_get_size(struct mapped_device *md)
{
return get_capacity(md->disk);
@@ -2894,6 +2930,11 @@ int dm_suspended_md(struct mapped_device *md)
return test_bit(DMF_SUSPENDED, &md->flags);
}
+int dm_test_deferred_remove_flag(struct mapped_device *md)
+{
+ return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
+}
+
int dm_suspended(struct dm_target *ti)
{
return dm_suspended_md(dm_table_get_md(ti->table));
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 1d1ad7b7e527..c57ba550f69e 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -129,6 +129,16 @@ int dm_deleting_md(struct mapped_device *md);
int dm_suspended_md(struct mapped_device *md);
/*
+ * Test if the device is scheduled for deferred remove.
+ */
+int dm_test_deferred_remove_flag(struct mapped_device *md);
+
+/*
+ * Try to remove devices marked for deferred removal.
+ */
+void dm_deferred_remove(void);
+
+/*
* The device-mapper can be driven through one of two interfaces;
* ioctl or filesystem, depending which patch you have applied.
*/
@@ -158,7 +168,8 @@ void dm_stripe_exit(void);
void dm_destroy(struct mapped_device *md);
void dm_destroy_immediate(struct mapped_device *md);
int dm_open_count(struct mapped_device *md);
-int dm_lock_for_deletion(struct mapped_device *md);
+int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred);
+int dm_cancel_deferred_remove(struct mapped_device *md);
int dm_request_based(struct mapped_device *md);
sector_t dm_get_size(struct mapped_device *md);
struct dm_stats *dm_get_stats(struct mapped_device *md);
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c
index 172147eb1d40..af96e24ec328 100644
--- a/drivers/md/persistent-data/dm-array.c
+++ b/drivers/md/persistent-data/dm-array.c
@@ -509,15 +509,18 @@ static int grow_add_tail_block(struct resize *resize)
static int grow_needs_more_blocks(struct resize *resize)
{
int r;
+ unsigned old_nr_blocks = resize->old_nr_full_blocks;
if (resize->old_nr_entries_in_last_block > 0) {
+ old_nr_blocks++;
+
r = grow_extend_tail_block(resize, resize->max_entries);
if (r)
return r;
}
r = insert_full_ablocks(resize->info, resize->size_of_block,
- resize->old_nr_full_blocks,
+ old_nr_blocks,
resize->new_nr_full_blocks,
resize->max_entries, resize->value,
&resize->root);
diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c
index e735a6d5a793..cfbf9617e465 100644
--- a/drivers/md/persistent-data/dm-space-map-disk.c
+++ b/drivers/md/persistent-data/dm-space-map-disk.c
@@ -140,26 +140,10 @@ static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
{
- int r;
- uint32_t old_count;
enum allocation_event ev;
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
- r = sm_ll_dec(&smd->ll, b, &ev);
- if (!r && (ev == SM_FREE)) {
- /*
- * It's only free if it's also free in the last
- * transaction.
- */
- r = sm_ll_lookup(&smd->old_ll, b, &old_count);
- if (r)
- return r;
-
- if (!old_count)
- smd->nr_allocated_this_transaction--;
- }
-
- return r;
+ return sm_ll_dec(&smd->ll, b, &ev);
}
static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)