From 414dd67d50a6b9a11af23bbb68e8fae13d726c8b Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:25 +0000 Subject: dm cache: avoid 64 bit division on 32 bit Squash various 32bit link errors. >> on i386: >> drivers/built-in.o: In function `is_discarded_oblock': >> dm-cache-target.c:(.text+0x1ea28e): undefined reference to `__udivdi3' ... Reported-by: Randy Dunlap Signed-off-by: Joe Thornber Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'drivers/md/dm-cache-target.c') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 0f4e84b15c30..5ad227f0cea3 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -158,7 +158,7 @@ struct cache { /* * origin_blocks entries, discarded if set. */ - sector_t discard_block_size; /* a power of 2 times sectors per block */ + uint32_t discard_block_size; /* a power of 2 times sectors per block */ dm_dblock_t discard_nr_blocks; unsigned long *discard_bitset; @@ -412,17 +412,24 @@ static bool block_size_is_power_of_two(struct cache *cache) return cache->sectors_per_block_shift >= 0; } +static dm_block_t block_div(dm_block_t b, uint32_t n) +{ + do_div(b, n); + + return b; +} + static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock) { - sector_t discard_blocks = cache->discard_block_size; + uint32_t discard_blocks = cache->discard_block_size; dm_block_t b = from_oblock(oblock); if (!block_size_is_power_of_two(cache)) - (void) sector_div(discard_blocks, cache->sectors_per_block); + discard_blocks = discard_blocks / cache->sectors_per_block; else discard_blocks >>= cache->sectors_per_block_shift; - (void) sector_div(b, discard_blocks); + b = block_div(b, discard_blocks); return to_dblock(b); } @@ -1002,7 +1009,7 @@ static void process_discard_bio(struct cache *cache, struct bio *bio) dm_block_t end_block = bio->bi_sector + bio_sectors(bio); dm_block_t b; - (void) sector_div(end_block, cache->discard_block_size); + end_block = block_div(end_block, cache->discard_block_size); for (b = start_block; b < end_block; b++) set_discard(cache, to_dblock(b)); @@ -1835,7 +1842,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) /* FIXME: factor out this whole section */ origin_blocks = cache->origin_sectors = ca->origin_sectors; - (void) sector_div(origin_blocks, ca->block_size); + origin_blocks = block_div(origin_blocks, ca->block_size); cache->origin_blocks = to_oblock(origin_blocks); cache->sectors_per_block = ca->block_size; @@ -1848,7 +1855,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) dm_block_t cache_size = ca->cache_sectors; cache->sectors_per_block_shift = -1; - (void) sector_div(cache_size, ca->block_size); + cache_size = block_div(cache_size, ca->block_size); cache->cache_size = to_cblock(cache_size); } else { cache->sectors_per_block_shift = __ffs(ca->block_size); -- cgit v1.2.3 From 617a0b89da4898d4cc990c9eb4bc9c0591c538a5 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 20 Mar 2013 17:21:26 +0000 Subject: dm cache: detect cache_create failure Return error if cache_create() fails. A missing return check made cache_ctr continue even after an error in cache_create() resulting in the cache object being destroyed. So a simple failure like an odd number of cache policy config value arguments would result in an oops. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/dm-cache-target.c') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 5ad227f0cea3..76cc910557f0 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2009,6 +2009,8 @@ static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out; r = cache_create(ca, &cache); + if (r) + goto out; r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3); if (r) { -- cgit v1.2.3 From b978440b8db901aba0c4cd38c7c841c9b5cd9a7e Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Wed, 20 Mar 2013 17:21:26 +0000 Subject: dm cache: avoid calling policy destructor twice on error If the cache policy's config values are not able to be set we must set the policy to NULL after destroying it in create_cache_policy() so we don't attempt to destroy it a second time later. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md/dm-cache-target.c') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 76cc910557f0..79ac8603644d 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1763,8 +1763,11 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca, } r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv); - if (r) + if (r) { + *error = "Error setting cache policy's config values"; dm_cache_policy_destroy(cache->policy); + cache->policy = NULL; + } return r; } -- cgit v1.2.3 From e2e74d617eadc15f601983270c4f4a6935c5a943 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Wed, 20 Mar 2013 17:21:27 +0000 Subject: dm cache: fix race in writethrough implementation We have found a race in the optimisation used in the dm cache writethrough implementation. Currently, dm core sends the cache target two bios, one for the origin device and one for the cache device and these are processed in parallel. This patch avoids the race by changing the code back to a simpler (slower) implementation which processes the two writes in series, one after the other, until we can develop a complete fix for the problem. When the cache is in writethrough mode it needs to send WRITE bios to both the origin and cache devices. Previously we've been implementing this by having dm core query the cache target on every write to find out how many copies of the bio it wants. The cache will ask for two bios if the block is in the cache, and one otherwise. Then main problem with this is it's racey. At the time this check is made the bio hasn't yet been submitted and so isn't being taken into account when quiescing a block for migration (promotion or demotion). This means a single bio may be submitted when two were needed because the block has since been promoted to the cache (catastrophic), or two bios where only one is needed (harmless). I really don't want to start entering bios into the quiescing system (deferred_set) in the get_num_write_bios callback. Instead this patch simplifies things; only one bio is submitted by the core, this is first written to the origin and then the cache device in series. Obviously this will have a latency impact. deferred_writethrough_bios is introduced to record bios that must be later issued to the cache device from the worker thread. This deferred submission, after the origin bio completes, is required given that we're in interrupt context (writethrough_endio). Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-target.c | 138 +++++++++++++++++++++++++++---------------- 1 file changed, 88 insertions(+), 50 deletions(-) (limited to 'drivers/md/dm-cache-target.c') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 79ac8603644d..ff267db60025 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -142,6 +142,7 @@ struct cache { spinlock_t lock; struct bio_list deferred_bios; struct bio_list deferred_flush_bios; + struct bio_list deferred_writethrough_bios; struct list_head quiesced_migrations; struct list_head completed_migrations; struct list_head need_commit_migrations; @@ -199,6 +200,11 @@ struct per_bio_data { bool tick:1; unsigned req_nr:2; struct dm_deferred_entry *all_io_entry; + + /* writethrough fields */ + struct cache *cache; + dm_cblock_t cblock; + bio_end_io_t *saved_bi_end_io; }; struct dm_cache_migration { @@ -616,6 +622,56 @@ static void issue(struct cache *cache, struct bio *bio) spin_unlock_irqrestore(&cache->lock, flags); } +static void defer_writethrough_bio(struct cache *cache, struct bio *bio) +{ + unsigned long flags; + + spin_lock_irqsave(&cache->lock, flags); + bio_list_add(&cache->deferred_writethrough_bios, bio); + spin_unlock_irqrestore(&cache->lock, flags); + + wake_worker(cache); +} + +static void writethrough_endio(struct bio *bio, int err) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + bio->bi_end_io = pb->saved_bi_end_io; + + if (err) { + bio_endio(bio, err); + return; + } + + remap_to_cache(pb->cache, bio, pb->cblock); + + /* + * We can't issue this bio directly, since we're in interrupt + * context. So it get's put on a bio list for processing by the + * worker thread. + */ + defer_writethrough_bio(pb->cache, bio); +} + +/* + * When running in writethrough mode we need to send writes to clean blocks + * to both the cache and origin devices. In future we'd like to clone the + * bio and send them in parallel, but for now we're doing them in + * series as this is easier. + */ +static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) +{ + struct per_bio_data *pb = get_per_bio_data(bio); + + pb->cache = cache; + pb->cblock = cblock; + pb->saved_bi_end_io = bio->bi_end_io; + bio->bi_end_io = writethrough_endio; + + remap_to_origin_clear_discard(pb->cache, bio, oblock); +} + /*---------------------------------------------------------------- * Migration processing * @@ -1077,14 +1133,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs, inc_hit_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) { - /* - * No need to mark anything dirty in write through mode. - */ - pb->req_nr == 0 ? - remap_to_cache(cache, bio, lookup_result.cblock) : - remap_to_origin_clear_discard(cache, bio, block); - } else + if (is_writethrough_io(cache, bio, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); issue(cache, bio); @@ -1093,17 +1144,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs, case POLICY_MISS: inc_miss_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - - if (pb->req_nr != 0) { - /* - * This is a duplicate writethrough io that is no - * longer needed because the block has been demoted. - */ - bio_endio(bio, 0); - } else { - remap_to_origin_clear_discard(cache, bio, block); - issue(cache, bio); - } + remap_to_origin_clear_discard(cache, bio, block); + issue(cache, bio); break; case POLICY_NEW: @@ -1224,6 +1266,23 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios) submit_bios ? generic_make_request(bio) : bio_io_error(bio); } +static void process_deferred_writethrough_bios(struct cache *cache) +{ + unsigned long flags; + struct bio_list bios; + struct bio *bio; + + bio_list_init(&bios); + + spin_lock_irqsave(&cache->lock, flags); + bio_list_merge(&bios, &cache->deferred_writethrough_bios); + bio_list_init(&cache->deferred_writethrough_bios); + spin_unlock_irqrestore(&cache->lock, flags); + + while ((bio = bio_list_pop(&bios))) + generic_make_request(bio); +} + static void writeback_some_dirty_blocks(struct cache *cache) { int r = 0; @@ -1320,6 +1379,7 @@ static int more_work(struct cache *cache) else return !bio_list_empty(&cache->deferred_bios) || !bio_list_empty(&cache->deferred_flush_bios) || + !bio_list_empty(&cache->deferred_writethrough_bios) || !list_empty(&cache->quiesced_migrations) || !list_empty(&cache->completed_migrations) || !list_empty(&cache->need_commit_migrations); @@ -1338,6 +1398,8 @@ static void do_worker(struct work_struct *ws) writeback_some_dirty_blocks(cache); + process_deferred_writethrough_bios(cache); + if (commit_if_needed(cache)) { process_deferred_flush_bios(cache, false); @@ -1803,8 +1865,6 @@ static sector_t calculate_discard_block_size(sector_t cache_block_size, #define DEFAULT_MIGRATION_THRESHOLD (2048 * 100) -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio); - static int cache_create(struct cache_args *ca, struct cache **result) { int r = 0; @@ -1831,9 +1891,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) memcpy(&cache->features, &ca->features, sizeof(cache->features)); - if (cache->features.write_through) - ti->num_write_bios = cache_num_write_bios; - cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -1883,6 +1940,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) spin_lock_init(&cache->lock); bio_list_init(&cache->deferred_bios); bio_list_init(&cache->deferred_flush_bios); + bio_list_init(&cache->deferred_writethrough_bios); INIT_LIST_HEAD(&cache->quiesced_migrations); INIT_LIST_HEAD(&cache->completed_migrations); INIT_LIST_HEAD(&cache->need_commit_migrations); @@ -2028,20 +2086,6 @@ out: return r; } -static unsigned cache_num_write_bios(struct dm_target *ti, struct bio *bio) -{ - int r; - struct cache *cache = ti->private; - dm_oblock_t block = get_bio_block(cache, bio); - dm_cblock_t cblock; - - r = policy_lookup(cache->policy, block, &cblock); - if (r < 0) - return 2; /* assume the worst */ - - return (!r && !is_dirty(cache, cblock)) ? 2 : 1; -} - static int cache_map(struct dm_target *ti, struct bio *bio) { struct cache *cache = ti->private; @@ -2109,18 +2153,12 @@ static int cache_map(struct dm_target *ti, struct bio *bio) inc_hit_counter(cache, bio); pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds); - if (is_writethrough_io(cache, bio, lookup_result.cblock)) { - /* - * No need to mark anything dirty in write through mode. - */ - pb->req_nr == 0 ? - remap_to_cache(cache, bio, lookup_result.cblock) : - remap_to_origin_clear_discard(cache, bio, block); - cell_defer(cache, cell, false); - } else { + if (is_writethrough_io(cache, bio, lookup_result.cblock)) + remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock); + else remap_to_cache_dirty(cache, bio, block, lookup_result.cblock); - cell_defer(cache, cell, false); - } + + cell_defer(cache, cell, false); break; case POLICY_MISS: @@ -2547,7 +2585,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {1, 0, 0}, + .version = {1, 1, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, -- cgit v1.2.3 From ea2dd8c1ed0becee9812cf0840a9cd553ed398fe Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 20 Mar 2013 17:21:28 +0000 Subject: dm cache: policy ignore hints if generated by different version When reading the dm cache metadata from disk, ignore the policy hints unless they were generated by the same major version number of the same policy module. The hints are considered to be private data belonging to the specific module that generated them and there is no requirement for them to make sense to different versions of the policy that generated them. Policy modules are all required to work fine if no previous hints are supplied (or if existing hints are lost). Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-cache-metadata.c | 47 +++++++++++++++++++++++++++++++++--------- drivers/md/dm-cache-metadata.h | 2 +- drivers/md/dm-cache-target.c | 3 +-- 3 files changed, 39 insertions(+), 13 deletions(-) (limited to 'drivers/md/dm-cache-target.c') diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 74213d1f1db5..83e995fece88 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -863,18 +863,43 @@ struct thunk { bool hints_valid; }; +static bool policy_unchanged(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy) +{ + const char *policy_name = dm_cache_policy_get_name(policy); + const unsigned *policy_version = dm_cache_policy_get_version(policy); + size_t policy_hint_size = dm_cache_policy_get_hint_size(policy); + + /* + * Ensure policy names match. + */ + if (strncmp(cmd->policy_name, policy_name, sizeof(cmd->policy_name))) + return false; + + /* + * Ensure policy major versions match. + */ + if (cmd->policy_version[0] != policy_version[0]) + return false; + + /* + * Ensure policy hint sizes match. + */ + if (cmd->policy_hint_size != policy_hint_size) + return false; + + return true; +} + static bool hints_array_initialized(struct dm_cache_metadata *cmd) { return cmd->hint_root && cmd->policy_hint_size; } static bool hints_array_available(struct dm_cache_metadata *cmd, - const char *policy_name) + struct dm_cache_policy *policy) { - bool policy_names_match = !strncmp(cmd->policy_name, policy_name, - sizeof(cmd->policy_name)); - - return cmd->clean_when_opened && policy_names_match && + return cmd->clean_when_opened && policy_unchanged(cmd, policy) && hints_array_initialized(cmd); } @@ -908,7 +933,8 @@ static int __load_mapping(void *context, uint64_t cblock, void *leaf) return r; } -static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +static int __load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context) { struct thunk thunk; @@ -918,18 +944,19 @@ static int __load_mappings(struct dm_cache_metadata *cmd, const char *policy_nam thunk.cmd = cmd; thunk.respect_dirty_flags = cmd->clean_when_opened; - thunk.hints_valid = hints_array_available(cmd, policy_name); + thunk.hints_valid = hints_array_available(cmd, policy); return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk); } -int dm_cache_load_mappings(struct dm_cache_metadata *cmd, const char *policy_name, +int dm_cache_load_mappings(struct dm_cache_metadata *cmd, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context) { int r; down_read(&cmd->root_lock); - r = __load_mappings(cmd, policy_name, fn, context); + r = __load_mappings(cmd, policy, fn, context); up_read(&cmd->root_lock); return r; @@ -1085,7 +1112,7 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) return -EINVAL; - if (strcmp(cmd->policy_name, policy_name)) { + if (!policy_unchanged(cmd, policy)) { strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h index 135864ea0eee..f45cef21f3d0 100644 --- a/drivers/md/dm-cache-metadata.h +++ b/drivers/md/dm-cache-metadata.h @@ -89,7 +89,7 @@ typedef int (*load_mapping_fn)(void *context, dm_oblock_t oblock, dm_cblock_t cblock, bool dirty, uint32_t hint, bool hint_valid); int dm_cache_load_mappings(struct dm_cache_metadata *cmd, - const char *policy_name, + struct dm_cache_policy *policy, load_mapping_fn fn, void *context); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index ff267db60025..66120bd46d15 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2369,8 +2369,7 @@ static int cache_preresume(struct dm_target *ti) } if (!cache->loaded_mappings) { - r = dm_cache_load_mappings(cache->cmd, - dm_cache_policy_get_name(cache->policy), + r = dm_cache_load_mappings(cache->cmd, cache->policy, load_mapping, cache); if (r) { DMERR("could not load cache mappings"); -- cgit v1.2.3