summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bcache.h20
-rw-r--r--drivers/md/bcache/btree.c5
-rw-r--r--drivers/md/bcache/btree.h18
-rw-r--r--drivers/md/bcache/debug.c3
-rw-r--r--drivers/md/bcache/journal.c2
-rw-r--r--drivers/md/bcache/request.c6
-rw-r--r--drivers/md/bcache/super.c48
-rw-r--r--drivers/md/bcache/sysfs.c61
-rw-r--r--drivers/md/bcache/writeback.c30
-rw-r--r--drivers/md/bcache/writeback.h12
-rw-r--r--drivers/md/dm-core.h5
-rw-r--r--drivers/md/dm-rq.c7
-rw-r--r--drivers/md/dm-table.c4
-rw-r--r--drivers/md/dm.c79
-rw-r--r--drivers/md/md.c7
-rw-r--r--drivers/md/raid0.c2
16 files changed, 231 insertions, 78 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b61b83bbcfff..fdf75352e16a 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -627,6 +627,20 @@ struct cache_set {
struct bkey gc_done;
/*
+ * For automatical garbage collection after writeback completed, this
+ * varialbe is used as bit fields,
+ * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback
+ * - 0000 0010b (BCH_DO_AUTO_GC): do gc after writeback
+ * This is an optimization for following write request after writeback
+ * finished, but read hit rate dropped due to clean data on cache is
+ * discarded. Unless user explicitly sets it via sysfs, it won't be
+ * enabled.
+ */
+#define BCH_ENABLE_AUTO_GC 1
+#define BCH_DO_AUTO_GC 2
+ uint8_t gc_after_writeback;
+
+ /*
* The allocation code needs gc_mark in struct bucket to be correct, but
* it's not while a gc is in progress. Protected by bucket_lock.
*/
@@ -658,7 +672,11 @@ struct cache_set {
/*
* A btree node on disk could have too many bsets for an iterator to fit
- * on the stack - have to dynamically allocate them
+ * on the stack - have to dynamically allocate them.
+ * bch_cache_set_alloc() will make sure the pool can allocate iterators
+ * equipped with enough room that can host
+ * (sb.bucket_size / sb.block_size)
+ * btree_iter_sets, which is more than static MAX_BSETS.
*/
mempool_t fill_iter;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3f4211b5cd33..23cb1dc7296b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -207,6 +207,11 @@ void bch_btree_node_read_done(struct btree *b)
struct bset *i = btree_bset_first(b);
struct btree_iter *iter;
+ /*
+ * c->fill_iter can allocate an iterator with more memory space
+ * than static MAX_BSETS.
+ * See the comment arount cache_set->fill_iter.
+ */
iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
iter->used = 0;
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index a68d6c55783b..d1c72ef64edf 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -266,6 +266,24 @@ static inline void wake_up_gc(struct cache_set *c)
wake_up(&c->gc_wait);
}
+static inline void force_wake_up_gc(struct cache_set *c)
+{
+ /*
+ * Garbage collection thread only works when sectors_to_gc < 0,
+ * calling wake_up_gc() won't start gc thread if sectors_to_gc is
+ * not a nagetive value.
+ * Therefore sectors_to_gc is set to -1 here, before waking up
+ * gc thread by calling wake_up_gc(). Then gc_should_run() will
+ * give a chance to permit gc thread to run. "Give a chance" means
+ * before going into gc_should_run(), there is still possibility
+ * that c->sectors_to_gc being set to other positive value. So
+ * this routine won't 100% make sure gc thread will be woken up
+ * to run.
+ */
+ atomic_set(&c->sectors_to_gc, -1);
+ wake_up_gc(c);
+}
+
#define MAP_DONE 0
#define MAP_CONTINUE 1
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 8f448b9c96a1..8b123be05254 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -249,8 +249,7 @@ void bch_debug_init_cache_set(struct cache_set *c)
void bch_debug_exit(void)
{
- if (!IS_ERR_OR_NULL(bcache_debug))
- debugfs_remove_recursive(bcache_debug);
+ debugfs_remove_recursive(bcache_debug);
}
void __init bch_debug_init(void)
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 522c7426f3a0..b2fd412715b1 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -663,7 +663,7 @@ static void journal_write_unlocked(struct closure *cl)
REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
bch_bio_map(bio, w->data);
- trace_bcache_journal_write(bio);
+ trace_bcache_journal_write(bio, w->data->keys);
bio_list_add(&list, bio);
SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 3bf35914bb57..15070412a32e 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -311,11 +311,11 @@ err:
* data is written it calls bch_journal, and after the keys have been added to
* the next journal write they're inserted into the btree.
*
- * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
+ * It inserts the data in op->bio; bi_sector is used for the key offset,
* and op->inode is used for the key inode.
*
- * If s->bypass is true, instead of inserting the data it invalidates the
- * region of the cache represented by s->cache_bio and op->inode.
+ * If op->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
*/
void bch_data_insert(struct closure *cl)
{
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 7bbd670a5a84..4dee119c3664 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -25,8 +25,8 @@
#include <linux/reboot.h>
#include <linux/sysfs.h>
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+unsigned int bch_cutoff_writeback;
+unsigned int bch_cutoff_writeback_sync;
static const char bcache_magic[] = {
0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
@@ -1510,8 +1510,7 @@ static void cache_set_free(struct closure *cl)
struct cache *ca;
unsigned int i;
- if (!IS_ERR_OR_NULL(c->debug))
- debugfs_remove(c->debug);
+ debugfs_remove(c->debug);
bch_open_buckets_free(c);
bch_btree_cache_free(c);
@@ -2424,6 +2423,32 @@ static void bcache_exit(void)
mutex_destroy(&bch_register_lock);
}
+/* Check and fixup module parameters */
+static void check_module_parameters(void)
+{
+ if (bch_cutoff_writeback_sync == 0)
+ bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
+ else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
+ pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
+ bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
+ bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
+ }
+
+ if (bch_cutoff_writeback == 0)
+ bch_cutoff_writeback = CUTOFF_WRITEBACK;
+ else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
+ pr_warn("set bch_cutoff_writeback (%u) to max value %u",
+ bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
+ bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
+ }
+
+ if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
+ pr_warn("set bch_cutoff_writeback (%u) to %u",
+ bch_cutoff_writeback, bch_cutoff_writeback_sync);
+ bch_cutoff_writeback = bch_cutoff_writeback_sync;
+ }
+}
+
static int __init bcache_init(void)
{
static const struct attribute *files[] = {
@@ -2432,6 +2457,8 @@ static int __init bcache_init(void)
NULL
};
+ check_module_parameters();
+
mutex_init(&bch_register_lock);
init_waitqueue_head(&unregister_wait);
register_reboot_notifier(&reboot);
@@ -2468,5 +2495,18 @@ err:
return -ENOMEM;
}
+/*
+ * Module hooks
+ */
module_exit(bcache_exit);
module_init(bcache_init);
+
+module_param(bch_cutoff_writeback, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
+
+module_param(bch_cutoff_writeback_sync, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
+
+MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 26f035a0c5b9..557a8a3270a1 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,7 +16,7 @@
#include <linux/sort.h>
#include <linux/sched/clock.h>
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
+/* Default is 0 ("writethrough") */
static const char * const bch_cache_modes[] = {
"writethrough",
"writeback",
@@ -25,7 +25,7 @@ static const char * const bch_cache_modes[] = {
NULL
};
-/* Default is -1; we skip past it for stop_when_cache_set_failed */
+/* Default is 0 ("auto") */
static const char * const bch_stop_on_failure_modes[] = {
"auto",
"always",
@@ -88,6 +88,8 @@ read_attribute(writeback_keys_done);
read_attribute(writeback_keys_failed);
read_attribute(io_errors);
read_attribute(congested);
+read_attribute(cutoff_writeback);
+read_attribute(cutoff_writeback_sync);
rw_attribute(congested_read_threshold_us);
rw_attribute(congested_write_threshold_us);
@@ -128,6 +130,7 @@ rw_attribute(expensive_debug_checks);
rw_attribute(cache_replacement_policy);
rw_attribute(btree_shrinker_disabled);
rw_attribute(copy_gc_enabled);
+rw_attribute(gc_after_writeback);
rw_attribute(size);
static ssize_t bch_snprint_string_list(char *buf,
@@ -264,7 +267,8 @@ STORE(__cached_dev)
d_strtoul(writeback_running);
d_strtoul(writeback_delay);
- sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+ sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
+ 0, bch_cutoff_writeback);
if (attr == &sysfs_writeback_rate) {
ssize_t ret;
@@ -384,8 +388,25 @@ STORE(bch_cached_dev)
mutex_lock(&bch_register_lock);
size = __cached_dev_store(kobj, attr, buf, size);
- if (attr == &sysfs_writeback_running)
- bch_writeback_queue(dc);
+ if (attr == &sysfs_writeback_running) {
+ /* dc->writeback_running changed in __cached_dev_store() */
+ if (IS_ERR_OR_NULL(dc->writeback_thread)) {
+ /*
+ * reject setting it to 1 via sysfs if writeback
+ * kthread is not created yet.
+ */
+ if (dc->writeback_running) {
+ dc->writeback_running = false;
+ pr_err("%s: failed to run non-existent writeback thread",
+ dc->disk.disk->disk_name);
+ }
+ } else
+ /*
+ * writeback kthread will check if dc->writeback_running
+ * is true or false.
+ */
+ bch_writeback_queue(dc);
+ }
if (attr == &sysfs_writeback_percent)
if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
@@ -668,6 +689,9 @@ SHOW(__bch_cache_set)
sysfs_print(congested_write_threshold_us,
c->congested_write_threshold_us);
+ sysfs_print(cutoff_writeback, bch_cutoff_writeback);
+ sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync);
+
sysfs_print(active_journal_entries, fifo_used(&c->journal.pin));
sysfs_printf(verify, "%i", c->verify);
sysfs_printf(key_merging_disabled, "%i", c->key_merging_disabled);
@@ -676,6 +700,7 @@ SHOW(__bch_cache_set)
sysfs_printf(gc_always_rewrite, "%i", c->gc_always_rewrite);
sysfs_printf(btree_shrinker_disabled, "%i", c->shrinker_disabled);
sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
+ sysfs_printf(gc_after_writeback, "%i", c->gc_after_writeback);
sysfs_printf(io_disable, "%i",
test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -725,21 +750,8 @@ STORE(__bch_cache_set)
bch_cache_accounting_clear(&c->accounting);
}
- if (attr == &sysfs_trigger_gc) {
- /*
- * Garbage collection thread only works when sectors_to_gc < 0,
- * when users write to sysfs entry trigger_gc, most of time
- * they want to forcibly triger gargage collection. Here -1 is
- * set to c->sectors_to_gc, to make gc_should_run() give a
- * chance to permit gc thread to run. "give a chance" means
- * before going into gc_should_run(), there is still chance
- * that c->sectors_to_gc being set to other positive value. So
- * writing sysfs entry trigger_gc won't always make sure gc
- * thread takes effect.
- */
- atomic_set(&c->sectors_to_gc, -1);
- wake_up_gc(c);
- }
+ if (attr == &sysfs_trigger_gc)
+ force_wake_up_gc(c);
if (attr == &sysfs_prune_cache) {
struct shrink_control sc;
@@ -789,6 +801,12 @@ STORE(__bch_cache_set)
sysfs_strtoul(gc_always_rewrite, c->gc_always_rewrite);
sysfs_strtoul(btree_shrinker_disabled, c->shrinker_disabled);
sysfs_strtoul(copy_gc_enabled, c->copy_gc_enabled);
+ /*
+ * write gc_after_writeback here may overwrite an already set
+ * BCH_DO_AUTO_GC, it doesn't matter because this flag will be
+ * set in next chance.
+ */
+ sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1);
return size;
}
@@ -869,7 +887,10 @@ static struct attribute *bch_cache_set_internal_files[] = {
&sysfs_gc_always_rewrite,
&sysfs_btree_shrinker_disabled,
&sysfs_copy_gc_enabled,
+ &sysfs_gc_after_writeback,
&sysfs_io_disable,
+ &sysfs_cutoff_writeback,
+ &sysfs_cutoff_writeback_sync,
NULL
};
KTYPE(bch_cache_set_internal);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 08c3a9f9676c..73f0efac2b9f 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -17,6 +17,15 @@
#include <linux/sched/clock.h>
#include <trace/events/bcache.h>
+static void update_gc_after_writeback(struct cache_set *c)
+{
+ if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
+ c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
+ return;
+
+ c->gc_after_writeback |= BCH_DO_AUTO_GC;
+}
+
/* Rate limiting */
static uint64_t __calc_target_rate(struct cached_dev *dc)
{
@@ -191,6 +200,7 @@ static void update_writeback_rate(struct work_struct *work)
if (!set_at_max_writeback_rate(c, dc)) {
down_read(&dc->writeback_lock);
__update_writeback_rate(dc);
+ update_gc_after_writeback(c);
up_read(&dc->writeback_lock);
}
}
@@ -689,6 +699,23 @@ static int bch_writeback_thread(void *arg)
up_write(&dc->writeback_lock);
break;
}
+
+ /*
+ * When dirty data rate is high (e.g. 50%+), there might
+ * be heavy buckets fragmentation after writeback
+ * finished, which hurts following write performance.
+ * If users really care about write performance they
+ * may set BCH_ENABLE_AUTO_GC via sysfs, then when
+ * BCH_DO_AUTO_GC is set, garbage collection thread
+ * will be wake up here. After moving gc, the shrunk
+ * btree and discarded free buckets SSD space may be
+ * helpful for following write requests.
+ */
+ if (c->gc_after_writeback ==
+ (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
+ c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
+ force_wake_up_gc(c);
+ }
}
up_write(&dc->writeback_lock);
@@ -777,7 +804,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
bch_keybuf_init(&dc->writeback_keys);
dc->writeback_metadata = true;
- dc->writeback_running = true;
+ dc->writeback_running = false;
dc->writeback_percent = 10;
dc->writeback_delay = 30;
atomic_long_set(&dc->writeback_rate.rate, 1024);
@@ -805,6 +832,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
cached_dev_put(dc);
return PTR_ERR(dc->writeback_thread);
}
+ dc->writeback_running = true;
WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
schedule_delayed_work(&dc->writeback_rate_update,
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index d2b9fdbc8994..6a743d3bb338 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,12 +5,17 @@
#define CUTOFF_WRITEBACK 40
#define CUTOFF_WRITEBACK_SYNC 70
+#define CUTOFF_WRITEBACK_MAX 70
+#define CUTOFF_WRITEBACK_SYNC_MAX 90
+
#define MAX_WRITEBACKS_IN_PASS 5
#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
#define WRITEBACK_RATE_UPDATE_SECS_MAX 60
#define WRITEBACK_RATE_UPDATE_SECS_DEFAULT 5
+#define BCH_AUTO_GC_DIRTY_THRESHOLD 50
+
/*
* 14 (16384ths) is chosen here as something that each backing device
* should be a reasonable fraction of the share, and not to blow up
@@ -53,6 +58,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
}
}
+extern unsigned int bch_cutoff_writeback;
+extern unsigned int bch_cutoff_writeback_sync;
+
static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
unsigned int cache_mode, bool would_skip)
{
@@ -60,7 +68,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
if (cache_mode != CACHE_MODE_WRITEBACK ||
test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
- in_use > CUTOFF_WRITEBACK_SYNC)
+ in_use > bch_cutoff_writeback_sync)
return false;
if (dc->partial_stripes_expensive &&
@@ -73,7 +81,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
return (op_is_sync(bio->bi_opf) ||
bio->bi_opf & (REQ_META|REQ_PRIO) ||
- in_use <= CUTOFF_WRITEBACK);
+ in_use <= bch_cutoff_writeback);
}
static inline void bch_writeback_queue(struct cached_dev *dc)
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 224d44503a06..95c6d86ab5e8 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -65,7 +65,6 @@ struct mapped_device {
*/
struct work_struct work;
wait_queue_head_t wait;
- atomic_t pending[2];
spinlock_t deferred_lock;
struct bio_list deferred;
@@ -107,9 +106,6 @@ struct mapped_device {
struct block_device *bdev;
- /* zero-length flush that will be cloned and submitted to targets */
- struct bio flush_bio;
-
struct dm_stats stats;
/* for blk-mq request-based DM support */
@@ -119,7 +115,6 @@ struct mapped_device {
struct srcu_struct io_barrier;
};
-int md_in_flight(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
void disable_write_zeroes(struct mapped_device *md);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 7cd36e4d1310..4e06be4f0a62 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -43,7 +43,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
int dm_request_based(struct mapped_device *md)
{
- return queue_is_rq_based(md->queue);
+ return queue_is_mq(md->queue);
}
void dm_start_queue(struct request_queue *q)
@@ -130,10 +130,8 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
*/
static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
- atomic_dec(&md->pending[rw]);
-
/* nudge anyone waiting on suspend queue */
- if (!md_in_flight(md))
+ if (unlikely(waitqueue_active(&md->wait)))
wake_up(&md->wait);
/*
@@ -436,7 +434,6 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
static void dm_start_request(struct mapped_device *md, struct request *orig)
{
blk_mq_start_request(orig);
- atomic_inc(&md->pending[rq_data_dir(orig)]);
if (unlikely(dm_stats_used(&md->stats))) {
struct dm_rq_target_io *tio = tio_from_request(orig);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9038c302d5c2..844f7d0f2ef8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -919,12 +919,12 @@ static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev,
struct request_queue *q = bdev_get_queue(dev->bdev);
struct verify_rq_based_data *v = data;
- if (q->mq_ops)
+ if (queue_is_mq(q))
v->mq_count++;
else
v->sq_count++;
- return queue_is_rq_based(q);
+ return queue_is_mq(q);
}
static int dm_table_determine_type(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 63a7c416b224..a4a06982ed91 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -646,26 +646,38 @@ static void free_tio(struct dm_target_io *tio)
bio_put(&tio->clone);
}
-int md_in_flight(struct mapped_device *md)
+static bool md_in_flight_bios(struct mapped_device *md)
{
- return atomic_read(&md->pending[READ]) +
- atomic_read(&md->pending[WRITE]);
+ int cpu;
+ struct hd_struct *part = &dm_disk(md)->part0;
+ long sum = 0;
+
+ for_each_possible_cpu(cpu) {
+ sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
+ sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
+ }
+
+ return sum != 0;
+}
+
+static bool md_in_flight(struct mapped_device *md)
+{
+ if (queue_is_mq(md->queue))
+ return blk_mq_queue_inflight(md->queue);
+ else
+ return md_in_flight_bios(md);
}
static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
- int rw = bio_data_dir(bio);
io->start_time = jiffies;
generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
&dm_disk(md)->part0);
- atomic_set(&dm_disk(md)->part0.in_flight[rw],
- atomic_inc_return(&md->pending[rw]));
-
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
bio->bi_iter.bi_sector, bio_sectors(bio),
@@ -677,8 +689,6 @@ static void end_io_acct(struct dm_io *io)
struct mapped_device *md = io->md;
struct bio *bio = io->orig_bio;
unsigned long duration = jiffies - io->start_time;
- int pending;
- int rw = bio_data_dir(bio);
generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
io->start_time);
@@ -688,16 +698,8 @@ static void end_io_acct(struct dm_io *io)
bio->bi_iter.bi_sector, bio_sectors(bio),
true, duration, &io->stats_aux);
- /*
- * After this is decremented the bio must not be touched if it is
- * a flush.
- */
- pending = atomic_dec_return(&md->pending[rw]);
- atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
- pending += atomic_read(&md->pending[rw^0x1]);
-
/* nudge anyone waiting on suspend queue */
- if (!pending)
+ if (unlikely(waitqueue_active(&md->wait)))
wake_up(&md->wait);
}
@@ -1417,10 +1419,21 @@ static int __send_empty_flush(struct clone_info *ci)
unsigned target_nr = 0;
struct dm_target *ti;
+ /*
+ * Empty flush uses a statically initialized bio, as the base for
+ * cloning. However, blkg association requires that a bdev is
+ * associated with a gendisk, which doesn't happen until the bdev is
+ * opened. So, blkg association is done at issue time of the flush
+ * rather than when the device is created in alloc_dev().
+ */
+ bio_set_dev(ci->bio, ci->io->md->bdev);
+
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
+ bio_disassociate_blkg(ci->bio);
+
return 0;
}
@@ -1598,7 +1611,16 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
- ci.bio = &ci.io->md->flush_bio;
+ struct bio flush_bio;
+
+ /*
+ * Use an on-stack bio for this, it's safe since we don't
+ * need to reference it after submit. It's just used as
+ * the basis for the clone(s).
+ */
+ bio_init(&flush_bio, NULL, 0);
+ flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
@@ -1654,7 +1676,16 @@ static blk_qc_t __process_bio(struct mapped_device *md,
init_clone_info(&ci, md, map, bio);
if (bio->bi_opf & REQ_PREFLUSH) {
- ci.bio = &ci.io->md->flush_bio;
+ struct bio flush_bio;
+
+ /*
+ * Use an on-stack bio for this, it's safe since we don't
+ * need to reference it after submit. It's just used as
+ * the basis for the clone(s).
+ */
+ bio_init(&flush_bio, NULL, 0);
+ flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+ ci.bio = &flush_bio;
ci.sector_count = 0;
error = __send_empty_flush(&ci);
/* dec_pending submits any data associated with flush */
@@ -1898,7 +1929,7 @@ static struct mapped_device *alloc_dev(int minor)
INIT_LIST_HEAD(&md->table_devices);
spin_lock_init(&md->uevent_lock);
- md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
+ md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
if (!md->queue)
goto bad;
md->queue->queuedata = md;
@@ -1908,8 +1939,6 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->disk)
goto bad;
- atomic_set(&md->pending[0], 0);
- atomic_set(&md->pending[1], 0);
init_waitqueue_head(&md->wait);
INIT_WORK(&md->work, dm_wq_work);
init_waitqueue_head(&md->eventq);
@@ -1940,10 +1969,6 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->bdev)
goto bad;
- bio_init(&md->flush_bio, NULL, 0);
- bio_set_dev(&md->flush_bio, md->bdev);
- md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
-
dm_stats_init(&md->stats);
/* Populate the mapping, nobody knows we exist yet */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fc488cb30a94..9a0a1e0934d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -334,7 +334,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
const int sgrp = op_stat_group(bio_op(bio));
struct mddev *mddev = q->queuedata;
unsigned int sectors;
- int cpu;
blk_queue_split(q, &bio);
@@ -359,9 +358,9 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
md_handle_request(mddev, bio);
- cpu = part_stat_lock();
- part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
+ part_stat_lock();
+ part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
+ part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
part_stat_unlock();
return BLK_QC_T_NONE;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ac1cffd2a09b..f3fb5bb8c82a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
!discard_bio)
continue;
bio_chain(discard_bio, bio);
- bio_clone_blkcg_association(discard_bio, bio);
+ bio_clone_blkg_association(discard_bio, bio);
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
discard_bio, disk_devt(mddev->gendisk),