summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bcache.h6
-rw-r--r--drivers/md/bcache/btree.c49
-rw-r--r--drivers/md/bcache/debug.c6
-rw-r--r--drivers/md/bcache/journal.c1
-rw-r--r--drivers/md/bcache/request.c16
-rw-r--r--drivers/md/bcache/super.c50
-rw-r--r--drivers/md/bcache/writeback.c34
-rw-r--r--drivers/md/bcache/writeback.h19
-rw-r--r--drivers/md/dm.c6
-rw-r--r--drivers/md/md.c12
10 files changed, 139 insertions, 60 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d6bf294f3907..872ef4d67711 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -474,6 +474,7 @@ struct cache {
struct gc_stat {
size_t nodes;
+ size_t nodes_pre;
size_t key_bytes;
size_t nkeys;
@@ -525,6 +526,7 @@ struct cache_set {
unsigned devices_max_used;
struct list_head cached_devs;
uint64_t cached_dev_sectors;
+ atomic_long_t flash_dev_dirty_sectors;
struct closure caching;
struct closure sb_write;
@@ -603,6 +605,10 @@ struct cache_set {
*/
atomic_t rescale;
/*
+ * used for GC, identify if any front side I/Os is inflight
+ */
+ atomic_t search_inflight;
+ /*
* When we invalidate buckets, we use both the priority and the amount
* of good data to determine which buckets to reuse first - to weight
* those together consistently we keep track of the smallest nonzero
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 547c9eedc2f4..475008fbbaab 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -90,6 +90,9 @@
#define MAX_NEED_GC 64
#define MAX_SAVE_PRIO 72
+#define MAX_GC_TIMES 100
+#define MIN_GC_NODES 100
+#define GC_SLEEP_MS 100
#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
@@ -1520,6 +1523,32 @@ static unsigned btree_gc_count_keys(struct btree *b)
return ret;
}
+static size_t btree_gc_min_nodes(struct cache_set *c)
+{
+ size_t min_nodes;
+
+ /*
+ * Since incremental GC would stop 100ms when front
+ * side I/O comes, so when there are many btree nodes,
+ * if GC only processes constant (100) nodes each time,
+ * GC would last a long time, and the front side I/Os
+ * would run out of the buckets (since no new bucket
+ * can be allocated during GC), and be blocked again.
+ * So GC should not process constant nodes, but varied
+ * nodes according to the number of btree nodes, which
+ * realized by dividing GC into constant(100) times,
+ * so when there are many btree nodes, GC can process
+ * more nodes each time, otherwise, GC will process less
+ * nodes each time (but no less than MIN_GC_NODES)
+ */
+ min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
+ if (min_nodes < MIN_GC_NODES)
+ min_nodes = MIN_GC_NODES;
+
+ return min_nodes;
+}
+
+
static int btree_gc_recurse(struct btree *b, struct btree_op *op,
struct closure *writes, struct gc_stat *gc)
{
@@ -1585,6 +1614,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
r->b = NULL;
+ if (atomic_read(&b->c->search_inflight) &&
+ gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+ gc->nodes_pre = gc->nodes;
+ ret = -EAGAIN;
+ break;
+ }
+
if (need_resched()) {
ret = -EAGAIN;
break;
@@ -1753,7 +1789,10 @@ static void bch_btree_gc(struct cache_set *c)
closure_sync(&writes);
cond_resched();
- if (ret && ret != -EAGAIN)
+ if (ret == -EAGAIN)
+ schedule_timeout_interruptible(msecs_to_jiffies
+ (GC_SLEEP_MS));
+ else if (ret)
pr_warn("gc failed!");
} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
@@ -1834,8 +1873,14 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
do {
k = bch_btree_iter_next_filter(&iter, &b->keys,
bch_ptr_bad);
- if (k)
+ if (k) {
btree_node_prefetch(b, k);
+ /*
+ * initiallize c->gc_stats.nodes
+ * for incremental GC
+ */
+ b->c->gc_stats.nodes++;
+ }
if (p)
ret = btree(check_recurse, p, b, op);
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index d030ce3025a6..04d146711950 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -110,11 +110,15 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
struct bio_vec bv, cbv;
struct bvec_iter iter, citer = { 0 };
- check = bio_clone_kmalloc(bio, GFP_NOIO);
+ check = bio_kmalloc(GFP_NOIO, bio_segments(bio));
if (!check)
return;
+ check->bi_disk = bio->bi_disk;
check->bi_opf = REQ_OP_READ;
+ check->bi_iter.bi_sector = bio->bi_iter.bi_sector;
+ check->bi_iter.bi_size = bio->bi_iter.bi_size;
+ bch_bio_map(check, NULL);
if (bch_bio_alloc_pages(check, GFP_NOIO))
goto out_put;
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 18f1b5239620..10748c626a1d 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -828,6 +828,7 @@ void bch_journal_free(struct cache_set *c)
free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
free_fifo(&c->journal.pin);
+ free_heap(&c->flush_btree);
}
int bch_journal_alloc(struct cache_set *c)
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ae67f5fa8047..43af905920f5 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -667,8 +667,7 @@ static void backing_request_endio(struct bio *bio)
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
- generic_end_io_acct(s->d->disk->queue,
- bio_data_dir(s->orig_bio),
+ generic_end_io_acct(s->d->disk->queue, bio_op(s->orig_bio),
&s->d->disk->part0, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
@@ -702,6 +701,8 @@ static void search_free(struct closure *cl)
{
struct search *s = container_of(cl, struct search, cl);
+ atomic_dec(&s->d->c->search_inflight);
+
if (s->iop.bio)
bio_put(s->iop.bio);
@@ -719,6 +720,7 @@ static inline struct search *search_alloc(struct bio *bio,
closure_init(&s->cl, NULL);
do_bio_hook(s, bio, request_endio);
+ atomic_inc(&d->c->search_inflight);
s->orig_bio = bio;
s->cache_miss = NULL;
@@ -1062,8 +1064,7 @@ static void detached_dev_end_io(struct bio *bio)
bio->bi_end_io = ddip->bi_end_io;
bio->bi_private = ddip->bi_private;
- generic_end_io_acct(ddip->d->disk->queue,
- bio_data_dir(bio),
+ generic_end_io_acct(ddip->d->disk->queue, bio_op(bio),
&ddip->d->disk->part0, ddip->start_time);
if (bio->bi_status) {
@@ -1120,7 +1121,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
}
atomic_set(&dc->backing_idle, 0);
- generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+ generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
bio_set_dev(bio, dc->bdev);
bio->bi_iter.bi_sector += dc->sb.data_offset;
@@ -1229,7 +1230,6 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
struct search *s;
struct closure *cl;
struct bcache_device *d = bio->bi_disk->private_data;
- int rw = bio_data_dir(bio);
if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
bio->bi_status = BLK_STS_IOERR;
@@ -1237,7 +1237,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
return BLK_QC_T_NONE;
}
- generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
+ generic_start_io_acct(q, bio_op(bio), bio_sectors(bio), &d->disk->part0);
s = search_alloc(bio, d);
cl = &s->cl;
@@ -1254,7 +1254,7 @@ static blk_qc_t flash_dev_make_request(struct request_queue *q,
flash_dev_nodata,
bcache_wq);
return BLK_QC_T_NONE;
- } else if (rw) {
+ } else if (bio_data_dir(bio)) {
bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
&KEY(d->id, bio->bi_iter.bi_sector, 0),
&KEY(d->id, bio_end_sector(bio), 0));
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fa4058e43202..e0a92104ca23 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -181,7 +181,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
goto err;
}
- sb->last_mount = get_seconds();
+ sb->last_mount = (u32)ktime_get_real_seconds();
err = NULL;
get_page(bh->b_page);
@@ -701,7 +701,7 @@ static void bcache_device_detach(struct bcache_device *d)
SET_UUID_FLASH_ONLY(u, 0);
memcpy(u->uuid, invalid_uuid, 16);
- u->invalidated = cpu_to_le32(get_seconds());
+ u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
bch_uuid_write(d->c);
}
@@ -796,11 +796,12 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
return idx;
if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
- BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
- !(d->disk = alloc_disk(BCACHE_MINORS))) {
- ida_simple_remove(&bcache_device_idx, idx);
- return -ENOMEM;
- }
+ BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
+ goto err;
+
+ d->disk = alloc_disk(BCACHE_MINORS);
+ if (!d->disk)
+ goto err;
set_capacity(d->disk, sectors);
snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
@@ -834,6 +835,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
blk_queue_write_cache(q, true, true);
return 0;
+
+err:
+ ida_simple_remove(&bcache_device_idx, idx);
+ return -ENOMEM;
+
}
/* Cached device */
@@ -1027,7 +1033,7 @@ void bch_cached_dev_detach(struct cached_dev *dc)
int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
uint8_t *set_uuid)
{
- uint32_t rtime = cpu_to_le32(get_seconds());
+ uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
struct uuid_entry *u;
struct cached_dev *exist_dc, *t;
@@ -1070,7 +1076,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
(BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
memcpy(u->uuid, invalid_uuid, 16);
- u->invalidated = cpu_to_le32(get_seconds());
+ u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
u = NULL;
}
@@ -1311,6 +1317,8 @@ static void flash_dev_free(struct closure *cl)
{
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
mutex_lock(&bch_register_lock);
+ atomic_long_sub(bcache_dev_sectors_dirty(d),
+ &d->c->flash_dev_dirty_sectors);
bcache_device_free(d);
mutex_unlock(&bch_register_lock);
kobject_put(&d->kobj);
@@ -1390,7 +1398,7 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size)
get_random_bytes(u->uuid, 16);
memset(u->label, 0, 32);
- u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
+ u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
SET_UUID_FLASH_ONLY(u, 1);
u->sectors = size >> 9;
@@ -1894,7 +1902,7 @@ static void run_cache_set(struct cache_set *c)
goto err;
closure_sync(&cl);
- c->sb.last_mount = get_seconds();
+ c->sb.last_mount = (u32)ktime_get_real_seconds();
bcache_write_super(c);
list_for_each_entry_safe(dc, t, &uncached_devices, list)
@@ -2163,8 +2171,12 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
if (!try_module_get(THIS_MODULE))
return -EBUSY;
- if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
- !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
+ path = kstrndup(buffer, size, GFP_KERNEL);
+ if (!path)
+ goto err;
+
+ sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
+ if (!sb)
goto err;
err = "failed to open device";
@@ -2324,9 +2336,15 @@ static int __init bcache_init(void)
return bcache_major;
}
- if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
- !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
- bch_request_init() ||
+ bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
+ if (!bcache_wq)
+ goto err;
+
+ bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
+ if (!bcache_kobj)
+ goto err;
+
+ if (bch_request_init() ||
bch_debug_init(bcache_kobj) || closure_debug_init() ||
sysfs_create_files(bcache_kobj, files))
goto err;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ad45ebe1a74b..912e969fedba 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -27,7 +27,7 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
* flash-only devices
*/
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
- bcache_flash_devs_sectors_dirty(c);
+ atomic_long_read(&c->flash_dev_dirty_sectors);
/*
* Unfortunately there is no control of global dirty data. If the
@@ -476,6 +476,9 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
if (!d)
return;
+ if (UUID_FLASH_ONLY(&c->uuids[inode]))
+ atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors);
+
stripe = offset_to_stripe(d, offset);
stripe_offset = offset & (d->stripe_size - 1);
@@ -673,10 +676,14 @@ static int bch_writeback_thread(void *arg)
}
/* Init */
+#define INIT_KEYS_EACH_TIME 500000
+#define INIT_KEYS_SLEEP_MS 100
struct sectors_dirty_init {
struct btree_op op;
unsigned inode;
+ size_t count;
+ struct bkey start;
};
static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
@@ -691,18 +698,37 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
KEY_START(k), KEY_SIZE(k));
+ op->count++;
+ if (atomic_read(&b->c->search_inflight) &&
+ !(op->count % INIT_KEYS_EACH_TIME)) {
+ bkey_copy_key(&op->start, k);
+ return -EAGAIN;
+ }
+
return MAP_CONTINUE;
}
void bch_sectors_dirty_init(struct bcache_device *d)
{
struct sectors_dirty_init op;
+ int ret;
bch_btree_op_init(&op.op, -1);
op.inode = d->id;
-
- bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
- sectors_dirty_init_fn, 0);
+ op.count = 0;
+ op.start = KEY(op.inode, 0, 0);
+
+ do {
+ ret = bch_btree_map_keys(&op.op, d->c, &op.start,
+ sectors_dirty_init_fn, 0);
+ if (ret == -EAGAIN)
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
+ else if (ret < 0) {
+ pr_warn("sectors dirty init failed, ret=%d!", ret);
+ break;
+ }
+ } while (ret == -EAGAIN);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 610fb01de629..3745d7004c47 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -28,25 +28,6 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
return ret;
}
-static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
-{
- uint64_t i, ret = 0;
-
- mutex_lock(&bch_register_lock);
-
- for (i = 0; i < c->devices_max_used; i++) {
- struct bcache_device *d = c->devices[i];
-
- if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
- continue;
- ret += bcache_dev_sectors_dirty(d);
- }
-
- mutex_unlock(&bch_register_lock);
-
- return ret;
-}
-
static inline unsigned offset_to_stripe(struct bcache_device *d,
uint64_t offset)
{
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b0dd7027848b..20f7e4ef5342 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -609,7 +609,8 @@ static void start_io_acct(struct dm_io *io)
io->start_time = jiffies;
- generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0);
+ generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
+ &dm_disk(md)->part0);
atomic_set(&dm_disk(md)->part0.in_flight[rw],
atomic_inc_return(&md->pending[rw]));
@@ -628,7 +629,8 @@ static void end_io_acct(struct dm_io *io)
int pending;
int rw = bio_data_dir(bio);
- generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
+ generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
+ io->start_time);
if (unlikely(dm_stats_used(&md->stats)))
dm_stats_account_io(&md->stats, bio_data_dir(bio),
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 994aed2f9dff..cb4eb5faa519 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -204,10 +204,6 @@ static int start_readonly;
*/
static bool create_on_open = true;
-/* bio_clone_mddev
- * like bio_clone_bioset, but with a local bio set
- */
-
struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev)
{
@@ -335,6 +331,7 @@ EXPORT_SYMBOL(md_handle_request);
static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
{
const int rw = bio_data_dir(bio);
+ const int sgrp = op_stat_group(bio_op(bio));
struct mddev *mddev = q->queuedata;
unsigned int sectors;
int cpu;
@@ -363,8 +360,8 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
md_handle_request(mddev, bio);
cpu = part_stat_lock();
- part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
- part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+ part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
+ part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
part_stat_unlock();
return BLK_QC_T_NONE;
@@ -8046,8 +8043,7 @@ static int is_mddev_idle(struct mddev *mddev, int init)
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
- curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
- (int)part_stat_read(&disk->part0, sectors[1]) -
+ curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
atomic_read(&disk->sync_io);
/* sync IO will cause sync_io to increase before the disk_stats
* as sync_io is counted when a request starts, and