summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-03-28 12:55:04 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2012-03-28 12:55:04 -0700
commit89e5d6f0d979f6e7dc2bbb1ebd9e239217e2e952 (patch)
tree1126044004b73df905a6183430376f1d97c3b6c9 /drivers
parent516e77977085c9c50703fabb5dc61bd57a8cc1d0 (diff)
parenta4ffc152198efba2ed9e6eac0eb97f17bfebce85 (diff)
downloadlinux-89e5d6f0d979f6e7dc2bbb1ebd9e239217e2e952.tar.bz2
Merge tag 'dm-3.4-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm
Pull device-mapper changes for 3.4 from Alasdair Kergon: - Update thin provisioning to support read-only external snapshot origins and discards. - A new target, dm verity, for device content validation. - Mark dm uevent and dm raid as no-longer-experimental. - Miscellaneous other fixes and clean-ups. * tag 'dm-3.4-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-dm: (27 commits) dm: add verity target dm bufio: prefetch dm thin: add pool target flags to control discard dm thin: support discards dm thin: prepare to support discard dm thin: use dm_target_offset dm thin: support read only external snapshot origins dm thin: relax hard limit on the maximum size of a metadata device dm persistent data: remove space map ref_count entries if redundant dm thin: commit outstanding data every second dm: reject trailing characters in sccanf input dm raid: handle failed devices during start up dm thin metadata: pass correct space map to dm_sm_root_size dm persistent data: remove redundant value_size arg from value_ptr dm mpath: detect invalid map_context dm: clear bi_end_io on remapping failure dm table: simplify call to free_devices dm thin: correct comments dm raid: no longer experimental dm uevent: no longer experimental ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/Kconfig28
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/dm-bufio.c108
-rw-r--r--drivers/md/dm-bufio.h8
-rw-r--r--drivers/md/dm-crypt.c46
-rw-r--r--drivers/md/dm-delay.c9
-rw-r--r--drivers/md/dm-exception-store.c2
-rw-r--r--drivers/md/dm-flakey.c3
-rw-r--r--drivers/md/dm-ioctl.c5
-rw-r--r--drivers/md/dm-linear.c3
-rw-r--r--drivers/md/dm-log.c3
-rw-r--r--drivers/md/dm-mpath.c52
-rw-r--r--drivers/md/dm-queue-length.c3
-rw-r--r--drivers/md/dm-raid.c53
-rw-r--r--drivers/md/dm-raid1.c12
-rw-r--r--drivers/md/dm-round-robin.c3
-rw-r--r--drivers/md/dm-service-time.c5
-rw-r--r--drivers/md/dm-stripe.c3
-rw-r--r--drivers/md/dm-table.c9
-rw-r--r--drivers/md/dm-thin-metadata.c5
-rw-r--r--drivers/md/dm-thin-metadata.h13
-rw-r--r--drivers/md/dm-thin.c680
-rw-r--r--drivers/md/dm-verity.c913
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h7
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c202
-rw-r--r--drivers/md/persistent-data/dm-btree.c27
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c3
28 files changed, 1831 insertions, 376 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index faa4741df6d3..10f122a3a856 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -277,8 +277,8 @@ config DM_MIRROR
needed for live data migration tools such as 'pvmove'.
config DM_RAID
- tristate "RAID 1/4/5/6 target (EXPERIMENTAL)"
- depends on BLK_DEV_DM && EXPERIMENTAL
+ tristate "RAID 1/4/5/6 target"
+ depends on BLK_DEV_DM
select MD_RAID1
select MD_RAID456
select BLK_DEV_MD
@@ -359,8 +359,8 @@ config DM_DELAY
If unsure, say N.
config DM_UEVENT
- bool "DM uevents (EXPERIMENTAL)"
- depends on BLK_DEV_DM && EXPERIMENTAL
+ bool "DM uevents"
+ depends on BLK_DEV_DM
---help---
Generate udev events for DM events.
@@ -370,4 +370,24 @@ config DM_FLAKEY
---help---
A target that intermittently fails I/O for debugging purposes.
+config DM_VERITY
+ tristate "Verity target support (EXPERIMENTAL)"
+ depends on BLK_DEV_DM && EXPERIMENTAL
+ select CRYPTO
+ select CRYPTO_HASH
+ select DM_BUFIO
+ ---help---
+ This device-mapper target creates a read-only device that
+ transparently validates the data on one underlying device against
+ a pre-generated tree of cryptographic checksums stored on a second
+ device.
+
+ You'll need to activate the digests you're going to use in the
+ cryptoapi configuration.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-verity.
+
+ If unsure, say N.
+
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 046860c7a166..8b2e0dffe82e 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
obj-$(CONFIG_DM_RAID) += dm-raid.o
obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
+obj-$(CONFIG_DM_VERITY) += dm-verity.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index b6e58c7b6df5..cc06a1e52423 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -578,7 +578,7 @@ static void write_endio(struct bio *bio, int error)
struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
b->write_error = error;
- if (error) {
+ if (unlikely(error)) {
struct dm_bufio_client *c = b->c;
(void)cmpxchg(&c->async_write_error, 0, error);
}
@@ -697,13 +697,20 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
dm_bufio_lock(c);
}
+enum new_flag {
+ NF_FRESH = 0,
+ NF_READ = 1,
+ NF_GET = 2,
+ NF_PREFETCH = 3
+};
+
/*
* Allocate a new buffer. If the allocation is not possible, wait until
* some other thread frees a buffer.
*
* May drop the lock and regain it.
*/
-static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c)
+static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
{
struct dm_buffer *b;
@@ -726,6 +733,9 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
return b;
}
+ if (nf == NF_PREFETCH)
+ return NULL;
+
if (!list_empty(&c->reserved_buffers)) {
b = list_entry(c->reserved_buffers.next,
struct dm_buffer, lru_list);
@@ -743,9 +753,12 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
}
}
-static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c)
+static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
{
- struct dm_buffer *b = __alloc_buffer_wait_no_callback(c);
+ struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
+
+ if (!b)
+ return NULL;
if (c->alloc_callback)
c->alloc_callback(b);
@@ -865,32 +878,23 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
* Getting a buffer
*--------------------------------------------------------------*/
-enum new_flag {
- NF_FRESH = 0,
- NF_READ = 1,
- NF_GET = 2
-};
-
static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
- enum new_flag nf, struct dm_buffer **bp,
- int *need_submit)
+ enum new_flag nf, int *need_submit)
{
struct dm_buffer *b, *new_b = NULL;
*need_submit = 0;
b = __find(c, block);
- if (b) {
- b->hold_count++;
- __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
- test_bit(B_WRITING, &b->state));
- return b;
- }
+ if (b)
+ goto found_buffer;
if (nf == NF_GET)
return NULL;
- new_b = __alloc_buffer_wait(c);
+ new_b = __alloc_buffer_wait(c, nf);
+ if (!new_b)
+ return NULL;
/*
* We've had a period where the mutex was unlocked, so need to
@@ -899,10 +903,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
b = __find(c, block);
if (b) {
__free_buffer_wake(new_b);
- b->hold_count++;
- __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
- test_bit(B_WRITING, &b->state));
- return b;
+ goto found_buffer;
}
__check_watermark(c);
@@ -922,6 +923,24 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
*need_submit = 1;
return b;
+
+found_buffer:
+ if (nf == NF_PREFETCH)
+ return NULL;
+ /*
+ * Note: it is essential that we don't wait for the buffer to be
+ * read if dm_bufio_get function is used. Both dm_bufio_get and
+ * dm_bufio_prefetch can be used in the driver request routine.
+ * If the user called both dm_bufio_prefetch and dm_bufio_get on
+ * the same buffer, it would deadlock if we waited.
+ */
+ if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
+ return NULL;
+
+ b->hold_count++;
+ __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
+ test_bit(B_WRITING, &b->state));
+ return b;
}
/*
@@ -956,10 +975,10 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
struct dm_buffer *b;
dm_bufio_lock(c);
- b = __bufio_new(c, block, nf, bp, &need_submit);
+ b = __bufio_new(c, block, nf, &need_submit);
dm_bufio_unlock(c);
- if (!b || IS_ERR(b))
+ if (!b)
return b;
if (need_submit)
@@ -1005,13 +1024,47 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
}
EXPORT_SYMBOL_GPL(dm_bufio_new);
+void dm_bufio_prefetch(struct dm_bufio_client *c,
+ sector_t block, unsigned n_blocks)
+{
+ struct blk_plug plug;
+
+ blk_start_plug(&plug);
+ dm_bufio_lock(c);
+
+ for (; n_blocks--; block++) {
+ int need_submit;
+ struct dm_buffer *b;
+ b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
+ if (unlikely(b != NULL)) {
+ dm_bufio_unlock(c);
+
+ if (need_submit)
+ submit_io(b, READ, b->block, read_endio);
+ dm_bufio_release(b);
+
+ dm_bufio_cond_resched();
+
+ if (!n_blocks)
+ goto flush_plug;
+ dm_bufio_lock(c);
+ }
+
+ }
+
+ dm_bufio_unlock(c);
+
+flush_plug:
+ blk_finish_plug(&plug);
+}
+EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
+
void dm_bufio_release(struct dm_buffer *b)
{
struct dm_bufio_client *c = b->c;
dm_bufio_lock(c);
- BUG_ON(test_bit(B_READING, &b->state));
BUG_ON(!b->hold_count);
b->hold_count--;
@@ -1024,6 +1077,7 @@ void dm_bufio_release(struct dm_buffer *b)
* invalid buffer.
*/
if ((b->read_error || b->write_error) &&
+ !test_bit(B_READING, &b->state) &&
!test_bit(B_WRITING, &b->state) &&
!test_bit(B_DIRTY, &b->state)) {
__unlink_buffer(b);
@@ -1041,6 +1095,8 @@ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
dm_bufio_lock(c);
+ BUG_ON(test_bit(B_READING, &b->state));
+
if (!test_and_set_bit(B_DIRTY, &b->state))
__relink_lru(b, LIST_DIRTY);
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index 5c4c3a04e381..b142946a9e32 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -63,6 +63,14 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
struct dm_buffer **bp);
/*
+ * Prefetch the specified blocks to the cache.
+ * The function starts to read the blocks and returns without waiting for
+ * I/O to finish.
+ */
+void dm_bufio_prefetch(struct dm_bufio_client *c,
+ sector_t block, unsigned n_blocks);
+
+/*
* Release a reference obtained with dm_bufio_{read,get,new}. The data
* pointer and dm_buffer pointer is no longer valid after this call.
*/
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index db6b51639cee..3f06df59fd82 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -176,7 +176,6 @@ struct crypt_config {
#define MIN_IOS 16
#define MIN_POOL_PAGES 32
-#define MIN_BIO_PAGES 8
static struct kmem_cache *_crypt_io_pool;
@@ -848,12 +847,11 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
}
/*
- * if additional pages cannot be allocated without waiting,
- * return a partially allocated bio, the caller will then try
- * to allocate additional bios while submitting this partial bio
+ * If additional pages cannot be allocated without waiting,
+ * return a partially-allocated bio. The caller will then try
+ * to allocate more bios while submitting this partial bio.
*/
- if (i == (MIN_BIO_PAGES - 1))
- gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
+ gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
@@ -1046,16 +1044,14 @@ static void kcryptd_queue_io(struct dm_crypt_io *io)
queue_work(cc->io_queue, &io->work);
}
-static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io,
- int error, int async)
+static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
{
struct bio *clone = io->ctx.bio_out;
struct crypt_config *cc = io->target->private;
- if (unlikely(error < 0)) {
+ if (unlikely(io->error < 0)) {
crypt_free_buffer_pages(cc, clone);
bio_put(clone);
- io->error = -EIO;
crypt_dec_pending(io);
return;
}
@@ -1106,12 +1102,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
sector += bio_sectors(clone);
crypt_inc_pending(io);
+
r = crypt_convert(cc, &io->ctx);
+ if (r < 0)
+ io->error = -EIO;
+
crypt_finished = atomic_dec_and_test(&io->ctx.pending);
/* Encryption was already finished, submit io now */
if (crypt_finished) {
- kcryptd_crypt_write_io_submit(io, r, 0);
+ kcryptd_crypt_write_io_submit(io, 0);
/*
* If there was an error, do not try next fragments.
@@ -1162,11 +1162,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
crypt_dec_pending(io);
}
-static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error)
+static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
{
- if (unlikely(error < 0))
- io->error = -EIO;
-
crypt_dec_pending(io);
}
@@ -1181,9 +1178,11 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
io->sector);
r = crypt_convert(cc, &io->ctx);
+ if (r < 0)
+ io->error = -EIO;
if (atomic_dec_and_test(&io->ctx.pending))
- kcryptd_crypt_read_done(io, r);
+ kcryptd_crypt_read_done(io);
crypt_dec_pending(io);
}
@@ -1204,15 +1203,18 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
+ if (error < 0)
+ io->error = -EIO;
+
mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
if (!atomic_dec_and_test(&ctx->pending))
return;
if (bio_data_dir(io->base_bio) == READ)
- kcryptd_crypt_read_done(io, error);
+ kcryptd_crypt_read_done(io);
else
- kcryptd_crypt_write_io_submit(io, error, 1);
+ kcryptd_crypt_write_io_submit(io, 1);
}
static void kcryptd_crypt(struct work_struct *work)
@@ -1413,6 +1415,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
char *cipher_api = NULL;
int cpu, ret = -EINVAL;
+ char dummy;
/* Convert to crypto api definition? */
if (strchr(cipher_in, '(')) {
@@ -1434,7 +1437,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
if (!keycount)
cc->tfms_count = 1;
- else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 ||
+ else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 ||
!is_power_of_2(cc->tfms_count)) {
ti->error = "Bad cipher key count specification";
return -EINVAL;
@@ -1579,6 +1582,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
int ret;
struct dm_arg_set as;
const char *opt_string;
+ char dummy;
static struct dm_arg _args[] = {
{0, 1, "Invalid number of feature args"},
@@ -1636,7 +1640,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
}
ret = -EINVAL;
- if (sscanf(argv[2], "%llu", &tmpll) != 1) {
+ if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid iv_offset sector";
goto bad;
}
@@ -1647,7 +1651,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad;
}
- if (sscanf(argv[4], "%llu", &tmpll) != 1) {
+ if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid device sector";
goto bad;
}
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index f18375dcedd9..2dc22dddb2ae 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -131,6 +131,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct delay_c *dc;
unsigned long long tmpll;
+ char dummy;
if (argc != 3 && argc != 6) {
ti->error = "requires exactly 3 or 6 arguments";
@@ -145,13 +146,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
dc->reads = dc->writes = 0;
- if (sscanf(argv[1], "%llu", &tmpll) != 1) {
+ if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid device sector";
goto bad;
}
dc->start_read = tmpll;
- if (sscanf(argv[2], "%u", &dc->read_delay) != 1) {
+ if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) {
ti->error = "Invalid delay";
goto bad;
}
@@ -166,13 +167,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (argc == 3)
goto out;
- if (sscanf(argv[4], "%llu", &tmpll) != 1) {
+ if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid write device sector";
goto bad_dev_read;
}
dc->start_write = tmpll;
- if (sscanf(argv[5], "%u", &dc->write_delay) != 1) {
+ if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) {
ti->error = "Invalid write delay";
goto bad_dev_read;
}
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 042e71996569..aa70f7d43a1a 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -283,7 +283,7 @@ int dm_exception_store_init(void)
return 0;
persistent_fail:
- dm_persistent_snapshot_exit();
+ dm_transient_snapshot_exit();
transient_fail:
return r;
}
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index b280c433e4a0..ac49c01f1a44 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -160,6 +160,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
unsigned long long tmpll;
struct dm_arg_set as;
const char *devname;
+ char dummy;
as.argc = argc;
as.argv = argv;
@@ -178,7 +179,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
devname = dm_shift_arg(&as);
- if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) {
+ if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) {
ti->error = "Invalid device sector";
goto bad;
}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1ce84ed0b765..a1a3e6df17b8 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -880,6 +880,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
struct hd_geometry geometry;
unsigned long indata[4];
char *geostr = (char *) param + param->data_start;
+ char dummy;
md = find_device(param);
if (!md)
@@ -891,8 +892,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
goto out;
}
- x = sscanf(geostr, "%lu %lu %lu %lu", indata,
- indata + 1, indata + 2, indata + 3);
+ x = sscanf(geostr, "%lu %lu %lu %lu%c", indata,
+ indata + 1, indata + 2, indata + 3, &dummy);
if (x != 4) {
DMWARN("Unable to interpret geometry settings.");
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9728839f844a..3639eeab6042 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -29,6 +29,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
struct linear_c *lc;
unsigned long long tmp;
+ char dummy;
if (argc != 2) {
ti->error = "Invalid argument count";
@@ -41,7 +42,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -ENOMEM;
}
- if (sscanf(argv[1], "%llu", &tmp) != 1) {
+ if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) {
ti->error = "dm-linear: Invalid device sector";
goto bad;
}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 3b52bb72bd1f..65ebaebf502b 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -369,6 +369,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
unsigned int region_count;
size_t bitset_size, buf_size;
int r;
+ char dummy;
if (argc < 1 || argc > 2) {
DMWARN("wrong number of arguments to dirty region log");
@@ -387,7 +388,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
}
}
- if (sscanf(argv[0], "%u", &region_size) != 1 ||
+ if (sscanf(argv[0], "%u%c", &region_size, &dummy) != 1 ||
!_check_region_size(ti, region_size)) {
DMWARN("invalid region size %s", argv[0]);
return -EINVAL;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 801d92d237cf..922a3385eead 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -226,6 +226,27 @@ static void free_multipath(struct multipath *m)
kfree(m);
}
+static int set_mapinfo(struct multipath *m, union map_info *info)
+{
+ struct dm_mpath_io *mpio;
+
+ mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
+ if (!mpio)
+ return -ENOMEM;
+
+ memset(mpio, 0, sizeof(*mpio));
+ info->ptr = mpio;
+
+ return 0;
+}
+
+static void clear_mapinfo(struct multipath *m, union map_info *info)
+{
+ struct dm_mpath_io *mpio = info->ptr;
+
+ info->ptr = NULL;
+ mempool_free(mpio, m->mpio_pool);
+}
/*-----------------------------------------------
* Path selection
@@ -341,13 +362,14 @@ static int __must_push_back(struct multipath *m)
}
static int map_io(struct multipath *m, struct request *clone,
- struct dm_mpath_io *mpio, unsigned was_queued)
+ union map_info *map_context, unsigned was_queued)
{
int r = DM_MAPIO_REMAPPED;
size_t nr_bytes = blk_rq_bytes(clone);
unsigned long flags;
struct pgpath *pgpath;
struct block_device *bdev;
+ struct dm_mpath_io *mpio = map_context->ptr;
spin_lock_irqsave(&m->lock, flags);
@@ -423,7 +445,6 @@ static void dispatch_queued_ios(struct multipath *m)
{
int r;
unsigned long flags;
- struct dm_mpath_io *mpio;
union map_info *info;
struct request *clone, *n;
LIST_HEAD(cl);
@@ -436,16 +457,15 @@ static void dispatch_queued_ios(struct multipath *m)
list_del_init(&clone->queuelist);
info = dm_get_rq_mapinfo(clone);
- mpio = info->ptr;
- r = map_io(m, clone, mpio, 1);
+ r = map_io(m, clone, info, 1);
if (r < 0) {
- mempool_free(mpio, m->mpio_pool);
+ clear_mapinfo(m, info);
dm_kill_unmapped_request(clone, r);
} else if (r == DM_MAPIO_REMAPPED)
dm_dispatch_request(clone);
else if (r == DM_MAPIO_REQUEUE) {
- mempool_free(mpio, m->mpio_pool);
+ clear_mapinfo(m, info);
dm_requeue_unmapped_request(clone);
}
}
@@ -908,20 +928,16 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
union map_info *map_context)
{
int r;
- struct dm_mpath_io *mpio;
struct multipath *m = (struct multipath *) ti->private;
- mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
- if (!mpio)
+ if (set_mapinfo(m, map_context) < 0)
/* ENOMEM, requeue */
return DM_MAPIO_REQUEUE;
- memset(mpio, 0, sizeof(*mpio));
- map_context->ptr = mpio;
clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
- r = map_io(m, clone, mpio, 0);
+ r = map_io(m, clone, map_context, 0);
if (r < 0 || r == DM_MAPIO_REQUEUE)
- mempool_free(mpio, m->mpio_pool);
+ clear_mapinfo(m, map_context);
return r;
}
@@ -1054,8 +1070,9 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
struct priority_group *pg;
unsigned pgnum;
unsigned long flags;
+ char dummy;
- if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
+ if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
(pgnum > m->nr_priority_groups)) {
DMWARN("invalid PG number supplied to switch_pg_num");
return -EINVAL;
@@ -1085,8 +1102,9 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
{
struct priority_group *pg;
unsigned pgnum;
+ char dummy;
- if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum ||
+ if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
(pgnum > m->nr_priority_groups)) {
DMWARN("invalid PG number supplied to bypass_pg");
return -EINVAL;
@@ -1261,13 +1279,15 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
struct path_selector *ps;
int r;
+ BUG_ON(!mpio);
+
r = do_end_io(m, clone, error, mpio);
if (pgpath) {
ps = &pgpath->pg->ps;
if (ps->type->end_io)
ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
}
- mempool_free(mpio, m->mpio_pool);
+ clear_mapinfo(m, map_context);
return r;
}
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 03a837aa5ce6..3941fae0de9f 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -112,6 +112,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
struct selector *s = ps->context;
struct path_info *pi;
unsigned repeat_count = QL_MIN_IO;
+ char dummy;
/*
* Arguments: [<repeat_count>]
@@ -123,7 +124,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
return -EINVAL;
}
- if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+ if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
*error = "queue-length ps: invalid repeat count";
return -EINVAL;
}
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index c5a875d7b882..b0ba52459ed7 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -604,7 +604,9 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
return 0;
if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
- DMERR("Failed to read device superblock");
+ DMERR("Failed to read superblock of device at position %d",
+ rdev->raid_disk);
+ set_bit(Faulty, &rdev->flags);
return -EINVAL;
}
@@ -855,9 +857,25 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
{
int ret;
+ unsigned redundancy = 0;
+ struct raid_dev *dev;
struct md_rdev *rdev, *freshest;
struct mddev *mddev = &rs->md;
+ switch (rs->raid_type->level) {
+ case 1:
+ redundancy = rs->md.raid_disks - 1;
+ break;
+ case 4:
+ case 5:
+ case 6:
+ redundancy = rs->raid_type->parity_devs;
+ break;
+ default:
+ ti->error = "Unknown RAID type";
+ return -EINVAL;
+ }
+
freshest = NULL;
rdev_for_each(rdev, mddev) {
if (!rdev->meta_bdev)
@@ -872,6 +890,37 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
case 0:
break;
default:
+ dev = container_of(rdev, struct raid_dev, rdev);
+ if (redundancy--) {
+ if (dev->meta_dev)
+ dm_put_device(ti, dev->meta_dev);
+
+ dev->meta_dev = NULL;
+ rdev->meta_bdev = NULL;
+
+ if (rdev->sb_page)
+ put_page(rdev->sb_page);
+
+ rdev->sb_page = NULL;
+
+ rdev->sb_loaded = 0;
+
+ /*
+ * We might be able to salvage the data device
+ * even though the meta device has failed. For
+ * now, we behave as though '- -' had been
+ * set for this device in the table.
+ */
+ if (dev->data_dev)
+ dm_put_device(ti, dev->data_dev);
+
+ dev->data_dev = NULL;
+ rdev->bdev = NULL;
+
+ list_del(&rdev->same_set);
+
+ continue;
+ }
ti->error = "Failed to load superblock";
return ret;
}
@@ -1214,7 +1263,7 @@ static void raid_resume(struct dm_target *ti)
static struct target_type raid_target = {
.name = "raid",
- .version = {1, 1, 0},
+ .version = {1, 2, 0},
.module = THIS_MODULE,
.ctr = raid_ctr,
.dtr = raid_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9bfd057be686..d039de8322f0 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -924,8 +924,9 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
unsigned int mirror, char **argv)
{
unsigned long long offset;
+ char dummy;
- if (sscanf(argv[1], "%llu", &offset) != 1) {
+ if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) {
ti->error = "Invalid offset";
return -EINVAL;
}
@@ -953,13 +954,14 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
{
unsigned param_count;
struct dm_dirty_log *dl;
+ char dummy;
if (argc < 2) {
ti->error = "Insufficient mirror log arguments";
return NULL;
}
- if (sscanf(argv[1], "%u", &param_count) != 1) {
+ if (sscanf(argv[1], "%u%c", &param_count, &dummy) != 1) {
ti->error = "Invalid mirror log argument count";
return NULL;
}
@@ -986,13 +988,14 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
{
unsigned num_features;
struct dm_target *ti = ms->ti;
+ char dummy;
*args_used = 0;
if (!argc)
return 0;
- if (sscanf(argv[0], "%u", &num_features) != 1) {
+ if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) {
ti->error = "Invalid number of features";
return -EINVAL;
}
@@ -1036,6 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
unsigned int nr_mirrors, m, args_used;
struct mirror_set *ms;
struct dm_dirty_log *dl;
+ char dummy;
dl = create_dirty_log(ti, argc, argv, &args_used);
if (!dl)
@@ -1044,7 +1048,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
argv += args_used;
argc -= args_used;
- if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
+ if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 ||
nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
ti->error = "Invalid number of mirrors";
dm_dirty_log_destroy(dl);
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 27f1d423b76c..6ab1192cdd5f 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -114,6 +114,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
struct selector *s = (struct selector *) ps->context;
struct path_info *pi;
unsigned repeat_count = RR_MIN_IO;
+ char dummy;
if (argc > 1) {
*error = "round-robin ps: incorrect number of arguments";
@@ -121,7 +122,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
}
/* First path argument is number of I/Os before switching path */
- if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+ if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
*error = "round-robin ps: invalid repeat count";
return -EINVAL;
}
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 59883bd78214..9df8f6bd6418 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -110,6 +110,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
struct path_info *pi;
unsigned repeat_count = ST_MIN_IO;
unsigned relative_throughput = 1;
+ char dummy;
/*
* Arguments: [<repeat_count> [<relative_throughput>]]
@@ -128,13 +129,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
return -EINVAL;
}
- if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+ if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
*error = "service-time ps: invalid repeat count";
return -EINVAL;
}
if ((argc == 2) &&
- (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
+ (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
*error = "service-time ps: invalid relative_throughput value";
return -EINVAL;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 3d80cf0c152d..35c94ff24ad5 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -75,8 +75,9 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
unsigned int stripe, char **argv)
{
unsigned long long start;
+ char dummy;
- if (sscanf(argv[1], "%llu", &start) != 1)
+ if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1)
return -EINVAL;
if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 63cc54289aff..2e227fbf1622 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -268,8 +268,7 @@ void dm_table_destroy(struct dm_table *t)
vfree(t->highs);
/* free the device list */
- if (t->devices.next != &t->devices)
- free_devices(&t->devices);
+ free_devices(&t->devices);
dm_free_md_mempools(t->mempools);
@@ -464,10 +463,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
struct dm_dev_internal *dd;
unsigned int major, minor;
struct dm_table *t = ti->table;
+ char dummy;
BUG_ON(!t);
- if (sscanf(path, "%u:%u", &major, &minor) == 2) {
+ if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
/* Extract the major/minor numbers */
dev = MKDEV(major, minor);
if (MAJOR(dev) != major || MINOR(dev) != minor)
@@ -842,9 +842,10 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
unsigned *value, char **error, unsigned grouped)
{
const char *arg_str = dm_shift_arg(arg_set);
+ char dummy;
if (!arg_str ||
- (sscanf(arg_str, "%u", value) != 1) ||
+ (sscanf(arg_str, "%u%c", value, &dummy) != 1) ||
(*value < arg->min) ||
(*value > arg->max) ||
(grouped && arg_set->argc < *value)) {
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 237571af77fd..737d38865b69 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -614,7 +614,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
if (r < 0)
goto out;
- r = dm_sm_root_size(pmd->metadata_sm, &data_len);
+ r = dm_sm_root_size(pmd->data_sm, &data_len);
if (r < 0)
goto out;
@@ -713,6 +713,9 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
if (r)
goto bad;
+ if (bdev_size > THIN_METADATA_MAX_SECTORS)
+ bdev_size = THIN_METADATA_MAX_SECTORS;
+
disk_super = dm_block_data(sblock);
disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
disk_super->version = cpu_to_le32(THIN_VERSION);
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 859c16896877..ed4725e67c96 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -11,6 +11,19 @@
#define THIN_METADATA_BLOCK_SIZE 4096
+/*
+ * The metadata device is currently limited in size.
+ *
+ * We have one block of index, which can hold 255 index entries. Each
+ * index entry contains allocation info about 16k metadata blocks.
+ */
+#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
+
+/*
+ * A metadata device larger than 16GB triggers a warning.
+ */
+#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
+
/*----------------------------------------------------------------*/
struct dm_pool_metadata;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c3087575fef0..213ae32a0fc4 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -23,6 +23,7 @@
#define DEFERRED_SET_SIZE 64
#define MAPPING_POOL_SIZE 1024
#define PRISON_CELLS 1024
+#define COMMIT_PERIOD HZ
/*
* The block size of the device holding pool data must be
@@ -32,16 +33,6 @@
#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
/*
- * The metadata device is currently limited in size. The limitation is
- * checked lower down in dm-space-map-metadata, but we also check it here
- * so we can fail early.
- *
- * We have one block of index, which can hold 255 index entries. Each
- * index entry contains allocation info about 16k metadata blocks.
- */
-#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
-
-/*
* Device id is restricted to 24 bits.
*/
#define MAX_DEV_ID ((1 << 24) - 1)
@@ -72,7 +63,7 @@
* missed out if the io covers the block. (schedule_copy).
*
* iv) insert the new mapping into the origin's btree
- * (process_prepared_mappings). This act of inserting breaks some
+ * (process_prepared_mapping). This act of inserting breaks some
* sharing of btree nodes between the two devices. Breaking sharing only
* effects the btree of that specific device. Btrees for the other
* devices that share the block never change. The btree for the origin
@@ -124,7 +115,7 @@ struct cell {
struct hlist_node list;
struct bio_prison *prison;
struct cell_key key;
- unsigned count;
+ struct bio *holder;
struct bio_list bios;
};
@@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
* This may block if a new cell needs allocating. You must ensure that
* cells will be unlocked even if the calling thread is blocked.
*
- * Returns the number of entries in the cell prior to the new addition
- * or < 0 on failure.
+ * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
*/
static int bio_detain(struct bio_prison *prison, struct cell_key *key,
struct bio *inmate, struct cell **ref)
{
- int r;
+ int r = 1;
unsigned long flags;
uint32_t hash = hash_key(prison, key);
- struct cell *uninitialized_var(cell), *cell2 = NULL;
+ struct cell *cell, *cell2;
BUG_ON(hash > prison->nr_buckets);
spin_lock_irqsave(&prison->lock, flags);
+
cell = __search_bucket(prison->cells + hash, key);
+ if (cell) {
+ bio_list_add(&cell->bios, inmate);
+ goto out;
+ }
- if (!cell) {
- /*
- * Allocate a new cell
- */
- spin_unlock_irqrestore(&prison->lock, flags);
- cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
- spin_lock_irqsave(&prison->lock, flags);
+ /*
+ * Allocate a new cell
+ */
+ spin_unlock_irqrestore(&prison->lock, flags);
+ cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
+ spin_lock_irqsave(&prison->lock, flags);
- /*
- * We've been unlocked, so we have to double check that
- * nobody else has inserted this cell in the meantime.
- */
- cell = __search_bucket(prison->cells + hash, key);
+ /*
+ * We've been unlocked, so we have to double check that
+ * nobody else has inserted this cell in the meantime.
+ */
+ cell = __search_bucket(prison->cells + hash, key);
+ if (cell) {
+ mempool_free(cell2, prison->cell_pool);
+ bio_list_add(&cell->bios, inmate);
+ goto out;
+ }
- if (!cell) {
- cell = cell2;
- cell2 = NULL;
+ /*
+ * Use new cell.
+ */
+ cell = cell2;
- cell->prison = prison;
- memcpy(&cell->key, key, sizeof(cell->key));
- cell->count = 0;
- bio_list_init(&cell->bios);
- hlist_add_head(&cell->list, prison->cells + hash);
- }
- }
+ cell->prison = prison;
+ memcpy(&cell->key, key, sizeof(cell->key));
+ cell->holder = inmate;
+ bio_list_init(&cell->bios);
+ hlist_add_head(&cell->list, prison->cells + hash);
- r = cell->count++;
- bio_list_add(&cell->bios, inmate);
- spin_unlock_irqrestore(&prison->lock, flags);
+ r = 0;
- if (cell2)
- mempool_free(cell2, prison->cell_pool);
+out:
+ spin_unlock_irqrestore(&prison->lock, flags);
*ref = cell;
@@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
hlist_del(&cell->list);
- if (inmates)
- bio_list_merge(inmates, &cell->bios);
+ bio_list_add(inmates, cell->holder);
+ bio_list_merge(inmates, &cell->bios);
mempool_free(cell, prison->cell_pool);
}
@@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
* bio may be in the cell. This function releases the cell, and also does
* a sanity check.
*/
+static void __cell_release_singleton(struct cell *cell, struct bio *bio)
+{
+ hlist_del(&cell->list);
+ BUG_ON(cell->holder != bio);
+ BUG_ON(!bio_list_empty(&cell->bios));
+}
+
static void cell_release_singleton(struct cell *cell, struct bio *bio)
{
- struct bio_prison *prison = cell->prison;
- struct bio_list bios;
- struct bio *b;
unsigned long flags;
-
- bio_list_init(&bios);
+ struct bio_prison *prison = cell->prison;
spin_lock_irqsave(&prison->lock, flags);
- __cell_release(cell, &bios);
+ __cell_release_singleton(cell, bio);
spin_unlock_irqrestore(&prison->lock, flags);
+}
+
+/*
+ * Sometimes we don't want the holder, just the additional bios.
+ */
+static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+ struct bio_prison *prison = cell->prison;
+
+ hlist_del(&cell->list);
+ bio_list_merge(inmates, &cell->bios);
- b = bio_list_pop(&bios);
- BUG_ON(b != bio);
- BUG_ON(!bio_list_empty(&bios));
+ mempool_free(cell, prison->cell_pool);
+}
+
+static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
+{
+ unsigned long flags;
+ struct bio_prison *prison = cell->prison;
+
+ spin_lock_irqsave(&prison->lock, flags);
+ __cell_release_no_holder(cell, inmates);
+ spin_unlock_irqrestore(&prison->lock, flags);
}
static void cell_error(struct cell *cell)
@@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
* devices.
*/
struct new_mapping;
+
+struct pool_features {
+ unsigned zero_new_blocks:1;
+ unsigned discard_enabled:1;
+ unsigned discard_passdown:1;
+};
+
struct pool {
struct list_head list;
struct dm_target *ti; /* Only set if a pool target is bound */
@@ -484,7 +509,7 @@ struct pool {
dm_block_t offset_mask;
dm_block_t low_water_blocks;
- unsigned zero_new_blocks:1;
+ struct pool_features pf;
unsigned low_water_triggered:1; /* A dm event has been sent */
unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
@@ -493,17 +518,21 @@ struct pool {
struct workqueue_struct *wq;
struct work_struct worker;
+ struct delayed_work waker;
unsigned ref_count;
+ unsigned long last_commit_jiffies;
spinlock_t lock;
struct bio_list deferred_bios;
struct bio_list deferred_flush_bios;
struct list_head prepared_mappings;
+ struct list_head prepared_discards;
struct bio_list retry_on_resume_list;
- struct deferred_set ds; /* FIXME: move to thin_c */
+ struct deferred_set shared_read_ds;
+ struct deferred_set all_io_ds;
struct new_mapping *next_mapping;
mempool_t *mapping_pool;
@@ -521,7 +550,7 @@ struct pool_c {
struct dm_target_callbacks callbacks;
dm_block_t low_water_blocks;
- unsigned zero_new_blocks:1;
+ struct pool_features pf;
};
/*
@@ -529,6 +558,7 @@ struct pool_c {
*/
struct thin_c {
struct dm_dev *pool_dev;
+ struct dm_dev *origin_dev;
dm_thin_id dev_id;
struct pool *pool;
@@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
/*----------------------------------------------------------------*/
+struct endio_hook {
+ struct thin_c *tc;
+ struct deferred_entry *shared_read_entry;
+ struct deferred_entry *all_io_entry;
+ struct new_mapping *overwrite_mapping;
+};
+
static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
{
struct bio *bio;
@@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
bio_list_init(master);
while ((bio = bio_list_pop(&bios))) {
- if (dm_get_mapinfo(bio)->ptr == tc)
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ if (h->tc == tc)
bio_endio(bio, DM_ENDIO_REQUEUE);
else
bio_list_add(master, bio);
@@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
(bio->bi_sector & pool->offset_mask);
}
-static void remap_and_issue(struct thin_c *tc, struct bio *bio,
- dm_block_t block)
+static void remap_to_origin(struct thin_c *tc, struct bio *bio)
+{
+ bio->bi_bdev = tc->origin_dev->bdev;
+}
+
+static void issue(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
unsigned long flags;
- remap(tc, bio, block);
-
/*
* Batch together any FUA/FLUSH bios we find and then issue
* a single commit for them in process_deferred_bios().
@@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
generic_make_request(bio);
}
+static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
+{
+ remap_to_origin(tc, bio);
+ issue(tc, bio);
+}
+
+static void remap_and_issue(struct thin_c *tc, struct bio *bio,
+ dm_block_t block)
+{
+ remap(tc, bio, block);
+ issue(tc, bio);
+}
+
/*
* wake_worker() is used when new work is queued and when pool_resume is
* ready to continue deferred IO processing.
@@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool)
/*
* Bio endio functions.
*/
-struct endio_hook {
- struct thin_c *tc;
- bio_end_io_t *saved_bi_end_io;
- struct deferred_entry *entry;
-};
-
struct new_mapping {
struct list_head list;
- int prepared;
+ unsigned quiesced:1;
+ unsigned prepared:1;
+ unsigned pass_discard:1;
struct thin_c *tc;
dm_block_t virt_block;
dm_block_t data_block;
- struct cell *cell;
+ struct cell *cell, *cell2;
int err;
/*
@@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
{
struct pool *pool = m->tc->pool;
- if (list_empty(&m->list) && m->prepared) {
+ if (m->quiesced && m->prepared) {
list_add(&m->list, &pool->prepared_mappings);
wake_worker(pool);
}
@@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
static void overwrite_endio(struct bio *bio, int err)
{
unsigned long flags;
- struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct new_mapping *m = h->overwrite_mapping;
struct pool *pool = m->tc->pool;
m->err = err;
@@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err)
spin_unlock_irqrestore(&pool->lock, flags);
}
-static void shared_read_endio(struct bio *bio, int err)
-{
- struct list_head mappings;
- struct new_mapping *m, *tmp;
- struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
- unsigned long flags;
- struct pool *pool = h->tc->pool;
-
- bio->bi_end_io = h->saved_bi_end_io;
- bio_endio(bio, err);
-
- INIT_LIST_HEAD(&mappings);
- ds_dec(h->entry, &mappings);
-
- spin_lock_irqsave(&pool->lock, flags);
- list_for_each_entry_safe(m, tmp, &mappings, list) {
- list_del(&m->list);
- INIT_LIST_HEAD(&m->list);
- __maybe_add_mapping(m);
- }
- spin_unlock_irqrestore(&pool->lock, flags);
-
- mempool_free(h, pool->endio_hook_pool);
-}
-
/*----------------------------------------------------------------*/
/*
@@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell,
* Same as cell_defer above, except it omits one particular detainee,
* a write bio that covers the block and has already been processed.
*/
-static void cell_defer_except(struct thin_c *tc, struct cell *cell,
- struct bio *exception)
+static void cell_defer_except(struct thin_c *tc, struct cell *cell)
{
struct bio_list bios;
- struct bio *bio;
struct pool *pool = tc->pool;
unsigned long flags;
bio_list_init(&bios);
- cell_release(cell, &bios);
spin_lock_irqsave(&pool->lock, flags);
- while ((bio = bio_list_pop(&bios)))
- if (bio != exception)
- bio_list_add(&pool->deferred_bios, bio);
+ cell_release_no_holder(cell, &pool->deferred_bios);
spin_unlock_irqrestore(&pool->lock, flags);
wake_worker(pool);
@@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m)
* the bios in the cell.
*/
if (bio) {
- cell_defer_except(tc, m->cell, bio);
+ cell_defer_except(tc, m->cell);
bio_endio(bio, 0);
} else
cell_defer(tc, m->cell, m->data_block);
@@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m)
mempool_free(m, tc->pool->mapping_pool);
}
-static void process_prepared_mappings(struct pool *pool)
+static void process_prepared_discard(struct new_mapping *m)
+{
+ int r;
+ struct thin_c *tc = m->tc;
+
+ r = dm_thin_remove_block(tc->td, m->virt_block);
+ if (r)
+ DMERR("dm_thin_remove_block() failed");
+
+ /*
+ * Pass the discard down to the underlying device?
+ */
+ if (m->pass_discard)
+ remap_and_issue(tc, m->bio, m->data_block);
+ else
+ bio_endio(m->bio, 0);
+
+ cell_defer_except(tc, m->cell);
+ cell_defer_except(tc, m->cell2);
+ mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared(struct pool *pool, struct list_head *head,
+ void (*fn)(struct new_mapping *))
{
unsigned long flags;
struct list_head maps;
@@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool)
INIT_LIST_HEAD(&maps);
spin_lock_irqsave(&pool->lock, flags);
- list_splice_init(&pool->prepared_mappings, &maps);
+ list_splice_init(head, &maps);
spin_unlock_irqrestore(&pool->lock, flags);
list_for_each_entry_safe(m, tmp, &maps, list)
- process_prepared_mapping(m);
+ fn(m);
}
/*
* Deferred bio jobs.
*/
-static int io_overwrites_block(struct pool *pool, struct bio *bio)
+static int io_overlaps_block(struct pool *pool, struct bio *bio)
{
- return ((bio_data_dir(bio) == WRITE) &&
- !(bio->bi_sector & pool->offset_mask)) &&
+ return !(bio->bi_sector & pool->offset_mask) &&
(bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
+
+}
+
+static int io_overwrites_block(struct pool *pool, struct bio *bio)
+{
+ return (bio_data_dir(bio) == WRITE) &&
+ io_overlaps_block(pool, bio);
}
static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
}
static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
- dm_block_t data_origin, dm_block_t data_dest,
+ struct dm_dev *origin, dm_block_t data_origin,
+ dm_block_t data_dest,
struct cell *cell, struct bio *bio)
{
int r;
@@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct new_mapping *m = get_next_mapping(pool);
INIT_LIST_HEAD(&m->list);
+ m->quiesced = 0;
m->prepared = 0;
m->tc = tc;
m->virt_block = virt_block;
@@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
m->err = 0;
m->bio = NULL;
- ds_add_work(&pool->ds, &m->list);
+ if (!ds_add_work(&pool->shared_read_ds, &m->list))
+ m->quiesced = 1;
/*
* IO to pool_dev remaps to the pool target's data_dev.
@@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
* bio immediately. Otherwise we use kcopyd to clone the data first.
*/
if (io_overwrites_block(pool, bio)) {
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ h->overwrite_mapping = m;
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
- dm_get_mapinfo(bio)->ptr = m;
remap_and_issue(tc, bio, data_dest);
} else {
struct dm_io_region from, to;
- from.bdev = tc->pool_dev->bdev;
+ from.bdev = origin->bdev;
from.sector = data_origin * pool->sectors_per_block;
from.count = pool->sectors_per_block;
@@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
}
}
+static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_origin, dm_block_t data_dest,
+ struct cell *cell, struct bio *bio)
+{
+ schedule_copy(tc, virt_block, tc->pool_dev,
+ data_origin, data_dest, cell, bio);
+}
+
+static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
+ dm_block_t data_dest,
+ struct cell *cell, struct bio *bio)
+{
+ schedule_copy(tc, virt_block, tc->origin_dev,
+ virt_block, data_dest, cell, bio);
+}
+
static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
dm_block_t data_block, struct cell *cell,
struct bio *bio)
@@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
struct new_mapping *m = get_next_mapping(pool);
INIT_LIST_HEAD(&m->list);
+ m->quiesced = 1;
m->prepared = 0;
m->tc = tc;
m->virt_block = virt_block;
@@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
* zeroing pre-existing data, we can issue the bio immediately.
* Otherwise we use kcopyd to zero the data first.
*/
- if (!pool->zero_new_blocks)
+ if (!pool->pf.zero_new_blocks)
process_prepared_mapping(m);
else if (io_overwrites_block(pool, bio)) {
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ h->overwrite_mapping = m;
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
- dm_get_mapinfo(bio)->ptr = m;
remap_and_issue(tc, bio, data_block);
} else {
@@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
*/
static void retry_on_resume(struct bio *bio)
{
- struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct thin_c *tc = h->tc;
struct pool *pool = tc->pool;
unsigned long flags;
@@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell)
retry_on_resume(bio);
}
+static void process_discard(struct thin_c *tc, struct bio *bio)
+{
+ int r;
+ struct pool *pool = tc->pool;
+ struct cell *cell, *cell2;
+ struct cell_key key, key2;
+ dm_block_t block = get_bio_block(tc, bio);
+ struct dm_thin_lookup_result lookup_result;
+ struct new_mapping *m;
+
+ build_virtual_key(tc->td, block, &key);
+ if (bio_detain(tc->pool->prison, &key, bio, &cell))
+ return;
+
+ r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+ switch (r) {
+ case 0:
+ /*
+ * Check nobody is fiddling with this pool block. This can
+ * happen if someone's in the process of breaking sharing
+ * on this block.
+ */
+ build_data_key(tc->td, lookup_result.block, &key2);
+ if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
+ cell_release_singleton(cell, bio);
+ break;
+ }
+
+ if (io_overlaps_block(pool, bio)) {
+ /*
+ * IO may still be going to the destination block. We must
+ * quiesce before we can do the removal.
+ */
+ m = get_next_mapping(pool);
+ m->tc = tc;
+ m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
+ m->virt_block = block;
+ m->data_block = lookup_result.block;
+ m->cell = cell;
+ m->cell2 = cell2;
+ m->err = 0;
+ m->bio = bio;
+
+ if (!ds_add_work(&pool->all_io_ds, &m->list)) {
+ list_add(&m->list, &pool->prepared_discards);
+ wake_worker(pool);
+ }
+ } else {
+ /*
+ * This path is hit if people are ignoring
+ * limits->discard_granularity. It ignores any
+ * part of the discard that is in a subsequent
+ * block.
+ */
+ sector_t offset = bio->bi_sector - (block << pool->block_shift);
+ unsigned remaining = (pool->sectors_per_block - offset) << 9;
+ bio->bi_size = min(bio->bi_size, remaining);
+
+ cell_release_singleton(cell, bio);
+ cell_release_singleton(cell2, bio);
+ remap_and_issue(tc, bio, lookup_result.block);
+ }
+ break;
+
+ case -ENODATA:
+ /*
+ * It isn't provisioned, just forget it.
+ */
+ cell_release_singleton(cell, bio);
+ bio_endio(bio, 0);
+ break;
+
+ default:
+ DMERR("discard: find block unexpectedly returned %d", r);
+ cell_release_singleton(cell, bio);
+ bio_io_error(bio);
+ break;
+ }
+}
+
static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
struct cell_key *key,
struct dm_thin_lookup_result *lookup_result,
@@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
r = alloc_data_block(tc, &data_block);
switch (r) {
case 0:
- schedule_copy(tc, block, lookup_result->block,
- data_block, cell, bio);
+ schedule_internal_copy(tc, block, lookup_result->block,
+ data_block, cell, bio);
break;
case -ENOSPC:
@@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
if (bio_data_dir(bio) == WRITE)
break_sharing(tc, bio, block, &key, lookup_result, cell);
else {
- struct endio_hook *h;
- h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
- h->tc = tc;
- h->entry = ds_inc(&pool->ds);
- save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
- dm_get_mapinfo(bio)->ptr = h;
+ h->shared_read_entry = ds_inc(&pool->shared_read_ds);
cell_release_singleton(cell, bio);
remap_and_issue(tc, bio, lookup_result->block);
@@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
r = alloc_data_block(tc, &data_block);
switch (r) {
case 0:
- schedule_zero(tc, block, data_block, cell, bio);
+ if (tc->origin_dev)
+ schedule_external_copy(tc, block, data_block, cell, bio);
+ else
+ schedule_zero(tc, block, data_block, cell, bio);
break;
case -ENOSPC:
@@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
break;
case -ENODATA:
- provision_block(tc, bio, block, cell);
+ if (bio_data_dir(bio) == READ && tc->origin_dev) {
+ cell_release_singleton(cell, bio);
+ remap_to_origin_and_issue(tc, bio);
+ } else
+ provision_block(tc, bio, block, cell);
break;
default:
DMERR("dm_thin_find_block() failed, error = %d", r);
+ cell_release_singleton(cell, bio);
bio_io_error(bio);
break;
}
}
+static int need_commit_due_to_time(struct pool *pool)
+{
+ return jiffies < pool->last_commit_jiffies ||
+ jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
+}
+
static void process_deferred_bios(struct pool *pool)
{
unsigned long flags;
@@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool)
spin_unlock_irqrestore(&pool->lock, flags);
while ((bio = bio_list_pop(&bios))) {
- struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+ struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+ struct thin_c *tc = h->tc;
+
/*
* If we've got no free new_mapping structs, and processing
* this bio might require one, we pause until there are some
@@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool)
break;
}
- process_bio(tc, bio);
+
+ if (bio->bi_rw & REQ_DISCARD)
+ process_discard(tc, bio);
+ else
+ process_bio(tc, bio);
}
/*
@@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool)
bio_list_init(&pool->deferred_flush_bios);
spin_unlock_irqrestore(&pool->lock, flags);
- if (bio_list_empty(&bios))
+ if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
return;
r = dm_pool_commit_metadata(pool->pmd);
@@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool)
bio_io_error(bio);
return;
}
+ pool->last_commit_jiffies = jiffies;
while ((bio = bio_list_pop(&bios)))
generic_make_request(bio);
@@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws)
{
struct pool *pool = container_of(ws, struct pool, worker);
- process_prepared_mappings(pool);
+ process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
+ process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
process_deferred_bios(pool);
}
+/*
+ * We want to commit periodically so that not too much
+ * unwritten data builds up.
+ */
+static void do_waker(struct work_struct *ws)
+{
+ struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
+ wake_worker(pool);
+ queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
+}
+
/*----------------------------------------------------------------*/
/*
@@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
wake_worker(pool);
}
+static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+ struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+
+ h->tc = tc;
+ h->shared_read_entry = NULL;
+ h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
+ h->overwrite_mapping = NULL;
+
+ return h;
+}
+
/*
* Non-blocking function called from the thin target's map function.
*/
@@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
struct dm_thin_device *td = tc->td;
struct dm_thin_lookup_result result;
- /*
- * Save the thin context for easy access from the deferred bio later.
- */
- map_context->ptr = tc;
-
- if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+ map_context->ptr = thin_hook_bio(tc, bio);
+ if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
thin_defer_bio(tc, bio);
return DM_MAPIO_SUBMITTED;
}
@@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
pool->ti = ti;
pool->low_water_blocks = pt->low_water_blocks;
- pool->zero_new_blocks = pt->zero_new_blocks;
+ pool->pf = pt->pf;
return 0;
}
@@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
/*----------------------------------------------------------------
* Pool creation
*--------------------------------------------------------------*/
+/* Initialize pool features. */
+static void pool_features_init(struct pool_features *pf)
+{
+ pf->zero_new_blocks = 1;
+ pf->discard_enabled = 1;
+ pf->discard_passdown = 1;
+}
+
static void __pool_destroy(struct pool *pool)
{
__pool_table_remove(pool);
@@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
pool->block_shift = ffs(block_size) - 1;
pool->offset_mask = block_size - 1;
pool->low_water_blocks = 0;
- pool->zero_new_blocks = 1;
+ pool_features_init(&pool->pf);
pool->prison = prison_create(PRISON_CELLS);
if (!pool->prison) {
*error = "Error creating pool's bio prison";
@@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md,
}
INIT_WORK(&pool->worker, do_worker);
+ INIT_DELAYED_WORK(&pool->waker, do_waker);
spin_lock_init(&pool->lock);
bio_list_init(&pool->deferred_bios);
bio_list_init(&pool->deferred_flush_bios);
INIT_LIST_HEAD(&pool->prepared_mappings);
+ INIT_LIST_HEAD(&pool->prepared_discards);
pool->low_water_triggered = 0;
pool->no_free_space = 0;
bio_list_init(&pool->retry_on_resume_list);
- ds_init(&pool->ds);
+ ds_init(&pool->shared_read_ds);
+ ds_init(&pool->all_io_ds);
pool->next_mapping = NULL;
pool->mapping_pool =
@@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
goto bad_endio_hook_pool;
}
pool->ref_count = 1;
+ pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
pool->md_dev = metadata_dev;
__pool_table_insert(pool);
@@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool)
static struct pool *__pool_find(struct mapped_device *pool_md,
struct block_device *metadata_dev,
- unsigned long block_size, char **error)
+ unsigned long block_size, char **error,
+ int *created)
{
struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
@@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
return ERR_PTR(-EINVAL);
__pool_inc(pool);
- } else
+ } else {
pool = pool_create(pool_md, metadata_dev, block_size, error);
+ *created = 1;
+ }
}
return pool;
@@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti)
mutex_unlock(&dm_thin_pool_table.mutex);
}
-struct pool_features {
- unsigned zero_new_blocks:1;
-};
-
static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
struct dm_target *ti)
{
@@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
const char *arg_name;
static struct dm_arg _args[] = {
- {0, 1, "Invalid number of pool feature arguments"},
+ {0, 3, "Invalid number of pool feature arguments"},
};
/*
@@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
if (!strcasecmp(arg_name, "skip_block_zeroing")) {
pf->zero_new_blocks = 0;
continue;
+ } else if (!strcasecmp(arg_name, "ignore_discard")) {
+ pf->discard_enabled = 0;
+ continue;
+ } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
+ pf->discard_passdown = 0;
+ continue;
}
ti->error = "Unrecognised pool feature requested";
@@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
*
* Optional feature arguments are:
* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
+ * ignore_discard: disable discard
+ * no_discard_passdown: don't pass discards down to the data device
*/
static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
- int r;
+ int r, pool_created = 0;
struct pool_c *pt;
struct pool *pool;
struct pool_features pf;
@@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
dm_block_t low_water_blocks;
struct dm_dev *metadata_dev;
sector_t metadata_dev_size;
+ char b[BDEVNAME_SIZE];
/*
* FIXME Remove validation from scope of lock.
@@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
- if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) {
- ti->error = "Metadata device is too large";
- r = -EINVAL;
- goto out_metadata;
- }
+ if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
+ DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
+ bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
if (r) {
@@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
/*
* Set default pool features.
*/
- memset(&pf, 0, sizeof(pf));
- pf.zero_new_blocks = 1;
+ pool_features_init(&pf);
dm_consume_args(&as, 4);
r = parse_pool_features(&as, &pf, ti);
@@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
}
pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
- block_size, &ti->error);
+ block_size, &ti->error, &pool_created);
if (IS_ERR(pool)) {
r = PTR_ERR(pool);
goto out_free_pt;
}
+ /*
+ * 'pool_created' reflects whether this is the first table load.
+ * Top level discard support is not allowed to be changed after
+ * initial load. This would require a pool reload to trigger thin
+ * device changes.
+ */
+ if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
+ ti->error = "Discard support cannot be disabled once enabled";
+ r = -EINVAL;
+ goto out_flags_changed;
+ }
+
+ /*
+ * If discard_passdown was enabled verify that the data device
+ * supports discards. Disable discard_passdown if not; otherwise
+ * -EOPNOTSUPP will be returned.
+ */
+ if (pf.discard_passdown) {
+ struct request_queue *q = bdev_get_queue(data_dev->bdev);
+ if (!q || !blk_queue_discard(q)) {
+ DMWARN("Discard unsupported by data device: Disabling discard passdown.");
+ pf.discard_passdown = 0;
+ }
+ }
+
pt->pool = pool;
pt->ti = ti;
pt->metadata_dev = metadata_dev;
pt->data_dev = data_dev;
pt->low_water_blocks = low_water_blocks;
- pt->zero_new_blocks = pf.zero_new_blocks;
+ pt->pf = pf;
ti->num_flush_requests = 1;
- ti->num_discard_requests = 0;
+ /*
+ * Only need to enable discards if the pool should pass
+ * them down to the data device. The thin device's discard
+ * processing will cause mappings to be removed from the btree.
+ */
+ if (pf.discard_enabled && pf.discard_passdown) {
+ ti->num_discard_requests = 1;
+ /*
+ * Setting 'discards_supported' circumvents the normal
+ * stacking of discard limits (this keeps the pool and
+ * thin devices' discard limits consistent).
+ */
+ ti->discards_supported = 1;
+ }
ti->private = pt;
pt->callbacks.congested_fn = pool_is_congested;
@@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
return 0;
+out_flags_changed:
+ __pool_dec(pool);
out_free_pt:
kfree(pt);
out:
@@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti)
__requeue_bios(pool);
spin_unlock_irqrestore(&pool->lock, flags);
- wake_worker(pool);
+ do_waker(&pool->waker.work);
}
static void pool_postsuspend(struct dm_target *ti)
@@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti)
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
+ cancel_delayed_work(&pool->waker);
flush_workqueue(pool->wq);
r = dm_pool_commit_metadata(pool->pmd);
@@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
static int pool_status(struct dm_target *ti, status_type_t type,
char *result, unsigned maxlen)
{
- int r;
+ int r, count;
unsigned sz = 0;
uint64_t transaction_id;
dm_block_t nr_free_blocks_data;
@@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
(unsigned long)pool->sectors_per_block,
(unsigned long long)pt->low_water_blocks);
- DMEMIT("%u ", !pool->zero_new_blocks);
+ count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
+ !pool->pf.discard_passdown;
+ DMEMIT("%u ", count);
- if (!pool->zero_new_blocks)
+ if (!pool->pf.zero_new_blocks)
DMEMIT("skip_block_zeroing ");
+
+ if (!pool->pf.discard_enabled)
+ DMEMIT("ignore_discard ");
+
+ if (!pool->pf.discard_passdown)
+ DMEMIT("no_discard_passdown ");
+
break;
}
@@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
+static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
+{
+ /*
+ * FIXME: these limits may be incompatible with the pool's data device
+ */
+ limits->max_discard_sectors = pool->sectors_per_block;
+
+ /*
+ * This is just a hint, and not enforced. We have to cope with
+ * bios that overlap 2 blocks.
+ */
+ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+ limits->discard_zeroes_data = pool->pf.zero_new_blocks;
+}
+
static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct pool_c *pt = ti->private;
@@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_min(limits, 0);
blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ if (pool->pf.discard_enabled)
+ set_discard_limits(pool, limits);
}
static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti)
__pool_dec(tc->pool);
dm_pool_close_thin_device(tc->td);
dm_put_device(ti, tc->pool_dev);
+ if (tc->origin_dev)
+ dm_put_device(ti, tc->origin_dev);
kfree(tc);
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti)
/*
* Thin target parameters:
*
- * <pool_dev> <dev_id>
+ * <pool_dev> <dev_id> [origin_dev]
*
* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
* dev_id: the internal device identifier
+ * origin_dev: a device external to the pool that should act as the origin
+ *
+ * If the pool device has discards disabled, they get disabled for the thin
+ * device as well.
*/
static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
{
int r;
struct thin_c *tc;
- struct dm_dev *pool_dev;
+ struct dm_dev *pool_dev, *origin_dev;
struct mapped_device *pool_md;
mutex_lock(&dm_thin_pool_table.mutex);
- if (argc != 2) {
+ if (argc != 2 && argc != 3) {
ti->error = "Invalid argument count";
r = -EINVAL;
goto out_unlock;
@@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto out_unlock;
}
+ if (argc == 3) {
+ r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
+ if (r) {
+ ti->error = "Error opening origin device";
+ goto bad_origin_dev;
+ }
+ tc->origin_dev = origin_dev;
+ }
+
r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
if (r) {
ti->error = "Error opening pool device";
@@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->split_io = tc->pool->sectors_per_block;
ti->num_flush_requests = 1;
- ti->num_discard_requests = 0;
- ti->discards_supported = 0;
+
+ /* In case the pool supports discards, pass them on. */
+ if (tc->pool->pf.discard_enabled) {
+ ti->discards_supported = 1;
+ ti->num_discard_requests = 1;
+ }
dm_put(pool_md);
@@ -2289,6 +2582,9 @@ bad_pool_lookup:
bad_common:
dm_put_device(ti, tc->pool_dev);
bad_pool_dev:
+ if (tc->origin_dev)
+ dm_put_device(ti, tc->origin_dev);
+bad_origin_dev:
kfree(tc);
out_unlock:
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2299,11 +2595,46 @@ out_unlock:
static int thin_map(struct dm_target *ti, struct bio *bio,
union map_info *map_context)
{
- bio->bi_sector -= ti->begin;
+ bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
return thin_bio_map(ti, bio, map_context);
}
+static int thin_endio(struct dm_target *ti,
+ struct bio *bio, int err,
+ union map_info *map_context)
+{
+ unsigned long flags;
+ struct endio_hook *h = map_context->ptr;
+ struct list_head work;
+ struct new_mapping *m, *tmp;
+ struct pool *pool = h->tc->pool;
+
+ if (h->shared_read_entry) {
+ INIT_LIST_HEAD(&work);
+ ds_dec(h->shared_read_entry, &work);
+
+ spin_lock_irqsave(&pool->lock, flags);
+ list_for_each_entry_safe(m, tmp, &work, list) {
+ list_del(&m->list);
+ m->quiesced = 1;
+ __maybe_add_mapping(m);
+ }
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ if (h->all_io_entry) {
+ INIT_LIST_HEAD(&work);
+ ds_dec(h->all_io_entry, &work);
+ list_for_each_entry_safe(m, tmp, &work, list)
+ list_add(&m->list, &pool->prepared_discards);
+ }
+
+ mempool_free(h, pool->endio_hook_pool);
+
+ return 0;
+}
+
static void thin_postsuspend(struct dm_target *ti)
{
if (dm_noflush_suspending(ti))
@@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type,
DMEMIT("%s %lu",
format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
(unsigned long) tc->dev_id);
+ if (tc->origin_dev)
+ DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
break;
}
}
@@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti,
static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
blk_limits_io_min(limits, 0);
- blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
+ blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+ set_discard_limits(pool, limits);
}
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 0, 0},
+ .version = {1, 1, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
.map = thin_map,
+ .end_io = thin_endio,
.postsuspend = thin_postsuspend,
.status = thin_status,
.iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
new file mode 100644
index 000000000000..fa365d39b612
--- /dev/null
+++ b/drivers/md/dm-verity.c
@@ -0,0 +1,913 @@
+/*
+ * Copyright (C) 2012 Red Hat, Inc.
+ *
+ * Author: Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
+ *
+ * This file is released under the GPLv2.
+ *
+ * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
+ * default prefetch value. Data are read in "prefetch_cluster" chunks from the
+ * hash device. Setting this greatly improves performance when data and hash
+ * are on the same disk on different partitions on devices with poor random
+ * access behavior.
+ */
+
+#include "dm-bufio.h"
+
+#include <linux/module.h>
+#include <linux/device-mapper.h>
+#include <crypto/hash.h>
+
+#define DM_MSG_PREFIX "verity"
+
+#define DM_VERITY_IO_VEC_INLINE 16
+#define DM_VERITY_MEMPOOL_SIZE 4
+#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
+
+#define DM_VERITY_MAX_LEVELS 63
+
+static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
+
+module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
+
+struct dm_verity {
+ struct dm_dev *data_dev;
+ struct dm_dev *hash_dev;
+ struct dm_target *ti;
+ struct dm_bufio_client *bufio;
+ char *alg_name;
+ struct crypto_shash *tfm;
+ u8 *root_digest; /* digest of the root block */
+ u8 *salt; /* salt: its size is salt_size */
+ unsigned salt_size;
+ sector_t data_start; /* data offset in 512-byte sectors */
+ sector_t hash_start; /* hash start in blocks */
+ sector_t data_blocks; /* the number of data blocks */
+ sector_t hash_blocks; /* the number of hash blocks */
+ unsigned char data_dev_block_bits; /* log2(data blocksize) */
+ unsigned char hash_dev_block_bits; /* log2(hash blocksize) */
+ unsigned char hash_per_block_bits; /* log2(hashes in hash block) */
+ unsigned char levels; /* the number of tree levels */
+ unsigned char version;
+ unsigned digest_size; /* digest size for the current hash algorithm */
+ unsigned shash_descsize;/* the size of temporary space for crypto */
+ int hash_failed; /* set to 1 if hash of any block failed */
+
+ mempool_t *io_mempool; /* mempool of struct dm_verity_io */
+ mempool_t *vec_mempool; /* mempool of bio vector */
+
+ struct workqueue_struct *verify_wq;
+
+ /* starting blocks for each tree level. 0 is the lowest level. */
+ sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
+};
+
+struct dm_verity_io {
+ struct dm_verity *v;
+ struct bio *bio;
+
+ /* original values of bio->bi_end_io and bio->bi_private */
+ bio_end_io_t *orig_bi_end_io;
+ void *orig_bi_private;
+
+ sector_t block;
+ unsigned n_blocks;
+
+ /* saved bio vector */
+ struct bio_vec *io_vec;
+ unsigned io_vec_size;
+
+ struct work_struct work;
+
+ /* A space for short vectors; longer vectors are allocated separately. */
+ struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
+
+ /*
+ * Three variably-size fields follow this struct:
+ *
+ * u8 hash_desc[v->shash_descsize];
+ * u8 real_digest[v->digest_size];
+ * u8 want_digest[v->digest_size];
+ *
+ * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
+ */
+};
+
+static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
+{
+ return (struct shash_desc *)(io + 1);
+}
+
+static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
+{
+ return (u8 *)(io + 1) + v->shash_descsize;
+}
+
+static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
+{
+ return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
+}
+
+/*
+ * Auxiliary structure appended to each dm-bufio buffer. If the value
+ * hash_verified is nonzero, hash of the block has been verified.
+ *
+ * The variable hash_verified is set to 0 when allocating the buffer, then
+ * it can be changed to 1 and it is never reset to 0 again.
+ *
+ * There is no lock around this value, a race condition can at worst cause
+ * that multiple processes verify the hash of the same buffer simultaneously
+ * and write 1 to hash_verified simultaneously.
+ * This condition is harmless, so we don't need locking.
+ */
+struct buffer_aux {
+ int hash_verified;
+};
+
+/*
+ * Initialize struct buffer_aux for a freshly created buffer.
+ */
+static void dm_bufio_alloc_callback(struct dm_buffer *buf)
+{
+ struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+
+ aux->hash_verified = 0;
+}
+
+/*
+ * Translate input sector number to the sector number on the target device.
+ */
+static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector)
+{
+ return v->data_start + dm_target_offset(v->ti, bi_sector);
+}
+
+/*
+ * Return hash position of a specified block at a specified tree level
+ * (0 is the lowest level).
+ * The lowest "hash_per_block_bits"-bits of the result denote hash position
+ * inside a hash block. The remaining bits denote location of the hash block.
+ */
+static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
+ int level)
+{
+ return block >> (level * v->hash_per_block_bits);
+}
+
+static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
+ sector_t *hash_block, unsigned *offset)
+{
+ sector_t position = verity_position_at_level(v, block, level);
+ unsigned idx;
+
+ *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits);
+
+ if (!offset)
+ return;
+
+ idx = position & ((1 << v->hash_per_block_bits) - 1);
+ if (!v->version)
+ *offset = idx * v->digest_size;
+ else
+ *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
+}
+
+/*
+ * Verify hash of a metadata block pertaining to the specified data block
+ * ("block" argument) at a specified level ("level" argument).
+ *
+ * On successful return, io_want_digest(v, io) contains the hash value for
+ * a lower tree level or for the data block (if we're at the lowest leve).
+ *
+ * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
+ * If "skip_unverified" is false, unverified buffer is hashed and verified
+ * against current value of io_want_digest(v, io).
+ */
+static int verity_verify_level(struct dm_verity_io *io, sector_t block,
+ int level, bool skip_unverified)
+{
+ struct dm_verity *v = io->v;
+ struct dm_buffer *buf;
+ struct buffer_aux *aux;
+ u8 *data;
+ int r;
+ sector_t hash_block;
+ unsigned offset;
+
+ verity_hash_at_level(v, block, level, &hash_block, &offset);
+
+ data = dm_bufio_read(v->bufio, hash_block, &buf);
+ if (unlikely(IS_ERR(data)))
+ return PTR_ERR(data);
+
+ aux = dm_bufio_get_aux_data(buf);
+
+ if (!aux->hash_verified) {
+ struct shash_desc *desc;
+ u8 *result;
+
+ if (skip_unverified) {
+ r = 1;
+ goto release_ret_r;
+ }
+
+ desc = io_hash_desc(v, io);
+ desc->tfm = v->tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ r = crypto_shash_init(desc);
+ if (r < 0) {
+ DMERR("crypto_shash_init failed: %d", r);
+ goto release_ret_r;
+ }
+
+ if (likely(v->version >= 1)) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ goto release_ret_r;
+ }
+ }
+
+ r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ goto release_ret_r;
+ }
+
+ if (!v->version) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ goto release_ret_r;
+ }
+ }
+
+ result = io_real_digest(v, io);
+ r = crypto_shash_final(desc, result);
+ if (r < 0) {
+ DMERR("crypto_shash_final failed: %d", r);
+ goto release_ret_r;
+ }
+ if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
+ DMERR_LIMIT("metadata block %llu is corrupted",
+ (unsigned long long)hash_block);
+ v->hash_failed = 1;
+ r = -EIO;
+ goto release_ret_r;
+ } else
+ aux->hash_verified = 1;
+ }
+
+ data += offset;
+
+ memcpy(io_want_digest(v, io), data, v->digest_size);
+
+ dm_bufio_release(buf);
+ return 0;
+
+release_ret_r:
+ dm_bufio_release(buf);
+
+ return r;
+}
+
+/*
+ * Verify one "dm_verity_io" structure.
+ */
+static int verity_verify_io(struct dm_verity_io *io)
+{
+ struct dm_verity *v = io->v;
+ unsigned b;
+ int i;
+ unsigned vector = 0, offset = 0;
+
+ for (b = 0; b < io->n_blocks; b++) {
+ struct shash_desc *desc;
+ u8 *result;
+ int r;
+ unsigned todo;
+
+ if (likely(v->levels)) {
+ /*
+ * First, we try to get the requested hash for
+ * the current block. If the hash block itself is
+ * verified, zero is returned. If it isn't, this
+ * function returns 0 and we fall back to whole
+ * chain verification.
+ */
+ int r = verity_verify_level(io, io->block + b, 0, true);
+ if (likely(!r))
+ goto test_block_hash;
+ if (r < 0)
+ return r;
+ }
+
+ memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
+
+ for (i = v->levels - 1; i >= 0; i--) {
+ int r = verity_verify_level(io, io->block + b, i, false);
+ if (unlikely(r))
+ return r;
+ }
+
+test_block_hash:
+ desc = io_hash_desc(v, io);
+ desc->tfm = v->tfm;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ r = crypto_shash_init(desc);
+ if (r < 0) {
+ DMERR("crypto_shash_init failed: %d", r);
+ return r;
+ }
+
+ if (likely(v->version >= 1)) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ return r;
+ }
+ }
+
+ todo = 1 << v->data_dev_block_bits;
+ do {
+ struct bio_vec *bv;
+ u8 *page;
+ unsigned len;
+
+ BUG_ON(vector >= io->io_vec_size);
+ bv = &io->io_vec[vector];
+ page = kmap_atomic(bv->bv_page);
+ len = bv->bv_len - offset;
+ if (likely(len >= todo))
+ len = todo;
+ r = crypto_shash_update(desc,
+ page + bv->bv_offset + offset, len);
+ kunmap_atomic(page);
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ return r;
+ }
+ offset += len;
+ if (likely(offset == bv->bv_len)) {
+ offset = 0;
+ vector++;
+ }
+ todo -= len;
+ } while (todo);
+
+ if (!v->version) {
+ r = crypto_shash_update(desc, v->salt, v->salt_size);
+ if (r < 0) {
+ DMERR("crypto_shash_update failed: %d", r);
+ return r;
+ }
+ }
+
+ result = io_real_digest(v, io);
+ r = crypto_shash_final(desc, result);
+ if (r < 0) {
+ DMERR("crypto_shash_final failed: %d", r);
+ return r;
+ }
+ if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
+ DMERR_LIMIT("data block %llu is corrupted",
+ (unsigned long long)(io->block + b));
+ v->hash_failed = 1;
+ return -EIO;
+ }
+ }
+ BUG_ON(vector != io->io_vec_size);
+ BUG_ON(offset);
+
+ return 0;
+}
+
+/*
+ * End one "io" structure with a given error.
+ */
+static void verity_finish_io(struct dm_verity_io *io, int error)
+{
+ struct bio *bio = io->bio;
+ struct dm_verity *v = io->v;
+
+ bio->bi_end_io = io->orig_bi_end_io;
+ bio->bi_private = io->orig_bi_private;
+
+ if (io->io_vec != io->io_vec_inline)
+ mempool_free(io->io_vec, v->vec_mempool);
+
+ mempool_free(io, v->io_mempool);
+
+ bio_endio(bio, error);
+}
+
+static void verity_work(struct work_struct *w)
+{
+ struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
+
+ verity_finish_io(io, verity_verify_io(io));
+}
+
+static void verity_end_io(struct bio *bio, int error)
+{
+ struct dm_verity_io *io = bio->bi_private;
+
+ if (error) {
+ verity_finish_io(io, error);
+ return;
+ }
+
+ INIT_WORK(&io->work, verity_work);
+ queue_work(io->v->verify_wq, &io->work);
+}
+
+/*
+ * Prefetch buffers for the specified io.
+ * The root buffer is not prefetched, it is assumed that it will be cached
+ * all the time.
+ */
+static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
+{
+ int i;
+
+ for (i = v->levels - 2; i >= 0; i--) {
+ sector_t hash_block_start;
+ sector_t hash_block_end;
+ verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
+ verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
+ if (!i) {
+ unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster;
+
+ cluster >>= v->data_dev_block_bits;
+ if (unlikely(!cluster))
+ goto no_prefetch_cluster;
+
+ if (unlikely(cluster & (cluster - 1)))
+ cluster = 1 << (fls(cluster) - 1);
+
+ hash_block_start &= ~(sector_t)(cluster - 1);
+ hash_block_end |= cluster - 1;
+ if (unlikely(hash_block_end >= v->hash_blocks))
+ hash_block_end = v->hash_blocks - 1;
+ }
+no_prefetch_cluster:
+ dm_bufio_prefetch(v->bufio, hash_block_start,
+ hash_block_end - hash_block_start + 1);
+ }
+}
+
+/*
+ * Bio map function. It allocates dm_verity_io structure and bio vector and
+ * fills them. Then it issues prefetches and the I/O.
+ */
+static int verity_map(struct dm_target *ti, struct bio *bio,
+ union map_info *map_context)
+{
+ struct dm_verity *v = ti->private;
+ struct dm_verity_io *io;
+
+ bio->bi_bdev = v->data_dev->bdev;
+ bio->bi_sector = verity_map_sector(v, bio->bi_sector);
+
+ if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
+ ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
+ DMERR_LIMIT("unaligned io");
+ return -EIO;
+ }
+
+ if ((bio->bi_sector + bio_sectors(bio)) >>
+ (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
+ DMERR_LIMIT("io out of range");
+ return -EIO;
+ }
+
+ if (bio_data_dir(bio) == WRITE)
+ return -EIO;
+
+ io = mempool_alloc(v->io_mempool, GFP_NOIO);
+ io->v = v;
+ io->bio = bio;
+ io->orig_bi_end_io = bio->bi_end_io;
+ io->orig_bi_private = bio->bi_private;
+ io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
+ io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
+
+ bio->bi_end_io = verity_end_io;
+ bio->bi_private = io;
+ io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
+ if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
+ io->io_vec = io->io_vec_inline;
+ else
+ io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
+ memcpy(io->io_vec, bio_iovec(bio),
+ io->io_vec_size * sizeof(struct bio_vec));
+
+ verity_prefetch_io(v, io);
+
+ generic_make_request(bio);
+
+ return DM_MAPIO_SUBMITTED;
+}
+
+/*
+ * Status: V (valid) or C (corruption found)
+ */
+static int verity_status(struct dm_target *ti, status_type_t type,
+ char *result, unsigned maxlen)
+{
+ struct dm_verity *v = ti->private;
+ unsigned sz = 0;
+ unsigned x;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%c", v->hash_failed ? 'C' : 'V');
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u %s %s %u %u %llu %llu %s ",
+ v->version,
+ v->data_dev->name,
+ v->hash_dev->name,
+ 1 << v->data_dev_block_bits,
+ 1 << v->hash_dev_block_bits,
+ (unsigned long long)v->data_blocks,
+ (unsigned long long)v->hash_start,
+ v->alg_name
+ );
+ for (x = 0; x < v->digest_size; x++)
+ DMEMIT("%02x", v->root_digest[x]);
+ DMEMIT(" ");
+ if (!v->salt_size)
+ DMEMIT("-");
+ else
+ for (x = 0; x < v->salt_size; x++)
+ DMEMIT("%02x", v->salt[x]);
+ break;
+ }
+
+ return 0;
+}
+
+static int verity_ioctl(struct dm_target *ti, unsigned cmd,
+ unsigned long arg)
+{
+ struct dm_verity *v = ti->private;
+ int r = 0;
+
+ if (v->data_start ||
+ ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
+ r = scsi_verify_blk_ioctl(NULL, cmd);
+
+ return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
+ cmd, arg);
+}
+
+static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+ struct bio_vec *biovec, int max_size)
+{
+ struct dm_verity *v = ti->private;
+ struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
+
+ if (!q->merge_bvec_fn)
+ return max_size;
+
+ bvm->bi_bdev = v->data_dev->bdev;
+ bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
+
+ return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int verity_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct dm_verity *v = ti->private;
+
+ return fn(ti, v->data_dev, v->data_start, ti->len, data);
+}
+
+static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct dm_verity *v = ti->private;
+
+ if (limits->logical_block_size < 1 << v->data_dev_block_bits)
+ limits->logical_block_size = 1 << v->data_dev_block_bits;
+
+ if (limits->physical_block_size < 1 << v->data_dev_block_bits)
+ limits->physical_block_size = 1 << v->data_dev_block_bits;
+
+ blk_limits_io_min(limits, limits->logical_block_size);
+}
+
+static void verity_dtr(struct dm_target *ti)
+{
+ struct dm_verity *v = ti->private;
+
+ if (v->verify_wq)
+ destroy_workqueue(v->verify_wq);
+
+ if (v->vec_mempool)
+ mempool_destroy(v->vec_mempool);
+
+ if (v->io_mempool)
+ mempool_destroy(v->io_mempool);
+
+ if (v->bufio)
+ dm_bufio_client_destroy(v->bufio);
+
+ kfree(v->salt);
+ kfree(v->root_digest);
+
+ if (v->tfm)
+ crypto_free_shash(v->tfm);
+
+ kfree(v->alg_name);
+
+ if (v->hash_dev)
+ dm_put_device(ti, v->hash_dev);
+
+ if (v->data_dev)
+ dm_put_device(ti, v->data_dev);
+
+ kfree(v);
+}
+
+/*
+ * Target parameters:
+ * <version> The current format is version 1.
+ * Vsn 0 is compatible with original Chromium OS releases.
+ * <data device>
+ * <hash device>
+ * <data block size>
+ * <hash block size>
+ * <the number of data blocks>
+ * <hash start block>
+ * <algorithm>
+ * <digest>
+ * <salt> Hex string or "-" if no salt.
+ */
+static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+ struct dm_verity *v;
+ unsigned num;
+ unsigned long long num_ll;
+ int r;
+ int i;
+ sector_t hash_position;
+ char dummy;
+
+ v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
+ if (!v) {
+ ti->error = "Cannot allocate verity structure";
+ return -ENOMEM;
+ }
+ ti->private = v;
+ v->ti = ti;
+
+ if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
+ ti->error = "Device must be readonly";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (argc != 10) {
+ ti->error = "Invalid argument count: exactly 10 arguments required";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
+ num < 0 || num > 1) {
+ ti->error = "Invalid version";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->version = num;
+
+ r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
+ if (r) {
+ ti->error = "Data device lookup failed";
+ goto bad;
+ }
+
+ r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
+ if (r) {
+ ti->error = "Data device lookup failed";
+ goto bad;
+ }
+
+ if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
+ !num || (num & (num - 1)) ||
+ num < bdev_logical_block_size(v->data_dev->bdev) ||
+ num > PAGE_SIZE) {
+ ti->error = "Invalid data device block size";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->data_dev_block_bits = ffs(num) - 1;
+
+ if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
+ !num || (num & (num - 1)) ||
+ num < bdev_logical_block_size(v->hash_dev->bdev) ||
+ num > INT_MAX) {
+ ti->error = "Invalid hash device block size";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->hash_dev_block_bits = ffs(num) - 1;
+
+ if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
+ num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) !=
+ (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) {
+ ti->error = "Invalid data blocks";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->data_blocks = num_ll;
+
+ if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) {
+ ti->error = "Data device is too small";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
+ num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) !=
+ (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) {
+ ti->error = "Invalid hash start";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->hash_start = num_ll;
+
+ v->alg_name = kstrdup(argv[7], GFP_KERNEL);
+ if (!v->alg_name) {
+ ti->error = "Cannot allocate algorithm name";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
+ if (IS_ERR(v->tfm)) {
+ ti->error = "Cannot initialize hash function";
+ r = PTR_ERR(v->tfm);
+ v->tfm = NULL;
+ goto bad;
+ }
+ v->digest_size = crypto_shash_digestsize(v->tfm);
+ if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
+ ti->error = "Digest size too big";
+ r = -EINVAL;
+ goto bad;
+ }
+ v->shash_descsize =
+ sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
+
+ v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
+ if (!v->root_digest) {
+ ti->error = "Cannot allocate root digest";
+ r = -ENOMEM;
+ goto bad;
+ }
+ if (strlen(argv[8]) != v->digest_size * 2 ||
+ hex2bin(v->root_digest, argv[8], v->digest_size)) {
+ ti->error = "Invalid root digest";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ if (strcmp(argv[9], "-")) {
+ v->salt_size = strlen(argv[9]) / 2;
+ v->salt = kmalloc(v->salt_size, GFP_KERNEL);
+ if (!v->salt) {
+ ti->error = "Cannot allocate salt";
+ r = -ENOMEM;
+ goto bad;
+ }
+ if (strlen(argv[9]) != v->salt_size * 2 ||
+ hex2bin(v->salt, argv[9], v->salt_size)) {
+ ti->error = "Invalid salt";
+ r = -EINVAL;
+ goto bad;
+ }
+ }
+
+ v->hash_per_block_bits =
+ fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
+
+ v->levels = 0;
+ if (v->data_blocks)
+ while (v->hash_per_block_bits * v->levels < 64 &&
+ (unsigned long long)(v->data_blocks - 1) >>
+ (v->hash_per_block_bits * v->levels))
+ v->levels++;
+
+ if (v->levels > DM_VERITY_MAX_LEVELS) {
+ ti->error = "Too many tree levels";
+ r = -E2BIG;
+ goto bad;
+ }
+
+ hash_position = v->hash_start;
+ for (i = v->levels - 1; i >= 0; i--) {
+ sector_t s;
+ v->hash_level_block[i] = hash_position;
+ s = verity_position_at_level(v, v->data_blocks, i);
+ s = (s >> v->hash_per_block_bits) +
+ !!(s & ((1 << v->hash_per_block_bits) - 1));
+ if (hash_position + s < hash_position) {
+ ti->error = "Hash device offset overflow";
+ r = -E2BIG;
+ goto bad;
+ }
+ hash_position += s;
+ }
+ v->hash_blocks = hash_position;
+
+ v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
+ 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
+ dm_bufio_alloc_callback, NULL);
+ if (IS_ERR(v->bufio)) {
+ ti->error = "Cannot initialize dm-bufio";
+ r = PTR_ERR(v->bufio);
+ v->bufio = NULL;
+ goto bad;
+ }
+
+ if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) {
+ ti->error = "Hash device is too small";
+ r = -E2BIG;
+ goto bad;
+ }
+
+ v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
+ sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2);
+ if (!v->io_mempool) {
+ ti->error = "Cannot allocate io mempool";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
+ BIO_MAX_PAGES * sizeof(struct bio_vec));
+ if (!v->vec_mempool) {
+ ti->error = "Cannot allocate vector mempool";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ /* WQ_UNBOUND greatly improves performance when running on ramdisk */
+ v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
+ if (!v->verify_wq) {
+ ti->error = "Cannot allocate workqueue";
+ r = -ENOMEM;
+ goto bad;
+ }
+
+ return 0;
+
+bad:
+ verity_dtr(ti);
+
+ return r;
+}
+
+static struct target_type verity_target = {
+ .name = "verity",
+ .version = {1, 0, 0},
+ .module = THIS_MODULE,
+ .ctr = verity_ctr,
+ .dtr = verity_dtr,
+ .map = verity_map,
+ .status = verity_status,
+ .ioctl = verity_ioctl,
+ .merge = verity_merge,
+ .iterate_devices = verity_iterate_devices,
+ .io_hints = verity_io_hints,
+};
+
+static int __init dm_verity_init(void)
+{
+ int r;
+
+ r = dm_register_target(&verity_target);
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ return r;
+}
+
+static void __exit dm_verity_exit(void)
+{
+ dm_unregister_target(&verity_target);
+}
+
+module_init(dm_verity_init);
+module_exit(dm_verity_exit);
+
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
+MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
+MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b89c548ec3f8..e24143cc2040 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1016,6 +1016,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
/*
* Store bio_set for cleanup.
*/
+ clone->bi_end_io = NULL;
clone->bi_private = md->bs;
bio_put(clone);
free_tio(md, tio);
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index d279c768f8f1..5709bfeab1e8 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -108,12 +108,9 @@ static inline void *value_base(struct node *n)
return &n->keys[le32_to_cpu(n->header.max_entries)];
}
-/*
- * FIXME: Now that value size is stored in node we don't need the third parm.
- */
-static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
+static inline void *value_ptr(struct node *n, uint32_t index)
{
- BUG_ON(value_size != le32_to_cpu(n->header.value_size));
+ uint32_t value_size = le32_to_cpu(n->header.value_size);
return value_base(n) + (value_size * index);
}
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 023fbc2d389e..aa71e2359a07 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -61,20 +61,20 @@ static void node_shift(struct node *n, int shift)
if (shift < 0) {
shift = -shift;
BUG_ON(shift > nr_entries);
- BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size));
+ BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
memmove(key_ptr(n, 0),
key_ptr(n, shift),
(nr_entries - shift) * sizeof(__le64));
- memmove(value_ptr(n, 0, value_size),
- value_ptr(n, shift, value_size),
+ memmove(value_ptr(n, 0),
+ value_ptr(n, shift),
(nr_entries - shift) * value_size);
} else {
BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
memmove(key_ptr(n, shift),
key_ptr(n, 0),
nr_entries * sizeof(__le64));
- memmove(value_ptr(n, shift, value_size),
- value_ptr(n, 0, value_size),
+ memmove(value_ptr(n, shift),
+ value_ptr(n, 0),
nr_entries * value_size);
}
}
@@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift)
memcpy(key_ptr(left, nr_left),
key_ptr(right, 0),
shift * sizeof(__le64));
- memcpy(value_ptr(left, nr_left, value_size),
- value_ptr(right, 0, value_size),
+ memcpy(value_ptr(left, nr_left),
+ value_ptr(right, 0),
shift * value_size);
} else {
BUG_ON(shift > le32_to_cpu(right->header.max_entries));
memcpy(key_ptr(right, 0),
key_ptr(left, nr_left - shift),
shift * sizeof(__le64));
- memcpy(value_ptr(right, 0, value_size),
- value_ptr(left, nr_left - shift, value_size),
+ memcpy(value_ptr(right, 0),
+ value_ptr(left, nr_left - shift),
shift * value_size);
}
}
@@ -120,26 +120,17 @@ static void delete_at(struct node *n, unsigned index)
key_ptr(n, index + 1),
nr_to_copy * sizeof(__le64));
- memmove(value_ptr(n, index, value_size),
- value_ptr(n, index + 1, value_size),
+ memmove(value_ptr(n, index),
+ value_ptr(n, index + 1),
nr_to_copy * value_size);
}
n->header.nr_entries = cpu_to_le32(nr_entries - 1);
}
-static unsigned del_threshold(struct node *n)
-{
- return le32_to_cpu(n->header.max_entries) / 3;
-}
-
static unsigned merge_threshold(struct node *n)
{
- /*
- * The extra one is because we know we're potentially going to
- * delete an entry.
- */
- return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
+ return le32_to_cpu(n->header.max_entries) / 3;
}
struct child {
@@ -175,7 +166,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent,
if (inc)
inc_children(info->tm, result->n, &le64_type);
- *((__le64 *) value_ptr(parent, index, sizeof(__le64))) =
+ *((__le64 *) value_ptr(parent, index)) =
cpu_to_le64(dm_block_location(result->block));
return 0;
@@ -188,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c)
static void shift(struct node *left, struct node *right, int count)
{
+ uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+ uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
+
+ BUG_ON(max_entries != r_max_entries);
+ BUG_ON(nr_left - count > max_entries);
+ BUG_ON(nr_right + count > max_entries);
+
if (!count)
return;
@@ -199,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count)
node_shift(right, count);
}
- left->header.nr_entries =
- cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count);
- BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
-
- right->header.nr_entries =
- cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
- BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
+ left->header.nr_entries = cpu_to_le32(nr_left - count);
+ right->header.nr_entries = cpu_to_le32(nr_right + count);
}
static void __rebalance2(struct dm_btree_info *info, struct node *parent,
@@ -215,8 +210,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
struct node *right = r->n;
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
+ unsigned threshold = 2 * merge_threshold(left) + 1;
- if (nr_left + nr_right <= merge_threshold(left)) {
+ if (nr_left + nr_right < threshold) {
/*
* Merge
*/
@@ -234,9 +230,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
* Rebalance.
*/
unsigned target_left = (nr_left + nr_right) / 2;
- unsigned shift_ = nr_left - target_left;
- BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
- BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
shift(left, right, nr_left - target_left);
*key_ptr(parent, r->index) = right->keys[0];
}
@@ -272,6 +265,84 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
return exit_child(info, &right);
}
+/*
+ * We dump as many entries from center as possible into left, then the rest
+ * in right, then rebalance2. This wastes some cpu, but I want something
+ * simple atm.
+ */
+static void delete_center_node(struct dm_btree_info *info, struct node *parent,
+ struct child *l, struct child *c, struct child *r,
+ struct node *left, struct node *center, struct node *right,
+ uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ unsigned shift = min(max_entries - nr_left, nr_center);
+
+ BUG_ON(nr_left + shift > max_entries);
+ node_copy(left, center, -shift);
+ left->header.nr_entries = cpu_to_le32(nr_left + shift);
+
+ if (shift != nr_center) {
+ shift = nr_center - shift;
+ BUG_ON((nr_right + shift) > max_entries);
+ node_shift(right, shift);
+ node_copy(center, right, shift);
+ right->header.nr_entries = cpu_to_le32(nr_right + shift);
+ }
+ *key_ptr(parent, r->index) = right->keys[0];
+
+ delete_at(parent, c->index);
+ r->index--;
+
+ dm_tm_dec(info->tm, dm_block_location(c->block));
+ __rebalance2(info, parent, l, r);
+}
+
+/*
+ * Redistributes entries among 3 sibling nodes.
+ */
+static void redistribute3(struct dm_btree_info *info, struct node *parent,
+ struct child *l, struct child *c, struct child *r,
+ struct node *left, struct node *center, struct node *right,
+ uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
+{
+ int s;
+ uint32_t max_entries = le32_to_cpu(left->header.max_entries);
+ unsigned target = (nr_left + nr_center + nr_right) / 3;
+ BUG_ON(target > max_entries);
+
+ if (nr_left < nr_right) {
+ s = nr_left - target;
+
+ if (s < 0 && nr_center < -s) {
+ /* not enough in central node */
+ shift(left, center, nr_center);
+ s = nr_center - target;
+ shift(left, right, s);
+ nr_right += s;
+ } else
+ shift(left, center, s);
+
+ shift(center, right, target - nr_right);
+
+ } else {
+ s = target - nr_right;
+ if (s > 0 && nr_center < s) {
+ /* not enough in central node */
+ shift(center, right, nr_center);
+ s = target - nr_center;
+ shift(left, right, s);
+ nr_left -= s;
+ } else
+ shift(center, right, s);
+
+ shift(left, center, nr_left - target);
+ }
+
+ *key_ptr(parent, c->index) = center->keys[0];
+ *key_ptr(parent, r->index) = right->keys[0];
+}
+
static void __rebalance3(struct dm_btree_info *info, struct node *parent,
struct child *l, struct child *c, struct child *r)
{
@@ -282,62 +353,18 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent,
uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
- uint32_t max_entries = le32_to_cpu(left->header.max_entries);
- unsigned target;
+ unsigned threshold = merge_threshold(left) * 4 + 1;
BUG_ON(left->header.max_entries != center->header.max_entries);
BUG_ON(center->header.max_entries != right->header.max_entries);
- if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) {
- /*
- * Delete center node:
- *
- * We dump as many entries from center as possible into
- * left, then the rest in right, then rebalance2. This
- * wastes some cpu, but I want something simple atm.
- */
- unsigned shift = min(max_entries - nr_left, nr_center);
-
- BUG_ON(nr_left + shift > max_entries);
- node_copy(left, center, -shift);
- left->header.nr_entries = cpu_to_le32(nr_left + shift);
-
- if (shift != nr_center) {
- shift = nr_center - shift;
- BUG_ON((nr_right + shift) >= max_entries);
- node_shift(right, shift);
- node_copy(center, right, shift);
- right->header.nr_entries = cpu_to_le32(nr_right + shift);
- }
- *key_ptr(parent, r->index) = right->keys[0];
-
- delete_at(parent, c->index);
- r->index--;
-
- dm_tm_dec(info->tm, dm_block_location(c->block));
- __rebalance2(info, parent, l, r);
-
- return;
- }
-
- /*
- * Rebalance
- */
- target = (nr_left + nr_center + nr_right) / 3;
- BUG_ON(target > max_entries);
-
- /*
- * Adjust the left node
- */
- shift(left, center, nr_left - target);
-
- /*
- * Adjust the right node
- */
- shift(center, right, target - nr_right);
- *key_ptr(parent, c->index) = center->keys[0];
- *key_ptr(parent, r->index) = right->keys[0];
+ if ((nr_left + nr_center + nr_right) < threshold)
+ delete_center_node(info, parent, l, c, r, left, center, right,
+ nr_left, nr_center, nr_right);
+ else
+ redistribute3(info, parent, l, c, r, left, center, right,
+ nr_left, nr_center, nr_right);
}
static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
@@ -441,9 +468,6 @@ static int rebalance_children(struct shadow_spine *s,
if (r)
return r;
- if (child_entries > del_threshold(n))
- return 0;
-
has_left_sibling = i > 0;
has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
@@ -496,7 +520,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
*/
if (shadow_has_parent(s)) {
__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
- memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)),
+ memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
&location, sizeof(__le64));
}
@@ -553,7 +577,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
if (info->value_type.dec)
info->value_type.dec(info->value_type.context,
- value_ptr(n, index, info->value_type.size));
+ value_ptr(n, index));
delete_at(n, index);
}
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index bd1e7ffbe26c..d12b2cc51f1a 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -74,8 +74,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n,
dm_tm_inc(tm, value64(n, i));
else if (vt->inc)
for (i = 0; i < nr_entries; i++)
- vt->inc(vt->context,
- value_ptr(n, i, vt->size));
+ vt->inc(vt->context, value_ptr(n, i));
}
static int insert_at(size_t value_size, struct node *node, unsigned index,
@@ -281,7 +280,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
for (i = 0; i < f->nr_children; i++)
info->value_type.dec(info->value_type.context,
- value_ptr(f->n, i, info->value_type.size));
+ value_ptr(f->n, i));
}
f->current_child = f->nr_children;
}
@@ -320,7 +319,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
} while (!(flags & LEAF_NODE));
*result_key = le64_to_cpu(ro_node(s)->keys[i]);
- memcpy(v, value_ptr(ro_node(s), i, value_size), value_size);
+ memcpy(v, value_ptr(ro_node(s), i), value_size);
return 0;
}
@@ -432,7 +431,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
sizeof(uint64_t) : s->info->value_type.size;
- memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size),
+ memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
size * nr_right);
/*
@@ -443,7 +442,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
pn = dm_block_data(parent);
location = cpu_to_le64(dm_block_location(left));
__dm_bless_for_disk(&location);
- memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)),
+ memcpy_disk(value_ptr(pn, parent_index),
&location, sizeof(__le64));
location = cpu_to_le64(dm_block_location(right));
@@ -529,8 +528,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
sizeof(__le64) : s->info->value_type.size;
- memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size);
- memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size),
+ memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
+ memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
nr_right * size);
/* new_parent should just point to l and r now */
@@ -545,12 +544,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
val = cpu_to_le64(dm_block_location(left));
__dm_bless_for_disk(&val);
pn->keys[0] = ln->keys[0];
- memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64));
+ memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
val = cpu_to_le64(dm_block_location(right));
__dm_bless_for_disk(&val);
pn->keys[1] = rn->keys[0];
- memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64));
+ memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
/*
* rejig the spine. This is ugly, since it knows too
@@ -595,7 +594,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
__dm_bless_for_disk(&location);
- memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
+ memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
&location, sizeof(__le64));
}
@@ -710,12 +709,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
(!info->value_type.equal ||
!info->value_type.equal(
info->value_type.context,
- value_ptr(n, index, info->value_type.size),
+ value_ptr(n, index),
value))) {
info->value_type.dec(info->value_type.context,
- value_ptr(n, index, info->value_type.size));
+ value_ptr(n, index));
}
- memcpy_disk(value_ptr(n, index, info->value_type.size),
+ memcpy_disk(value_ptr(n, index),
value, info->value_type.size);
}
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index df2494c06cdc..ff3beed6ad2d 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
if (r < 0)
return r;
-#if 0
- /* FIXME: dm_btree_remove doesn't handle this yet */
if (old > 2) {
r = dm_btree_remove(&ll->ref_count_info,
ll->ref_count_root,
@@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
if (r)
return r;
}
-#endif
} else {
__le32 le_rc = cpu_to_le32(ref_count);