diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-18 08:14:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-18 08:14:18 -0400 |
commit | afad97eee47c1f1f242202e2473929b4ef5d9f43 (patch) | |
tree | 31f68d70760234b582a28bd3f64311ff5307b7b1 | |
parent | 04b7fe6a4a231871ef681bc95e08fe66992f7b1f (diff) | |
parent | 44c144f9c8e8fbd73ede2848da8253b3aae42ec2 (diff) | |
download | linux-afad97eee47c1f1f242202e2473929b4ef5d9f43.tar.bz2 |
Merge tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer:
- the most extensive changes this cycle are the DM core improvements to
add full blk-mq support to request-based DM.
- disabled by default but user can opt-in with CONFIG_DM_MQ_DEFAULT
- depends on some blk-mq changes from Jens' for-4.1/core branch so
that explains why this pull is built on linux-block.git
- update DM to use name_to_dev_t() rather than open-coding a less
capable device parser.
- includes a couple small improvements to name_to_dev_t() that offer
stricter constraints that DM's code provided.
- improvements to the dm-cache "mq" cache replacement policy.
- a DM crypt crypt_ctr() error path fix and an async crypto deadlock
fix
- a small efficiency improvement for DM crypt decryption by leveraging
immutable biovecs
- add error handling modes for corrupted blocks to DM verity
- a new "log-writes" DM target from Josef Bacik that is meant for file
system developers to test file system integrity at particular points
in the life of a file system
- a few DM log userspace cleanups and fixes
- a few Documentation fixes (for thin, cache, crypt and switch)
* tag 'dm-4.1-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (34 commits)
dm crypt: fix missing error code return from crypt_ctr error path
dm crypt: fix deadlock when async crypto algorithm returns -EBUSY
dm crypt: leverage immutable biovecs when decrypting on read
dm crypt: update URLs to new cryptsetup project page
dm: add log writes target
dm table: use bool function return values of true/false not 1/0
dm verity: add error handling modes for corrupted blocks
dm thin: remove stale 'trim' message documentation
dm delay: use msecs_to_jiffies for time conversion
dm log userspace base: fix compile warning
dm log userspace transfer: match wait_for_completion_timeout return type
dm table: fall back to getting device using name_to_dev_t()
init: stricter checking of major:minor root= values
init: export name_to_dev_t and mark name argument as const
dm: add 'use_blk_mq' module param and expose in per-device ro sysfs attr
dm: optimize dm_mq_queue_rq to _not_ use kthread if using pure blk-mq
dm: add full blk-mq support to request-based DM
dm: impose configurable deadline for dm_request_fn's merge heuristic
dm sysfs: introduce ability to add writable attributes
dm: don't start current request if it would've merged with the previous
...
-rw-r--r-- | Documentation/ABI/testing/sysfs-block-dm | 22 | ||||
-rw-r--r-- | Documentation/device-mapper/dm-crypt.txt | 4 | ||||
-rw-r--r-- | Documentation/device-mapper/log-writes.txt | 140 | ||||
-rw-r--r-- | Documentation/device-mapper/switch.txt | 4 | ||||
-rw-r--r-- | Documentation/device-mapper/thin-provisioning.txt | 3 | ||||
-rw-r--r-- | Documentation/device-mapper/verity.txt | 21 | ||||
-rw-r--r-- | drivers/md/Kconfig | 27 | ||||
-rw-r--r-- | drivers/md/Makefile | 1 | ||||
-rw-r--r-- | drivers/md/dm-cache-policy-mq.c | 251 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 25 | ||||
-rw-r--r-- | drivers/md/dm-delay.c | 2 | ||||
-rw-r--r-- | drivers/md/dm-log-userspace-base.c | 91 | ||||
-rw-r--r-- | drivers/md/dm-log-userspace-transfer.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-log-writes.c | 825 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-sysfs.c | 43 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 71 | ||||
-rw-r--r-- | drivers/md/dm-verity.c | 147 | ||||
-rw-r--r-- | drivers/md/dm.c | 556 | ||||
-rw-r--r-- | drivers/md/dm.h | 10 | ||||
-rw-r--r-- | include/linux/device-mapper.h | 5 | ||||
-rw-r--r-- | include/linux/mount.h | 2 | ||||
-rw-r--r-- | include/uapi/linux/dm-ioctl.h | 4 | ||||
-rw-r--r-- | init/do_mounts.c | 6 |
24 files changed, 1928 insertions, 343 deletions
diff --git a/Documentation/ABI/testing/sysfs-block-dm b/Documentation/ABI/testing/sysfs-block-dm index 87ca5691e29b..f9f2339b9a0a 100644 --- a/Documentation/ABI/testing/sysfs-block-dm +++ b/Documentation/ABI/testing/sysfs-block-dm @@ -23,3 +23,25 @@ Description: Device-mapper device suspend state. Contains the value 1 while the device is suspended. Otherwise it contains 0. Read-only attribute. Users: util-linux, device-mapper udev rules + +What: /sys/block/dm-<num>/dm/rq_based_seq_io_merge_deadline +Date: March 2015 +KernelVersion: 4.1 +Contact: dm-devel@redhat.com +Description: Allow control over how long a request that is a + reasonable merge candidate can be queued on the request + queue. The resolution of this deadline is in + microseconds (ranging from 1 to 100000 usecs). + Setting this attribute to 0 (the default) will disable + request-based DM's merge heuristic and associated extra + accounting. This attribute is not applicable to + bio-based DM devices so it will only ever report 0 for + them. + +What: /sys/block/dm-<num>/dm/use_blk_mq +Date: March 2015 +KernelVersion: 4.1 +Contact: dm-devel@redhat.com +Description: Request-based Device-mapper blk-mq I/O path mode. + Contains the value 1 if the device is using blk-mq. + Otherwise it contains 0. Read-only attribute. diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index ad697781f9ac..692171fe9da0 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -5,7 +5,7 @@ Device-Mapper's "crypt" target provides transparent encryption of block devices using the kernel crypto API. For a more detailed description of supported parameters see: -http://code.google.com/p/cryptsetup/wiki/DMCrypt +https://gitlab.com/cryptsetup/cryptsetup/wikis/DMCrypt Parameters: <cipher> <key> <iv_offset> <device path> \ <offset> [<#opt_params> <opt_params>] @@ -80,7 +80,7 @@ Example scripts =============== LUKS (Linux Unified Key Setup) is now the preferred way to set up disk encryption with dm-crypt using the 'cryptsetup' utility, see -http://code.google.com/p/cryptsetup/ +https://gitlab.com/cryptsetup/cryptsetup [[ #!/bin/sh diff --git a/Documentation/device-mapper/log-writes.txt b/Documentation/device-mapper/log-writes.txt new file mode 100644 index 000000000000..c10f30c9b534 --- /dev/null +++ b/Documentation/device-mapper/log-writes.txt @@ -0,0 +1,140 @@ +dm-log-writes +============= + +This target takes 2 devices, one to pass all IO to normally, and one to log all +of the write operations to. This is intended for file system developers wishing +to verify the integrity of metadata or data as the file system is written to. +There is a log_write_entry written for every WRITE request and the target is +able to take arbitrary data from userspace to insert into the log. The data +that is in the WRITE requests is copied into the log to make the replay happen +exactly as it happened originally. + +Log Ordering +============ + +We log things in order of completion once we are sure the write is no longer in +cache. This means that normal WRITE requests are not actually logged until the +next REQ_FLUSH request. This is to make it easier for userspace to replay the +log in a way that correlates to what is on disk and not what is in cache, to +make it easier to detect improper waiting/flushing. + +This works by attaching all WRITE requests to a list once the write completes. +Once we see a REQ_FLUSH request we splice this list onto the request and once +the FLUSH request completes we log all of the WRITEs and then the FLUSH. Only +completed WRITEs, at the time the REQ_FLUSH is issued, are added in order to +simulate the worst case scenario with regard to power failures. Consider the +following example (W means write, C means complete): + +W1,W2,W3,C3,C2,Wflush,C1,Cflush + +The log would show the following + +W3,W2,flush,W1.... + +Again this is to simulate what is actually on disk, this allows us to detect +cases where a power failure at a particular point in time would create an +inconsistent file system. + +Any REQ_FUA requests bypass this flushing mechanism and are logged as soon as +they complete as those requests will obviously bypass the device cache. + +Any REQ_DISCARD requests are treated like WRITE requests. Otherwise we would +have all the DISCARD requests, and then the WRITE requests and then the FLUSH +request. Consider the following example: + +WRITE block 1, DISCARD block 1, FLUSH + +If we logged DISCARD when it completed, the replay would look like this + +DISCARD 1, WRITE 1, FLUSH + +which isn't quite what happened and wouldn't be caught during the log replay. + +Target interface +================ + +i) Constructor + + log-writes <dev_path> <log_dev_path> + + dev_path : Device that all of the IO will go to normally. + log_dev_path : Device where the log entries are written to. + +ii) Status + + <#logged entries> <highest allocated sector> + + #logged entries : Number of logged entries + highest allocated sector : Highest allocated sector + +iii) Messages + + mark <description> + + You can use a dmsetup message to set an arbitrary mark in a log. + For example say you want to fsck a file system after every + write, but first you need to replay up to the mkfs to make sure + we're fsck'ing something reasonable, you would do something like + this: + + mkfs.btrfs -f /dev/mapper/log + dmsetup message log 0 mark mkfs + <run test> + + This would allow you to replay the log up to the mkfs mark and + then replay from that point on doing the fsck check in the + interval that you want. + + Every log has a mark at the end labeled "dm-log-writes-end". + +Userspace component +=================== + +There is a userspace tool that will replay the log for you in various ways. +It can be found here: https://github.com/josefbacik/log-writes + +Example usage +============= + +Say you want to test fsync on your file system. You would do something like +this: + +TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" +dmsetup create log --table "$TABLE" +mkfs.btrfs -f /dev/mapper/log +dmsetup message log 0 mark mkfs + +mount /dev/mapper/log /mnt/btrfs-test +<some test that does fsync at the end> +dmsetup message log 0 mark fsync +md5sum /mnt/btrfs-test/foo +umount /mnt/btrfs-test + +dmsetup remove log +replay-log --log /dev/sdc --replay /dev/sdb --end-mark fsync +mount /dev/sdb /mnt/btrfs-test +md5sum /mnt/btrfs-test/foo +<verify md5sum's are correct> + +Another option is to do a complicated file system operation and verify the file +system is consistent during the entire operation. You could do this with: + +TABLE="0 $(blockdev --getsz /dev/sdb) log-writes /dev/sdb /dev/sdc" +dmsetup create log --table "$TABLE" +mkfs.btrfs -f /dev/mapper/log +dmsetup message log 0 mark mkfs + +mount /dev/mapper/log /mnt/btrfs-test +<fsstress to dirty the fs> +btrfs filesystem balance /mnt/btrfs-test +umount /mnt/btrfs-test +dmsetup remove log + +replay-log --log /dev/sdc --replay /dev/sdb --end-mark mkfs +btrfsck /dev/sdb +replay-log --log /dev/sdc --replay /dev/sdb --start-mark mkfs \ + --fsck "btrfsck /dev/sdb" --check fua + +And that will replay the log until it sees a FUA request, run the fsck command +and if the fsck passes it will replay to the next FUA, until it is completed or +the fsck command exists abnormally. diff --git a/Documentation/device-mapper/switch.txt b/Documentation/device-mapper/switch.txt index 8897d0494838..424835e57f27 100644 --- a/Documentation/device-mapper/switch.txt +++ b/Documentation/device-mapper/switch.txt @@ -47,8 +47,8 @@ consume far too much memory. Using this device-mapper switch target we can now build a two-layer device hierarchy: - Upper Tier – Determine which array member the I/O should be sent to. - Lower Tier – Load balance amongst paths to a particular member. + Upper Tier - Determine which array member the I/O should be sent to. + Lower Tier - Load balance amongst paths to a particular member. The lower tier consists of a single dm multipath device for each member. Each of these multipath devices contains the set of paths directly to diff --git a/Documentation/device-mapper/thin-provisioning.txt b/Documentation/device-mapper/thin-provisioning.txt index 2f5173500bd9..4f67578b2954 100644 --- a/Documentation/device-mapper/thin-provisioning.txt +++ b/Documentation/device-mapper/thin-provisioning.txt @@ -380,9 +380,6 @@ then you'll have no access to blocks mapped beyond the end. If you load a target that is bigger than before, then extra blocks will be provisioned as and when needed. -If you wish to reduce the size of your thin device and potentially -regain some space then send the 'trim' message to the pool. - ii) Status <nr mapped sectors> <highest mapped sector> diff --git a/Documentation/device-mapper/verity.txt b/Documentation/device-mapper/verity.txt index 9884681535ee..e15bc1a0fb98 100644 --- a/Documentation/device-mapper/verity.txt +++ b/Documentation/device-mapper/verity.txt @@ -11,6 +11,7 @@ Construction Parameters <data_block_size> <hash_block_size> <num_data_blocks> <hash_start_block> <algorithm> <digest> <salt> + [<#opt_params> <opt_params>] <version> This is the type of the on-disk hash format. @@ -62,6 +63,22 @@ Construction Parameters <salt> The hexadecimal encoding of the salt value. +<#opt_params> + Number of optional parameters. If there are no optional parameters, + the optional paramaters section can be skipped or #opt_params can be zero. + Otherwise #opt_params is the number of following arguments. + + Example of optional parameters section: + 1 ignore_corruption + +ignore_corruption + Log corrupted blocks, but allow read operations to proceed normally. + +restart_on_corruption + Restart the system when a corrupted block is discovered. This option is + not compatible with ignore_corruption and requires user space support to + avoid restart loops. + Theory of operation =================== @@ -125,7 +142,7 @@ block boundary) are the hash blocks which are stored a depth at a time The full specification of kernel parameters and on-disk metadata format is available at the cryptsetup project's wiki page - http://code.google.com/p/cryptsetup/wiki/DMVerity + https://gitlab.com/cryptsetup/cryptsetup/wikis/DMVerity Status ====== @@ -142,7 +159,7 @@ Set up a device: A command line tool veritysetup is available to compute or verify the hash tree or activate the kernel device. This is available from -the cryptsetup upstream repository http://code.google.com/p/cryptsetup/ +the cryptsetup upstream repository https://gitlab.com/cryptsetup/cryptsetup/ (as a libcryptsetup extension). Create hash on the device: diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 63e05e32b462..6ddc983417d5 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -196,6 +196,17 @@ config BLK_DEV_DM If unsure, say N. +config DM_MQ_DEFAULT + bool "request-based DM: use blk-mq I/O path by default" + depends on BLK_DEV_DM + ---help--- + This option enables the blk-mq based I/O path for request-based + DM devices by default. With the option the dm_mod.use_blk_mq + module/boot option defaults to Y, without it to N, but it can + still be overriden either way. + + If unsure say N. + config DM_DEBUG bool "Device mapper debugging support" depends on BLK_DEV_DM @@ -432,4 +443,20 @@ config DM_SWITCH If unsure, say N. +config DM_LOG_WRITES + tristate "Log writes target support" + depends on BLK_DEV_DM + ---help--- + This device-mapper target takes two devices, one device to use + normally, one to log all write operations done to the first device. + This is for use by file system developers wishing to verify that + their fs is writing a consitent file system at all times by allowing + them to replay the log in a variety of ways and to check the + contents. + + To compile this code as a module, choose M here: the module will + be called dm-log-writes. + + If unsure, say N. + endif # MD diff --git a/drivers/md/Makefile b/drivers/md/Makefile index a2da532b1c2b..1863feaa5846 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_DM_CACHE) += dm-cache.o obj-$(CONFIG_DM_CACHE_MQ) += dm-cache-mq.o obj-$(CONFIG_DM_CACHE_CLEANER) += dm-cache-cleaner.o obj-$(CONFIG_DM_ERA) += dm-era.o +obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o ifeq ($(CONFIG_DM_UEVENT),y) dm-mod-objs += dm-uevent.o diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c index 13f547a4eeb6..3ddd1162334d 100644 --- a/drivers/md/dm-cache-policy-mq.c +++ b/drivers/md/dm-cache-policy-mq.c @@ -8,6 +8,7 @@ #include "dm.h" #include <linux/hash.h> +#include <linux/jiffies.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> @@ -124,32 +125,41 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio) * sorted queue. */ #define NR_QUEUE_LEVELS 16u +#define NR_SENTINELS NR_QUEUE_LEVELS * 3 + +#define WRITEBACK_PERIOD HZ struct queue { + unsigned nr_elts; + bool current_writeback_sentinels; + unsigned long next_writeback; struct list_head qs[NR_QUEUE_LEVELS]; + struct list_head sentinels[NR_SENTINELS]; }; static void queue_init(struct queue *q) { unsigned i; - for (i = 0; i < NR_QUEUE_LEVELS; i++) + q->nr_elts = 0; + q->current_writeback_sentinels = false; + q->next_writeback = 0; + for (i = 0; i < NR_QUEUE_LEVELS; i++) { INIT_LIST_HEAD(q->qs + i); + INIT_LIST_HEAD(q->sentinels + i); + INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i); + INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i); + } } -/* - * Checks to see if the queue is empty. - * FIXME: reduce cpu usage. - */ -static bool queue_empty(struct queue *q) +static unsigned queue_size(struct queue *q) { - unsigned i; - - for (i = 0; i < NR_QUEUE_LEVELS; i++) - if (!list_empty(q->qs + i)) - return false; + return q->nr_elts; +} - return true; +static bool queue_empty(struct queue *q) +{ + return q->nr_elts == 0; } /* @@ -157,24 +167,19 @@ static bool queue_empty(struct queue *q) */ static void queue_push(struct queue *q, unsigned level, struct list_head *elt) { + q->nr_elts++; list_add_tail(elt, q->qs + level); } -static void queue_remove(struct list_head *elt) +static void queue_remove(struct queue *q, struct list_head *elt) { + q->nr_elts--; list_del(elt); } -/* - * Shifts all regions down one level. This has no effect on the order of - * the queue. - */ -static void queue_shift_down(struct queue *q) +static bool is_sentinel(struct queue *q, struct list_head *h) { - unsigned level; - - for (level = 1; level < NR_QUEUE_LEVELS; level++) - list_splice_init(q->qs + level, q->qs + level - 1); + return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS)); } /* @@ -184,10 +189,12 @@ static void queue_shift_down(struct queue *q) static struct list_head *queue_peek(struct queue *q) { unsigned level; + struct list_head *h; for (level = 0; level < NR_QUEUE_LEVELS; level++) - if (!list_empty(q->qs + level)) - return q->qs[level].next; + list_for_each(h, q->qs + level) + if (!is_sentinel(q, h)) + return h; return NULL; } @@ -197,16 +204,34 @@ static struct list_head *queue_pop(struct queue *q) struct list_head *r = queue_peek(q); if (r) { + q->nr_elts--; list_del(r); - - /* have we just emptied the bottom level? */ - if (list_empty(q->qs)) - queue_shift_down(q); } return r; } +/* + * Pops an entry from a level that is not past a sentinel. + */ +static struct list_head *queue_pop_old(struct queue *q) +{ + unsigned level; + struct list_head *h; + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + list_for_each(h, q->qs + level) { + if (is_sentinel(q, h)) + break; + + q->nr_elts--; + list_del(h); + return h; + } + + return NULL; +} + static struct list_head *list_pop(struct list_head *lh) { struct list_head *r = lh->next; @@ -217,6 +242,62 @@ static struct list_head *list_pop(struct list_head *lh) return r; } +static struct list_head *writeback_sentinel(struct queue *q, unsigned level) +{ + if (q->current_writeback_sentinels) + return q->sentinels + NR_QUEUE_LEVELS + level; + else + return q->sentinels + 2 * NR_QUEUE_LEVELS + level; +} + +static void queue_update_writeback_sentinels(struct queue *q) +{ + unsigned i; + struct list_head *h; + + if (time_after(jiffies, q->next_writeback)) { + for (i = 0; i < NR_QUEUE_LEVELS; i++) { + h = writeback_sentinel(q, i); + list_del(h); + list_add_tail(h, q->qs + i); + } + + q->next_writeback = jiffies + WRITEBACK_PERIOD; + q->current_writeback_sentinels = !q->current_writeback_sentinels; + } +} + +/* + * Sometimes we want to iterate through entries that have been pushed since + * a certain event. We use sentinel entries on the queues to delimit these + * 'tick' events. + */ +static void queue_tick(struct queue *q) +{ + unsigned i; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) { + list_del(q->sentinels + i); + list_add_tail(q->sentinels + i, q->qs + i); + } +} + +typedef void (*iter_fn)(struct list_head *, void *); +static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context) +{ + unsigned i; + struct list_head *h; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) { + list_for_each_prev(h, q->qs + i) { + if (is_sentinel(q, h)) + break; + + fn(h, context); + } + } +} + /*----------------------------------------------------------------*/ /* @@ -232,8 +313,6 @@ struct entry { */ bool dirty:1; unsigned hit_count; - unsigned generation; - unsigned tick; }; /* @@ -481,7 +560,6 @@ static bool in_cache(struct mq_policy *mq, struct entry *e) */ static void push(struct mq_policy *mq, struct entry *e) { - e->tick = mq->tick; hash_insert(mq, e); if (in_cache(mq, e)) @@ -496,7 +574,11 @@ static void push(struct mq_policy *mq, struct entry *e) */ static void del(struct mq_policy *mq, struct entry *e) { - queue_remove(&e->list); + if (in_cache(mq, e)) + queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list); + else + queue_remove(&mq->pre_cache, &e->list); + hash_remove(e); } @@ -518,18 +600,24 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q) return e; } -static struct entry *peek(struct queue *q) +static struct entry *pop_old(struct mq_policy *mq, struct queue *q) { - struct list_head *h = queue_peek(q); - return h ? container_of(h, struct entry, list) : NULL; + struct entry *e; + struct list_head *h = queue_pop_old(q); + + if (!h) + return NULL; + + e = container_of(h, struct entry, list); + hash_remove(e); + + return e; } -/* - * Has this entry already been updated? - */ -static bool updated_this_tick(struct mq_policy *mq, struct entry *e) +static struct entry *peek(struct queue *q) { - return mq->tick == e->tick; + struct list_head *h = queue_peek(q); + return h ? container_of(h, struct entry, list) : NULL; } /* @@ -583,20 +671,9 @@ static void check_generation(struct mq_policy *mq) * Whenever we use an entry we bump up it's hit counter, and push it to the * back to it's current level. */ -static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) +static void requeue(struct mq_policy *mq, struct entry *e) { - if (updated_this_tick(mq, e)) - return; - - e->hit_count++; - mq->hit_count++; check_generation(mq); - - /* generation adjustment, to stop the counts increasing forever. */ - /* FIXME: divide? */ - /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */ - e->generation = mq->generation; - del(mq, e); push(mq, e); } @@ -703,7 +780,7 @@ static int cache_entry_found(struct mq_policy *mq, struct entry *e, struct policy_result *result) { - requeue_and_update_tick(mq, e); + requeue(mq, e); if (in_cache(mq, e)) { result->op = POLICY_HIT; @@ -740,8 +817,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, new_e->oblock = e->oblock; new_e->dirty = false; new_e->hit_count = e->hit_count; - new_e->generation = e->generation; - new_e->tick = e->tick; del(mq, e); free_entry(&mq->pre_cache_pool, e); @@ -757,18 +832,16 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, int data_dir, struct policy_result *result) { int r = 0; - bool updated = updated_this_tick(mq, e); - if ((!discarded_oblock && updated) || - !should_promote(mq, e, discarded_oblock, data_dir)) { - requeue_and_update_tick(mq, e); + if (!should_promote(mq, e, discarded_oblock, data_dir)) { + requeue(mq, e); result->op = POLICY_MISS; } else if (!can_migrate) r = -EWOULDBLOCK; else { - requeue_and_update_tick(mq, e); + requeue(mq, e); r = pre_cache_to_cache(mq, e, result); } @@ -795,7 +868,6 @@ static void insert_in_pre_cache(struct mq_policy *mq, e->dirty = false; e->oblock = oblock; e->hit_count = 1; - e->generation = mq->generation; push(mq, e); } @@ -828,7 +900,6 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, e->oblock = oblock; e->dirty = false; e->hit_count = 1; - e->generation = mq->generation; push(mq, e); result->cblock = infer_cblock(&mq->cache_pool, e); @@ -905,12 +976,37 @@ static void mq_destroy(struct dm_cache_policy *p) kfree(mq); } +static void update_pre_cache_hits(struct list_head *h, void *context) +{ + struct entry *e = container_of(h, struct entry, list); + e->hit_count++; +} + +static void update_cache_hits(struct list_head *h, void *context) +{ + struct mq_policy *mq = context; + struct entry *e = container_of(h, struct entry, list); + e->hit_count++; + mq->hit_count++; +} + static void copy_tick(struct mq_policy *mq) { - unsigned long flags; + unsigned long flags, tick; spin_lock_irqsave(&mq->tick_lock, flags); - mq->tick = mq->tick_protected; + tick = mq->tick_protected; + if (tick != mq->tick) { + queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq); + queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq); + queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq); + mq->tick = tick; + } + + queue_tick(&mq->pre_cache); + queue_tick(&mq->cache_dirty); + queue_tick(&mq->cache_clean); + queue_update_writeback_sentinels(&mq->cache_dirty); spin_unlock_irqrestore(&mq->tick_lock, flags); } @@ -1001,7 +1097,6 @@ static int mq_load_mapping(struct dm_cache_policy *p, e->oblock = oblock; e->dirty = false; /* this gets corrected in a minute */ e->hit_count = hint_valid ? hint : 1; - e->generation = mq->generation; push(mq, e); return 0; @@ -1012,10 +1107,15 @@ static int mq_save_hints(struct mq_policy *mq, struct queue *q, { int r; unsigned level; + struct list_head *h; struct entry *e; for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each_entry(e, q->qs + level, list) { + list_for_each(h, q->qs + level) { + if (is_sentinel(q, h)) + continue; + + e = container_of(h, struct entry, list); r = fn(context, infer_cblock(&mq->cache_pool, e), e->oblock, e->hit_count); if (r) @@ -1087,10 +1187,27 @@ static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) return r; } +#define CLEAN_TARGET_PERCENTAGE 25 + +static bool clean_target_met(struct mq_policy *mq) +{ + /* + * Cache entries may not be populated. So we're cannot rely on the + * size of the clean queue. + */ + unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty); + unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100; + + return nr_clean >= target; +} + static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, dm_cblock_t *cblock) { - struct entry *e = pop(mq, &mq->cache_dirty); + struct entry *e = pop_old(mq, &mq->cache_dirty); + + if (!e && !clean_target_met(mq)) + e = pop(mq, &mq->cache_dirty); if (!e) return -ENODATA; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 713a96237a80..9eeea196328a 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -228,7 +228,7 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc) * * tcw: Compatible implementation of the block chaining mode used * by the TrueCrypt device encryption system (prior to version 4.1). - * For more info see: http://www.truecrypt.org + * For more info see: https://gitlab.com/cryptsetup/cryptsetup/wikis/TrueCryptOnDiskFormat * It operates on full 512 byte sectors and uses CBC * with an IV derived from initial key and the sector number. * In addition, whitening value is applied on every sector, whitening @@ -925,11 +925,10 @@ static int crypt_convert(struct crypt_config *cc, switch (r) { /* async */ + case -EINPROGRESS: case -EBUSY: wait_for_completion(&ctx->restart); reinit_completion(&ctx->restart); - /* fall through*/ - case -EINPROGRESS: ctx->req = NULL; ctx->cc_sector++; continue; @@ -1124,15 +1123,15 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) { struct crypt_config *cc = io->cc; - struct bio *base_bio = io->base_bio; struct bio *clone; /* - * The block layer might modify the bvec array, so always - * copy the required bvecs because we need the original - * one in order to decrypt the whole bio data *afterwards*. + * We need the original biovec array in order to decrypt + * the whole bio data *afterwards* -- thanks to immutable + * biovecs we don't need to worry about the block layer + * modifying the biovec array; so leverage bio_clone_fast(). */ - clone = bio_clone_bioset(base_bio, gfp, cc->bs); + clone = bio_clone_fast(io->base_bio, gfp, cc->bs); if (!clone) return 1; @@ -1346,10 +1345,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx); struct crypt_config *cc = io->cc; - if (error == -EINPROGRESS) { - complete(&ctx->restart); + if (error == -EINPROGRESS) return; - } if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); @@ -1360,12 +1357,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio); if (!atomic_dec_and_test(&ctx->cc_pending)) - return; + goto done; if (bio_data_dir(io->base_bio) == READ) kcryptd_crypt_read_done(io); else kcryptd_crypt_write_io_submit(io, 1); +done: + if (!completion_done(&ctx->restart)) + complete(&ctx->restart); } static void kcryptd_crypt(struct work_struct *work) @@ -1816,6 +1816,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (ret) goto bad; + ret = -EINVAL; while (opt_params--) { opt_string = dm_shift_arg(&as); if (!opt_string) { diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 42c3a27a14cc..57b6a1901c91 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -236,7 +236,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio) delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); delayed->context = dc; - delayed->expires = expires = jiffies + (delay * HZ / 1000); + delayed->expires = expires = jiffies + msecs_to_jiffies(delay); mutex_lock(&delayed_bios_lock); diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index 03177ca0b009..058256d2eeea 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -17,7 +17,9 @@ #define DM_LOG_USERSPACE_VSN "1.3.0" -struct flush_entry { +#define FLUSH_ENTRY_POOL_SIZE 16 + +struct dm_dirty_log_flush_entry { int type; region_t region; struct list_head list; @@ -34,22 +36,14 @@ struct flush_entry { struct log_c { struct dm_target *ti; struct dm_dev *log_dev; - uint32_t region_size; - region_t region_count; - uint64_t luid; - char uuid[DM_UUID_LEN]; char *usr_argv_str; uint32_t usr_argc; - /* - * in_sync_hint gets set when doing is_remote_recovering. It - * represents the first region that needs recovery. IOW, the - * first zero bit of sync_bits. This can be useful for to limit - * traffic for calls like is_remote_recovering and get_resync_work, - * but be take care in its use for anything else. - */ - uint64_t in_sync_hint; + uint32_t region_size; + region_t region_count; + uint64_t luid; + char uuid[DM_UUID_LEN]; /* * Mark and clear requests are held until a flush is issued @@ -62,6 +56,15 @@ struct log_c { struct list_head clear_list; /* + * in_sync_hint gets set when doing is_remote_recovering. It + * represents the first region that needs recovery. IOW, the + * first zero bit of sync_bits. This can be useful for to limit + * traffic for calls like is_remote_recovering and get_resync_work, + * but be take care in its use for anything else. + */ + uint64_t in_sync_hint; + + /* * Workqueue for flush of clear region requests. */ struct workqueue_struct *dmlog_wq; @@ -72,19 +75,11 @@ struct log_c { * Combine userspace flush and mark requests for efficiency. */ uint32_t integrated_flush; -}; - -static mempool_t *flush_entry_pool; -static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data) -{ - return kmalloc(sizeof(struct flush_entry), gfp_mask); -} + mempool_t *flush_entry_pool; +}; -static void flush_entry_free(void *element, void *pool_data) -{ - kfree(element); -} +static struct kmem_cache *_flush_entry_cache; static int userspace_do_request(struct log_c *lc, const char *uuid, int request_type, char *data, size_t data_size, @@ -254,6 +249,14 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, goto out; } + lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE, + _flush_entry_cache); + if (!lc->flush_entry_pool) { + DMERR("Failed to create flush_entry_pool"); + r = -ENOMEM; + goto out; + } + /* * Send table string and get back any opened device. */ @@ -310,6 +313,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, out: kfree(devices_rdata); if (r) { + if (lc->flush_entry_pool) + mempool_destroy(lc->flush_entry_pool); kfree(lc); kfree(ctr_str); } else { @@ -338,6 +343,8 @@ static void userspace_dtr(struct dm_dirty_log *log) if (lc->log_dev) dm_put_device(lc->ti, lc->log_dev); + mempool_destroy(lc->flush_entry_pool); + kfree(lc->usr_argv_str); kfree(lc); @@ -461,7 +468,7 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region, static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list) { int r = 0; - struct flush_entry *fe; + struct dm_dirty_log_flush_entry *fe; list_for_each_entry(fe, flush_list, list) { r = userspace_do_request(lc, lc->uuid, fe->type, @@ -481,7 +488,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list, int r = 0; int count; uint32_t type = 0; - struct flush_entry *fe, *tmp_fe; + struct dm_dirty_log_flush_entry *fe, *tmp_fe; LIST_HEAD(tmp_list); uint64_t group[MAX_FLUSH_GROUP_COUNT]; @@ -563,7 +570,8 @@ static int userspace_flush(struct dm_dirty_log *log) LIST_HEAD(clear_list); int mark_list_is_empty; int clear_list_is_empty; - struct flush_entry *fe, *tmp_fe; + struct dm_dirty_log_flush_entry *fe, *tmp_fe; + mempool_t *flush_entry_pool = lc->flush_entry_pool; spin_lock_irqsave(&lc->flush_lock, flags); list_splice_init(&lc->mark_list, &mark_list); @@ -643,10 +651,10 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region) { unsigned long flags; struct log_c *lc = log->context; - struct flush_entry *fe; + struct dm_dirty_log_flush_entry *fe; /* Wait for an allocation, but _never_ fail */ - fe = mempool_alloc(flush_entry_pool, GFP_NOIO); + fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO); BUG_ON(!fe); spin_lock_irqsave(&lc->flush_lock, flags); @@ -672,7 +680,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region) { unsigned long flags; struct log_c *lc = log->context; - struct flush_entry *fe; + struct dm_dirty_log_flush_entry *fe; /* * If we fail to allocate, we skip the clearing of @@ -680,7 +688,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region) * to cause the region to be resync'ed when the * device is activated next time. */ - fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC); + fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC); if (!fe) { DMERR("Failed to allocate memory to clear region."); return; @@ -733,7 +741,6 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region) static void userspace_set_region_sync(struct dm_dirty_log *log, region_t region, int in_sync) { - int r; struct log_c *lc = log->context; struct { region_t r; @@ -743,12 +750,12 @@ static void userspace_set_region_sync(struct dm_dirty_log *log, pkg.r = region; pkg.i = (int64_t)in_sync; - r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, - (char *)&pkg, sizeof(pkg), NULL, NULL); + (void) userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC, + (char *)&pkg, sizeof(pkg), NULL, NULL); /* * It would be nice to be able to report failures. - * However, it is easy emough to detect and resolve. + * However, it is easy enough to detect and resolve. */ return; } @@ -886,18 +893,16 @@ static int __init userspace_dirty_log_init(void) { int r = 0; - flush_entry_pool = mempool_create(100, flush_entry_alloc, - flush_entry_free, NULL); - - if (!flush_entry_pool) { - DMWARN("Unable to create flush_entry_pool: No memory."); + _flush_entry_cache = KMEM_CACHE(dm_dirty_log_flush_entry, 0); + if (!_flush_entry_cache) { + DMWARN("Unable to create flush_entry_cache: No memory."); return -ENOMEM; } r = dm_ulog_tfr_init(); if (r) { DMWARN("Unable to initialize userspace log communications"); - mempool_destroy(flush_entry_pool); + kmem_cache_destroy(_flush_entry_cache); return r; } @@ -905,7 +910,7 @@ static int __init userspace_dirty_log_init(void) if (r) { DMWARN("Couldn't register userspace dirty log type"); dm_ulog_tfr_exit(); - mempool_destroy(flush_entry_pool); + kmem_cache_destroy(_flush_entry_cache); return r; } @@ -917,7 +922,7 @@ static void __exit userspace_dirty_log_exit(void) { dm_dirty_log_type_unregister(&_userspace_type); dm_ulog_tfr_exit(); - mempool_destroy(flush_entry_pool); + kmem_cache_destroy(_flush_entry_cache); DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded"); return; diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 39ad9664d397..fdf8ec304f8d 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -172,6 +172,7 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, char *rdata, size_t *rdata_size) { int r = 0; + unsigned long tmo; size_t dummy = 0; int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg); struct dm_ulog_request *tfr = prealloced_ulog_tfr; @@ -236,11 +237,11 @@ resend: goto out; } - r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); + tmo = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT); spin_lock(&receiving_list_lock); list_del_init(&(pkg.list)); spin_unlock(&receiving_list_lock); - if (!r) { + if (!tmo) { DMWARN("[%s] Request timed out: [%u/%u] - retrying", (strlen(uuid) > 8) ? (uuid + (strlen(uuid) - 8)) : (uuid), diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c new file mode 100644 index 000000000000..93e08446a87d --- /dev/null +++ b/drivers/md/dm-log-writes.c @@ -0,0 +1,825 @@ +/* + * Copyright (C) 2014 Facebook. All rights reserved. + * + * This file is released under the GPL. + */ + +#include <linux/device-mapper.h> + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/blkdev.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/kthread.h> +#include <linux/freezer.h> + +#define DM_MSG_PREFIX "log-writes" + +/* + * This target will sequentially log all writes to the target device onto the + * log device. This is helpful for replaying writes to check for fs consistency + * at all times. This target provides a mechanism to mark specific events to + * check data at a later time. So for example you would: + * + * write data + * fsync + * dmsetup message /dev/whatever mark mymark + * unmount /mnt/test + * + * Then replay the log up to mymark and check the contents of the replay to + * verify it matches what was written. + * + * We log writes only after they have been flushed, this makes the log describe + * close to the order in which the data hits the actual disk, not its cache. So + * for example the following sequence (W means write, C means complete) + * + * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd + * + * Would result in the log looking like this: + * + * c,a,flush,fuad,b,<other writes>,<next flush> + * + * This is meant to help expose problems where file systems do not properly wait + * on data being written before invoking a FLUSH. FUA bypasses cache so once it + * completes it is added to the log as it should be on disk. + * + * We treat DISCARDs as if they don't bypass cache so that they are logged in + * order of completion along with the normal writes. If we didn't do it this + * way we would process all the discards first and then write all the data, when + * in fact we want to do the data and the discard in the order that they + * completed. + */ +#define LOG_FLUSH_FLAG (1 << 0) +#define LOG_FUA_FLAG (1 << 1) +#define LOG_DISCARD_FLAG (1 << 2) +#define LOG_MARK_FLAG (1 << 3) + +#define WRITE_LOG_VERSION 1 +#define WRITE_LOG_MAGIC 0x6a736677736872 + +/* + * The disk format for this is braindead simple. + * + * At byte 0 we have our super, followed by the following sequence for + * nr_entries: + * + * [ 1 sector ][ entry->nr_sectors ] + * [log_write_entry][ data written ] + * + * The log_write_entry takes up a full sector so we can have arbitrary length + * marks and it leaves us room for extra content in the future. + */ + +/* + * Basic info about the log for userspace. + */ +struct log_write_super { + __le64 magic; + __le64 version; + __le64 nr_entries; + __le32 sectorsize; +}; + +/* + * sector - the sector we wrote. + * nr_sectors - the number of sectors we wrote. + * flags - flags for this log entry. + * data_len - the size of the data in this log entry, this is for private log + * entry stuff, the MARK data provided by userspace for example. + */ +struct log_write_entry { + __le64 sector; + __le64 nr_sectors; + __le64 flags; + __le64 data_len; +}; + +struct log_writes_c { + struct dm_dev *dev; + struct dm_dev *logdev; + u64 logged_entries; + u32 sectorsize; + atomic_t io_blocks; + atomic_t pending_blocks; + sector_t next_sector; + sector_t end_sector; + bool logging_enabled; + bool device_supports_discard; + spinlock_t blocks_lock; + struct list_head unflushed_blocks; + struct list_head logging_blocks; + wait_queue_head_t wait; + struct task_struct *log_kthread; +}; + +struct pending_block { + int vec_cnt; + u64 flags; + sector_t sector; + sector_t nr_sectors; + char *data; + u32 datalen; + struct list_head list; + struct bio_vec vecs[0]; +}; + +struct per_bio_data { + struct pending_block *block; +}; + +static void put_pending_block(struct log_writes_c *lc) +{ + if (atomic_dec_and_test(&lc->pending_blocks)) { + smp_mb__after_atomic(); + if (waitqueue_active(&lc->wait)) + wake_up(&lc->wait); + } +} + +static void put_io_block(struct log_writes_c *lc) +{ + if (atomic_dec_and_test(&lc->io_blocks)) { + smp_mb__after_atomic(); + if (waitqueue_active(&lc->wait)) + wake_up(&lc->wait); + } +} + +static void log_end_io(struct bio *bio, int err) +{ + struct log_writes_c *lc = bio->bi_private; + struct bio_vec *bvec; + int i; + + if (err) { + unsigned long flags; + + DMERR("Error writing log block, error=%d", err); + spin_lock_irqsave(&lc->blocks_lock, flags); + lc->logging_enabled = false; + spin_unlock_irqrestore(&lc->blocks_lock, flags); + } + + bio_for_each_segment_all(bvec, bio, i) + __free_page(bvec->bv_page); + + put_io_block(lc); + bio_put(bio); +} + +/* + * Meant to be called if there is an error, it will free all the pages + * associated with the block. + */ +static void free_pending_block(struct log_writes_c *lc, + struct pending_block *block) +{ + int i; + + for (i = 0; i < block->vec_cnt; i++) { + if (block->vecs[i].bv_page) + __free_page(block->vecs[i].bv_page); + } + kfree(block->data); + kfree(block); + put_pending_block(lc); +} + +static int write_metadata(struct log_writes_c *lc, void *entry, + size_t entrylen, void *data, size_t datalen, + sector_t sector) +{ + struct bio *bio; + struct page *page; + void *ptr; + size_t ret; + + bio = bio_alloc(GFP_KERNEL, 1); + if (!bio) { + DMERR("Couldn't alloc log bio"); + goto error; + } + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = lc->logdev->bdev; + bio->bi_end_io = log_end_io; + bio->bi_private = lc; + set_bit(BIO_UPTODATE, &bio->bi_flags); + + page = alloc_page(GFP_KERNEL); + if (!page) { + DMERR("Couldn't alloc log page"); + bio_put(bio); + goto error; + } + + ptr = kmap_atomic(page); + memcpy(ptr, entry, entrylen); + if (datalen) + memcpy(ptr + entrylen, data, datalen); + memset(ptr + entrylen + datalen, 0, + lc->sectorsize - entrylen - datalen); + kunmap_atomic(ptr); + + ret = bio_add_page(bio, page, lc->sectorsize, 0); + if (ret != lc->sectorsize) { + DMERR("Couldn't add page to the log block"); + goto error_bio; + } + submit_bio(WRITE, bio); + return 0; +error_bio: + bio_put(bio); + __free_page(page); +error: + put_io_block(lc); + return -1; +} + +static int log_one_block(struct log_writes_c *lc, + struct pending_block *block, sector_t sector) +{ + struct bio *bio; + struct log_write_entry entry; + size_t ret; + int i; + + entry.sector = cpu_to_le64(block->sector); + entry.nr_sectors = cpu_to_le64(block->nr_sectors); + entry.flags = cpu_to_le64(block->flags); + entry.data_len = cpu_to_le64(block->datalen); + if (write_metadata(lc, &entry, sizeof(entry), block->data, + block->datalen, sector)) { + free_pending_block(lc, block); + return -1; + } + + if (!block->vec_cnt) + goto out; + sector++; + + bio = bio_alloc(GFP_KERNEL, block->vec_cnt); + if (!bio) { + DMERR("Couldn't alloc log bio"); + goto error; + } + atomic_inc(&lc->io_blocks); + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = lc->logdev->bdev; + bio->bi_end_io = log_end_io; + bio->bi_private = lc; + set_bit(BIO_UPTODATE, &bio->bi_flags); + + for (i = 0; i < block->vec_cnt; i++) { + /* + * The page offset is always 0 because we allocate a new page + * for every bvec in the original bio for simplicity sake. + */ + ret = bio_add_page(bio, block->vecs[i].bv_page, + block->vecs[i].bv_len, 0); + if (ret != block->vecs[i].bv_len) { + atomic_inc(&lc->io_blocks); + submit_bio(WRITE, bio); + bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i); + if (!bio) { + DMERR("Couldn't alloc log bio"); + goto error; + } + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = lc->logdev->bdev; + bio->bi_end_io = log_end_io; + bio->bi_private = lc; + set_bit(BIO_UPTODATE, &bio->bi_flags); + + ret = bio_add_page(bio, block->vecs[i].bv_page, + block->vecs[i].bv_len, 0); + if (ret != block->vecs[i].bv_len) { + DMERR("Couldn't add page on new bio?"); + bio_put(bio); + goto error; + } + } + sector += block->vecs[i].bv_len >> SECTOR_SHIFT; + } + submit_bio(WRITE, bio); +out: + kfree(block->data); + kfree(block); + put_pending_block(lc); + return 0; +error: + free_pending_block(lc, block); + put_io_block(lc); + return -1; +} + +static int log_super(struct log_writes_c *lc) +{ + struct log_write_super super; + + super.magic = cpu_to_le64(WRITE_LOG_MAGIC); + super.version = cpu_to_le64(WRITE_LOG_VERSION); + super.nr_entries = cpu_to_le64(lc->logged_entries); + super.sectorsize = cpu_to_le32(lc->sectorsize); + + if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) { + DMERR("Couldn't write super"); + return -1; + } + + return 0; +} + +static inline sector_t logdev_last_sector(struct log_writes_c *lc) +{ + return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT; +} + +static int log_writes_kthread(void *arg) +{ + struct log_writes_c *lc = (struct log_writes_c *)arg; + sector_t sector = 0; + + while (!kthread_should_stop()) { + bool super = false; + bool logging_enabled; + struct pending_block *block = NULL; + int ret; + + spin_lock_irq(&lc->blocks_lock); + if (!list_empty(&lc->logging_blocks)) { + block = list_first_entry(&lc->logging_blocks, + struct pending_block, list); + list_del_init(&block->list); + if (!lc->logging_enabled) + goto next; + + sector = lc->next_sector; + if (block->flags & LOG_DISCARD_FLAG) + lc->next_sector++; + else + lc->next_sector += block->nr_sectors + 1; + + /* + * Apparently the size of the device may not be known + * right away, so handle this properly. + */ + if (!lc->end_sector) + lc->end_sector = logdev_last_sector(lc); + if (lc->end_sector && + lc->next_sector >= lc->end_sector) { + DMERR("Ran out of space on the logdev"); + lc->logging_enabled = false; + goto next; + } + lc->logged_entries++; + atomic_inc(&lc->io_blocks); + + super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG)); + if (super) + atomic_inc(&lc->io_blocks); + } +next: + logging_enabled = lc->logging_enabled; + spin_unlock_irq(&lc->blocks_lock); + if (block) { + if (logging_enabled) { + ret = log_one_block(lc, block, sector); + if (!ret && super) + ret = log_super(lc); + if (ret) { + spin_lock_irq(&lc->blocks_lock); + lc->logging_enabled = false; + spin_unlock_irq(&lc->blocks_lock); + } + } else + free_pending_block(lc, block); + continue; + } + + if (!try_to_freeze()) { + set_current_state(TASK_INTERRUPTIBLE); + if (!kthread_should_stop() && + !atomic_read(&lc->pending_blocks)) + schedule(); + __set_current_state(TASK_RUNNING); + } + } + return 0; +} + +/* + * Construct a log-writes mapping: + * log-writes <dev_path> <log_dev_path> + */ +static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv) +{ + struct log_writes_c *lc; + struct dm_arg_set as; + const char *devname, *logdevname; + + as.argc = argc; + as.argv = argv; + + if (argc < 2) { + ti->error = "Invalid argument count"; + return -EINVAL; + } + + lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL); + if (!lc) { + ti->error = "Cannot allocate context"; + return -ENOMEM; + } + spin_lock_init(&lc->blocks_lock); + INIT_LIST_HEAD(&lc->unflushed_blocks); + INIT_LIST_HEAD(&lc->logging_blocks); + init_waitqueue_head(&lc->wait); + lc->sectorsize = 1 << SECTOR_SHIFT; + atomic_set(&lc->io_blocks, 0); + atomic_set(&lc->pending_blocks, 0); + + devname = dm_shift_arg(&as); + if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) { + ti->error = "Device lookup failed"; + goto bad; + } + + logdevname = dm_shift_arg(&as); + if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) { + ti->error = "Log device lookup failed"; + dm_put_device(ti, lc->dev); + goto bad; + } + + lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write"); + if (!lc->log_kthread) { + ti->error = "Couldn't alloc kthread"; + dm_put_device(ti, lc->dev); + dm_put_device(ti, lc->logdev); + goto bad; + } + + /* We put the super at sector 0, start logging at sector 1 */ + lc->next_sector = 1; + lc->logging_enabled = true; + lc->end_sector = logdev_last_sector(lc); + lc->device_supports_discard = true; + + ti->num_flush_bios = 1; + ti->flush_supported = true; + ti->num_discard_bios = 1; + ti->discards_supported = true; + ti->per_bio_data_size = sizeof(struct per_bio_data); + ti->private = lc; + return 0; + +bad: + kfree(lc); + return -EINVAL; +} + +static int log_mark(struct log_writes_c *lc, char *data) +{ + struct pending_block *block; + size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry); + + block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); + if (!block) { + DMERR("Error allocating pending block"); + return -ENOMEM; + } + + block->data = kstrndup(data, maxsize, GFP_KERNEL); + if (!block->data) { + DMERR("Error copying mark data"); + kfree(block); + return -ENOMEM; + } + atomic_inc(&lc->pending_blocks); + block->datalen = strlen(block->data); + block->flags |= LOG_MARK_FLAG; + spin_lock_irq(&lc->blocks_lock); + list_add_tail(&block->list, &lc->logging_blocks); + spin_unlock_irq(&lc->blocks_lock); + wake_up_process(lc->log_kthread); + return 0; +} + +static void log_writes_dtr(struct dm_target *ti) +{ + struct log_writes_c *lc = ti->private; + + spin_lock_irq(&lc->blocks_lock); + list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks); + spin_unlock_irq(&lc->blocks_lock); + + /* + * This is just nice to have since it'll update the super to include the + * unflushed blocks, if it fails we don't really care. + */ + log_mark(lc, "dm-log-writes-end"); + wake_up_process(lc->log_kthread); + wait_event(lc->wait, !atomic_read(&lc->io_blocks) && + !atomic_read(&lc->pending_blocks)); + kthread_stop(lc->log_kthread); + + WARN_ON(!list_empty(&lc->logging_blocks)); + WARN_ON(!list_empty(&lc->unflushed_blocks)); + dm_put_device(ti, lc->dev); + dm_put_device(ti, lc->logdev); + kfree(lc); +} + +static void normal_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct log_writes_c *lc = ti->private; + + bio->bi_bdev = lc->dev->bdev; +} + +static int log_writes_map(struct dm_target *ti, struct bio *bio) +{ + struct log_writes_c *lc = ti->private; + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + struct pending_block *block; + struct bvec_iter iter; + struct bio_vec bv; + size_t alloc_size; + int i = 0; + bool flush_bio = (bio->bi_rw & REQ_FLUSH); + bool fua_bio = (bio->bi_rw & REQ_FUA); + bool discard_bio = (bio->bi_rw & REQ_DISCARD); + + pb->block = NULL; + + /* Don't bother doing anything if logging has been disabled */ + if (!lc->logging_enabled) + goto map_bio; + + /* + * Map reads as normal. + */ + if (bio_data_dir(bio) == READ) + goto map_bio; + + /* No sectors and not a flush? Don't care */ + if (!bio_sectors(bio) && !flush_bio) + goto map_bio; + + /* + * Discards will have bi_size set but there's no actual data, so just + * allocate the size of the pending block. + */ + if (discard_bio) + alloc_size = sizeof(struct pending_block); + else + alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio); + + block = kzalloc(alloc_size, GFP_NOIO); + if (!block) { + DMERR("Error allocating pending block"); + spin_lock_irq(&lc->blocks_lock); + lc->logging_enabled = false; + spin_unlock_irq(&lc->blocks_lock); + return -ENOMEM; + } + INIT_LIST_HEAD(&block->list); + pb->block = block; + atomic_inc(&lc->pending_blocks); + + if (flush_bio) + block->flags |= LOG_FLUSH_FLAG; + if (fua_bio) + block->flags |= LOG_FUA_FLAG; + if (discard_bio) + block->flags |= LOG_DISCARD_FLAG; + + block->sector = bio->bi_iter.bi_sector; + block->nr_sectors = bio_sectors(bio); + + /* We don't need the data, just submit */ + if (discard_bio) { + WARN_ON(flush_bio || fua_bio); + if (lc->device_supports_discard) + goto map_bio; + bio_endio(bio, 0); + return DM_MAPIO_SUBMITTED; + } + + /* Flush bio, splice the unflushed blocks onto this list and submit */ + if (flush_bio && !bio_sectors(bio)) { + spin_lock_irq(&lc->blocks_lock); + list_splice_init(&lc->unflushed_blocks, &block->list); + spin_unlock_irq(&lc->blocks_lock); + goto map_bio; + } + + /* + * We will write this bio somewhere else way later so we need to copy + * the actual contents into new pages so we know the data will always be + * there. + * + * We do this because this could be a bio from O_DIRECT in which case we + * can't just hold onto the page until some later point, we have to + * manually copy the contents. + */ + bio_for_each_segment(bv, bio, iter) { + struct page *page; + void *src, *dst; + + page = alloc_page(GFP_NOIO); + if (!page) { + DMERR("Error allocing page"); + free_pending_block(lc, block); + spin_lock_irq(&lc->blocks_lock); + lc->logging_enabled = false; + spin_unlock_irq(&lc->blocks_lock); + return -ENOMEM; + } + + src = kmap_atomic(bv.bv_page); + dst = kmap_atomic(page); + memcpy(dst, src + bv.bv_offset, bv.bv_len); + kunmap_atomic(dst); + kunmap_atomic(src); + block->vecs[i].bv_page = page; + block->vecs[i].bv_len = bv.bv_len; + block->vec_cnt++; + i++; + } + + /* Had a flush with data in it, weird */ + if (flush_bio) { + spin_lock_irq(&lc->blocks_lock); + list_splice_init(&lc->unflushed_blocks, &block->list); + spin_unlock_irq(&lc->blocks_lock); + } +map_bio: + normal_map_bio(ti, bio); + return DM_MAPIO_REMAPPED; +} + +static int normal_end_io(struct dm_target *ti, struct bio *bio, int error) +{ + struct log_writes_c *lc = ti->private; + struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data)); + + if (bio_data_dir(bio) == WRITE && pb->block) { + struct pending_block *block = pb->block; + unsigned long flags; + + spin_lock_irqsave(&lc->blocks_lock, flags); + if (block->flags & LOG_FLUSH_FLAG) { + list_splice_tail_init(&block->list, &lc->logging_blocks); + list_add_tail(&block->list, &lc->logging_blocks); + wake_up_process(lc->log_kthread); + } else if (block->flags & LOG_FUA_FLAG) { + list_add_tail(&block->list, &lc->logging_blocks); + wake_up_process(lc->log_kthread); + } else + list_add_tail(&block->list, &lc->unflushed_blocks); + spin_unlock_irqrestore(&lc->blocks_lock, flags); + } + + return error; +} + +/* + * INFO format: <logged entries> <highest allocated sector> + */ +static void log_writes_status(struct dm_target *ti, status_type_t type, + unsigned status_flags, char *result, + unsigned maxlen) +{ + unsigned sz = 0; + struct log_writes_c *lc = ti->private; + + switch (type) { + case STATUSTYPE_INFO: + DMEMIT("%llu %llu", lc->logged_entries, + (unsigned long long)lc->next_sector - 1); + if (!lc->logging_enabled) + DMEMIT(" logging_disabled"); + break; + + case STATUSTYPE_TABLE: + DMEMIT("%s %s", lc->dev->name, lc->logdev->name); + break; + } +} + +static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd, + unsigned long arg) +{ + struct log_writes_c *lc = ti->private; + struct dm_dev *dev = lc->dev; + int r = 0; + + /* + * Only pass ioctls through if the device sizes match exactly. + */ + if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT) + r = scsi_verify_blk_ioctl(NULL, cmd); + + return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg); +} + +static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm, + struct bio_vec *biovec, int max_size) +{ + struct log_writes_c *lc = ti->private; + struct request_queue *q = bdev_get_queue(lc->dev->bdev); + + if (!q->merge_bvec_fn) + return max_size; + + bvm->bi_bdev = lc->dev->bdev; + bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector); + + return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); +} + +static int log_writes_iterate_devices(struct dm_target *ti, + iterate_devices_callout_fn fn, + void *data) +{ + struct log_writes_c *lc = ti->private; + + return fn(ti, lc->dev, 0, ti->len, data); +} + +/* + * Messages supported: + * mark <mark data> - specify the marked data. + */ +static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv) +{ + int r = -EINVAL; + struct log_writes_c *lc = ti->private; + + if (argc != 2) { + DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc); + return r; + } + + if (!strcasecmp(argv[0], "mark")) + r = log_mark(lc, argv[1]); + else + DMWARN("Unrecognised log writes target message received: %s", argv[0]); + + return r; +} + +static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct log_writes_c *lc = ti->private; + struct request_queue *q = bdev_get_queue(lc->dev->bdev); + + if (!q || !blk_queue_discard(q)) { + lc->device_supports_discard = false; + limits->discard_granularity = 1 << SECTOR_SHIFT; + limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT); + } +} + +static struct target_type log_writes_target = { + .name = "log-writes", + .version = {1, 0, 0}, + .module = THIS_MODULE, + .ctr = log_writes_ctr, + .dtr = log_writes_dtr, + .map = log_writes_map, + .end_io = normal_end_io, + .status = log_writes_status, + .ioctl = log_writes_ioctl, + .merge = log_writes_merge, + .message = log_writes_message, + .iterate_devices = log_writes_iterate_devices, + .io_hints = log_writes_io_hints, +}; + +static int __init dm_log_writes_init(void) +{ + int r = dm_register_target(&log_writes_target); + + if (r < 0) + DMERR("register failed %d", r); + + return r; +} + +static void __exit dm_log_writes_exit(void) +{ + dm_unregister_target(&log_writes_target); +} + +module_init(dm_log_writes_init); +module_exit(dm_log_writes_exit); + +MODULE_DESCRIPTION(DM_NAME " log writes target"); +MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d376dc87716e..63953477a07c 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -428,7 +428,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, } else { /* blk-mq request-based interface */ *__clone = blk_get_request(bdev_get_queue(bdev), - rq_data_dir(rq), GFP_KERNEL); + rq_data_dir(rq), GFP_ATOMIC); if (IS_ERR(*__clone)) /* ENOMEM, requeue */ return r; @@ -1627,7 +1627,7 @@ static int __pgpath_busy(struct pgpath *pgpath) { struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); - return dm_underlying_device_busy(q); + return blk_lld_busy(q); } /* @@ -1703,7 +1703,7 @@ out: *---------------------------------------------------------------*/ static struct target_type multipath_target = { .name = "multipath", - .version = {1, 8, 0}, + .version = {1, 9, 0}, .module = THIS_MODULE, .ctr = multipath_ctr, .dtr = multipath_dtr, diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index c62c5ab6aed5..7e818f5f1dc4 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -11,7 +11,7 @@ struct dm_sysfs_attr { struct attribute attr; ssize_t (*show)(struct mapped_device *, char *); - ssize_t (*store)(struct mapped_device *, char *); + ssize_t (*store)(struct mapped_device *, const char *, size_t count); }; #define DM_ATTR_RO(_name) \ @@ -39,6 +39,31 @@ static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr, return ret; } +#define DM_ATTR_RW(_name) \ +struct dm_sysfs_attr dm_attr_##_name = \ + __ATTR(_name, S_IRUGO | S_IWUSR, dm_attr_##_name##_show, dm_attr_##_name##_store) + +static ssize_t dm_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t count) +{ + struct dm_sysfs_attr *dm_attr; + struct mapped_device *md; + ssize_t ret; + + dm_attr = container_of(attr, struct dm_sysfs_attr, attr); + if (!dm_attr->store) + return -EIO; + + md = dm_get_from_kobject(kobj); + if (!md) + return -EINVAL; + + ret = dm_attr->store(md, page, count); + dm_put(md); + + return ret; +} + static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf) { if (dm_copy_name_and_uuid(md, buf, NULL)) @@ -64,25 +89,33 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf) return strlen(buf); } +static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf) +{ + sprintf(buf, "%d\n", dm_use_blk_mq(md)); + + return strlen(buf); +} + static DM_ATTR_RO(name); static DM_ATTR_RO(uuid); static DM_ATTR_RO(suspended); +static DM_ATTR_RO(use_blk_mq); +static DM_ATTR_RW(rq_based_seq_io_merge_deadline); static struct attribute *dm_attrs[] = { &dm_attr_name.attr, &dm_attr_uuid.attr, &dm_attr_suspended.attr, + &dm_attr_use_blk_mq.attr, + &dm_attr_rq_based_seq_io_merge_deadline.attr, NULL, }; static const struct sysfs_ops dm_sysfs_ops = { .show = dm_attr_show, + .store = dm_attr_store, }; -/* - * dm kobject is embedded in mapped_device structure - * no need to define release function here - */ static struct kobj_type dm_ktype = { .sysfs_ops = &dm_sysfs_ops, .default_attrs = dm_attrs, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 6554d9148927..d9b00b8565c6 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -18,6 +18,8 @@ #include <linux/mutex.h> #include <linux/delay.h> #include <linux/atomic.h> +#include <linux/blk-mq.h> +#include <linux/mount.h> #define DM_MSG_PREFIX "table" @@ -372,23 +374,18 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, int r; dev_t uninitialized_var(dev); struct dm_dev_internal *dd; - unsigned int major, minor; struct dm_table *t = ti->table; - char dummy; + struct block_device *bdev; BUG_ON(!t); - if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { - /* Extract the major/minor numbers */ - dev = MKDEV(major, minor); - if (MAJOR(dev) != major || MINOR(dev) != minor) - return -EOVERFLOW; + /* convert the path to a device */ + bdev = lookup_bdev(path); + if (IS_ERR(bdev)) { + dev = name_to_dev_t(path); + if (!dev) + return -ENODEV; } else { - /* convert the path to a device */ - struct block_device *bdev = lookup_bdev(path); - - if (IS_ERR(bdev)) - return PTR_ERR(bdev); dev = bdev->bd_dev; bdput(bdev); } @@ -939,7 +936,7 @@ bool dm_table_mq_request_based(struct dm_table *t) return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED; } -static int dm_table_alloc_md_mempools(struct dm_table *t) +static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md) { unsigned type = dm_table_get_type(t); unsigned per_bio_data_size = 0; @@ -957,7 +954,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t) per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size); } - t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size); + t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size); if (!t->mempools) return -ENOMEM; @@ -1127,7 +1124,7 @@ int dm_table_complete(struct dm_table *t) return r; } - r = dm_table_alloc_md_mempools(t); + r = dm_table_alloc_md_mempools(t, t->md); if (r) DMERR("unable to allocate mempools"); @@ -1339,14 +1336,14 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush) continue; if (ti->flush_supported) - return 1; + return true; if (ti->type->iterate_devices && ti->type->iterate_devices(ti, device_flush_capable, &flush)) - return 1; + return true; } - return 0; + return false; } static bool dm_table_discard_zeroes_data(struct dm_table *t) @@ -1359,10 +1356,10 @@ static bool dm_table_discard_zeroes_data(struct dm_table *t) ti = dm_table_get_target(t, i++); if (ti->discard_zeroes_data_unsupported) - return 0; + return false; } - return 1; + return true; } static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, @@ -1408,10 +1405,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t, if (!ti->type->iterate_devices || !ti->type->iterate_devices(ti, func, NULL)) - return 0; + return false; } - return 1; + return true; } static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, @@ -1468,14 +1465,14 @@ static bool dm_table_supports_discards(struct dm_table *t) continue; if (ti->discards_supported) - return 1; + return true; if (ti->type->iterate_devices && ti->type->iterate_devices(ti, device_discard_capable, NULL)) - return 1; + return true; } - return 0; + return false; } void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, @@ -1677,20 +1674,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) return r; } -int dm_table_any_busy_target(struct dm_table *t) -{ - unsigned i; - struct dm_target *ti; - - for (i = 0; i < t->num_targets; i++) { - ti = t->targets + i; - if (ti->type->busy && ti->type->busy(ti)) - return 1; - } - - return 0; -} - struct mapped_device *dm_table_get_md(struct dm_table *t) { return t->md; @@ -1709,9 +1692,13 @@ void dm_table_run_md_queue_async(struct dm_table *t) md = dm_table_get_md(t); queue = dm_get_md_queue(md); if (queue) { - spin_lock_irqsave(queue->queue_lock, flags); - blk_run_queue_async(queue); - spin_unlock_irqrestore(queue->queue_lock, flags); + if (queue->mq_ops) + blk_mq_run_hw_queues(queue, true); + else { + spin_lock_irqsave(queue->queue_lock, flags); + blk_run_queue_async(queue); + spin_unlock_irqrestore(queue->queue_lock, flags); + } } } EXPORT_SYMBOL(dm_table_run_md_queue_async); diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c index 7a7bab8947ae..66616db33e6f 100644 --- a/drivers/md/dm-verity.c +++ b/drivers/md/dm-verity.c @@ -18,20 +18,39 @@ #include <linux/module.h> #include <linux/device-mapper.h> +#include <linux/reboot.h> #include <crypto/hash.h> #define DM_MSG_PREFIX "verity" +#define DM_VERITY_ENV_LENGTH 42 +#define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR" + #define DM_VERITY_IO_VEC_INLINE 16 #define DM_VERITY_MEMPOOL_SIZE 4 #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 #define DM_VERITY_MAX_LEVELS 63 +#define DM_VERITY_MAX_CORRUPTED_ERRS 100 + +#define DM_VERITY_OPT_LOGGING "ignore_corruption" +#define DM_VERITY_OPT_RESTART "restart_on_corruption" static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); +enum verity_mode { + DM_VERITY_MODE_EIO, + DM_VERITY_MODE_LOGGING, + DM_VERITY_MODE_RESTART +}; + +enum verity_block_type { + DM_VERITY_BLOCK_TYPE_DATA, + DM_VERITY_BLOCK_TYPE_METADATA +}; + struct dm_verity { struct dm_dev *data_dev; struct dm_dev *hash_dev; @@ -54,6 +73,8 @@ struct dm_verity { unsigned digest_size; /* digest size for the current hash algorithm */ unsigned shash_descsize;/* the size of temporary space for crypto */ int hash_failed; /* set to 1 if hash of any block failed */ + enum verity_mode mode; /* mode for handling verification errors */ + unsigned corrupted_errs;/* Number of errors for corrupted blocks */ mempool_t *vec_mempool; /* mempool of bio vector */ @@ -175,6 +196,57 @@ static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, } /* + * Handle verification errors. + */ +static int verity_handle_err(struct dm_verity *v, enum verity_block_type type, + unsigned long long block) +{ + char verity_env[DM_VERITY_ENV_LENGTH]; + char *envp[] = { verity_env, NULL }; + const char *type_str = ""; + struct mapped_device *md = dm_table_get_md(v->ti->table); + + /* Corruption should be visible in device status in all modes */ + v->hash_failed = 1; + + if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS) + goto out; + + v->corrupted_errs++; + + switch (type) { + case DM_VERITY_BLOCK_TYPE_DATA: + type_str = "data"; + break; + case DM_VERITY_BLOCK_TYPE_METADATA: + type_str = "metadata"; + break; + default: + BUG(); + } + + DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str, + block); + + if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS) + DMERR("%s: reached maximum errors", v->data_dev->name); + + snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu", + DM_VERITY_ENV_VAR_NAME, type, block); + + kobject_uevent_env(&disk_to_dev(dm_disk(md))->kobj, KOBJ_CHANGE, envp); + +out: + if (v->mode == DM_VERITY_MODE_LOGGING) + return 0; + + if (v->mode == DM_VERITY_MODE_RESTART) + kernel_restart("dm-verity device corrupted"); + + return 1; +} + +/* * Verify hash of a metadata block pertaining to the specified data block * ("block" argument) at a specified level ("level" argument). * @@ -251,11 +323,11 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block, goto release_ret_r; } if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { - DMERR_LIMIT("metadata block %llu is corrupted", - (unsigned long long)hash_block); - v->hash_failed = 1; - r = -EIO; - goto release_ret_r; + if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA, + hash_block)) { + r = -EIO; + goto release_ret_r; + } } else aux->hash_verified = 1; } @@ -367,10 +439,9 @@ test_block_hash: return r; } if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { - DMERR_LIMIT("data block %llu is corrupted", - (unsigned long long)(io->block + b)); - v->hash_failed = 1; - return -EIO; + if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, + io->block + b)) + return -EIO; } } @@ -546,6 +617,19 @@ static void verity_status(struct dm_target *ti, status_type_t type, else for (x = 0; x < v->salt_size; x++) DMEMIT("%02x", v->salt[x]); + if (v->mode != DM_VERITY_MODE_EIO) { + DMEMIT(" 1 "); + switch (v->mode) { + case DM_VERITY_MODE_LOGGING: + DMEMIT(DM_VERITY_OPT_LOGGING); + break; + case DM_VERITY_MODE_RESTART: + DMEMIT(DM_VERITY_OPT_RESTART); + break; + default: + BUG(); + } + } break; } } @@ -647,13 +731,19 @@ static void verity_dtr(struct dm_target *ti) static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) { struct dm_verity *v; - unsigned num; + struct dm_arg_set as; + const char *opt_string; + unsigned int num, opt_params; unsigned long long num_ll; int r; int i; sector_t hash_position; char dummy; + static struct dm_arg _args[] = { + {0, 1, "Invalid number of feature args"}, + }; + v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); if (!v) { ti->error = "Cannot allocate verity structure"; @@ -668,8 +758,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } - if (argc != 10) { - ti->error = "Invalid argument count: exactly 10 arguments required"; + if (argc < 10) { + ti->error = "Not enough arguments"; r = -EINVAL; goto bad; } @@ -790,6 +880,39 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) } } + argv += 10; + argc -= 10; + + /* Optional parameters */ + if (argc) { + as.argc = argc; + as.argv = argv; + + r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); + if (r) + goto bad; + + while (opt_params) { + opt_params--; + opt_string = dm_shift_arg(&as); + if (!opt_string) { + ti->error = "Not enough feature arguments"; + r = -EINVAL; + goto bad; + } + + if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING)) + v->mode = DM_VERITY_MODE_LOGGING; + else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART)) + v->mode = DM_VERITY_MODE_RESTART; + else { + ti->error = "Invalid feature arguments"; + r = -EINVAL; + goto bad; + } + } + } + v->hash_per_block_bits = __fls((1 << v->hash_dev_block_bits) / v->digest_size); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8001fe9e3434..f8c7ca3e8947 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -21,6 +21,9 @@ #include <linux/delay.h> #include <linux/wait.h> #include <linux/kthread.h> +#include <linux/ktime.h> +#include <linux/elevator.h> /* for rq_end_sector() */ +#include <linux/blk-mq.h> #include <trace/events/block.h> @@ -216,8 +219,29 @@ struct mapped_device { struct kthread_worker kworker; struct task_struct *kworker_task; + + /* for request-based merge heuristic in dm_request_fn() */ + unsigned seq_rq_merge_deadline_usecs; + int last_rq_rw; + sector_t last_rq_pos; + ktime_t last_rq_start_time; + + /* for blk-mq request-based DM support */ + struct blk_mq_tag_set tag_set; + bool use_blk_mq; }; +#ifdef CONFIG_DM_MQ_DEFAULT +static bool use_blk_mq = true; +#else +static bool use_blk_mq = false; +#endif + +bool dm_use_blk_mq(struct mapped_device *md) +{ + return md->use_blk_mq; +} + /* * For mempools pre-allocation at the table loading time. */ @@ -250,35 +274,35 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; */ static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; -static unsigned __dm_get_reserved_ios(unsigned *reserved_ios, +static unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max) { - unsigned ios = ACCESS_ONCE(*reserved_ios); - unsigned modified_ios = 0; + unsigned param = ACCESS_ONCE(*module_param); + unsigned modified_param = 0; - if (!ios) - modified_ios = def; - else if (ios > max) - modified_ios = max; + if (!param) + modified_param = def; + else if (param > max) + modified_param = max; - if (modified_ios) { - (void)cmpxchg(reserved_ios, ios, modified_ios); - ios = modified_ios; + if (modified_param) { + (void)cmpxchg(module_param, param, modified_param); + param = modified_param; } - return ios; + return param; } unsigned dm_get_reserved_bio_based_ios(void) { - return __dm_get_reserved_ios(&reserved_bio_based_ios, + return __dm_get_module_param(&reserved_bio_based_ios, RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); } EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); unsigned dm_get_reserved_rq_based_ios(void) { - return __dm_get_reserved_ios(&reserved_rq_based_ios, + return __dm_get_module_param(&reserved_rq_based_ios, RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); } EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); @@ -1017,6 +1041,11 @@ static void end_clone_bio(struct bio *clone, int error) blk_update_request(tio->orig, 0, nr_bytes); } +static struct dm_rq_target_io *tio_from_request(struct request *rq) +{ + return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); +} + /* * Don't touch any member of the md after calling this function because * the md may be freed in dm_put() at the end of this function. @@ -1024,10 +1053,13 @@ static void end_clone_bio(struct bio *clone, int error) */ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { + int nr_requests_pending; + atomic_dec(&md->pending[rw]); /* nudge anyone waiting on suspend queue */ - if (!md_in_flight(md)) + nr_requests_pending = md_in_flight(md); + if (!nr_requests_pending) wake_up(&md->wait); /* @@ -1036,8 +1068,13 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) * back into ->request_fn() could deadlock attempting to grab the * queue lock again. */ - if (run_queue) - blk_run_queue_async(md->queue); + if (run_queue) { + if (md->queue->mq_ops) + blk_mq_run_hw_queues(md->queue, true); + else if (!nr_requests_pending || + (nr_requests_pending >= md->queue->nr_congestion_on)) + blk_run_queue_async(md->queue); + } /* * dm_put() must be at the end of this function. See the comment above @@ -1048,13 +1085,18 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) static void free_rq_clone(struct request *clone) { struct dm_rq_target_io *tio = clone->end_io_data; + struct mapped_device *md = tio->md; blk_rq_unprep_clone(clone); - if (clone->q && clone->q->mq_ops) + + if (clone->q->mq_ops) tio->ti->type->release_clone_rq(clone); - else - free_clone_request(tio->md, clone); - free_rq_tio(tio); + else if (!md->queue->mq_ops) + /* request_fn queue stacked on request_fn queue(s) */ + free_clone_request(md, clone); + + if (!md->queue->mq_ops) + free_rq_tio(tio); } /* @@ -1083,17 +1125,22 @@ static void dm_end_request(struct request *clone, int error) } free_rq_clone(clone); - blk_end_request_all(rq, error); + if (!rq->q->mq_ops) + blk_end_request_all(rq, error); + else + blk_mq_end_request(rq, error); rq_completed(md, rw, true); } static void dm_unprep_request(struct request *rq) { - struct dm_rq_target_io *tio = rq->special; + struct dm_rq_target_io *tio = tio_from_request(rq); struct request *clone = tio->clone; - rq->special = NULL; - rq->cmd_flags &= ~REQ_DONTPREP; + if (!rq->q->mq_ops) { + rq->special = NULL; + rq->cmd_flags &= ~REQ_DONTPREP; + } if (clone) free_rq_clone(clone); @@ -1102,18 +1149,29 @@ static void dm_unprep_request(struct request *rq) /* * Requeue the original request of a clone. */ -static void dm_requeue_unmapped_original_request(struct mapped_device *md, - struct request *rq) +static void old_requeue_request(struct request *rq) { - int rw = rq_data_dir(rq); struct request_queue *q = rq->q; unsigned long flags; - dm_unprep_request(rq); - spin_lock_irqsave(q->queue_lock, flags); blk_requeue_request(q, rq); spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void dm_requeue_unmapped_original_request(struct mapped_device *md, + struct request *rq) +{ + int rw = rq_data_dir(rq); + + dm_unprep_request(rq); + + if (!rq->q->mq_ops) + old_requeue_request(rq); + else { + blk_mq_requeue_request(rq); + blk_mq_kick_requeue_list(rq->q); + } rq_completed(md, rw, false); } @@ -1125,35 +1183,44 @@ static void dm_requeue_unmapped_request(struct request *clone) dm_requeue_unmapped_original_request(tio->md, tio->orig); } -static void __stop_queue(struct request_queue *q) -{ - blk_stop_queue(q); -} - -static void stop_queue(struct request_queue *q) +static void old_stop_queue(struct request_queue *q) { unsigned long flags; + if (blk_queue_stopped(q)) + return; + spin_lock_irqsave(q->queue_lock, flags); - __stop_queue(q); + blk_stop_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } -static void __start_queue(struct request_queue *q) +static void stop_queue(struct request_queue *q) { - if (blk_queue_stopped(q)) - blk_start_queue(q); + if (!q->mq_ops) + old_stop_queue(q); + else + blk_mq_stop_hw_queues(q); } -static void start_queue(struct request_queue *q) +static void old_start_queue(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); - __start_queue(q); + if (blk_queue_stopped(q)) + blk_start_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } +static void start_queue(struct request_queue *q) +{ + if (!q->mq_ops) + old_start_queue(q); + else + blk_mq_start_stopped_hw_queues(q, true); +} + static void dm_done(struct request *clone, int error, bool mapped) { int r = error; @@ -1192,13 +1259,20 @@ static void dm_done(struct request *clone, int error, bool mapped) static void dm_softirq_done(struct request *rq) { bool mapped = true; - struct dm_rq_target_io *tio = rq->special; + struct dm_rq_target_io *tio = tio_from_request(rq); struct request *clone = tio->clone; + int rw; if (!clone) { - blk_end_request_all(rq, tio->error); - rq_completed(tio->md, rq_data_dir(rq), false); - free_rq_tio(tio); + rw = rq_data_dir(rq); + if (!rq->q->mq_ops) { + blk_end_request_all(rq, tio->error); + rq_completed(tio->md, rw, false); + free_rq_tio(tio); + } else { + blk_mq_end_request(rq, tio->error); + rq_completed(tio->md, rw, false); + } return; } @@ -1214,7 +1288,7 @@ static void dm_softirq_done(struct request *rq) */ static void dm_complete_request(struct request *rq, int error) { - struct dm_rq_target_io *tio = rq->special; + struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; blk_complete_request(rq); @@ -1233,7 +1307,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error) } /* - * Called with the clone's queue lock held + * Called with the clone's queue lock held (for non-blk-mq) */ static void end_clone_request(struct request *clone, int error) { @@ -1693,7 +1767,7 @@ out: * The request function that just remaps the bio built up by * dm_merge_bvec. */ -static void _dm_request(struct request_queue *q, struct bio *bio) +static void dm_make_request(struct request_queue *q, struct bio *bio) { int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; @@ -1725,16 +1799,6 @@ int dm_request_based(struct mapped_device *md) return blk_queue_stackable(md->queue); } -static void dm_request(struct request_queue *q, struct bio *bio) -{ - struct mapped_device *md = q->queuedata; - - if (dm_request_based(md)) - blk_queue_bio(q, bio); - else - _dm_request(q, bio); -} - static void dm_dispatch_clone_request(struct request *clone, struct request *rq) { int r; @@ -1787,15 +1851,25 @@ static int setup_clone(struct request *clone, struct request *rq, static struct request *clone_rq(struct request *rq, struct mapped_device *md, struct dm_rq_target_io *tio, gfp_t gfp_mask) { - struct request *clone = alloc_clone_request(md, gfp_mask); + /* + * Do not allocate a clone if tio->clone was already set + * (see: dm_mq_queue_rq). + */ + bool alloc_clone = !tio->clone; + struct request *clone; - if (!clone) - return NULL; + if (alloc_clone) { + clone = alloc_clone_request(md, gfp_mask); + if (!clone) + return NULL; + } else + clone = tio->clone; blk_rq_init(NULL, clone); if (setup_clone(clone, rq, tio, gfp_mask)) { /* -ENOMEM */ - free_clone_request(md, clone); + if (alloc_clone) + free_clone_request(md, clone); return NULL; } @@ -1804,6 +1878,19 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md, static void map_tio_request(struct kthread_work *work); +static void init_tio(struct dm_rq_target_io *tio, struct request *rq, + struct mapped_device *md) +{ + tio->md = md; + tio->ti = NULL; + tio->clone = NULL; + tio->orig = rq; + tio->error = 0; + memset(&tio->info, 0, sizeof(tio->info)); + if (md->kworker_task) + init_kthread_work(&tio->work, map_tio_request); +} + static struct dm_rq_target_io *prep_tio(struct request *rq, struct mapped_device *md, gfp_t gfp_mask) { @@ -1815,13 +1902,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq, if (!tio) return NULL; - tio->md = md; - tio->ti = NULL; - tio->clone = NULL; - tio->orig = rq; - tio->error = 0; - memset(&tio->info, 0, sizeof(tio->info)); - init_kthread_work(&tio->work, map_tio_request); + init_tio(tio, rq, md); table = dm_get_live_table(md, &srcu_idx); if (!dm_table_mq_request_based(table)) { @@ -1865,11 +1946,11 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) * DM_MAPIO_REQUEUE : the original request needs to be requeued * < 0 : the request was completed due to failure */ -static int map_request(struct dm_target *ti, struct request *rq, +static int map_request(struct dm_rq_target_io *tio, struct request *rq, struct mapped_device *md) { int r; - struct dm_rq_target_io *tio = rq->special; + struct dm_target *ti = tio->ti; struct request *clone = NULL; if (tio->clone) { @@ -1884,7 +1965,7 @@ static int map_request(struct dm_target *ti, struct request *rq, } if (IS_ERR(clone)) return DM_MAPIO_REQUEUE; - if (setup_clone(clone, rq, tio, GFP_KERNEL)) { + if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { /* -ENOMEM */ ti->type->release_clone_rq(clone); return DM_MAPIO_REQUEUE; @@ -1925,15 +2006,24 @@ static void map_tio_request(struct kthread_work *work) struct request *rq = tio->orig; struct mapped_device *md = tio->md; - if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE) + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) dm_requeue_unmapped_original_request(md, rq); } static void dm_start_request(struct mapped_device *md, struct request *orig) { - blk_start_request(orig); + if (!orig->q->mq_ops) + blk_start_request(orig); + else + blk_mq_start_request(orig); atomic_inc(&md->pending[rq_data_dir(orig)]); + if (md->seq_rq_merge_deadline_usecs) { + md->last_rq_pos = rq_end_sector(orig); + md->last_rq_rw = rq_data_dir(orig); + md->last_rq_start_time = ktime_get(); + } + /* * Hold the md reference here for the in-flight I/O. * We can't rely on the reference count by device opener, @@ -1944,6 +2034,45 @@ static void dm_start_request(struct mapped_device *md, struct request *orig) dm_get(md); } +#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000 + +ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) +{ + return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs); +} + +ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, + const char *buf, size_t count) +{ + unsigned deadline; + + if (!dm_request_based(md) || md->use_blk_mq) + return count; + + if (kstrtouint(buf, 10, &deadline)) + return -EINVAL; + + if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS) + deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS; + + md->seq_rq_merge_deadline_usecs = deadline; + + return count; +} + +static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md) +{ + ktime_t kt_deadline; + + if (!md->seq_rq_merge_deadline_usecs) + return false; + + kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC); + kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline); + + return !ktime_after(ktime_get(), kt_deadline); +} + /* * q->request_fn for request-based dm. * Called with the queue lock held. @@ -1967,7 +2096,7 @@ static void dm_request_fn(struct request_queue *q) while (!blk_queue_stopped(q)) { rq = blk_peek_request(q); if (!rq) - goto delay_and_out; + goto out; /* always use block 0 to find the target for flushes for now */ pos = 0; @@ -1986,12 +2115,17 @@ static void dm_request_fn(struct request_queue *q) continue; } + if (dm_request_peeked_before_merge_deadline(md) && + md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && + md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) + goto delay_and_out; + if (ti->type->busy && ti->type->busy(ti)) goto delay_and_out; dm_start_request(md, rq); - tio = rq->special; + tio = tio_from_request(rq); /* Establish tio->ti before queuing work (map_tio_request) */ tio->ti = ti; queue_kthread_work(&md->kworker, &tio->work); @@ -2001,33 +2135,11 @@ static void dm_request_fn(struct request_queue *q) goto out; delay_and_out: - blk_delay_queue(q, HZ / 10); + blk_delay_queue(q, HZ / 100); out: dm_put_live_table(md, srcu_idx); } -int dm_underlying_device_busy(struct request_queue *q) -{ - return blk_lld_busy(q); -} -EXPORT_SYMBOL_GPL(dm_underlying_device_busy); - -static int dm_lld_busy(struct request_queue *q) -{ - int r; - struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table_fast(md); - - if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) - r = 1; - else - r = dm_table_any_busy_target(map); - - dm_put_live_table_fast(md); - - return r; -} - static int dm_any_congested(void *congested_data, int bdi_bits) { int r = bdi_bits; @@ -2110,7 +2222,7 @@ static void dm_init_md_queue(struct mapped_device *md) { /* * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device has not been decided yet. + * devices. The type of this dm device may not have been decided yet. * The type is decided at the first table loading time. * To prevent problematic device stacking, clear the queue flag * for request stacking support until then. @@ -2118,13 +2230,21 @@ static void dm_init_md_queue(struct mapped_device *md) * This queue is new, so no concurrency on the queue_flags. */ queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); +} + +static void dm_init_old_md_queue(struct mapped_device *md) +{ + md->use_blk_mq = false; + dm_init_md_queue(md); + /* + * Initialize aspects of queue that aren't relevant for blk-mq + */ md->queue->queuedata = md; md->queue->backing_dev_info.congested_fn = dm_any_congested; md->queue->backing_dev_info.congested_data = md; - blk_queue_make_request(md->queue, dm_request); + blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); - blk_queue_merge_bvec(md->queue, dm_merge_bvec); } /* @@ -2156,6 +2276,7 @@ static struct mapped_device *alloc_dev(int minor) if (r < 0) goto bad_io_barrier; + md->use_blk_mq = use_blk_mq; md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); @@ -2267,6 +2388,8 @@ static void free_dev(struct mapped_device *md) del_gendisk(md->disk); put_disk(md->disk); blk_cleanup_queue(md->queue); + if (md->use_blk_mq) + blk_mq_free_tag_set(&md->tag_set); bdput(md->bdev); free_minor(minor); @@ -2278,7 +2401,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) { struct dm_md_mempools *p = dm_table_get_md_mempools(t); - if (md->io_pool && md->bs) { + if (md->bs) { /* The md already has necessary mempools. */ if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { /* @@ -2310,7 +2433,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t) p->bs = NULL; out: - /* mempool bind completed, now no need any mempools in the table */ + /* mempool bind completed, no longer need any mempools in the table */ dm_table_free_md_mempools(t); } @@ -2357,7 +2480,7 @@ int dm_queue_merge_is_compulsory(struct request_queue *q) if (!q->merge_bvec_fn) return 0; - if (q->make_request_fn == dm_request) { + if (q->make_request_fn == dm_make_request) { dev_md = q->queuedata; if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) return 0; @@ -2426,7 +2549,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * This must be done before setting the queue restrictions, * because request-based dm may be run just after the setting. */ - if (dm_table_request_based(t) && !blk_queue_stopped(q)) + if (dm_table_request_based(t)) stop_queue(q); __bind_mempools(md, t); @@ -2508,14 +2631,6 @@ unsigned dm_get_md_type(struct mapped_device *md) return md->type; } -static bool dm_md_type_request_based(struct mapped_device *md) -{ - unsigned table_type = dm_get_md_type(md); - - return (table_type == DM_TYPE_REQUEST_BASED || - table_type == DM_TYPE_MQ_REQUEST_BASED); -} - struct target_type *dm_get_immutable_target_type(struct mapped_device *md) { return md->immutable_target_type; @@ -2532,6 +2647,14 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_get_queue_limits); +static void init_rq_based_worker_thread(struct mapped_device *md) +{ + /* Initialize the request-based DM worker thread */ + init_kthread_worker(&md->kworker); + md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, + "kdmwork-%s", dm_device_name(md)); +} + /* * Fully initialize a request-based queue (->elevator, ->request_fn, etc). */ @@ -2540,27 +2663,160 @@ static int dm_init_request_based_queue(struct mapped_device *md) struct request_queue *q = NULL; if (md->queue->elevator) - return 1; + return 0; /* Fully initialize the queue */ q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); if (!q) - return 0; + return -EINVAL; + + /* disable dm_request_fn's merge heuristic by default */ + md->seq_rq_merge_deadline_usecs = 0; md->queue = q; - dm_init_md_queue(md); + dm_init_old_md_queue(md); blk_queue_softirq_done(md->queue, dm_softirq_done); blk_queue_prep_rq(md->queue, dm_prep_fn); - blk_queue_lld_busy(md->queue, dm_lld_busy); - /* Also initialize the request-based DM worker thread */ - init_kthread_worker(&md->kworker); - md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, - "kdmwork-%s", dm_device_name(md)); + init_rq_based_worker_thread(md); elv_register_queue(md->queue); - return 1; + return 0; +} + +static int dm_mq_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct mapped_device *md = data; + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); + + /* + * Must initialize md member of tio, otherwise it won't + * be available in dm_mq_queue_rq. + */ + tio->md = md; + + return 0; +} + +static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); + struct mapped_device *md = tio->md; + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); + struct dm_target *ti; + sector_t pos; + + /* always use block 0 to find the target for flushes for now */ + pos = 0; + if (!(rq->cmd_flags & REQ_FLUSH)) + pos = blk_rq_pos(rq); + + ti = dm_table_find_target(map, pos); + if (!dm_target_is_valid(ti)) { + dm_put_live_table(md, srcu_idx); + DMERR_LIMIT("request attempted access beyond the end of device"); + /* + * Must perform setup, that rq_completed() requires, + * before returning BLK_MQ_RQ_QUEUE_ERROR + */ + dm_start_request(md, rq); + return BLK_MQ_RQ_QUEUE_ERROR; + } + dm_put_live_table(md, srcu_idx); + + if (ti->type->busy && ti->type->busy(ti)) + return BLK_MQ_RQ_QUEUE_BUSY; + + dm_start_request(md, rq); + + /* Init tio using md established in .init_request */ + init_tio(tio, rq, md); + + /* + * Establish tio->ti before queuing work (map_tio_request) + * or making direct call to map_request(). + */ + tio->ti = ti; + + /* Clone the request if underlying devices aren't blk-mq */ + if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) { + /* clone request is allocated at the end of the pdu */ + tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io); + if (!clone_rq(rq, md, tio, GFP_ATOMIC)) + return BLK_MQ_RQ_QUEUE_BUSY; + queue_kthread_work(&md->kworker, &tio->work); + } else { + /* Direct call is fine since .queue_rq allows allocations */ + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) + dm_requeue_unmapped_original_request(md, rq); + } + + return BLK_MQ_RQ_QUEUE_OK; +} + +static struct blk_mq_ops dm_mq_ops = { + .queue_rq = dm_mq_queue_rq, + .map_queue = blk_mq_map_queue, + .complete = dm_softirq_done, + .init_request = dm_mq_init_request, +}; + +static int dm_init_request_based_blk_mq_queue(struct mapped_device *md) +{ + unsigned md_type = dm_get_md_type(md); + struct request_queue *q; + int err; + + memset(&md->tag_set, 0, sizeof(md->tag_set)); + md->tag_set.ops = &dm_mq_ops; + md->tag_set.queue_depth = BLKDEV_MAX_RQ; + md->tag_set.numa_node = NUMA_NO_NODE; + md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + md->tag_set.nr_hw_queues = 1; + if (md_type == DM_TYPE_REQUEST_BASED) { + /* make the memory for non-blk-mq clone part of the pdu */ + md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request); + } else + md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); + md->tag_set.driver_data = md; + + err = blk_mq_alloc_tag_set(&md->tag_set); + if (err) + return err; + + q = blk_mq_init_allocated_queue(&md->tag_set, md->queue); + if (IS_ERR(q)) { + err = PTR_ERR(q); + goto out_tag_set; + } + md->queue = q; + dm_init_md_queue(md); + + /* backfill 'mq' sysfs registration normally done in blk_register_queue */ + blk_mq_register_disk(md->disk); + + if (md_type == DM_TYPE_REQUEST_BASED) + init_rq_based_worker_thread(md); + + return 0; + +out_tag_set: + blk_mq_free_tag_set(&md->tag_set); + return err; +} + +static unsigned filter_md_type(unsigned type, struct mapped_device *md) +{ + if (type == DM_TYPE_BIO_BASED) + return type; + + return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED; } /* @@ -2568,9 +2824,29 @@ static int dm_init_request_based_queue(struct mapped_device *md) */ int dm_setup_md_queue(struct mapped_device *md) { - if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) { - DMWARN("Cannot initialize queue for request-based mapped device"); - return -EINVAL; + int r; + unsigned md_type = filter_md_type(dm_get_md_type(md), md); + + switch (md_type) { + case DM_TYPE_REQUEST_BASED: + r = dm_init_request_based_queue(md); + if (r) { + DMWARN("Cannot initialize queue for request-based mapped device"); + return r; + } + break; + case DM_TYPE_MQ_REQUEST_BASED: + r = dm_init_request_based_blk_mq_queue(md); + if (r) { + DMWARN("Cannot initialize queue for request-based blk-mq mapped device"); + return r; + } + break; + case DM_TYPE_BIO_BASED: + dm_init_old_md_queue(md); + blk_queue_make_request(md->queue, dm_make_request); + blk_queue_merge_bvec(md->queue, dm_merge_bvec); + break; } return 0; @@ -2654,7 +2930,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait) set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); - if (dm_request_based(md)) + if (dm_request_based(md) && md->kworker_task) flush_kthread_worker(&md->kworker); /* @@ -2908,7 +3184,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, */ if (dm_request_based(md)) { stop_queue(md->queue); - flush_kthread_worker(&md->kworker); + if (md->kworker_task) + flush_kthread_worker(&md->kworker); } flush_workqueue(md->wq); @@ -3206,6 +3483,7 @@ struct gendisk *dm_disk(struct mapped_device *md) { return md->disk; } +EXPORT_SYMBOL_GPL(dm_disk); struct kobject *dm_kobject(struct mapped_device *md) { @@ -3253,16 +3531,19 @@ int dm_noflush_suspending(struct dm_target *ti) } EXPORT_SYMBOL_GPL(dm_noflush_suspending); -struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) +struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, + unsigned integrity, unsigned per_bio_data_size) { struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); - struct kmem_cache *cachep; + struct kmem_cache *cachep = NULL; unsigned int pool_size = 0; unsigned int front_pad; if (!pools) return NULL; + type = filter_md_type(type, md); + switch (type) { case DM_TYPE_BIO_BASED: cachep = _io_cache; @@ -3270,13 +3551,13 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); break; case DM_TYPE_REQUEST_BASED: + cachep = _rq_tio_cache; pool_size = dm_get_reserved_rq_based_ios(); pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); if (!pools->rq_pool) goto out; /* fall through to setup remaining rq-based pools */ case DM_TYPE_MQ_REQUEST_BASED: - cachep = _rq_tio_cache; if (!pool_size) pool_size = dm_get_reserved_rq_based_ios(); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); @@ -3284,12 +3565,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u WARN_ON(per_bio_data_size != 0); break; default: - goto out; + BUG(); } - pools->io_pool = mempool_create_slab_pool(pool_size, cachep); - if (!pools->io_pool) - goto out; + if (cachep) { + pools->io_pool = mempool_create_slab_pool(pool_size, cachep); + if (!pools->io_pool) + goto out; + } pools->bs = bioset_create_nobvec(pool_size, front_pad); if (!pools->bs) @@ -3346,6 +3629,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); +module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 59f53e79db82..6123c2bf9150 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -70,7 +70,6 @@ void dm_table_presuspend_undo_targets(struct dm_table *t); void dm_table_postsuspend_targets(struct dm_table *t); int dm_table_resume_targets(struct dm_table *t); int dm_table_any_congested(struct dm_table *t, int bdi_bits); -int dm_table_any_busy_target(struct dm_table *t); unsigned dm_table_get_type(struct dm_table *t); struct target_type *dm_table_get_immutable_target_type(struct dm_table *t); bool dm_table_request_based(struct dm_table *t); @@ -212,6 +211,8 @@ int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, void dm_internal_suspend(struct mapped_device *md); void dm_internal_resume(struct mapped_device *md); +bool dm_use_blk_mq(struct mapped_device *md); + int dm_io_init(void); void dm_io_exit(void); @@ -221,7 +222,8 @@ void dm_kcopyd_exit(void); /* * Mempool operations */ -struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size); +struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, + unsigned integrity, unsigned per_bio_data_size); void dm_free_md_mempools(struct dm_md_mempools *pools); /* @@ -235,4 +237,8 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen return !maxlen || strlen(result) + 1 >= maxlen; } +ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf); +ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, + const char *buf, size_t count); + #endif diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index fd23978d93fe..51cc1deb7af3 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -605,9 +605,4 @@ static inline unsigned long to_bytes(sector_t n) return (n << SECTOR_SHIFT); } -/*----------------------------------------------------------------- - * Helper for block layer and dm core operations - *---------------------------------------------------------------*/ -int dm_underlying_device_busy(struct request_queue *q); - #endif /* _LINUX_DEVICE_MAPPER_H */ diff --git a/include/linux/mount.h b/include/linux/mount.h index c2c561dc0114..bca086d62b1a 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -92,6 +92,6 @@ extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); -extern dev_t name_to_dev_t(char *name); +extern dev_t name_to_dev_t(const char *name); #endif /* _LINUX_MOUNT_H */ diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index 889f3a5b7b18..eac8c3641f39 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -267,9 +267,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 30 +#define DM_VERSION_MINOR 31 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2014-12-22)" +#define DM_VERSION_EXTRA "-ioctl (2015-3-12)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ diff --git a/init/do_mounts.c b/init/do_mounts.c index eb410083e8e0..8369ffa5f33d 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -207,7 +207,7 @@ done: * bangs. */ -dev_t name_to_dev_t(char *name) +dev_t name_to_dev_t(const char *name) { char s[32]; char *p; @@ -226,8 +226,9 @@ dev_t name_to_dev_t(char *name) if (strncmp(name, "/dev/", 5) != 0) { unsigned maj, min; + char dummy; - if (sscanf(name, "%u:%u", &maj, &min) == 2) { + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2) { res = MKDEV(maj, min); if (maj != MAJOR(res) || min != MINOR(res)) goto fail; @@ -286,6 +287,7 @@ fail: done: return res; } +EXPORT_SYMBOL_GPL(name_to_dev_t); static int __init root_dev_setup(char *line) { |