summaryrefslogtreecommitdiffstats
path: root/drivers/md/dm.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-06-30 18:19:39 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-06-30 18:19:39 -0700
commit2cfa582be80081fb8db02d4d9b44bff34b82ac54 (patch)
tree2faf8db8426b389ca8c9ed76065c688431bb7eb9 /drivers/md/dm.c
parentdbe69e43372212527abf48609aba7fc39a6daa27 (diff)
parent5c0de3d72f8c05678ed769bea24e98128f7ab570 (diff)
downloadlinux-2cfa582be80081fb8db02d4d9b44bff34b82ac54.tar.bz2
Merge tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mike Snitzer: - Various DM persistent-data library improvements and fixes that benefit both the DM thinp and cache targets. - A few small DM kcopyd efficiency improvements. - Significant zoned related block core, DM core and DM zoned target changes that culminate with adding zoned append emulation (which is required to properly fix DM crypt's zoned support). - Various DM writecache target changes that improve efficiency. Adds an optional "metadata_only" feature that only promotes bios flagged with REQ_META. But the most significant improvement is writecache's ability to pause writeback, for a confiurable time, if/when the working set is larger than the cache (and the cache is full) -- this ensures performance is no worse than the slower origin device. * tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (35 commits) dm writecache: make writeback pause configurable dm writecache: pause writeback if cache full and origin being written directly dm io tracker: factor out IO tracker dm btree remove: assign new_root only when removal succeeds dm zone: fix dm_revalidate_zones() memory allocation dm ps io affinity: remove redundant continue statement dm writecache: add optional "metadata_only" parameter dm writecache: add "cleaner" and "max_age" to Documentation dm writecache: write at least 4k when committing dm writecache: flush origin device when writing and cache is full dm writecache: have ssd writeback wait if the kcopyd workqueue is busy dm writecache: use list_move instead of list_del/list_add in writecache_writeback() dm writecache: commit just one block, not a full page dm writecache: remove unused gfp_t argument from wc_add_block() dm crypt: Fix zoned block device support dm: introduce zone append emulation dm: rearrange core declarations for extended use from dm-zone.c block: introduce BIO_ZONE_WRITE_LOCKED bio flag block: introduce bio zone helpers block: improve handling of all zones reset operation ...
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r--drivers/md/dm.c208
1 files changed, 55 insertions, 153 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a57aba553ebb..2c5f9e585211 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -74,38 +74,6 @@ struct clone_info {
unsigned sector_count;
};
-/*
- * One of these is allocated per clone bio.
- */
-#define DM_TIO_MAGIC 7282014
-struct dm_target_io {
- unsigned magic;
- struct dm_io *io;
- struct dm_target *ti;
- unsigned target_bio_nr;
- unsigned *len_ptr;
- bool inside_dm_io;
- struct bio clone;
-};
-
-/*
- * One of these is allocated per original bio.
- * It contains the first clone used for that original.
- */
-#define DM_IO_MAGIC 5191977
-struct dm_io {
- unsigned magic;
- struct mapped_device *md;
- blk_status_t status;
- atomic_t io_count;
- struct bio *orig_bio;
- unsigned long start_time;
- spinlock_t endio_lock;
- struct dm_stats_aux stats_aux;
- /* last member of dm_target_io is 'struct bio' */
- struct dm_target_io tio;
-};
-
#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
#define DM_IO_BIO_OFFSET \
(offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
@@ -137,19 +105,6 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
#define MINOR_ALLOCED ((void *)-1)
-/*
- * Bits for the md->flags field.
- */
-#define DMF_BLOCK_IO_FOR_SUSPEND 0
-#define DMF_SUSPENDED 1
-#define DMF_FROZEN 2
-#define DMF_FREEING 3
-#define DMF_DELETING 4
-#define DMF_NOFLUSH_SUSPENDING 5
-#define DMF_DEFERRED_REMOVE 6
-#define DMF_SUSPENDED_INTERNALLY 7
-#define DMF_POST_SUSPENDING 8
-
#define DM_NUMA_NODE NUMA_NO_NODE
static int dm_numa_node = DM_NUMA_NODE;
@@ -444,84 +399,6 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return dm_get_geometry(md, geo);
}
-#ifdef CONFIG_BLK_DEV_ZONED
-int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
-{
- struct dm_report_zones_args *args = data;
- sector_t sector_diff = args->tgt->begin - args->start;
-
- /*
- * Ignore zones beyond the target range.
- */
- if (zone->start >= args->start + args->tgt->len)
- return 0;
-
- /*
- * Remap the start sector and write pointer position of the zone
- * to match its position in the target range.
- */
- zone->start += sector_diff;
- if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
- if (zone->cond == BLK_ZONE_COND_FULL)
- zone->wp = zone->start + zone->len;
- else if (zone->cond == BLK_ZONE_COND_EMPTY)
- zone->wp = zone->start;
- else
- zone->wp += sector_diff;
- }
-
- args->next_sector = zone->start + zone->len;
- return args->orig_cb(zone, args->zone_idx++, args->orig_data);
-}
-EXPORT_SYMBOL_GPL(dm_report_zones_cb);
-
-static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
- unsigned int nr_zones, report_zones_cb cb, void *data)
-{
- struct mapped_device *md = disk->private_data;
- struct dm_table *map;
- int srcu_idx, ret;
- struct dm_report_zones_args args = {
- .next_sector = sector,
- .orig_data = data,
- .orig_cb = cb,
- };
-
- if (dm_suspended_md(md))
- return -EAGAIN;
-
- map = dm_get_live_table(md, &srcu_idx);
- if (!map) {
- ret = -EIO;
- goto out;
- }
-
- do {
- struct dm_target *tgt;
-
- tgt = dm_table_find_target(map, args.next_sector);
- if (WARN_ON_ONCE(!tgt->type->report_zones)) {
- ret = -EIO;
- goto out;
- }
-
- args.tgt = tgt;
- ret = tgt->type->report_zones(tgt, &args,
- nr_zones - args.zone_idx);
- if (ret < 0)
- goto out;
- } while (args.zone_idx < nr_zones &&
- args.next_sector < get_capacity(disk));
-
- ret = args.zone_idx;
-out:
- dm_put_live_table(md, srcu_idx);
- return ret;
-}
-#else
-#define dm_blk_report_zones NULL
-#endif /* CONFIG_BLK_DEV_ZONED */
-
static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
struct block_device **bdev)
{
@@ -903,7 +780,7 @@ static int __noflush_suspending(struct mapped_device *md)
* Decrements the number of outstanding ios that a bio has been
* cloned into, completing the original io if necc.
*/
-static void dec_pending(struct dm_io *io, blk_status_t error)
+void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
{
unsigned long flags;
blk_status_t io_error;
@@ -919,22 +796,27 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
}
if (atomic_dec_and_test(&io->io_count)) {
+ bio = io->orig_bio;
if (io->status == BLK_STS_DM_REQUEUE) {
/*
* Target requested pushing back the I/O.
*/
spin_lock_irqsave(&md->deferred_lock, flags);
- if (__noflush_suspending(md))
+ if (__noflush_suspending(md) &&
+ !WARN_ON_ONCE(dm_is_zone_write(md, bio))) {
/* NOTE early return due to BLK_STS_DM_REQUEUE below */
- bio_list_add_head(&md->deferred, io->orig_bio);
- else
- /* noflush suspend was interrupted. */
+ bio_list_add_head(&md->deferred, bio);
+ } else {
+ /*
+ * noflush suspend was interrupted or this is
+ * a write to a zoned target.
+ */
io->status = BLK_STS_IOERR;
+ }
spin_unlock_irqrestore(&md->deferred_lock, flags);
}
io_error = io->status;
- bio = io->orig_bio;
end_io_acct(io);
free_io(md, io);
@@ -994,7 +876,6 @@ static void clone_endio(struct bio *bio)
struct dm_io *io = tio->io;
struct mapped_device *md = tio->io->md;
dm_endio_fn endio = tio->ti->type->end_io;
- struct bio *orig_bio = io->orig_bio;
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
if (unlikely(error == BLK_STS_TARGET)) {
@@ -1009,23 +890,22 @@ static void clone_endio(struct bio *bio)
disable_write_zeroes(md);
}
- /*
- * For zone-append bios get offset in zone of the written
- * sector and add that to the original bio sector pos.
- */
- if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
- sector_t written_sector = bio->bi_iter.bi_sector;
- struct request_queue *q = orig_bio->bi_bdev->bd_disk->queue;
- u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
-
- orig_bio->bi_iter.bi_sector += written_sector & mask;
- }
+ if (blk_queue_is_zoned(q))
+ dm_zone_endio(io, bio);
if (endio) {
int r = endio(tio->ti, bio, &error);
switch (r) {
case DM_ENDIO_REQUEUE:
- error = BLK_STS_DM_REQUEUE;
+ /*
+ * Requeuing writes to a sequential zone of a zoned
+ * target will break the sequential write pattern:
+ * fail such IO.
+ */
+ if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
+ error = BLK_STS_IOERR;
+ else
+ error = BLK_STS_DM_REQUEUE;
fallthrough;
case DM_ENDIO_DONE:
break;
@@ -1044,7 +924,7 @@ static void clone_endio(struct bio *bio)
}
free_tio(tio);
- dec_pending(io, error);
+ dm_io_dec_pending(io, error);
}
/*
@@ -1237,8 +1117,8 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
- * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET,
- * REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH.
+ * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
+ * operations and REQ_OP_ZONE_APPEND (zone append writes).
*
* dm_accept_partial_bio informs the dm that the target only wants to process
* additional n_sectors sectors of the bio and the rest of the data should be
@@ -1268,9 +1148,13 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
{
struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+
BUG_ON(bio->bi_opf & REQ_PREFLUSH);
+ BUG_ON(op_is_zone_mgmt(bio_op(bio)));
+ BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
BUG_ON(bi_size > *tio->len_ptr);
BUG_ON(n_sectors > bi_size);
+
*tio->len_ptr -= bi_size - n_sectors;
bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
}
@@ -1308,7 +1192,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
* anything, the target has assumed ownership of
* this io.
*/
- atomic_inc(&io->io_count);
+ dm_io_inc_pending(io);
sector = clone->bi_iter.bi_sector;
if (unlikely(swap_bios_limit(ti, clone))) {
@@ -1319,7 +1203,16 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
down(&md->swap_bios_semaphore);
}
- r = ti->type->map(ti, clone);
+ /*
+ * Check if the IO needs a special mapping due to zone append emulation
+ * on zoned target. In this case, dm_zone_map_bio() calls the target
+ * map operation.
+ */
+ if (dm_emulate_zone_append(io->md))
+ r = dm_zone_map_bio(tio);
+ else
+ r = ti->type->map(ti, clone);
+
switch (r) {
case DM_MAPIO_SUBMITTED:
break;
@@ -1334,7 +1227,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
up(&md->swap_bios_semaphore);
}
free_tio(tio);
- dec_pending(io, BLK_STS_IOERR);
+ dm_io_dec_pending(io, BLK_STS_IOERR);
break;
case DM_MAPIO_REQUEUE:
if (unlikely(swap_bios_limit(ti, clone))) {
@@ -1342,7 +1235,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
up(&md->swap_bios_semaphore);
}
free_tio(tio);
- dec_pending(io, BLK_STS_DM_REQUEUE);
+ dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
break;
default:
DMWARN("unimplemented target map return value: %d", r);
@@ -1631,7 +1524,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
if (bio->bi_opf & REQ_PREFLUSH) {
error = __send_empty_flush(&ci);
- /* dec_pending submits any data associated with flush */
+ /* dm_io_dec_pending submits any data associated with flush */
} else if (op_is_zone_mgmt(bio_op(bio))) {
ci.bio = bio;
ci.sector_count = 0;
@@ -1672,7 +1565,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
}
/* drop the extra reference count */
- dec_pending(ci.io, errno_to_blk_status(error));
+ dm_io_dec_pending(ci.io, errno_to_blk_status(error));
return ret;
}
@@ -1817,6 +1710,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
mutex_destroy(&md->swap_bios_lock);
dm_mq_cleanup_mapped_device(md);
+ dm_cleanup_zoned_dev(md);
}
/*
@@ -2060,11 +1954,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
goto out;
}
+ ret = dm_table_set_restrictions(t, q, limits);
+ if (ret) {
+ old_map = ERR_PTR(ret);
+ goto out;
+ }
+
old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
rcu_assign_pointer(md->map, (void *)t);
md->immutable_target_type = dm_table_get_immutable_target_type(t);
- dm_table_set_restrictions(t, q, limits);
if (old_map)
dm_sync_table(md);
@@ -2183,7 +2082,10 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
DMERR("Cannot calculate initial queue limits");
return r;
}
- dm_table_set_restrictions(t, md->queue, &limits);
+ r = dm_table_set_restrictions(t, md->queue, &limits);
+ if (r)
+ return r;
+
blk_register_queue(md->disk);
return 0;