summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/bio-integrity.c8
-rw-r--r--block/bio.c28
-rw-r--r--block/blk-cgroup.c66
-rw-r--r--block/blk-core.c6
-rw-r--r--block/blk-mq.c2
-rw-r--r--block/blk-mq.h32
-rw-r--r--block/blk-throttle.c9
-rw-r--r--block/blk-zoned.c69
8 files changed, 175 insertions, 45 deletions
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 4db620849515..fb95dbb21dd8 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -276,8 +276,12 @@ bool bio_integrity_prep(struct bio *bio)
ret = bio_integrity_add_page(bio, virt_to_page(buf),
bytes, offset);
- if (ret == 0)
- return false;
+ if (ret == 0) {
+ printk(KERN_ERR "could not attach integrity payload\n");
+ kfree(buf);
+ status = BLK_STS_RESOURCE;
+ goto err_end_io;
+ }
if (ret < bytes)
break;
diff --git a/block/bio.c b/block/bio.c
index 29cd6cf4da51..299a0e7651ec 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -16,6 +16,7 @@
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/blk-cgroup.h>
+#include <linux/highmem.h>
#include <trace/events/block.h>
#include "blk.h"
@@ -1441,8 +1442,22 @@ void bio_unmap_user(struct bio *bio)
bio_put(bio);
}
+static void bio_invalidate_vmalloc_pages(struct bio *bio)
+{
+#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
+ if (bio->bi_private && !op_is_write(bio_op(bio))) {
+ unsigned long i, len = 0;
+
+ for (i = 0; i < bio->bi_vcnt; i++)
+ len += bio->bi_io_vec[i].bv_len;
+ invalidate_kernel_vmap_range(bio->bi_private, len);
+ }
+#endif
+}
+
static void bio_map_kern_endio(struct bio *bio)
{
+ bio_invalidate_vmalloc_pages(bio);
bio_put(bio);
}
@@ -1463,6 +1478,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = kaddr >> PAGE_SHIFT;
const int nr_pages = end - start;
+ bool is_vmalloc = is_vmalloc_addr(data);
+ struct page *page;
int offset, i;
struct bio *bio;
@@ -1470,6 +1487,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
if (!bio)
return ERR_PTR(-ENOMEM);
+ if (is_vmalloc) {
+ flush_kernel_vmap_range(data, len);
+ bio->bi_private = data;
+ }
+
offset = offset_in_page(kaddr);
for (i = 0; i < nr_pages; i++) {
unsigned int bytes = PAGE_SIZE - offset;
@@ -1480,7 +1502,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
if (bytes > len)
bytes = len;
- if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
+ if (!is_vmalloc)
+ page = virt_to_page(data);
+ else
+ page = vmalloc_to_page(data);
+ if (bio_add_pc_page(q, bio, page, bytes,
offset) < bytes) {
/* we don't support partial mappings */
bio_put(bio);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 53b7bd4c7000..24ed26957367 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -29,6 +29,7 @@
#include <linux/ctype.h>
#include <linux/blk-cgroup.h>
#include <linux/tracehook.h>
+#include <linux/psi.h>
#include "blk.h"
#define MAX_KEY_LEN 100
@@ -47,12 +48,14 @@ struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);
struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+EXPORT_SYMBOL_GPL(blkcg_root_css);
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
static bool blkcg_debug_stats = false;
+static struct workqueue_struct *blkcg_punt_bio_wq;
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
@@ -87,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+ WARN_ON(!bio_list_empty(&blkg->async_bios));
+
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
if (blkg->parent)
@@ -112,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
call_rcu(&blkg->rcu_head, __blkg_release);
}
+static void blkg_async_bio_workfn(struct work_struct *work)
+{
+ struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+ async_bio_work);
+ struct bio_list bios = BIO_EMPTY_LIST;
+ struct bio *bio;
+
+ /* as long as there are pending bios, @blkg can't go away */
+ spin_lock_bh(&blkg->async_bio_lock);
+ bio_list_merge(&bios, &blkg->async_bios);
+ bio_list_init(&blkg->async_bios);
+ spin_unlock_bh(&blkg->async_bio_lock);
+
+ while ((bio = bio_list_pop(&bios)))
+ submit_bio(bio);
+}
+
/**
* blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with
@@ -140,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
+ spin_lock_init(&blkg->async_bio_lock);
+ bio_list_init(&blkg->async_bios);
+ INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
blkg->blkcg = blkcg;
for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -1526,6 +1551,25 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+bool __blkcg_punt_bio_submit(struct bio *bio)
+{
+ struct blkcg_gq *blkg = bio->bi_blkg;
+
+ /* consume the flag first */
+ bio->bi_opf &= ~REQ_CGROUP_PUNT;
+
+ /* never bounce for the root cgroup */
+ if (!blkg->parent)
+ return false;
+
+ spin_lock_bh(&blkg->async_bio_lock);
+ bio_list_add(&blkg->async_bios, bio);
+ spin_unlock_bh(&blkg->async_bio_lock);
+
+ queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+ return true;
+}
+
/*
* Scale the accumulated delay based on how long it has been since we updated
* the delay. We only call this when we are adding delay, in case it's been a
@@ -1587,6 +1631,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
*/
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
+ unsigned long pflags;
u64 now = ktime_to_ns(ktime_get());
u64 exp;
u64 delay_nsec = 0;
@@ -1613,11 +1658,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
*/
delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
- /*
- * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
- * that hasn't landed upstream yet. Once that stuff is in place we need
- * to do a psi_memstall_enter/leave if memdelay is set.
- */
+ if (use_memdelay)
+ psi_memstall_enter(&pflags);
exp = ktime_add_ns(now, delay_nsec);
tok = io_schedule_prepare();
@@ -1627,6 +1669,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
break;
} while (!fatal_signal_pending(current));
io_schedule_finish(tok);
+
+ if (use_memdelay)
+ psi_memstall_leave(&pflags);
}
/**
@@ -1726,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
atomic64_add(delta, &blkg->delay_nsec);
}
+static int __init blkcg_init(void)
+{
+ blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+ WQ_MEM_RECLAIM | WQ_FREEZABLE |
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!blkcg_punt_bio_wq)
+ return -ENOMEM;
+ return 0;
+}
+subsys_initcall(blkcg_init);
+
module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index 5d1fc8e17dd1..d0cc6e14d2f0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -117,6 +117,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->internal_tag = -1;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
+ refcount_set(&rq->ref, 1);
}
EXPORT_SYMBOL(blk_rq_init);
@@ -687,7 +688,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
struct request *rq;
struct list_head *plug_list;
- plug = current->plug;
+ plug = blk_mq_plug(q, bio);
if (!plug)
return false;
@@ -1127,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request);
*/
blk_qc_t submit_bio(struct bio *bio)
{
+ if (blkcg_punt_bio_submit(bio))
+ return BLK_QC_T_NONE;
+
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e5ef40c603ca..b038ec680e84 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1973,7 +1973,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_bio_to_request(rq, bio, nr_segs);
- plug = current->plug;
+ plug = blk_mq_plug(q, bio);
if (unlikely(is_flush_fua)) {
/* bypass scheduler for flush rq */
blk_insert_flush(rq);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f4bf5161333e..32c62c64e6c2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -233,4 +233,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
qmap->mq_map[cpu] = 0;
}
+/*
+ * blk_mq_plug() - Get caller context plug
+ * @q: request queue
+ * @bio : the bio being submitted by the caller context
+ *
+ * Plugging, by design, may delay the insertion of BIOs into the elevator in
+ * order to increase BIO merging opportunities. This however can cause BIO
+ * insertion order to change from the order in which submit_bio() is being
+ * executed in the case of multiple contexts concurrently issuing BIOs to a
+ * device, even if these context are synchronized to tightly control BIO issuing
+ * order. While this is not a problem with regular block devices, this ordering
+ * change can cause write BIO failures with zoned block devices as these
+ * require sequential write patterns to zones. Prevent this from happening by
+ * ignoring the plug state of a BIO issuing context if the target request queue
+ * is for a zoned block device and the BIO to plug is a write operation.
+ *
+ * Return current->plug if the bio can be plugged and NULL otherwise
+ */
+static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
+ struct bio *bio)
+{
+ /*
+ * For regular block devices or read operations, use the context plug
+ * which may be NULL if blk_start_plug() was not executed.
+ */
+ if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
+ return current->plug;
+
+ /* Zoned block device write operation case: do not plug the BIO */
+ return NULL;
+}
+
#endif
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 9ea7c0ecad10..8ab6c8153223 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -881,13 +881,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;
- jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
-
- /* Slice has just started. Consider one slice interval */
- if (!jiffy_elapsed)
- jiffy_elapsed_rnd = tg->td->throtl_slice;
+ jiffy_elapsed = jiffies - tg->slice_start[rw];
- jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
+ /* Round up to the next throttle slice, wait time must be nonzero */
+ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
/*
* jiffy_elapsed_rnd should not be a big value as minimum iops can be
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ae7e91bd0618..6c503824ba3f 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,6 +14,9 @@
#include <linux/rbtree.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
#include "blk.h"
@@ -70,7 +73,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
sector_t nr_sectors)
{
- unsigned long zone_sectors = blk_queue_zone_sectors(q);
+ sector_t zone_sectors = blk_queue_zone_sectors(q);
return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
}
@@ -117,8 +120,7 @@ static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep)
}
static int blk_report_zones(struct gendisk *disk, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones,
- gfp_t gfp_mask)
+ struct blk_zone *zones, unsigned int *nr_zones)
{
struct request_queue *q = disk->queue;
unsigned int z = 0, n, nrz = *nr_zones;
@@ -127,8 +129,7 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
while (z < nrz && sector < capacity) {
n = nrz - z;
- ret = disk->fops->report_zones(disk, sector, &zones[z], &n,
- gfp_mask);
+ ret = disk->fops->report_zones(disk, sector, &zones[z], &n);
if (ret)
return ret;
if (!n)
@@ -149,17 +150,18 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
* @sector: Sector from which to report zones
* @zones: Array of zone structures where to return the zones information
* @nr_zones: Number of zone structures in the zone array
- * @gfp_mask: Memory allocation flags (for bio_alloc)
*
* Description:
* Get zone information starting from the zone containing @sector.
* The number of zone information reported may be less than the number
* requested by @nr_zones. The number of zones actually reported is
* returned in @nr_zones.
+ * The caller must use memalloc_noXX_save/restore() calls to control
+ * memory allocations done within this function (zone array and command
+ * buffer allocation by the device driver).
*/
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones,
- gfp_t gfp_mask)
+ struct blk_zone *zones, unsigned int *nr_zones)
{
struct request_queue *q = bdev_get_queue(bdev);
unsigned int i, nrz;
@@ -184,7 +186,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
nrz = min(*nr_zones,
__blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector));
ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector,
- zones, &nrz, gfp_mask);
+ zones, &nrz);
if (ret)
return ret;
@@ -305,9 +307,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
if (!zones)
return -ENOMEM;
- ret = blkdev_report_zones(bdev, rep.sector,
- zones, &rep.nr_zones,
- GFP_KERNEL);
+ ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones);
if (ret)
goto out;
@@ -373,22 +373,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
* Allocate an array of struct blk_zone to get nr_zones zone information.
* The allocated array may be smaller than nr_zones.
*/
-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
+static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
{
- size_t size = *nr_zones * sizeof(struct blk_zone);
- struct page *page;
- int order;
-
- for (order = get_order(size); order >= 0; order--) {
- page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
- if (page) {
- *nr_zones = min_t(unsigned int, *nr_zones,
- (PAGE_SIZE << order) / sizeof(struct blk_zone));
- return page_address(page);
- }
+ struct blk_zone *zones;
+ size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
+
+ /*
+ * GFP_KERNEL here is meaningless as the caller task context has
+ * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
+ * with memalloc_noio_save().
+ */
+ zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
+ if (!zones) {
+ *nr_zones = 0;
+ return NULL;
}
- return NULL;
+ *nr_zones = nrz;
+
+ return zones;
}
void blk_queue_free_zone_bitmaps(struct request_queue *q)
@@ -415,6 +418,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
unsigned int i, rep_nr_zones = 0, z = 0, nrz;
struct blk_zone *zones = NULL;
+ unsigned int noio_flag;
sector_t sector = 0;
int ret = 0;
@@ -427,6 +431,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
return 0;
}
+ /*
+ * Ensure that all memory allocations in this context are done as
+ * if GFP_NOIO was specified.
+ */
+ noio_flag = memalloc_noio_save();
+
if (!blk_queue_is_zoned(q) || !nr_zones) {
nr_zones = 0;
goto update;
@@ -443,13 +453,13 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
/* Get zone information and initialize seq_zones_bitmap */
rep_nr_zones = nr_zones;
- zones = blk_alloc_zones(q->node, &rep_nr_zones);
+ zones = blk_alloc_zones(&rep_nr_zones);
if (!zones)
goto out;
while (z < nr_zones) {
nrz = min(nr_zones - z, rep_nr_zones);
- ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO);
+ ret = blk_report_zones(disk, sector, zones, &nrz);
if (ret)
goto out;
if (!nrz)
@@ -480,8 +490,9 @@ update:
blk_mq_unfreeze_queue(q);
out:
- free_pages((unsigned long)zones,
- get_order(rep_nr_zones * sizeof(struct blk_zone)));
+ memalloc_noio_restore(noio_flag);
+
+ kvfree(zones);
kfree(seq_zones_wlock);
kfree(seq_zones_bitmap);