diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bio-integrity.c | 8 | ||||
-rw-r--r-- | block/bio.c | 28 | ||||
-rw-r--r-- | block/blk-cgroup.c | 66 | ||||
-rw-r--r-- | block/blk-core.c | 6 | ||||
-rw-r--r-- | block/blk-mq.c | 2 | ||||
-rw-r--r-- | block/blk-mq.h | 32 | ||||
-rw-r--r-- | block/blk-throttle.c | 9 | ||||
-rw-r--r-- | block/blk-zoned.c | 69 |
8 files changed, 175 insertions, 45 deletions
diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4db620849515..fb95dbb21dd8 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -276,8 +276,12 @@ bool bio_integrity_prep(struct bio *bio) ret = bio_integrity_add_page(bio, virt_to_page(buf), bytes, offset); - if (ret == 0) - return false; + if (ret == 0) { + printk(KERN_ERR "could not attach integrity payload\n"); + kfree(buf); + status = BLK_STS_RESOURCE; + goto err_end_io; + } if (ret < bytes) break; diff --git a/block/bio.c b/block/bio.c index 29cd6cf4da51..299a0e7651ec 100644 --- a/block/bio.c +++ b/block/bio.c @@ -16,6 +16,7 @@ #include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/blk-cgroup.h> +#include <linux/highmem.h> #include <trace/events/block.h> #include "blk.h" @@ -1441,8 +1442,22 @@ void bio_unmap_user(struct bio *bio) bio_put(bio); } +static void bio_invalidate_vmalloc_pages(struct bio *bio) +{ +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE + if (bio->bi_private && !op_is_write(bio_op(bio))) { + unsigned long i, len = 0; + + for (i = 0; i < bio->bi_vcnt; i++) + len += bio->bi_io_vec[i].bv_len; + invalidate_kernel_vmap_range(bio->bi_private, len); + } +#endif +} + static void bio_map_kern_endio(struct bio *bio) { + bio_invalidate_vmalloc_pages(bio); bio_put(bio); } @@ -1463,6 +1478,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT; const int nr_pages = end - start; + bool is_vmalloc = is_vmalloc_addr(data); + struct page *page; int offset, i; struct bio *bio; @@ -1470,6 +1487,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, if (!bio) return ERR_PTR(-ENOMEM); + if (is_vmalloc) { + flush_kernel_vmap_range(data, len); + bio->bi_private = data; + } + offset = offset_in_page(kaddr); for (i = 0; i < nr_pages; i++) { unsigned int bytes = PAGE_SIZE - offset; @@ -1480,7 +1502,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, if (bytes > len) bytes = len; - if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, + if (!is_vmalloc) + page = virt_to_page(data); + else + page = vmalloc_to_page(data); + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ bio_put(bio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 53b7bd4c7000..24ed26957367 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -29,6 +29,7 @@ #include <linux/ctype.h> #include <linux/blk-cgroup.h> #include <linux/tracehook.h> +#include <linux/psi.h> #include "blk.h" #define MAX_KEY_LEN 100 @@ -47,12 +48,14 @@ struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; +EXPORT_SYMBOL_GPL(blkcg_root_css); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ static bool blkcg_debug_stats = false; +static struct workqueue_struct *blkcg_punt_bio_wq; static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) @@ -87,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + WARN_ON(!bio_list_empty(&blkg->async_bios)); + /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); if (blkg->parent) @@ -112,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref) call_rcu(&blkg->rcu_head, __blkg_release); } +static void blkg_async_bio_workfn(struct work_struct *work) +{ + struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, + async_bio_work); + struct bio_list bios = BIO_EMPTY_LIST; + struct bio *bio; + + /* as long as there are pending bios, @blkg can't go away */ + spin_lock_bh(&blkg->async_bio_lock); + bio_list_merge(&bios, &blkg->async_bios); + bio_list_init(&blkg->async_bios); + spin_unlock_bh(&blkg->async_bio_lock); + + while ((bio = bio_list_pop(&bios))) + submit_bio(bio); +} + /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -140,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); + spin_lock_init(&blkg->async_bio_lock); + bio_list_init(&blkg->async_bios); + INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); blkg->blkcg = blkcg; for (i = 0; i < BLKCG_MAX_POLS; i++) { @@ -1526,6 +1551,25 @@ out_unlock: } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); +bool __blkcg_punt_bio_submit(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + /* consume the flag first */ + bio->bi_opf &= ~REQ_CGROUP_PUNT; + + /* never bounce for the root cgroup */ + if (!blkg->parent) + return false; + + spin_lock_bh(&blkg->async_bio_lock); + bio_list_add(&blkg->async_bios, bio); + spin_unlock_bh(&blkg->async_bio_lock); + + queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); + return true; +} + /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a @@ -1587,6 +1631,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { + unsigned long pflags; u64 now = ktime_to_ns(ktime_get()); u64 exp; u64 delay_nsec = 0; @@ -1613,11 +1658,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) */ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); - /* - * TODO: the use_memdelay flag is going to be for the upcoming psi stuff - * that hasn't landed upstream yet. Once that stuff is in place we need - * to do a psi_memstall_enter/leave if memdelay is set. - */ + if (use_memdelay) + psi_memstall_enter(&pflags); exp = ktime_add_ns(now, delay_nsec); tok = io_schedule_prepare(); @@ -1627,6 +1669,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) break; } while (!fatal_signal_pending(current)); io_schedule_finish(tok); + + if (use_memdelay) + psi_memstall_leave(&pflags); } /** @@ -1726,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) atomic64_add(delta, &blkg->delay_nsec); } +static int __init blkcg_init(void) +{ + blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", + WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!blkcg_punt_bio_wq) + return -ENOMEM; + return 0; +} +subsys_initcall(blkcg_init); + module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/block/blk-core.c b/block/blk-core.c index 5d1fc8e17dd1..d0cc6e14d2f0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -117,6 +117,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->internal_tag = -1; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; + refcount_set(&rq->ref, 1); } EXPORT_SYMBOL(blk_rq_init); @@ -687,7 +688,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, struct request *rq; struct list_head *plug_list; - plug = current->plug; + plug = blk_mq_plug(q, bio); if (!plug) return false; @@ -1127,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request); */ blk_qc_t submit_bio(struct bio *bio) { + if (blkcg_punt_bio_submit(bio)) + return BLK_QC_T_NONE; + /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. diff --git a/block/blk-mq.c b/block/blk-mq.c index e5ef40c603ca..b038ec680e84 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1973,7 +1973,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio, nr_segs); - plug = current->plug; + plug = blk_mq_plug(q, bio); if (unlikely(is_flush_fua)) { /* bypass scheduler for flush rq */ blk_insert_flush(rq); diff --git a/block/blk-mq.h b/block/blk-mq.h index f4bf5161333e..32c62c64e6c2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -233,4 +233,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) qmap->mq_map[cpu] = 0; } +/* + * blk_mq_plug() - Get caller context plug + * @q: request queue + * @bio : the bio being submitted by the caller context + * + * Plugging, by design, may delay the insertion of BIOs into the elevator in + * order to increase BIO merging opportunities. This however can cause BIO + * insertion order to change from the order in which submit_bio() is being + * executed in the case of multiple contexts concurrently issuing BIOs to a + * device, even if these context are synchronized to tightly control BIO issuing + * order. While this is not a problem with regular block devices, this ordering + * change can cause write BIO failures with zoned block devices as these + * require sequential write patterns to zones. Prevent this from happening by + * ignoring the plug state of a BIO issuing context if the target request queue + * is for a zoned block device and the BIO to plug is a write operation. + * + * Return current->plug if the bio can be plugged and NULL otherwise + */ +static inline struct blk_plug *blk_mq_plug(struct request_queue *q, + struct bio *bio) +{ + /* + * For regular block devices or read operations, use the context plug + * which may be NULL if blk_start_plug() was not executed. + */ + if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio))) + return current->plug; + + /* Zoned block device write operation case: do not plug the BIO */ + return NULL; +} + #endif diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9ea7c0ecad10..8ab6c8153223 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -881,13 +881,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; - jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; - - /* Slice has just started. Consider one slice interval */ - if (!jiffy_elapsed) - jiffy_elapsed_rnd = tg->td->throtl_slice; + jiffy_elapsed = jiffies - tg->slice_start[rw]; - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); + /* Round up to the next throttle slice, wait time must be nonzero */ + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); /* * jiffy_elapsed_rnd should not be a big value as minimum iops can be diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ae7e91bd0618..6c503824ba3f 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -14,6 +14,9 @@ #include <linux/rbtree.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/sched/mm.h> #include "blk.h" @@ -70,7 +73,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); static inline unsigned int __blkdev_nr_zones(struct request_queue *q, sector_t nr_sectors) { - unsigned long zone_sectors = blk_queue_zone_sectors(q); + sector_t zone_sectors = blk_queue_zone_sectors(q); return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors); } @@ -117,8 +120,7 @@ static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep) } static int blk_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct request_queue *q = disk->queue; unsigned int z = 0, n, nrz = *nr_zones; @@ -127,8 +129,7 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector, while (z < nrz && sector < capacity) { n = nrz - z; - ret = disk->fops->report_zones(disk, sector, &zones[z], &n, - gfp_mask); + ret = disk->fops->report_zones(disk, sector, &zones[z], &n); if (ret) return ret; if (!n) @@ -149,17 +150,18 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector, * @sector: Sector from which to report zones * @zones: Array of zone structures where to return the zones information * @nr_zones: Number of zone structures in the zone array - * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: * Get zone information starting from the zone containing @sector. * The number of zone information reported may be less than the number * requested by @nr_zones. The number of zones actually reported is * returned in @nr_zones. + * The caller must use memalloc_noXX_save/restore() calls to control + * memory allocations done within this function (zone array and command + * buffer allocation by the device driver). */ int blkdev_report_zones(struct block_device *bdev, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct request_queue *q = bdev_get_queue(bdev); unsigned int i, nrz; @@ -184,7 +186,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, nrz = min(*nr_zones, __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector)); ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector, - zones, &nrz, gfp_mask); + zones, &nrz); if (ret) return ret; @@ -305,9 +307,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!zones) return -ENOMEM; - ret = blkdev_report_zones(bdev, rep.sector, - zones, &rep.nr_zones, - GFP_KERNEL); + ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones); if (ret) goto out; @@ -373,22 +373,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node, * Allocate an array of struct blk_zone to get nr_zones zone information. * The allocated array may be smaller than nr_zones. */ -static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones) +static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones) { - size_t size = *nr_zones * sizeof(struct blk_zone); - struct page *page; - int order; - - for (order = get_order(size); order >= 0; order--) { - page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); - if (page) { - *nr_zones = min_t(unsigned int, *nr_zones, - (PAGE_SIZE << order) / sizeof(struct blk_zone)); - return page_address(page); - } + struct blk_zone *zones; + size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES); + + /* + * GFP_KERNEL here is meaningless as the caller task context has + * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones() + * with memalloc_noio_save(). + */ + zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL); + if (!zones) { + *nr_zones = 0; + return NULL; } - return NULL; + *nr_zones = nrz; + + return zones; } void blk_queue_free_zone_bitmaps(struct request_queue *q) @@ -415,6 +418,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL; unsigned int i, rep_nr_zones = 0, z = 0, nrz; struct blk_zone *zones = NULL; + unsigned int noio_flag; sector_t sector = 0; int ret = 0; @@ -427,6 +431,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk) return 0; } + /* + * Ensure that all memory allocations in this context are done as + * if GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); + if (!blk_queue_is_zoned(q) || !nr_zones) { nr_zones = 0; goto update; @@ -443,13 +453,13 @@ int blk_revalidate_disk_zones(struct gendisk *disk) /* Get zone information and initialize seq_zones_bitmap */ rep_nr_zones = nr_zones; - zones = blk_alloc_zones(q->node, &rep_nr_zones); + zones = blk_alloc_zones(&rep_nr_zones); if (!zones) goto out; while (z < nr_zones) { nrz = min(nr_zones - z, rep_nr_zones); - ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO); + ret = blk_report_zones(disk, sector, zones, &nrz); if (ret) goto out; if (!nrz) @@ -480,8 +490,9 @@ update: blk_mq_unfreeze_queue(q); out: - free_pages((unsigned long)zones, - get_order(rep_nr_zones * sizeof(struct blk_zone))); + memalloc_noio_restore(noio_flag); + + kvfree(zones); kfree(seq_zones_wlock); kfree(seq_zones_bitmap); |