diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-15 21:20:52 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-15 21:20:52 -0700 |
commit | 9637d517347e80ee2fe1c5d8ce45ba1b88d8b5cd (patch) | |
tree | 3cee2a1d8b3c6ea466924517307a1f98ada1e92f | |
parent | 273cbf61c3ddee9574ef1f4959b9bc6db5b24271 (diff) | |
parent | 787c79d6393fc028887cc1b6066915f0b094e92f (diff) | |
download | linux-9637d517347e80ee2fe1c5d8ce45ba1b88d8b5cd.tar.bz2 |
Merge tag 'for-linus-20190715' of git://git.kernel.dk/linux-block
Pull more block updates from Jens Axboe:
"A later pull request with some followup items. I had some vacation
coming up to the merge window, so certain things items were delayed a
bit. This pull request also contains fixes that came in within the
last few days of the merge window, which I didn't want to push right
before sending you a pull request.
This contains:
- NVMe pull request, mostly fixes, but also a few minor items on the
feature side that were timing constrained (Christoph et al)
- Report zones fixes (Damien)
- Removal of dead code (Damien)
- Turn on cgroup psi memstall (Josef)
- block cgroup MAINTAINERS entry (Konstantin)
- Flush init fix (Josef)
- blk-throttle low iops timing fix (Konstantin)
- nbd resize fixes (Mike)
- nbd 0 blocksize crash fix (Xiubo)
- block integrity error leak fix (Wenwen)
- blk-cgroup writeback and priority inheritance fixes (Tejun)"
* tag 'for-linus-20190715' of git://git.kernel.dk/linux-block: (42 commits)
MAINTAINERS: add entry for block io cgroup
null_blk: fixup ->report_zones() for !CONFIG_BLK_DEV_ZONED
block: Limit zone array allocation size
sd_zbc: Fix report zones buffer allocation
block: Kill gfp_t argument of blkdev_report_zones()
block: Allow mapping of vmalloc-ed buffers
block/bio-integrity: fix a memory leak bug
nvme: fix NULL deref for fabrics options
nbd: add netlink reconfigure resize support
nbd: fix crash when the blksize is zero
block: Disable write plugging for zoned block devices
block: Fix elevator name declaration
block: Remove unused definitions
nvme: fix regression upon hot device removal and insertion
blk-throttle: fix zero wait time for iops throttled group
block: Fix potential overflow in blk_report_zones()
blkcg: implement REQ_CGROUP_PUNT
blkcg, writeback: Implement wbc_blkcg_css()
blkcg, writeback: Add wbc->no_cgroup_owner
blkcg, writeback: Rename wbc_account_io() to wbc_account_cgroup_owner()
...
50 files changed, 660 insertions, 210 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index a9548de56ac9..8269e869cb1e 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2124,7 +2124,7 @@ following two functions. a queue (device) has been associated with the bio and before submission. - wbc_account_io(@wbc, @page, @bytes) + wbc_account_cgroup_owner(@wbc, @page, @bytes) Should be called for each data segment being written out. While this function doesn't care exactly when it's called during the writeback session, it's the easiest and most diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 31c177663ed5..5a4a799fe61b 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -843,11 +843,6 @@ elevator_latter_req_fn These return the request before or after the elevator_completed_req_fn called when a request is completed. -elevator_may_queue_fn returns true if the scheduler wants to allow the - current context to queue a new request even if - it is over the queue limit. This must be used - very carefully!! - elevator_set_req_fn elevator_put_req_fn Must be used to allocate and free any elevator specific storage for a request. diff --git a/MAINTAINERS b/MAINTAINERS index 4aee3a1de331..6debe6829716 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4183,6 +4183,19 @@ S: Maintained F: mm/memcontrol.c F: mm/swap_cgroup.c +CONTROL GROUP - BLOCK IO CONTROLLER (BLKIO) +M: Tejun Heo <tj@kernel.org> +M: Jens Axboe <axboe@kernel.dk> +L: cgroups@vger.kernel.org +L: linux-block@vger.kernel.org +T: git git://git.kernel.dk/linux-block +F: Documentation/cgroup-v1/blkio-controller.rst +F: block/blk-cgroup.c +F: include/linux/blk-cgroup.h +F: block/blk-throttle.c +F: block/blk-iolatency.c +F: block/bfq-cgroup.c + CORETEMP HARDWARE MONITORING DRIVER M: Fenghua Yu <fenghua.yu@intel.com> L: linux-hwmon@vger.kernel.org diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4db620849515..fb95dbb21dd8 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -276,8 +276,12 @@ bool bio_integrity_prep(struct bio *bio) ret = bio_integrity_add_page(bio, virt_to_page(buf), bytes, offset); - if (ret == 0) - return false; + if (ret == 0) { + printk(KERN_ERR "could not attach integrity payload\n"); + kfree(buf); + status = BLK_STS_RESOURCE; + goto err_end_io; + } if (ret < bytes) break; diff --git a/block/bio.c b/block/bio.c index 29cd6cf4da51..299a0e7651ec 100644 --- a/block/bio.c +++ b/block/bio.c @@ -16,6 +16,7 @@ #include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/blk-cgroup.h> +#include <linux/highmem.h> #include <trace/events/block.h> #include "blk.h" @@ -1441,8 +1442,22 @@ void bio_unmap_user(struct bio *bio) bio_put(bio); } +static void bio_invalidate_vmalloc_pages(struct bio *bio) +{ +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE + if (bio->bi_private && !op_is_write(bio_op(bio))) { + unsigned long i, len = 0; + + for (i = 0; i < bio->bi_vcnt; i++) + len += bio->bi_io_vec[i].bv_len; + invalidate_kernel_vmap_range(bio->bi_private, len); + } +#endif +} + static void bio_map_kern_endio(struct bio *bio) { + bio_invalidate_vmalloc_pages(bio); bio_put(bio); } @@ -1463,6 +1478,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT; const int nr_pages = end - start; + bool is_vmalloc = is_vmalloc_addr(data); + struct page *page; int offset, i; struct bio *bio; @@ -1470,6 +1487,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, if (!bio) return ERR_PTR(-ENOMEM); + if (is_vmalloc) { + flush_kernel_vmap_range(data, len); + bio->bi_private = data; + } + offset = offset_in_page(kaddr); for (i = 0; i < nr_pages; i++) { unsigned int bytes = PAGE_SIZE - offset; @@ -1480,7 +1502,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, if (bytes > len) bytes = len; - if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, + if (!is_vmalloc) + page = virt_to_page(data); + else + page = vmalloc_to_page(data); + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ bio_put(bio); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 53b7bd4c7000..24ed26957367 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -29,6 +29,7 @@ #include <linux/ctype.h> #include <linux/blk-cgroup.h> #include <linux/tracehook.h> +#include <linux/psi.h> #include "blk.h" #define MAX_KEY_LEN 100 @@ -47,12 +48,14 @@ struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; +EXPORT_SYMBOL_GPL(blkcg_root_css); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ static bool blkcg_debug_stats = false; +static struct workqueue_struct *blkcg_punt_bio_wq; static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) @@ -87,6 +90,8 @@ static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + WARN_ON(!bio_list_empty(&blkg->async_bios)); + /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); if (blkg->parent) @@ -112,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref) call_rcu(&blkg->rcu_head, __blkg_release); } +static void blkg_async_bio_workfn(struct work_struct *work) +{ + struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, + async_bio_work); + struct bio_list bios = BIO_EMPTY_LIST; + struct bio *bio; + + /* as long as there are pending bios, @blkg can't go away */ + spin_lock_bh(&blkg->async_bio_lock); + bio_list_merge(&bios, &blkg->async_bios); + bio_list_init(&blkg->async_bios); + spin_unlock_bh(&blkg->async_bio_lock); + + while ((bio = bio_list_pop(&bios))) + submit_bio(bio); +} + /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -140,6 +162,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); + spin_lock_init(&blkg->async_bio_lock); + bio_list_init(&blkg->async_bios); + INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); blkg->blkcg = blkcg; for (i = 0; i < BLKCG_MAX_POLS; i++) { @@ -1526,6 +1551,25 @@ out_unlock: } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); +bool __blkcg_punt_bio_submit(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + /* consume the flag first */ + bio->bi_opf &= ~REQ_CGROUP_PUNT; + + /* never bounce for the root cgroup */ + if (!blkg->parent) + return false; + + spin_lock_bh(&blkg->async_bio_lock); + bio_list_add(&blkg->async_bios, bio); + spin_unlock_bh(&blkg->async_bio_lock); + + queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); + return true; +} + /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a @@ -1587,6 +1631,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { + unsigned long pflags; u64 now = ktime_to_ns(ktime_get()); u64 exp; u64 delay_nsec = 0; @@ -1613,11 +1658,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) */ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); - /* - * TODO: the use_memdelay flag is going to be for the upcoming psi stuff - * that hasn't landed upstream yet. Once that stuff is in place we need - * to do a psi_memstall_enter/leave if memdelay is set. - */ + if (use_memdelay) + psi_memstall_enter(&pflags); exp = ktime_add_ns(now, delay_nsec); tok = io_schedule_prepare(); @@ -1627,6 +1669,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) break; } while (!fatal_signal_pending(current)); io_schedule_finish(tok); + + if (use_memdelay) + psi_memstall_leave(&pflags); } /** @@ -1726,5 +1771,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) atomic64_add(delta, &blkg->delay_nsec); } +static int __init blkcg_init(void) +{ + blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", + WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!blkcg_punt_bio_wq) + return -ENOMEM; + return 0; +} +subsys_initcall(blkcg_init); + module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/block/blk-core.c b/block/blk-core.c index 5d1fc8e17dd1..d0cc6e14d2f0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -117,6 +117,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->internal_tag = -1; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; + refcount_set(&rq->ref, 1); } EXPORT_SYMBOL(blk_rq_init); @@ -687,7 +688,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, struct request *rq; struct list_head *plug_list; - plug = current->plug; + plug = blk_mq_plug(q, bio); if (!plug) return false; @@ -1127,6 +1128,9 @@ EXPORT_SYMBOL_GPL(direct_make_request); */ blk_qc_t submit_bio(struct bio *bio) { + if (blkcg_punt_bio_submit(bio)) + return BLK_QC_T_NONE; + /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. diff --git a/block/blk-mq.c b/block/blk-mq.c index e5ef40c603ca..b038ec680e84 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1973,7 +1973,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_mq_bio_to_request(rq, bio, nr_segs); - plug = current->plug; + plug = blk_mq_plug(q, bio); if (unlikely(is_flush_fua)) { /* bypass scheduler for flush rq */ blk_insert_flush(rq); diff --git a/block/blk-mq.h b/block/blk-mq.h index f4bf5161333e..32c62c64e6c2 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -233,4 +233,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) qmap->mq_map[cpu] = 0; } +/* + * blk_mq_plug() - Get caller context plug + * @q: request queue + * @bio : the bio being submitted by the caller context + * + * Plugging, by design, may delay the insertion of BIOs into the elevator in + * order to increase BIO merging opportunities. This however can cause BIO + * insertion order to change from the order in which submit_bio() is being + * executed in the case of multiple contexts concurrently issuing BIOs to a + * device, even if these context are synchronized to tightly control BIO issuing + * order. While this is not a problem with regular block devices, this ordering + * change can cause write BIO failures with zoned block devices as these + * require sequential write patterns to zones. Prevent this from happening by + * ignoring the plug state of a BIO issuing context if the target request queue + * is for a zoned block device and the BIO to plug is a write operation. + * + * Return current->plug if the bio can be plugged and NULL otherwise + */ +static inline struct blk_plug *blk_mq_plug(struct request_queue *q, + struct bio *bio) +{ + /* + * For regular block devices or read operations, use the context plug + * which may be NULL if blk_start_plug() was not executed. + */ + if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio))) + return current->plug; + + /* Zoned block device write operation case: do not plug the BIO */ + return NULL; +} + #endif diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9ea7c0ecad10..8ab6c8153223 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -881,13 +881,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; u64 tmp; - jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; - - /* Slice has just started. Consider one slice interval */ - if (!jiffy_elapsed) - jiffy_elapsed_rnd = tg->td->throtl_slice; + jiffy_elapsed = jiffies - tg->slice_start[rw]; - jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); + /* Round up to the next throttle slice, wait time must be nonzero */ + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); /* * jiffy_elapsed_rnd should not be a big value as minimum iops can be diff --git a/block/blk-zoned.c b/block/blk-zoned.c index ae7e91bd0618..6c503824ba3f 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -14,6 +14,9 @@ #include <linux/rbtree.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> +#include <linux/sched/mm.h> #include "blk.h" @@ -70,7 +73,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); static inline unsigned int __blkdev_nr_zones(struct request_queue *q, sector_t nr_sectors) { - unsigned long zone_sectors = blk_queue_zone_sectors(q); + sector_t zone_sectors = blk_queue_zone_sectors(q); return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors); } @@ -117,8 +120,7 @@ static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep) } static int blk_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct request_queue *q = disk->queue; unsigned int z = 0, n, nrz = *nr_zones; @@ -127,8 +129,7 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector, while (z < nrz && sector < capacity) { n = nrz - z; - ret = disk->fops->report_zones(disk, sector, &zones[z], &n, - gfp_mask); + ret = disk->fops->report_zones(disk, sector, &zones[z], &n); if (ret) return ret; if (!n) @@ -149,17 +150,18 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector, * @sector: Sector from which to report zones * @zones: Array of zone structures where to return the zones information * @nr_zones: Number of zone structures in the zone array - * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: * Get zone information starting from the zone containing @sector. * The number of zone information reported may be less than the number * requested by @nr_zones. The number of zones actually reported is * returned in @nr_zones. + * The caller must use memalloc_noXX_save/restore() calls to control + * memory allocations done within this function (zone array and command + * buffer allocation by the device driver). */ int blkdev_report_zones(struct block_device *bdev, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct request_queue *q = bdev_get_queue(bdev); unsigned int i, nrz; @@ -184,7 +186,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, nrz = min(*nr_zones, __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector)); ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector, - zones, &nrz, gfp_mask); + zones, &nrz); if (ret) return ret; @@ -305,9 +307,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, if (!zones) return -ENOMEM; - ret = blkdev_report_zones(bdev, rep.sector, - zones, &rep.nr_zones, - GFP_KERNEL); + ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones); if (ret) goto out; @@ -373,22 +373,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node, * Allocate an array of struct blk_zone to get nr_zones zone information. * The allocated array may be smaller than nr_zones. */ -static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones) +static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones) { - size_t size = *nr_zones * sizeof(struct blk_zone); - struct page *page; - int order; - - for (order = get_order(size); order >= 0; order--) { - page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); - if (page) { - *nr_zones = min_t(unsigned int, *nr_zones, - (PAGE_SIZE << order) / sizeof(struct blk_zone)); - return page_address(page); - } + struct blk_zone *zones; + size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES); + + /* + * GFP_KERNEL here is meaningless as the caller task context has + * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones() + * with memalloc_noio_save(). + */ + zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL); + if (!zones) { + *nr_zones = 0; + return NULL; } - return NULL; + *nr_zones = nrz; + + return zones; } void blk_queue_free_zone_bitmaps(struct request_queue *q) @@ -415,6 +418,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL; unsigned int i, rep_nr_zones = 0, z = 0, nrz; struct blk_zone *zones = NULL; + unsigned int noio_flag; sector_t sector = 0; int ret = 0; @@ -427,6 +431,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk) return 0; } + /* + * Ensure that all memory allocations in this context are done as + * if GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); + if (!blk_queue_is_zoned(q) || !nr_zones) { nr_zones = 0; goto update; @@ -443,13 +453,13 @@ int blk_revalidate_disk_zones(struct gendisk *disk) /* Get zone information and initialize seq_zones_bitmap */ rep_nr_zones = nr_zones; - zones = blk_alloc_zones(q->node, &rep_nr_zones); + zones = blk_alloc_zones(&rep_nr_zones); if (!zones) goto out; while (z < nr_zones) { nrz = min(nr_zones - z, rep_nr_zones); - ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO); + ret = blk_report_zones(disk, sector, zones, &nrz); if (ret) goto out; if (!nrz) @@ -480,8 +490,9 @@ update: blk_mq_unfreeze_queue(q); out: - free_pages((unsigned long)zones, - get_order(rep_nr_zones * sizeof(struct blk_zone))); + memalloc_noio_restore(noio_flag); + + kvfree(zones); kfree(seq_zones_wlock); kfree(seq_zones_bitmap); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 3a9bca3aa093..9bcde2325893 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -134,6 +134,8 @@ static struct dentry *nbd_dbg_dir; #define NBD_MAGIC 0x68797548 +#define NBD_DEF_BLKSIZE 1024 + static unsigned int nbds_max = 16; static int max_part = 16; static struct workqueue_struct *recv_workqueue; @@ -1236,6 +1238,14 @@ static void nbd_clear_sock_ioctl(struct nbd_device *nbd, nbd_config_put(nbd); } +static bool nbd_is_valid_blksize(unsigned long blksize) +{ + if (!blksize || !is_power_of_2(blksize) || blksize < 512 || + blksize > PAGE_SIZE) + return false; + return true; +} + /* Must be called with config_lock held */ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) @@ -1251,8 +1261,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_SET_SOCK: return nbd_add_socket(nbd, arg, false); case NBD_SET_BLKSIZE: - if (!arg || !is_power_of_2(arg) || arg < 512 || - arg > PAGE_SIZE) + if (!arg) + arg = NBD_DEF_BLKSIZE; + if (!nbd_is_valid_blksize(arg)) return -EINVAL; nbd_size_set(nbd, arg, div_s64(config->bytesize, arg)); @@ -1332,7 +1343,7 @@ static struct nbd_config *nbd_alloc_config(void) atomic_set(&config->recv_threads, 0); init_waitqueue_head(&config->recv_wq); init_waitqueue_head(&config->conn_wait); - config->blksize = 1024; + config->blksize = NBD_DEF_BLKSIZE; atomic_set(&config->live_connections, 0); try_module_get(THIS_MODULE); return config; @@ -1673,6 +1684,30 @@ nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = { [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 }, }; +static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd) +{ + struct nbd_config *config = nbd->config; + u64 bsize = config->blksize; + u64 bytes = config->bytesize; + + if (info->attrs[NBD_ATTR_SIZE_BYTES]) + bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); + + if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) { + bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); + if (!bsize) + bsize = NBD_DEF_BLKSIZE; + if (!nbd_is_valid_blksize(bsize)) { + printk(KERN_ERR "Invalid block size %llu\n", bsize); + return -EINVAL; + } + } + + if (bytes != config->bytesize || bsize != config->blksize) + nbd_size_set(nbd, bsize, div64_u64(bytes, bsize)); + return 0; +} + static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info) { struct nbd_device *nbd = NULL; @@ -1760,16 +1795,10 @@ again: refcount_set(&nbd->config_refs, 1); set_bit(NBD_BOUND, &config->runtime_flags); - if (info->attrs[NBD_ATTR_SIZE_BYTES]) { - u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]); - nbd_size_set(nbd, config->blksize, - div64_u64(bytes, config->blksize)); - } - if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) { - u64 bsize = - nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]); - nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize)); - } + ret = nbd_genl_size_set(info, nbd); + if (ret) + goto out; + if (info->attrs[NBD_ATTR_TIMEOUT]) { u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); nbd->tag_set.timeout = timeout * HZ; @@ -1938,6 +1967,10 @@ static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info) goto out; } + ret = nbd_genl_size_set(info, nbd); + if (ret) + goto out; + if (info->attrs[NBD_ATTR_TIMEOUT]) { u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]); nbd->tag_set.timeout = timeout * HZ; diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 34b22d6523ba..a1b9929bd911 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -89,8 +89,7 @@ struct nullb { int null_zone_init(struct nullb_device *dev); void null_zone_exit(struct nullb_device *dev); int null_zone_report(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask); + struct blk_zone *zones, unsigned int *nr_zones); void null_zone_write(struct nullb_cmd *cmd, sector_t sector, unsigned int nr_sectors); void null_zone_reset(struct nullb_cmd *cmd, sector_t sector); @@ -103,7 +102,7 @@ static inline int null_zone_init(struct nullb_device *dev) static inline void null_zone_exit(struct nullb_device *dev) {} static inline int null_zone_report(struct gendisk *disk, sector_t sector, struct blk_zone *zones, - unsigned int *nr_zones, gfp_t gfp_mask) + unsigned int *nr_zones) { return -EOPNOTSUPP; } diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index fca0c97ff1aa..cb28d93f2bd1 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -67,8 +67,7 @@ void null_zone_exit(struct nullb_device *dev) } int null_zone_report(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct nullb *nullb = disk->private_data; struct nullb_device *dev = nullb->dev; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index a9bc518156f2..2900fbde89b3 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -461,15 +461,14 @@ static int flakey_prepare_ioctl(struct dm_target *ti, struct block_device **bdev #ifdef CONFIG_BLK_DEV_ZONED static int flakey_report_zones(struct dm_target *ti, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct flakey_c *fc = ti->private; int ret; /* Do report and remap it */ ret = blkdev_report_zones(fc->dev->bdev, flakey_map_sector(ti, sector), - zones, nr_zones, gfp_mask); + zones, nr_zones); if (ret != 0) return ret; diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index ad980a38fb1e..ecefe6703736 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -137,15 +137,14 @@ static int linear_prepare_ioctl(struct dm_target *ti, struct block_device **bdev #ifdef CONFIG_BLK_DEV_ZONED static int linear_report_zones(struct dm_target *ti, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct linear_c *lc = (struct linear_c *) ti->private; int ret; /* Do report and remap it */ ret = blkdev_report_zones(lc->dev->bdev, linear_map_sector(ti, sector), - zones, nr_zones, gfp_mask); + zones, nr_zones); if (ret != 0) return ret; diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index d8334cd45d7c..9faf3e49c7af 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/crc32.h> +#include <linux/sched/mm.h> #define DM_MSG_PREFIX "zoned metadata" @@ -1162,8 +1163,7 @@ static int dmz_init_zones(struct dmz_metadata *zmd) while (sector < dev->capacity) { /* Get zone information */ nr_blkz = DMZ_REPORT_NR_ZONES; - ret = blkdev_report_zones(dev->bdev, sector, blkz, - &nr_blkz, GFP_KERNEL); + ret = blkdev_report_zones(dev->bdev, sector, blkz, &nr_blkz); if (ret) { dmz_dev_err(dev, "Report zones failed %d", ret); goto out; @@ -1201,12 +1201,20 @@ out: static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) { unsigned int nr_blkz = 1; + unsigned int noio_flag; struct blk_zone blkz; int ret; - /* Get zone information from disk */ + /* + * Get zone information from disk. Since blkdev_report_zones() uses + * GFP_KERNEL by default for memory allocations, set the per-task + * PF_MEMALLOC_NOIO flag so that all allocations are done as if + * GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone), - &blkz, &nr_blkz, GFP_NOIO); + &blkz, &nr_blkz); + memalloc_noio_restore(noio_flag); if (!nr_blkz) ret = -EIO; if (ret) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5475081dcbd6..61f1152b74e9 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -441,8 +441,7 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) } static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { #ifdef CONFIG_BLK_DEV_ZONED struct mapped_device *md = disk->private_data; @@ -480,8 +479,7 @@ static int dm_blk_report_zones(struct gendisk *disk, sector_t sector, * So there is no need to loop here trying to fill the entire array * of zones. */ - ret = tgt->type->report_zones(tgt, sector, zones, - nr_zones, gfp_mask); + ret = tgt->type->report_zones(tgt, sector, zones, nr_zones); out: dm_put_live_table(md, srcu_idx); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b2dd4e391f5c..cc09b81fc7f4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -11,6 +11,7 @@ #include <linux/hdreg.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/backing-dev.h> #include <linux/list_sort.h> #include <linux/slab.h> #include <linux/types.h> @@ -1626,6 +1627,7 @@ static void nvme_update_disk_info(struct gendisk *disk, { sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9); unsigned short bs = 1 << ns->lba_shift; + u32 atomic_bs, phys_bs, io_opt; if (ns->lba_shift > PAGE_SHIFT) { /* unsupported block size, set capacity to 0 later */ @@ -1634,9 +1636,37 @@ static void nvme_update_disk_info(struct gendisk *disk, blk_mq_freeze_queue(disk->queue); blk_integrity_unregister(disk); + if (id->nabo == 0) { + /* + * Bit 1 indicates whether NAWUPF is defined for this namespace + * and whether it should be used instead of AWUPF. If NAWUPF == + * 0 then AWUPF must be used instead. + */ + if (id->nsfeat & (1 << 1) && id->nawupf) + atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; + else + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; + } else { + atomic_bs = bs; + } + phys_bs = bs; + io_opt = bs; + if (id->nsfeat & (1 << 4)) { + /* NPWG = Namespace Preferred Write Granularity */ + phys_bs *= 1 + le16_to_cpu(id->npwg); + /* NOWS = Namespace Optimal Write Size */ + io_opt *= 1 + le16_to_cpu(id->nows); + } + blk_queue_logical_block_size(disk->queue, bs); - blk_queue_physical_block_size(disk->queue, bs); - blk_queue_io_min(disk->queue, bs); + /* + * Linux filesystems assume writing a single physical block is + * an atomic operation. Hence limit the physical block size to the + * value of the Atomic Write Unit Power Fail parameter. + */ + blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); + blk_queue_io_min(disk->queue, phys_bs); + blk_queue_io_opt(disk->queue, io_opt); if (ns->ms && !ns->ext && (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) @@ -2386,8 +2416,8 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, lockdep_assert_held(&nvme_subsystems_lock); list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { - if (ctrl->state == NVME_CTRL_DELETING || - ctrl->state == NVME_CTRL_DEAD) + if (tmp->state == NVME_CTRL_DELETING || + tmp->state == NVME_CTRL_DEAD) continue; if (tmp->cntlid == ctrl->cntlid) { @@ -2433,6 +2463,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); subsys->vendor_id = le16_to_cpu(id->vid); subsys->cmic = id->cmic; + subsys->awupf = le16_to_cpu(id->awupf); #ifdef CONFIG_NVME_MULTIPATH subsys->iopolicy = NVME_IOPOLICY_NUMA; #endif @@ -3274,6 +3305,10 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) goto out_free_ns; } + if (ctrl->opts && ctrl->opts->data_digest) + ns->queue->backing_dev_info->capabilities + |= BDI_CAP_STABLE_WRITES; + blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue); if (ctrl->ops->flags & NVME_F_PCI_P2PDMA) blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index dcb2b799966f..232d8094091b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -204,6 +204,9 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt); static struct workqueue_struct *nvme_fc_wq; +static bool nvme_fc_waiting_to_unload; +static DECLARE_COMPLETION(nvme_fc_unload_proceed); + /* * These items are short-term. They will eventually be moved into * a generic FC class. See comments in module init. @@ -229,6 +232,8 @@ nvme_fc_free_lport(struct kref *ref) /* remove from transport list */ spin_lock_irqsave(&nvme_fc_lock, flags); list_del(&lport->port_list); + if (nvme_fc_waiting_to_unload && list_empty(&nvme_fc_lport_list)) + complete(&nvme_fc_unload_proceed); spin_unlock_irqrestore(&nvme_fc_lock, flags); ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); @@ -3457,11 +3462,51 @@ out_destroy_wq: return ret; } +static void +nvme_fc_delete_controllers(struct nvme_fc_rport *rport) +{ + struct nvme_fc_ctrl *ctrl; + + spin_lock(&rport->lock); + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: transport unloading: deleting ctrl\n", + ctrl->cnum); + nvme_delete_ctrl(&ctrl->ctrl); + } + spin_unlock(&rport->lock); +} + +static void +nvme_fc_cleanup_for_unload(void) +{ + struct nvme_fc_lport *lport; + struct nvme_fc_rport *rport; + + list_for_each_entry(lport, &nvme_fc_lport_list, port_list) { + list_for_each_entry(rport, &lport->endp_list, endp_list) { + nvme_fc_delete_controllers(rport); + } + } +} + static void __exit nvme_fc_exit_module(void) { - /* sanity check - all lports should be removed */ - if (!list_empty(&nvme_fc_lport_list)) - pr_warn("%s: localport list not empty\n", __func__); + unsigned long flags; + bool need_cleanup = false; + + spin_lock_irqsave(&nvme_fc_lock, flags); + nvme_fc_waiting_to_unload = true; + if (!list_empty(&nvme_fc_lport_list)) { + need_cleanup = true; + nvme_fc_cleanup_for_unload(); + } + spin_unlock_irqrestore(&nvme_fc_lock, flags); + if (need_cleanup) { + pr_info("%s: waiting for ctlr deletes\n", __func__); + wait_for_completion(&nvme_fc_unload_proceed); + pr_info("%s: ctrl deletes complete\n", __func__); + } nvmf_unregister_transport(&nvme_fc_transport); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 499acf07d61a..a9a927677970 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -123,14 +123,20 @@ void nvme_mpath_clear_current_path(struct nvme_ns *ns) } } +static bool nvme_path_is_disabled(struct nvme_ns *ns) +{ + return ns->ctrl->state != NVME_CTRL_LIVE || + test_bit(NVME_NS_ANA_PENDING, &ns->flags) || + test_bit(NVME_NS_REMOVING, &ns->flags); +} + static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node) { int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; struct nvme_ns *found = NULL, *fallback = NULL, *ns; list_for_each_entry_rcu(ns, &head->list, siblings) { - if (ns->ctrl->state != NVME_CTRL_LIVE || - test_bit(NVME_NS_ANA_PENDING, &ns->flags)) + if (nvme_path_is_disabled(ns)) continue; if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA) @@ -178,14 +184,16 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head, { struct nvme_ns *ns, *found, *fallback = NULL; - if (list_is_singular(&head->list)) + if (list_is_singular(&head->list)) { + if (nvme_path_is_disabled(old)) + return NULL; return old; + } for (ns = nvme_next_ns(head, old); ns != old; ns = nvme_next_ns(head, ns)) { - if (ns->ctrl->state != NVME_CTRL_LIVE || - test_bit(NVME_NS_ANA_PENDING, &ns->flags)) + if (nvme_path_is_disabled(ns)) continue; if (ns->ana_state == NVME_ANA_OPTIMIZED) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ea45d7d393ad..716a876119c8 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -283,6 +283,7 @@ struct nvme_subsystem { char firmware_rev[8]; u8 cmic; u16 vendor_id; + u16 awupf; /* 0's based awupf value. */ struct ida ns_ida; #ifdef CONFIG_NVME_MULTIPATH enum nvme_iopolicy iopolicy; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 189352081994..bb970ca82517 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1439,11 +1439,15 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); - nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, - nvmeq->sq_cmds); - if (nvmeq->sq_dma_addr) { - set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); - return 0; + if (nvmeq->sq_cmds) { + nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, + nvmeq->sq_cmds); + if (nvmeq->sq_dma_addr) { + set_bit(NVMEQ_SQ_CMB, &nvmeq->flags); + return 0; + } + + pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth)); } } @@ -2250,7 +2254,9 @@ static int nvme_dev_add(struct nvme_dev *dev) if (!dev->ctrl.tagset) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; - dev->tagset.nr_maps = 2; /* default + read */ + dev->tagset.nr_maps = 1; /* default */ + if (dev->io_queues[HCTX_TYPE_READ]) + dev->tagset.nr_maps++; if (dev->io_queues[HCTX_TYPE_POLL]) dev->tagset.nr_maps++; dev->tagset.timeout = NVME_IO_TIMEOUT; @@ -2289,8 +2295,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) pci_set_master(pdev); - if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && - dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) + if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64))) goto disable; if (readl(dev->bar + NVME_REG_CSTS) == -1) { @@ -2498,7 +2503,8 @@ static void nvme_reset_work(struct work_struct *work) * Limit the max command size to prevent iod->sg allocations going * over a single page. */ - dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; + dev->ctrl.max_hw_sectors = min_t(u32, + NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9); dev->ctrl.max_segments = NVME_MAX_SEGS; /* @@ -2923,7 +2929,7 @@ static int nvme_simple_resume(struct device *dev) return 0; } -const struct dev_pm_ops nvme_dev_pm_ops = { +static const struct dev_pm_ops nvme_dev_pm_ops = { .suspend = nvme_suspend, .resume = nvme_resume, .freeze = nvme_simple_suspend, diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 08a2501b9357..606b13d35d16 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -860,7 +860,14 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) else flags |= MSG_MORE; - ret = kernel_sendpage(queue->sock, page, offset, len, flags); + /* can't zcopy slab pages */ + if (unlikely(PageSlab(page))) { + ret = sock_no_sendpage(queue->sock, page, offset, len, + flags); + } else { + ret = kernel_sendpage(queue->sock, page, offset, len, + flags); + } if (ret <= 0) return ret; diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index f01ad0fd60bb..9778eb0406b3 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -7,6 +7,17 @@ #include <asm/unaligned.h> #include "trace.h" +static const char *nvme_trace_delete_sq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 sqid = get_unaligned_le16(cdw10); + + trace_seq_printf(p, "sqid=%u", sqid); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -23,6 +34,17 @@ static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_delete_cq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u16 cqid = get_unaligned_le16(cdw10); + + trace_seq_printf(p, "cqid=%u", cqid); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -107,8 +129,12 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode, u8 *cdw10) { switch (opcode) { + case nvme_admin_delete_sq: + return nvme_trace_delete_sq(p, cdw10); case nvme_admin_create_sq: return nvme_trace_create_sq(p, cdw10); + case nvme_admin_delete_cq: + return nvme_trace_delete_cq(p, cdw10); case nvme_admin_create_cq: return nvme_trace_create_cq(p, cdw10); case nvme_admin_identify: @@ -178,7 +204,7 @@ static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc) { const char *ret = trace_seq_buffer_ptr(p); - trace_seq_printf(p, "spcecific=%*ph", 24, spc); + trace_seq_printf(p, "specific=%*ph", 24, spc); trace_seq_putc(p, 0); return ret; } diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 9f72d515fc4b..4dc12ea52f23 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -442,6 +442,9 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) break; } + if (ns->bdev) + nvmet_bdev_set_limits(ns->bdev, id); + /* * We just provide a single LBA format that matches what the * underlying device reports. diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 08dd5af357f7..cd52b9f15376 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -588,8 +588,10 @@ static struct config_group *nvmet_ns_make(struct config_group *group, goto out; ret = -EINVAL; - if (nsid == 0 || nsid == NVME_NSID_ALL) + if (nsid == 0 || nsid == NVME_NSID_ALL) { + pr_err("invalid nsid %#x", nsid); goto out; + } ret = -ENOMEM; ns = nvmet_ns_alloc(subsys, nsid); diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index b8c1cc54a0db..b50b53db3746 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -434,7 +434,7 @@ fcloop_fcp_recv_work(struct work_struct *work) int ret = 0; bool aborted = false; - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); switch (tfcp_req->inistate) { case INI_IO_START: tfcp_req->inistate = INI_IO_ACTIVE; @@ -443,11 +443,11 @@ fcloop_fcp_recv_work(struct work_struct *work) aborted = true; break; default: - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); WARN_ON(1); return; } - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); if (unlikely(aborted)) ret = -ECANCELED; @@ -469,7 +469,7 @@ fcloop_fcp_abort_recv_work(struct work_struct *work) struct nvmefc_fcp_req *fcpreq; bool completed = false; - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); fcpreq = tfcp_req->fcpreq; switch (tfcp_req->inistate) { case INI_IO_ABORTED: @@ -478,11 +478,11 @@ fcloop_fcp_abort_recv_work(struct work_struct *work) completed = true; break; default: - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); WARN_ON(1); return; } - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); if (unlikely(completed)) { /* remove reference taken in original abort downcall */ @@ -494,9 +494,9 @@ fcloop_fcp_abort_recv_work(struct work_struct *work) nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport, &tfcp_req->tgt_fcp_req); - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); tfcp_req->fcpreq = NULL; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED); /* call_host_done releases reference for abort downcall */ @@ -513,10 +513,10 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work) container_of(work, struct fcloop_fcpreq, tio_done_work); struct nvmefc_fcp_req *fcpreq; - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); fcpreq = tfcp_req->fcpreq; tfcp_req->inistate = INI_IO_COMPLETED; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status); } @@ -535,7 +535,7 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport, if (!rport->targetport) return -ECONNREFUSED; - tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_KERNEL); + tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_ATOMIC); if (!tfcp_req) return -ENOMEM; @@ -621,12 +621,12 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport, int fcp_err = 0, active, aborted; u8 op = tgt_fcpreq->op; - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); fcpreq = tfcp_req->fcpreq; active = tfcp_req->active; aborted = tfcp_req->aborted; tfcp_req->active = true; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); if (unlikely(active)) /* illegal - call while i/o active */ @@ -634,9 +634,9 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport, if (unlikely(aborted)) { /* target transport has aborted i/o prior */ - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); tfcp_req->active = false; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); tgt_fcpreq->transferred_length = 0; tgt_fcpreq->fcp_error = -ECANCELED; tgt_fcpreq->done(tgt_fcpreq); @@ -693,9 +693,9 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport, break; } - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); tfcp_req->active = false; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); tgt_fcpreq->transferred_length = xfrlen; tgt_fcpreq->fcp_error = fcp_err; @@ -715,9 +715,9 @@ fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport, * (one doing io, other doing abort) and only kills ops posted * after the abort request */ - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); tfcp_req->aborted = true; - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); tfcp_req->status = NVME_SC_INTERNAL; @@ -765,7 +765,7 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, return; /* break initiator/target relationship for io */ - spin_lock(&tfcp_req->reqlock); + spin_lock_irq(&tfcp_req->reqlock); switch (tfcp_req->inistate) { case INI_IO_START: case INI_IO_ACTIVE: @@ -775,11 +775,11 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport, abortio = false; break; default: - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); WARN_ON(1); return; } - spin_unlock(&tfcp_req->reqlock); + spin_unlock_irq(&tfcp_req->reqlock); if (abortio) /* leave the reference while the work item is scheduled */ diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 7a1cf6437a6a..de0bff70ebb6 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -8,6 +8,45 @@ #include <linux/module.h> #include "nvmet.h" +void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) +{ + const struct queue_limits *ql = &bdev_get_queue(bdev)->limits; + /* Number of physical blocks per logical block. */ + const u32 ppl = ql->physical_block_size / ql->logical_block_size; + /* Physical blocks per logical block, 0's based. */ + const __le16 ppl0b = to0based(ppl); + + /* + * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN, + * NAWUPF, and NACWU are defined for this namespace and should be + * used by the host for this namespace instead of the AWUN, AWUPF, + * and ACWU fields in the Identify Controller data structure. If + * any of these fields are zero that means that the corresponding + * field from the identify controller data structure should be used. + */ + id->nsfeat |= 1 << 1; + id->nawun = ppl0b; + id->nawupf = ppl0b; + id->nacwu = ppl0b; + + /* + * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and + * NOWS are defined for this namespace and should be used by + * the host for I/O optimization. + */ + id->nsfeat |= 1 << 4; + /* NPWG = Namespace Preferred Write Granularity. 0's based */ + id->npwg = ppl0b; + /* NPWA = Namespace Preferred Write Alignment. 0's based */ + id->npwa = id->npwg; + /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ + id->npdg = to0based(ql->discard_granularity / ql->logical_block_size); + /* NPDG = Namespace Preferred Deallocate Alignment */ + id->npda = id->npdg; + /* NOWS = Namespace Optimal Write Size */ + id->nows = to0based(ql->io_opt / ql->logical_block_size); +} + int nvmet_bdev_ns_enable(struct nvmet_ns *ns) { int ret; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index dc270944bb25..6ee66c610739 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -365,6 +365,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask); void nvmet_execute_async_event(struct nvmet_req *req); u16 nvmet_parse_connect_cmd(struct nvmet_req *req); +void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id); u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); u16 nvmet_parse_admin_cmd(struct nvmet_req *req); @@ -492,4 +493,11 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req) } u16 errno_to_nvme_status(struct nvmet_req *req, int errno); + +/* Convert a 32-bit number to a 16-bit 0's based number */ +static inline __le16 to0based(u32 a) +{ + return cpu_to_le16(max(1U, min(1U << 16, a)) - 1); +} + #endif /* _NVMET_H */ diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index cdcdd14c6408..6af11d493271 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -146,7 +146,7 @@ static const char *nvmet_trace_fabrics_common(struct trace_seq *p, u8 *spc) { const char *ret = trace_seq_buffer_ptr(p); - trace_seq_printf(p, "spcecific=%*ph", 24, spc); + trace_seq_printf(p, "specific=%*ph", 24, spc); trace_seq_putc(p, 0); return ret; } diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 5796ace76225..38c50946fc42 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -213,8 +213,7 @@ extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd); extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr); extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask); + struct blk_zone *zones, unsigned int *nr_zones); #else /* CONFIG_BLK_DEV_ZONED */ diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 7334024b64f1..db16c19e05c4 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -9,6 +9,8 @@ */ #include <linux/blkdev.h> +#include <linux/vmalloc.h> +#include <linux/sched/mm.h> #include <asm/unaligned.h> @@ -50,7 +52,7 @@ static void sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf, /** * sd_zbc_do_report_zones - Issue a REPORT ZONES scsi command. * @sdkp: The target disk - * @buf: Buffer to use for the reply + * @buf: vmalloc-ed buffer to use for the reply * @buflen: the buffer size * @lba: Start LBA of the report * @partial: Do partial report @@ -79,7 +81,6 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf, put_unaligned_be32(buflen, &cmd[10]); if (partial) cmd[14] = ZBC_REPORT_ZONE_PARTIAL; - memset(buf, 0, buflen); result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE, buf, buflen, &sshdr, @@ -103,45 +104,83 @@ static int sd_zbc_do_report_zones(struct scsi_disk *sdkp, unsigned char *buf, return 0; } +/* + * Maximum number of zones to get with one report zones command. + */ +#define SD_ZBC_REPORT_MAX_ZONES 8192U + +/** + * Allocate a buffer for report zones reply. + * @sdkp: The target disk + * @nr_zones: Maximum number of zones to report + * @buflen: Size of the buffer allocated + * + * Try to allocate a reply buffer for the number of requested zones. + * The size of the buffer allocated may be smaller than requested to + * satify the device constraint (max_hw_sectors, max_segments, etc). + * + * Return the address of the allocated buffer and update @buflen with + * the size of the allocated buffer. + */ +static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, + unsigned int nr_zones, size_t *buflen) +{ + struct request_queue *q = sdkp->disk->queue; + size_t bufsize; + void *buf; + + /* + * Report zone buffer size should be at most 64B times the number of + * zones requested plus the 64B reply header, but should be at least + * SECTOR_SIZE for ATA devices. + * Make sure that this size does not exceed the hardware capabilities. + * Furthermore, since the report zone command cannot be split, make + * sure that the allocated buffer can always be mapped by limiting the + * number of pages allocated to the HBA max segments limit. + */ + nr_zones = min(nr_zones, SD_ZBC_REPORT_MAX_ZONES); + bufsize = roundup((nr_zones + 1) * 64, 512); + bufsize = min_t(size_t, bufsize, + queue_max_hw_sectors(q) << SECTOR_SHIFT); + bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); + + buf = vzalloc(bufsize); + if (buf) + *buflen = bufsize; + + return buf; +} + /** * sd_zbc_report_zones - Disk report zones operation. * @disk: The target disk * @sector: Start 512B sector of the report * @zones: Array of zone descriptors * @nr_zones: Number of descriptors in the array - * @gfp_mask: Memory allocation mask * * Execute a report zones command on the target disk. */ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask) + struct blk_zone *zones, unsigned int *nr_zones) { struct scsi_disk *sdkp = scsi_disk(disk); - unsigned int i, buflen, nrz = *nr_zones; + unsigned int i, nrz = *nr_zones; unsigned char *buf; - size_t offset = 0; + size_t buflen = 0, offset = 0; int ret = 0; if (!sd_is_zoned(sdkp)) /* Not a zoned device */ return -EOPNOTSUPP; - /* - * Get a reply buffer for the number of requested zones plus a header, - * without exceeding the device maximum command size. For ATA disks, - * buffers must be aligned to 512B. - */ - buflen = min(queue_max_hw_sectors(disk->queue) << 9, - roundup((nrz + 1) * 64, 512)); - buf = kmalloc(buflen, gfp_mask); + buf = sd_zbc_alloc_report_buffer(sdkp, nrz, &buflen); if (!buf) return -ENOMEM; ret = sd_zbc_do_report_zones(sdkp, buf, buflen, sectors_to_logical(sdkp->device, sector), true); if (ret) - goto out_free_buf; + goto out; nrz = min(nrz, get_unaligned_be32(&buf[0]) / 64); for (i = 0; i < nrz; i++) { @@ -152,8 +191,8 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, *nr_zones = nrz; -out_free_buf: - kfree(buf); +out: + kvfree(buf); return ret; } @@ -287,8 +326,6 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp, return 0; } -#define SD_ZBC_BUF_SIZE 131072U - /** * sd_zbc_check_zones - Check the device capacity and zone sizes * @sdkp: Target disk @@ -304,22 +341,28 @@ static int sd_zbc_check_zoned_characteristics(struct scsi_disk *sdkp, */ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks) { + size_t bufsize, buflen; + unsigned int noio_flag; u64 zone_blocks = 0; sector_t max_lba, block = 0; unsigned char *buf; unsigned char *rec; - unsigned int buf_len; - unsigned int list_length; int ret; u8 same; + /* Do all memory allocations as if GFP_NOIO was specified */ + noio_flag = memalloc_noio_save(); + /* Get a buffer */ - buf = kmalloc(SD_ZBC_BUF_SIZE, GFP_KERNEL); - if (!buf) - return -ENOMEM; + buf = sd_zbc_alloc_report_buffer(sdkp, SD_ZBC_REPORT_MAX_ZONES, + &bufsize); + if (!buf) { + ret = -ENOMEM; + goto out; + } /* Do a report zone to get max_lba and the same field */ - ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, 0, false); + ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, 0, false); if (ret) goto out_free; @@ -355,12 +398,12 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks) do { /* Parse REPORT ZONES header */ - list_length = get_unaligned_be32(&buf[0]) + 64; + buflen = min_t(size_t, get_unaligned_be32(&buf[0]) + 64, + bufsize); rec = buf + 64; - buf_len = min(list_length, SD_ZBC_BUF_SIZE); /* Parse zone descriptors */ - while (rec < buf + buf_len) { + while (rec < buf + buflen) { u64 this_zone_blocks = get_unaligned_be64(&rec[8]); if (zone_blocks == 0) { @@ -376,8 +419,8 @@ static int sd_zbc_check_zones(struct scsi_disk *sdkp, u32 *zblocks) } if (block < sdkp->capacity) { - ret = sd_zbc_do_report_zones(sdkp, buf, SD_ZBC_BUF_SIZE, - block, true); + ret = sd_zbc_do_report_zones(sdkp, buf, bufsize, block, + true); if (ret) goto out_free; } @@ -408,7 +451,8 @@ out: } out_free: - kfree(buf); + memalloc_noio_restore(noio_flag); + kvfree(buf); return ret; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index db337e53aab3..5106008f5e28 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2911,7 +2911,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, bio = NULL; } else { if (wbc) - wbc_account_io(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, page_size); return 0; } } @@ -2924,7 +2924,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, bio->bi_opf = opf; if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_io(wbc, page, page_size); + wbc_account_cgroup_owner(wbc, page, page_size); } *bio_ret = bio; diff --git a/fs/buffer.c b/fs/buffer.c index 49a871570092..86a38b979323 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3089,7 +3089,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, if (wbc) { wbc_init_bio(wbc, bio); - wbc_account_io(wbc, bh->b_page, bh->b_size); + wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size); } submit_bio(bio); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index a18a47a2a1d1..12ceadef32c5 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -396,7 +396,7 @@ submit_and_retry: ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; - wbc_account_io(io->io_wbc, page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size); io->io_next_block++; return 0; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0ca530afc684..4eb2f3920140 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -470,7 +470,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) } if (fio->io_wbc && !is_read_io(fio->op)) - wbc_account_io(fio->io_wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); bio_set_op_attrs(bio, fio->op, fio->op_flags); @@ -513,7 +513,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_io(fio->io_wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE); inc_page_count(fio->sbi, WB_DATA_TYPE(page)); @@ -592,7 +592,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_io(fio->io_wbc, bio_page, PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE); io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d95a681ef7c9..6de6cda44031 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2818,9 +2818,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) while (zones && sector < nr_sectors) { nr_zones = F2FS_REPORT_NR_ZONES; - err = blkdev_report_zones(bdev, sector, - zones, &nr_zones, - GFP_KERNEL); + err = blkdev_report_zones(bdev, sector, zones, &nr_zones); if (err) break; if (!nr_zones) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9ebfb1b28430..542b02d170f8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -270,6 +270,7 @@ void __inode_attach_wb(struct inode *inode, struct page *page) if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } +EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it @@ -582,6 +583,7 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, if (unlikely(wb_dying(wbc->wb))) inode_switch_wbs(inode, wbc->wb_id); } +EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection @@ -701,9 +703,10 @@ void wbc_detach_inode(struct writeback_control *wbc) wb_put(wbc->wb); wbc->wb = NULL; } +EXPORT_SYMBOL_GPL(wbc_detach_inode); /** - * wbc_account_io - account IO issued during writeback + * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress * @page: page being written out * @bytes: number of bytes being written out @@ -712,8 +715,8 @@ void wbc_detach_inode(struct writeback_control *wbc) * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ -void wbc_account_io(struct writeback_control *wbc, struct page *page, - size_t bytes) +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, + size_t bytes) { struct cgroup_subsys_state *css; int id; @@ -724,7 +727,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ - if (!wbc->wb) + if (!wbc->wb || wbc->no_cgroup_owner) return; css = mem_cgroup_css_from_page(page); @@ -750,7 +753,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } -EXPORT_SYMBOL_GPL(wbc_account_io); +EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); /** * inode_congested - test whether an inode is congested diff --git a/fs/mpage.c b/fs/mpage.c index 436a85260394..a63620cdb73a 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -647,7 +647,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ - wbc_account_io(wbc, page, PAGE_SIZE); + wbc_account_cgroup_owner(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(REQ_OP_WRITE, op_flags, bio); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 761248ee2778..f16d5f196c6b 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -796,7 +796,7 @@ xfs_add_to_ioend( } wpc->ioend->io_size += len; - wbc_account_io(wbc, page, len); + wbc_account_cgroup_owner(wbc, page, len); } STATIC void diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f9b029180241..35b31d176f74 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -48,6 +48,7 @@ extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; +extern struct workqueue_struct *bdi_async_bio_wq; static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 33f23a858438..689a58231288 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -132,13 +132,17 @@ struct blkcg_gq { struct blkg_policy_data *pd[BLKCG_MAX_POLS]; - struct rcu_head rcu_head; + spinlock_t async_bio_lock; + struct bio_list async_bios; + struct work_struct async_bio_work; atomic_t use_delay; atomic64_t delay_nsec; atomic64_t delay_start; u64 last_delay; int last_use; + + struct rcu_head rcu_head; }; typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); @@ -701,6 +705,15 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg struct bio *bio) { return false; } #endif +bool __blkcg_punt_bio_submit(struct bio *bio); + +static inline bool blkcg_punt_bio_submit(struct bio *bio) +{ + if (bio->bi_opf & REQ_CGROUP_PUNT) + return __blkcg_punt_bio_submit(bio); + else + return false; +} static inline void blkcg_bio_issue_init(struct bio *bio) { @@ -848,6 +861,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } static inline void blkg_get(struct blkcg_gq *blkg) { } static inline void blkg_put(struct blkcg_gq *blkg) { } +static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; } static inline void blkcg_bio_issue_init(struct bio *bio) { } static inline bool blkcg_bio_issue_check(struct request_queue *q, struct bio *bio) { return true; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6a53799c3fe2..feff3fe4467e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -311,6 +311,14 @@ enum req_flag_bits { __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ __REQ_NOWAIT, /* Don't wait if request will block */ + /* + * When a shared kthread needs to issue a bio for a cgroup, doing + * so synchronously can lead to priority inversions as the kthread + * can be trapped waiting for that cgroup. CGROUP_PUNT flag makes + * submit_bio() punt the actual issuing to a dedicated per-blkcg + * work item to avoid such priority inversions. + */ + __REQ_CGROUP_PUNT, /* command specific flags for REQ_OP_WRITE_ZEROES: */ __REQ_NOUNMAP, /* do not free blocks when zeroing */ @@ -337,6 +345,8 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) #define REQ_NOWAIT (1ULL << __REQ_NOWAIT) +#define REQ_CGROUP_PUNT (1ULL << __REQ_CGROUP_PUNT) + #define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) #define REQ_HIPRI (1ULL << __REQ_HIPRI) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0c482371c8b3..1ef375dafb1c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -344,10 +344,15 @@ struct queue_limits { #ifdef CONFIG_BLK_DEV_ZONED +/* + * Maximum number of zones to report with a single report zones command. + */ +#define BLK_ZONED_REPORT_MAX_ZONES 8192U + extern unsigned int blkdev_nr_zones(struct block_device *bdev); extern int blkdev_report_zones(struct block_device *bdev, sector_t sector, struct blk_zone *zones, - unsigned int *nr_zones, gfp_t gfp_mask); + unsigned int *nr_zones); extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); extern int blk_revalidate_disk_zones(struct gendisk *disk); @@ -681,7 +686,7 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) } } -static inline unsigned int blk_queue_zone_sectors(struct request_queue *q) +static inline sector_t blk_queue_zone_sectors(struct request_queue *q) { return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } @@ -1418,7 +1423,7 @@ static inline bool bdev_is_zoned(struct block_device *bdev) return false; } -static inline unsigned int bdev_zone_sectors(struct block_device *bdev) +static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); @@ -1673,8 +1678,7 @@ struct block_device_operations { /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, - struct blk_zone *zones, unsigned int *nr_zones, - gfp_t gfp_mask); + struct blk_zone *zones, unsigned int *nr_zones); struct module *owner; const struct pr_ops *pr_ops; }; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2af9b1b419f1..f6b048902d6c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -699,6 +699,7 @@ void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, struct cgroup_subsys_state; struct cgroup; +static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index e1f51d607cc5..3b470cb03b66 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -95,8 +95,7 @@ typedef int (*dm_prepare_ioctl_fn) (struct dm_target *ti, struct block_device ** typedef int (*dm_report_zones_fn) (struct dm_target *ti, sector_t sector, struct blk_zone *zones, - unsigned int *nr_zones, - gfp_t gfp_mask); + unsigned int *nr_zones); /* * These iteration functions are typically used to check (and combine) diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 169bb2e02516..17cd0078377c 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -75,7 +75,7 @@ struct elevator_type size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; - char elevator_name[ELV_NAME_MAX]; + const char *elevator_name; const char *elevator_alias; struct module *elevator_owner; #ifdef CONFIG_BLK_DEBUG_FS @@ -160,15 +160,6 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t); #define ELEVATOR_INSERT_FLUSH 5 #define ELEVATOR_INSERT_SORT_MERGE 6 -/* - * return values from elevator_may_queue_fn - */ -enum { - ELV_MQUEUE_MAY, - ELV_MQUEUE_NO, - ELV_MQUEUE_MUST, -}; - #define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) #define rb_entry_rq(node) rb_entry((node), struct request, rb_node) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d98b2d8baf4e..01aa6a6c241d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -315,7 +315,7 @@ struct nvme_id_ns { __u8 nmic; __u8 rescap; __u8 fpi; - __u8 rsvd33; + __u8 dlfeat; __le16 nawun; __le16 nawupf; __le16 nacwu; @@ -324,11 +324,17 @@ struct nvme_id_ns { __le16 nabspf; __le16 noiob; __u8 nvmcap[16]; - __u8 rsvd64[28]; + __le16 npwg; + __le16 npwa; + __le16 npdg; + __le16 npda; + __le16 nows; + __u8 rsvd74[18]; __le32 anagrpid; __u8 rsvd96[3]; __u8 nsattr; - __u8 rsvd100[4]; + __le16 nvmsetid; + __le16 endgid; __u8 nguid[16]; __u8 eui64[8]; struct nvme_lbaf lbaf[16]; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 738a0c24874f..8945aac31392 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -11,6 +11,7 @@ #include <linux/flex_proportions.h> #include <linux/backing-dev-defs.h> #include <linux/blk_types.h> +#include <linux/blk-cgroup.h> struct bio; @@ -68,6 +69,17 @@ struct writeback_control { unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ + + /* + * When writeback IOs are bounced through async layers, only the + * initial synchronous phase should be accounted towards inode + * cgroup ownership arbitration to avoid confusion. Later stages + * can set the following flag to disable the accounting. + */ + unsigned no_cgroup_owner:1; + + unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */ + #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ @@ -84,12 +96,27 @@ struct writeback_control { static inline int wbc_to_write_flags(struct writeback_control *wbc) { + int flags = 0; + + if (wbc->punt_to_cgroup) + flags = REQ_CGROUP_PUNT; + if (wbc->sync_mode == WB_SYNC_ALL) - return REQ_SYNC; + flags |= REQ_SYNC; else if (wbc->for_kupdate || wbc->for_background) - return REQ_BACKGROUND; + flags |= REQ_BACKGROUND; - return 0; + return flags; +} + +static inline struct cgroup_subsys_state * +wbc_blkcg_css(struct writeback_control *wbc) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + if (wbc->wb) + return wbc->wb->blkcg_css; +#endif + return blkcg_root_css; } /* @@ -188,8 +215,8 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); -void wbc_account_io(struct writeback_control *wbc, struct page *page, - size_t bytes); +void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, + size_t bytes); void cgroup_writeback_umount(void); /** @@ -291,8 +318,8 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { } -static inline void wbc_account_io(struct writeback_control *wbc, - struct page *page, size_t bytes) +static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, + struct page *page, size_t bytes) { } |