diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-06-08 12:12:11 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-06-08 12:12:11 -0700 |
commit | 8d72e5bd86cb405d8d8b9e92905d8cfffd08dde8 (patch) | |
tree | 65d5e73b08d7a573784dca677e374fd6fc25a694 | |
parent | 1b02caa319cf73ae89aced8714066a3a5bbe648b (diff) | |
parent | 6c70f899b8089ae23cdb4aa63050e3df4e20c71e (diff) | |
download | linux-8d72e5bd86cb405d8d8b9e92905d8cfffd08dde8.tar.bz2 |
Merge tag 'for-linus-20190608' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe:
- Allow symlink from the bfq.weight cgroup parameter to the general
weight (Angelo)
- Damien is new skd maintainer (Bart)
- NVMe pull request from Sagi, with a few small fixes.
- Ensure we set DMA segment size properly, dma-debug is now tripping on
these (Christoph)
- Remove useless debugfs_create() return check (Greg)
- Remove redundant unlikely() check on IS_ERR() (Kefeng)
- Fixup request freeing on exit (Ming)
* tag 'for-linus-20190608' of git://git.kernel.dk/linux-block:
block, bfq: add weight symlink to the bfq.weight cgroup parameter
cgroup: let a symlink too be created with a cftype file
block: free sched's request pool in blk_cleanup_queue
nvme-rdma: use dynamic dma mapping per command
nvme: Fix u32 overflow in the number of namespace list calculation
mmc: also set max_segment_size in the device
mtip32xx: also set max_segment_size in the device
rsxx: don't call dma_set_max_seg_size
nvme-pci: don't limit DMA segement size
block: Drop unlikely before IS_ERR(_OR_NULL)
block: aoe: no need to check return value of debugfs_create functions
nvmet: fix data_len to 0 for bdev-backed write_zeroes
MAINTAINERS: Hand over skd maintainership
nvme-tcp: fix queue mapping when queue count is limited
nvme-rdma: fix queue mapping when queue count is limited
-rw-r--r-- | MAINTAINERS | 2 | ||||
-rw-r--r-- | block/bfq-cgroup.c | 6 | ||||
-rw-r--r-- | block/blk-cgroup.c | 2 | ||||
-rw-r--r-- | block/blk-core.c | 13 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 30 | ||||
-rw-r--r-- | block/blk-mq-sched.h | 1 | ||||
-rw-r--r-- | block/blk-sysfs.c | 2 | ||||
-rw-r--r-- | block/blk.h | 10 | ||||
-rw-r--r-- | block/elevator.c | 2 | ||||
-rw-r--r-- | drivers/block/aoe/aoeblk.c | 16 | ||||
-rw-r--r-- | drivers/block/mtip32xx/mtip32xx.c | 1 | ||||
-rw-r--r-- | drivers/block/rsxx/core.c | 1 | ||||
-rw-r--r-- | drivers/mmc/core/queue.c | 2 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 3 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 6 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 152 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 57 | ||||
-rw-r--r-- | drivers/nvme/target/io-cmd-bdev.c | 1 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 3 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 33 |
20 files changed, 251 insertions, 92 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 36a84614d6c3..df83b6769c17 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14995,7 +14995,7 @@ S: Odd Fixes F: drivers/net/ethernet/adaptec/starfire* STEC S1220 SKD DRIVER -M: Bart Van Assche <bart.vanassche@wdc.com> +M: Damien Le Moal <Damien.LeMoal@wdc.com> L: linux-block@vger.kernel.org S: Maintained F: drivers/block/skd*[ch] diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index b3796a40a61a..59f46904cb11 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -1046,7 +1046,8 @@ struct blkcg_policy blkcg_policy_bfq = { struct cftype bfq_blkcg_legacy_files[] = { { .name = "bfq.weight", - .flags = CFTYPE_NOT_ON_ROOT, + .link_name = "weight", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_SYMLINKED, .seq_show = bfq_io_show_weight, .write_u64 = bfq_io_set_weight_legacy, }, @@ -1166,7 +1167,8 @@ struct cftype bfq_blkcg_legacy_files[] = { struct cftype bfq_blkg_files[] = { { .name = "bfq.weight", - .flags = CFTYPE_NOT_ON_ROOT, + .link_name = "weight", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_SYMLINKED, .seq_show = bfq_io_show_weight, .write = bfq_io_set_weight, }, diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b97b479e4f64..1f7127b03490 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -881,7 +881,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, blkg_free(new_blkg); } else { blkg = blkg_create(pos, q, new_blkg); - if (unlikely(IS_ERR(blkg))) { + if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_unlock; } diff --git a/block/blk-core.c b/block/blk-core.c index ee1b35fe8572..8340f69670d8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -320,6 +320,19 @@ void blk_cleanup_queue(struct request_queue *q) if (queue_is_mq(q)) blk_mq_exit_queue(q); + /* + * In theory, request pool of sched_tags belongs to request queue. + * However, the current implementation requires tag_set for freeing + * requests, so free the pool now. + * + * Queue has become frozen, there can't be any in-queue requests, so + * it is safe to free requests now. + */ + mutex_lock(&q->sysfs_lock); + if (q->elevator) + blk_mq_sched_free_requests(q); + mutex_unlock(&q->sysfs_lock); + percpu_ref_exit(&q->q_usage_counter); /* @q is and will stay empty, shutdown and put */ diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 74c6bb871f7e..500cb04901cc 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -475,14 +475,18 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, return ret; } +/* called in queue's release handler, tagset has gone away */ static void blk_mq_sched_tags_teardown(struct request_queue *q) { - struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) - blk_mq_sched_free_tags(set, hctx, i); + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) { + blk_mq_free_rq_map(hctx->sched_tags); + hctx->sched_tags = NULL; + } + } } int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) @@ -523,6 +527,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) ret = e->ops.init_hctx(hctx, i); if (ret) { eq = q->elevator; + blk_mq_sched_free_requests(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; @@ -534,11 +539,30 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return 0; err: + blk_mq_sched_free_requests(q); blk_mq_sched_tags_teardown(q); q->elevator = NULL; return ret; } +/* + * called in either blk_queue_cleanup or elevator_switch, tagset + * is required for freeing requests + */ +void blk_mq_sched_free_requests(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + lockdep_assert_held(&q->sysfs_lock); + WARN_ON(!q->elevator); + + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) + blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); + } +} + void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index c7bdb52367ac..3cf92cbbd8ac 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -28,6 +28,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); +void blk_mq_sched_free_requests(struct request_queue *q); static inline bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 75b5281cc577..977c659dcd18 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -850,7 +850,7 @@ static void blk_exit_queue(struct request_queue *q) */ if (q->elevator) { ioc_clear_queue(q); - elevator_exit(q, q->elevator); + __elevator_exit(q, q->elevator); q->elevator = NULL; } diff --git a/block/blk.h b/block/blk.h index 91b3581b7c7a..7814aa207153 100644 --- a/block/blk.h +++ b/block/blk.h @@ -6,6 +6,7 @@ #include <linux/blk-mq.h> #include <xen/xen.h> #include "blk-mq.h" +#include "blk-mq-sched.h" /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) @@ -176,10 +177,17 @@ void blk_insert_flush(struct request *rq); int elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); -void elevator_exit(struct request_queue *, struct elevator_queue *); +void __elevator_exit(struct request_queue *, struct elevator_queue *); int elv_register_queue(struct request_queue *q); void elv_unregister_queue(struct request_queue *q); +static inline void elevator_exit(struct request_queue *q, + struct elevator_queue *e) +{ + blk_mq_sched_free_requests(q); + __elevator_exit(q, e); +} + struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); #ifdef CONFIG_FAIL_IO_TIMEOUT diff --git a/block/elevator.c b/block/elevator.c index ec55d5fc0b3e..2f17d66d0e61 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -178,7 +178,7 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -void elevator_exit(struct request_queue *q, struct elevator_queue *e) +void __elevator_exit(struct request_queue *q, struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); if (e->type->ops.exit_sched) diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index e2c6aae2d636..bd19f8af950b 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -196,7 +196,6 @@ static const struct file_operations aoe_debugfs_fops = { static void aoedisk_add_debugfs(struct aoedev *d) { - struct dentry *entry; char *p; if (aoe_debugfs_dir == NULL) @@ -207,15 +206,8 @@ aoedisk_add_debugfs(struct aoedev *d) else p++; BUG_ON(*p == '\0'); - entry = debugfs_create_file(p, 0444, aoe_debugfs_dir, d, - &aoe_debugfs_fops); - if (IS_ERR_OR_NULL(entry)) { - pr_info("aoe: cannot create debugfs file for %s\n", - d->gd->disk_name); - return; - } - BUG_ON(d->debugfs); - d->debugfs = entry; + d->debugfs = debugfs_create_file(p, 0444, aoe_debugfs_dir, d, + &aoe_debugfs_fops); } void aoedisk_rm_debugfs(struct aoedev *d) @@ -472,10 +464,6 @@ aoeblk_init(void) if (buf_pool_cache == NULL) return -ENOMEM; aoe_debugfs_dir = debugfs_create_dir("aoe", NULL); - if (IS_ERR_OR_NULL(aoe_debugfs_dir)) { - pr_info("aoe: cannot create debugfs directory\n"); - aoe_debugfs_dir = NULL; - } return 0; } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index bacfdac7161c..a14b09ab3a41 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3676,6 +3676,7 @@ skip_create_disk: blk_queue_physical_block_size(dd->queue, 4096); blk_queue_max_hw_sectors(dd->queue, 0xffff); blk_queue_max_segment_size(dd->queue, 0x400000); + dma_set_max_seg_size(&dd->pdev->dev, 0x400000); blk_queue_io_min(dd->queue, 4096); /* Set the capacity of the device in 512 byte sectors. */ diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c index de9b2d2f8654..76b73ddf8fd7 100644 --- a/drivers/block/rsxx/core.c +++ b/drivers/block/rsxx/core.c @@ -767,7 +767,6 @@ static int rsxx_pci_probe(struct pci_dev *dev, goto failed_enable; pci_set_master(dev); - dma_set_max_seg_size(&dev->dev, RSXX_HW_BLK_SIZE); st = dma_set_mask(&dev->dev, DMA_BIT_MASK(64)); if (st) { diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index b5b9c6142f08..92900a095796 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -377,6 +377,8 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card) blk_queue_max_segment_size(mq->queue, round_down(host->max_seg_size, block_size)); + dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue)); + INIT_WORK(&mq->recovery_work, mmc_mq_recovery_handler); INIT_WORK(&mq->complete_work, mmc_blk_mq_complete_work); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1b7c2afd84cb..120fb593d1da 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3400,7 +3400,8 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) { struct nvme_ns *ns; __le32 *ns_list; - unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); + unsigned i, j, nsid, prev = 0; + unsigned num_lists = DIV_ROUND_UP_ULL((u64)nn, 1024); int ret = 0; ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f562154551ce..524d6bd6d095 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2513,6 +2513,12 @@ static void nvme_reset_work(struct work_struct *work) */ dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1; dev->ctrl.max_segments = NVME_MAX_SEGS; + + /* + * Don't limit the IOMMU merged segment size. + */ + dma_set_max_seg_size(dev->dev, 0xffffffff); + mutex_unlock(&dev->shutdown_lock); /* diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f383146e7d0f..97f668a39ae1 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -213,6 +213,11 @@ static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev, if (!ring) return NULL; + /* + * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue + * lifetime. It's safe, since any chage in the underlying RDMA device + * will issue error recovery and queue re-creation. + */ for (i = 0; i < ib_queue_size; i++) { if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir)) goto out_free_ring; @@ -274,14 +279,9 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) static void nvme_rdma_exit_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx) { - struct nvme_rdma_ctrl *ctrl = set->driver_data; struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); - int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; - struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; - struct nvme_rdma_device *dev = queue->device; - nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); + kfree(req->sqe.data); } static int nvme_rdma_init_request(struct blk_mq_tag_set *set, @@ -292,15 +292,11 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; - struct nvme_rdma_device *dev = queue->device; - struct ib_device *ibdev = dev->dev; - int ret; nvme_req(rq)->ctrl = &ctrl->ctrl; - ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), - DMA_TO_DEVICE); - if (ret) - return ret; + req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL); + if (!req->sqe.data) + return -ENOMEM; req->queue = queue; @@ -641,34 +637,16 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; struct ib_device *ibdev = ctrl->device->dev; - unsigned int nr_io_queues; + unsigned int nr_io_queues, nr_default_queues; + unsigned int nr_read_queues, nr_poll_queues; int i, ret; - nr_io_queues = min(opts->nr_io_queues, num_online_cpus()); - - /* - * we map queues according to the device irq vectors for - * optimal locality so we don't need more queues than - * completion vectors. - */ - nr_io_queues = min_t(unsigned int, nr_io_queues, - ibdev->num_comp_vectors); - - if (opts->nr_write_queues) { - ctrl->io_queues[HCTX_TYPE_DEFAULT] = - min(opts->nr_write_queues, nr_io_queues); - nr_io_queues += ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } else { - ctrl->io_queues[HCTX_TYPE_DEFAULT] = nr_io_queues; - } - - ctrl->io_queues[HCTX_TYPE_READ] = nr_io_queues; - - if (opts->nr_poll_queues) { - ctrl->io_queues[HCTX_TYPE_POLL] = - min(opts->nr_poll_queues, num_online_cpus()); - nr_io_queues += ctrl->io_queues[HCTX_TYPE_POLL]; - } + nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors, + min(opts->nr_io_queues, num_online_cpus())); + nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors, + min(opts->nr_write_queues, num_online_cpus())); + nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus()); + nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues; ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) @@ -681,6 +659,34 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl) dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); + if (opts->nr_write_queues && nr_read_queues < nr_io_queues) { + /* + * separate read/write queues + * hand out dedicated default queues only after we have + * sufficient read queues. + */ + ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues; + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(nr_default_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* + * shared read/write queues + * either no write queues were requested, or we don't have + * sufficient queue count to have dedicated default queues. + */ + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(nr_read_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } + + if (opts->nr_poll_queues && nr_io_queues) { + /* map dedicated poll queues only if we have queues left */ + ctrl->io_queues[HCTX_TYPE_POLL] = + min(nr_poll_queues, nr_io_queues); + } + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvme_rdma_alloc_queue(ctrl, i, ctrl->ctrl.sqsize + 1); @@ -769,6 +775,11 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev); + /* + * Bind the async event SQE DMA mapping to the admin queue lifetime. + * It's safe, since any chage in the underlying RDMA device will issue + * error recovery and queue re-creation. + */ error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); if (error) @@ -1709,12 +1720,20 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); dev = queue->device->dev; + + req->sqe.dma = ib_dma_map_single(dev, req->sqe.data, + sizeof(struct nvme_command), + DMA_TO_DEVICE); + err = ib_dma_mapping_error(dev, req->sqe.dma); + if (unlikely(err)) + return BLK_STS_RESOURCE; + ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(struct nvme_command), DMA_TO_DEVICE); ret = nvme_setup_cmd(ns, rq, c); if (ret) - return ret; + goto unmap_qe; blk_mq_start_request(rq); @@ -1739,10 +1758,16 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, } return BLK_STS_OK; + err: if (err == -ENOMEM || err == -EAGAIN) - return BLK_STS_RESOURCE; - return BLK_STS_IOERR; + ret = BLK_STS_RESOURCE; + else + ret = BLK_STS_IOERR; +unmap_qe: + ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command), + DMA_TO_DEVICE); + return ret; } static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx) @@ -1755,25 +1780,36 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx) static void nvme_rdma_complete_rq(struct request *rq) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_queue *queue = req->queue; + struct ib_device *ibdev = queue->device->dev; - nvme_rdma_unmap_data(req->queue, rq); + nvme_rdma_unmap_data(queue, rq); + ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command), + DMA_TO_DEVICE); nvme_complete_rq(rq); } static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) { struct nvme_rdma_ctrl *ctrl = set->driver_data; + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_READ].nr_queues = ctrl->io_queues[HCTX_TYPE_READ]; - if (ctrl->ctrl.opts->nr_write_queues) { + if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { /* separate read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_READ]; set->map[HCTX_TYPE_READ].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; + ctrl->io_queues[HCTX_TYPE_DEFAULT]; } else { - /* mixed read/write queues */ + /* shared read/write queues */ + set->map[HCTX_TYPE_DEFAULT].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; set->map[HCTX_TYPE_READ].queue_offset = 0; } blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT], @@ -1781,16 +1817,22 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ], ctrl->device->dev, 0); - if (ctrl->ctrl.opts->nr_poll_queues) { + if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { + /* map dedicated poll queues only if we have queues left */ set->map[HCTX_TYPE_POLL].nr_queues = ctrl->io_queues[HCTX_TYPE_POLL]; set->map[HCTX_TYPE_POLL].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - if (ctrl->ctrl.opts->nr_write_queues) - set->map[HCTX_TYPE_POLL].queue_offset += - ctrl->io_queues[HCTX_TYPE_READ]; + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); } + + dev_info(ctrl->ctrl.device, + "mapped %d/%d/%d default/read/poll queues.\n", + ctrl->io_queues[HCTX_TYPE_DEFAULT], + ctrl->io_queues[HCTX_TYPE_READ], + ctrl->io_queues[HCTX_TYPE_POLL]); + return 0; } diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2b107a1d152b..08a2501b9357 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -111,6 +111,7 @@ struct nvme_tcp_ctrl { struct work_struct err_work; struct delayed_work connect_work; struct nvme_tcp_request async_req; + u32 io_queues[HCTX_MAX_TYPES]; }; static LIST_HEAD(nvme_tcp_ctrl_list); @@ -1564,6 +1565,35 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) return nr_io_queues; } +static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, + unsigned int nr_io_queues) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvmf_ctrl_options *opts = nctrl->opts; + + if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) { + /* + * separate read/write queues + * hand out dedicated default queues only after we have + * sufficient read queues. + */ + ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues; + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(opts->nr_write_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } else { + /* + * shared read/write queues + * either no write queues were requested, or we don't have + * sufficient queue count to have dedicated default queues. + */ + ctrl->io_queues[HCTX_TYPE_DEFAULT] = + min(opts->nr_io_queues, nr_io_queues); + nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; + } +} + static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { unsigned int nr_io_queues; @@ -1581,6 +1611,8 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) dev_info(ctrl->device, "creating %d I/O queues.\n", nr_io_queues); + nvme_tcp_set_io_queues(ctrl, nr_io_queues); + return __nvme_tcp_alloc_io_queues(ctrl); } @@ -2089,23 +2121,34 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) { struct nvme_tcp_ctrl *ctrl = set->driver_data; + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues; - if (ctrl->ctrl.opts->nr_write_queues) { + if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { /* separate read/write queues */ set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->ctrl.opts->nr_write_queues; + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_READ]; set->map[HCTX_TYPE_READ].queue_offset = - ctrl->ctrl.opts->nr_write_queues; + ctrl->io_queues[HCTX_TYPE_DEFAULT]; } else { - /* mixed read/write queues */ + /* shared read/write queues */ set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->ctrl.opts->nr_io_queues; + ctrl->io_queues[HCTX_TYPE_DEFAULT]; + set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; + set->map[HCTX_TYPE_READ].nr_queues = + ctrl->io_queues[HCTX_TYPE_DEFAULT]; set->map[HCTX_TYPE_READ].queue_offset = 0; } blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + + dev_info(ctrl->ctrl.device, + "mapped %d/%d default/read queues.\n", + ctrl->io_queues[HCTX_TYPE_DEFAULT], + ctrl->io_queues[HCTX_TYPE_READ]); + return 0; } diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 3efc52f9c309..7a1cf6437a6a 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -293,6 +293,7 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req) return 0; case nvme_cmd_write_zeroes: req->execute = nvmet_bdev_execute_write_zeroes; + req->data_len = 0; return 0; default: pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode, diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 11e215d7937e..d71b079bb021 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -106,6 +106,8 @@ enum { CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ + CFTYPE_SYMLINKED = (1 << 6), /* pointed to by symlink too */ + /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ @@ -543,6 +545,7 @@ struct cftype { * end of cftype array. */ char name[MAX_CFTYPE_NAME]; + char link_name[MAX_CFTYPE_NAME]; unsigned long private; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 426a0026225c..155048b0eca2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1460,8 +1460,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct *task, static struct kernfs_syscall_ops cgroup_kf_syscall_ops; -static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, - char *buf) +static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf, bool write_link_name) { struct cgroup_subsys *ss = cft->ss; @@ -1471,13 +1471,26 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, - cft->name); + write_link_name ? cft->link_name : cft->name); } else { - strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strscpy(buf, write_link_name ? cft->link_name : cft->name, + CGROUP_FILE_NAME_MAX); } return buf; } +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) +{ + return cgroup_fill_name(cgrp, cft, buf, false); +} + +static char *cgroup_link_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) +{ + return cgroup_fill_name(cgrp, cft, buf, true); +} + /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question @@ -1636,6 +1649,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); + if (cft->flags & CFTYPE_SYMLINKED) + kernfs_remove_by_name(cgrp->kn, + cgroup_link_name(cgrp, cft, name)); } /** @@ -3821,6 +3837,7 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, { char name[CGROUP_FILE_NAME_MAX]; struct kernfs_node *kn; + struct kernfs_node *kn_link; struct lock_class_key *key = NULL; int ret; @@ -3851,6 +3868,14 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, spin_unlock_irq(&cgroup_file_kn_lock); } + if (cft->flags & CFTYPE_SYMLINKED) { + kn_link = kernfs_create_link(cgrp->kn, + cgroup_link_name(cgrp, cft, name), + kn); + if (IS_ERR(kn_link)) + return PTR_ERR(kn_link); + } + return 0; } |