From 500f9fbadef86466a435726192f4ca4df7d94236 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 Aug 2019 12:15:59 -0600 Subject: io_uring: fix potential hang with polled IO If a request issue ends up being punted to async context to avoid blocking, we can get into a situation where the original application enters the poll loop for that very request before it has been issued. This should not be an issue, except that the polling will hold the io_uring uring_ctx mutex for the duration of the poll. When the async worker has actually issued the request, it needs to acquire this mutex to add the request to the poll issued list. Since the application polling is already holding this mutex, the workqueue sleeps on the mutex forever, and the application thus never gets a chance to poll for the very request it was interested in. Fix this by ensuring that the polling drops the uring_ctx occasionally if it's not making any progress. Reported-by: Jeffrey M. Birnbaum Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 24bbe3cb7ad4..36f04d0b197b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -805,11 +805,34 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, long min) { - int ret = 0; + int iters, ret = 0; + + /* + * We disallow the app entering submit/complete with polling, but we + * still need to lock the ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); + iters = 0; do { int tmin = 0; + /* + * If a submit got punted to a workqueue, we can have the + * application entering polling for a command before it gets + * issued. That app will hold the uring_lock for the duration + * of the poll right here, so we need to take a breather every + * now and then to ensure that the issue has a chance to add + * the poll to the issued list. Otherwise we can spin here + * forever, while the workqueue is stuck trying to acquire the + * very same mutex. + */ + if (!(++iters & 7)) { + mutex_unlock(&ctx->uring_lock); + mutex_lock(&ctx->uring_lock); + } + if (*nr_events < min) tmin = min - *nr_events; @@ -819,6 +842,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, ret = 0; } while (min && !*nr_events && !need_resched()); + mutex_unlock(&ctx->uring_lock); return ret; } @@ -2280,15 +2304,7 @@ static int io_sq_thread(void *data) unsigned nr_events = 0; if (ctx->flags & IORING_SETUP_IOPOLL) { - /* - * We disallow the app entering submit/complete - * with polling, but we still need to lock the - * ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); io_iopoll_check(ctx, &nr_events, 0); - mutex_unlock(&ctx->uring_lock); } else { /* * Normal IO, just pretend everything completed. @@ -3190,9 +3206,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, min_complete = min(min_complete, ctx->cq_entries); if (ctx->flags & IORING_SETUP_IOPOLL) { - mutex_lock(&ctx->uring_lock); ret = io_iopoll_check(ctx, &nr_events, min_complete); - mutex_unlock(&ctx->uring_lock); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } -- cgit v1.2.3 From 504db087aaccdb32af61539916409f7dca31ceb5 Mon Sep 17 00:00:00 2001 From: Anton Eidelman Date: Mon, 12 Aug 2019 23:00:36 +0300 Subject: nvme-multipath: fix possible I/O hang when paths are updated nvme_state_set_live() making a path available triggers requeue_work in order to resubmit requests that ended up on requeue_list when no paths were available. This requeue_work may race with concurrent nvme_ns_head_make_request() that do not observe the live path yet. Such concurrent requests may by made by either: - New IO submission. - Requeue_work triggered by nvme_failover_req() or another ana_work. A race may cause requeue_work capture the state of requeue_list before more requests get onto the list. These requests will stay on the list forever unless requeue_work is triggered again. In order to prevent such race, nvme_state_set_live() should synchronize_srcu(&head->srcu) before triggering the requeue_work and prevent nvme_ns_head_make_request referencing an old snapshot of the path list. Reviewed-by: Christoph Hellwig Signed-off-by: Anton Eidelman Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/multipath.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 888d4543894e..af831d3d15d0 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -428,6 +428,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns) srcu_read_unlock(&head->srcu, srcu_idx); } + synchronize_srcu(&ns->head->srcu); kblockd_schedule_work(&ns->head->requeue_work); } -- cgit v1.2.3 From a89fcca8185633993018dc081d6b021d005e6d0b Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Wed, 14 Aug 2019 11:26:10 -0300 Subject: nvme: Fix cntlid validation when not using NVMEoF Commit 1b1031ca63b2 ("nvme: validate cntlid during controller initialisation") introduced a validation for controllers with duplicate cntlid that runs on nvme_init_subsystem(). The problem is that the validation relies on ctrl->cntlid, and this value is assigned (from id_ctrl value) after the call for nvme_init_subsystem() in nvme_init_identify() for non-fabrics scenario. That leads to ctrl->cntlid always being 0 in case we have a physical set of controllers in the same subsystem. This patch fixes that by loading the discovered cntlid id_ctrl value into ctrl->cntlid before the subsystem initialization, only for the non-fabrics case. The patch was tested with emulated nvme devices (qemu) having two controllers in a single subsystem. Without the patch, we couldn't make it work failing in the duplicate check; when running with the patch, we could see the subsystem holding both controllers. For the fabrics case we see ctrl->cntlid has a more intricate relation with the admin connect, so we didn't change that. Fixes: 1b1031ca63b2 ("nvme: validate cntlid during controller initialisation") Signed-off-by: Guilherme G. Piccoli Reviewed-by: Sagi Grimberg Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c258a1ce4b28..fea83fd95252 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2597,6 +2597,9 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) goto out_free; } + if (!(ctrl->ops->flags & NVME_F_FABRICS)) + ctrl->cntlid = le16_to_cpu(id->cntlid); + if (!ctrl->identified) { int i; @@ -2697,7 +2700,6 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) goto out_free; } } else { - ctrl->cntlid = le16_to_cpu(id->cntlid); ctrl->hmpre = le32_to_cpu(id->hmpre); ctrl->hmmin = le32_to_cpu(id->hmmin); ctrl->hmminds = le32_to_cpu(id->hmminds); -- cgit v1.2.3 From cb32de1b7e2591f844f18a5513fde8e2bd49bce0 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 16 Aug 2019 15:16:19 -0500 Subject: nvme: Add quirk for LiteON CL1 devices running FW 22301111 One of the components in LiteON CL1 device has limitations that can be encountered based upon boundary race conditions using the nvme bus specific suspend to idle flow. When this situation occurs the drive doesn't resume properly from suspend-to-idle. LiteON has confirmed this problem and fixed in the next firmware version. As this firmware is already in the field, avoid running nvme specific suspend to idle flow. Fixes: d916b1be94b6 ("nvme-pci: use host managed power state for suspend") Link: http://lists.infradead.org/pipermail/linux-nvme/2019-July/thread.html Signed-off-by: Mario Limonciello Signed-off-by: Charles Hyde Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 10 ++++++++++ drivers/nvme/host/nvme.h | 5 +++++ drivers/nvme/host/pci.c | 3 ++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fea83fd95252..d3d6b7bd6903 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2257,6 +2257,16 @@ static const struct nvme_core_quirk_entry core_quirks[] = { .vid = 0x1179, .mn = "THNSF5256GPUK TOSHIBA", .quirks = NVME_QUIRK_NO_APST, + }, + { + /* + * This LiteON CL1-3D*-Q11 firmware version has a race + * condition associated with actions related to suspend to idle + * LiteON has resolved the problem in future firmware + */ + .vid = 0x14a4, + .fr = "22301111", + .quirks = NVME_QUIRK_SIMPLE_SUSPEND, } }; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 778b3a0b6adb..2d678fb968c7 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -92,6 +92,11 @@ enum nvme_quirks { * Broken Write Zeroes. */ NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9), + + /* + * Force simple suspend/resume path. + */ + NVME_QUIRK_SIMPLE_SUSPEND = (1 << 10), }; /* diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6bd9b1033965..732d5b63ec05 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2876,7 +2876,8 @@ static int nvme_suspend(struct device *dev) * state (which may not be possible if the link is up). */ if (pm_suspend_via_firmware() || !ctrl->npss || - !pcie_aspm_enabled(pdev)) { + !pcie_aspm_enabled(pdev) || + (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) { nvme_dev_disable(ndev, true); return 0; } -- cgit v1.2.3 From a3a0e43fd77013819e4b6f55e37e0efe8e35d805 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 20 Aug 2019 11:03:11 -0600 Subject: io_uring: don't enter poll loop if we have CQEs pending We need to check if we have CQEs pending before starting a poll loop, as those could be the events we will be spinning for (and hence we'll find none). This can happen if a CQE triggers an error, or if it is found by eg an IRQ before we get a chance to find it through polling. Signed-off-by: Jens Axboe --- fs/io_uring.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 36f04d0b197b..e7a43a354d91 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -679,6 +679,13 @@ static void io_put_req(struct io_kiocb *req) io_free_req(req); } +static unsigned io_cqring_events(struct io_cq_ring *ring) +{ + /* See comment at the top of this file */ + smp_rmb(); + return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); +} + /* * Find and free completed poll iocbs */ @@ -818,6 +825,14 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, do { int tmin = 0; + /* + * Don't enter poll loop if we already have events pending. + * If we do, we can potentially be spinning for commands that + * already triggered a CQE (eg in error). + */ + if (io_cqring_events(ctx->cq_ring)) + break; + /* * If a submit got punted to a workqueue, we can have the * application entering polling for a command before it gets @@ -2449,13 +2464,6 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) return submit; } -static unsigned io_cqring_events(struct io_cq_ring *ring) -{ - /* See comment at the top of this file */ - smp_rmb(); - return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); -} - /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. -- cgit v1.2.3 From 7035eef4496d95b69b0bc18e0bced09304e0afdf Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 21 Aug 2019 11:45:25 -0700 Subject: md: update MAINTAINERS info I have been reviewing patches for md in the past few months. Mark me as the MD maintainer, as I have effectively been filling that role. Cc: NeilBrown Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 43604d6ab96c..eae4e0d1117a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14882,9 +14882,9 @@ F: include/linux/arm_sdei.h F: include/uapi/linux/arm_sdei.h SOFTWARE RAID (Multiple Disks) SUPPORT -M: Shaohua Li +M: Song Liu L: linux-raid@vger.kernel.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git S: Supported F: drivers/md/Makefile F: drivers/md/Kconfig -- cgit v1.2.3 From 08f5439f1df25a6cf6cf4c72cf6c13025599ce67 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 21 Aug 2019 22:19:11 -0600 Subject: io_uring: add need_resched() check in inner poll loop The outer poll loop checks for whether we need to reschedule, and returns to userspace if we do. However, it's possible to get stuck in the inner loop as well, if the CPU we are running on needs to reschedule to finish the IO work. Add the need_resched() check in the inner loop as well. This fixes a potential hang if the kernel is configured with CONFIG_PREEMPT_VOLUNTARY=y. Reported-by: Sagi Grimberg Reviewed-by: Sagi Grimberg Tested-by: Sagi Grimberg Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e7a43a354d91..cfb48bd088e1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -778,7 +778,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { - while (!list_empty(&ctx->poll_list)) { + while (!list_empty(&ctx->poll_list) && !need_resched()) { int ret; ret = io_do_iopoll(ctx, nr_events, min); @@ -805,6 +805,12 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) unsigned int nr_events = 0; io_iopoll_getevents(ctx, &nr_events, 1); + + /* + * Ensure we allow local-to-the-cpu processing to take place, + * in this case we need to ensure that we reap all events. + */ + cond_resched(); } mutex_unlock(&ctx->uring_lock); } -- cgit v1.2.3