summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRuozhu Li <liruozhu@huawei.com>2022-06-23 14:45:39 +0800
committerChristoph Hellwig <hch@lst.de>2022-06-29 16:13:45 +0200
commitf7f70f4aa09dc43d7455c060143e86a017c30548 (patch)
treea10f57cdb8adf918a3bf54b1b5a8af4ff24c8928
parent1629de0e0373e04d68e88e6d9d3071fbf70b7ea8 (diff)
downloadlinux-f7f70f4aa09dc43d7455c060143e86a017c30548.tar.bz2
nvme: fix regression when disconnect a recovering ctrl
We encountered a problem that the disconnect command hangs. After analyzing the log and stack, we found that the triggering process is as follows: CPU0 CPU1 nvme_rdma_error_recovery_work nvme_rdma_teardown_io_queues nvme_do_delete_ctrl nvme_stop_queues nvme_remove_namespaces --clear ctrl->namespaces nvme_start_queues --no ns in ctrl->namespaces nvme_ns_remove return(because ctrl is deleting) blk_freeze_queue blk_mq_freeze_queue_wait --wait for ns to unquiesce to clean infligt IO, hang forever This problem was not found in older kernels because we will flush err work in nvme_stop_ctrl before nvme_remove_namespaces.It does not seem to be modified for functional reasons, the patch can be revert to solve the problem. Revert commit 794a4cb3d2f7 ("nvme: remove the .stop_ctrl callout") Signed-off-by: Ruozhu Li <liruozhu@huawei.com> Reviewed-by: Sagi Grimberg <sagi@grimberg.me> Signed-off-by: Christoph Hellwig <hch@lst.de>
-rw-r--r--drivers/nvme/host/core.c2
-rw-r--r--drivers/nvme/host/nvme.h1
-rw-r--r--drivers/nvme/host/rdma.c12
-rw-r--r--drivers/nvme/host/tcp.c10
4 files changed, 19 insertions, 6 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b3d9c29aba1e..ec6ac298d8de 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4595,6 +4595,8 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
nvme_stop_failfast_work(ctrl);
flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fw_act_work);
+ if (ctrl->ops->stop_ctrl)
+ ctrl->ops->stop_ctrl(ctrl);
}
EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0da94b233fed..5558f8812157 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -502,6 +502,7 @@ struct nvme_ctrl_ops {
void (*free_ctrl)(struct nvme_ctrl *ctrl);
void (*submit_async_event)(struct nvme_ctrl *ctrl);
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
+ void (*stop_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
void (*print_device_info)(struct nvme_ctrl *ctrl);
};
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f2a5e1ea508a..46c2dcf72f7e 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1048,6 +1048,14 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
}
}
+static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
+{
+ struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+
+ cancel_work_sync(&ctrl->err_work);
+ cancel_delayed_work_sync(&ctrl->reconnect_work);
+}
+
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -2252,9 +2260,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{
- cancel_work_sync(&ctrl->err_work);
- cancel_delayed_work_sync(&ctrl->reconnect_work);
-
nvme_rdma_teardown_io_queues(ctrl, shutdown);
nvme_stop_admin_queue(&ctrl->ctrl);
if (shutdown)
@@ -2304,6 +2309,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.submit_async_event = nvme_rdma_submit_async_event,
.delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address,
+ .stop_ctrl = nvme_rdma_stop_ctrl,
};
/*
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 009c2cf3f106..7a9e6ffa2342 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2193,9 +2193,6 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{
- cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
- cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
-
nvme_tcp_teardown_io_queues(ctrl, shutdown);
nvme_stop_admin_queue(ctrl);
if (shutdown)
@@ -2235,6 +2232,12 @@ out_fail:
nvme_tcp_reconnect_or_remove(ctrl);
}
+static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
+{
+ cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
+ cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
+}
+
static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
@@ -2556,6 +2559,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
.submit_async_event = nvme_tcp_submit_async_event,
.delete_ctrl = nvme_tcp_delete_ctrl,
.get_address = nvmf_get_address,
+ .stop_ctrl = nvme_tcp_stop_ctrl,
};
static bool