From adce7e9856798d4883f42c3d8429123707fa34e8 Mon Sep 17 00:00:00 2001 From: Edmund Nadolski Date: Wed, 27 Nov 2019 10:17:43 -0700 Subject: nvme: remove unused return code from nvme_alloc_ns The return code of nvme_alloc_ns is never used, so change it to void. Reviewed-by: Christoph Hellwig Signed-off-by: Edmund Nadolski Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a4d8c90ee7cc..414076aaf52b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3480,7 +3480,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) return 0; } -static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; @@ -3490,13 +3490,11 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) - return -ENOMEM; + return; ns->queue = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ns->queue)) { - ret = PTR_ERR(ns->queue); + if (IS_ERR(ns->queue)) goto out_free_ns; - } if (ctrl->opts && ctrl->opts->data_digest) ns->queue->backing_dev_info->capabilities @@ -3519,10 +3517,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (ret) goto out_free_queue; - if (id->ncap == 0) { - ret = -EINVAL; + if (id->ncap == 0) /* no namespace (legacy quirk) */ goto out_free_id; - } ret = nvme_init_ns_head(ns, nsid, id); if (ret) @@ -3531,10 +3527,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_set_disk_name(disk_name, ns, ctrl, &flags); disk = alloc_disk_node(0, node); - if (!disk) { - ret = -ENOMEM; + if (!disk) goto out_unlink_ns; - } disk->fops = &nvme_fops; disk->private_data = ns; @@ -3565,7 +3559,7 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); kfree(id); - return 0; + return; out_put_disk: put_disk(ns->disk); out_unlink_ns: @@ -3579,9 +3573,6 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); - if (ret > 0) - ret = blk_status_to_errno(nvme_error_status(ret)); - return ret; } static void nvme_ns_remove(struct nvme_ns *ns) -- cgit v1.2.3 From 527123c7deafd5aa921773f739887d610d59b437 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 26 Jan 2020 12:35:44 -0800 Subject: nvmet: configfs code cleanup This is a pure code cleanup patch which does not change any functionality. This patch removes the extra lines, get rid of else which is duplicate for return. Reviewed-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 98613a45bd3b..403508a52e17 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -395,14 +395,12 @@ static ssize_t nvmet_ns_device_uuid_store(struct config_item *item, struct nvmet_subsys *subsys = ns->subsys; int ret = 0; - mutex_lock(&subsys->lock); if (ns->enabled) { ret = -EBUSY; goto out_unlock; } - if (uuid_parse(page, &ns->uuid)) ret = -EINVAL; @@ -815,10 +813,10 @@ static ssize_t nvmet_subsys_attr_version_show(struct config_item *item, (int)NVME_MAJOR(subsys->ver), (int)NVME_MINOR(subsys->ver), (int)NVME_TERTIARY(subsys->ver)); - else - return snprintf(page, PAGE_SIZE, "%d.%d\n", - (int)NVME_MAJOR(subsys->ver), - (int)NVME_MINOR(subsys->ver)); + + return snprintf(page, PAGE_SIZE, "%d.%d\n", + (int)NVME_MAJOR(subsys->ver), + (int)NVME_MINOR(subsys->ver)); } static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, @@ -828,7 +826,6 @@ static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, int major, minor, tertiary = 0; int ret; - ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); if (ret != 2 && ret != 3) return -EINVAL; -- cgit v1.2.3 From 94a39d61f80fcd679debda11e1ca02b88d90e67e Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 30 Jan 2020 10:29:31 -0800 Subject: nvmet: make ctrl-id configurable This patch adds a new target subsys attribute which allows user to optionally specify target controller IDs which then used in the nvmet_execute_identify_ctrl() to fill up the nvme_id_ctrl structure. For example, when using a cluster setup with two nodes, with a dual ported NVMe drive and exporting the drive from both the nodes, The connection to the host fails due to the same controller ID and results in the following error message:- "nvme nvmeX: Duplicate cntlid XXX with nvmeX, rejecting" With this patch now user can partition the controller IDs for each subsystem by setting up the cntlid_min and cntlid_max. These values will be used at the time of the controller ID creation. By partitioning the ctrl-ids for each subsystem results in the unique ctrl-id space which avoids the collision. When new attribute is not specified target will fall back to original cntlid calculation method. Reviewed-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 62 ++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 8 ++++-- drivers/nvme/target/nvmet.h | 2 ++ 3 files changed, 70 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 403508a52e17..71c50751b5a6 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -859,10 +859,72 @@ static ssize_t nvmet_subsys_attr_serial_store(struct config_item *item, } CONFIGFS_ATTR(nvmet_subsys_, attr_serial); +static ssize_t nvmet_subsys_attr_cntlid_min_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cntlid_min); +} + +static ssize_t nvmet_subsys_attr_cntlid_min_store(struct config_item *item, + const char *page, size_t cnt) +{ + u16 cntlid_min; + + if (sscanf(page, "%hu\n", &cntlid_min) != 1) + return -EINVAL; + + if (cntlid_min == 0) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (cntlid_min >= to_subsys(item)->cntlid_max) + goto out_unlock; + to_subsys(item)->cntlid_min = cntlid_min; + up_write(&nvmet_config_sem); + return cnt; + +out_unlock: + up_write(&nvmet_config_sem); + return -EINVAL; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_min); + +static ssize_t nvmet_subsys_attr_cntlid_max_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cntlid_max); +} + +static ssize_t nvmet_subsys_attr_cntlid_max_store(struct config_item *item, + const char *page, size_t cnt) +{ + u16 cntlid_max; + + if (sscanf(page, "%hu\n", &cntlid_max) != 1) + return -EINVAL; + + if (cntlid_max == 0) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (cntlid_max <= to_subsys(item)->cntlid_min) + goto out_unlock; + to_subsys(item)->cntlid_max = cntlid_max; + up_write(&nvmet_config_sem); + return cnt; + +out_unlock: + up_write(&nvmet_config_sem); + return -EINVAL; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_max); + static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_allow_any_host, &nvmet_subsys_attr_attr_version, &nvmet_subsys_attr_attr_serial, + &nvmet_subsys_attr_attr_cntlid_min, + &nvmet_subsys_attr_attr_cntlid_max, NULL, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 576de773b4db..48080c948692 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1289,8 +1289,11 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, if (!ctrl->sqs) goto out_free_cqs; + if (subsys->cntlid_min > subsys->cntlid_max) + goto out_free_cqs; + ret = ida_simple_get(&cntlid_ida, - NVME_CNTLID_MIN, NVME_CNTLID_MAX, + subsys->cntlid_min, subsys->cntlid_max, GFP_KERNEL); if (ret < 0) { status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; @@ -1438,7 +1441,8 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, kfree(subsys); return ERR_PTR(-ENOMEM); } - + subsys->cntlid_min = NVME_CNTLID_MIN; + subsys->cntlid_max = NVME_CNTLID_MAX; kref_init(&subsys->ref); mutex_init(&subsys->lock); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index eda28b22a2c8..c2d518fb1789 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -211,6 +211,8 @@ struct nvmet_subsys { struct list_head namespaces; unsigned int nr_namespaces; unsigned int max_nsid; + u16 cntlid_min; + u16 cntlid_max; struct list_head ctrls; -- cgit v1.2.3 From 013b7ebe5a0d70e2a02fd225174595e79c591b3e Mon Sep 17 00:00:00 2001 From: Mark Ruijter Date: Thu, 30 Jan 2020 10:29:32 -0800 Subject: nvmet: make ctrl model configurable This patch adds a new target subsys attribute which allows user to optionally specify model name which then used in the nvmet_execute_identify_ctrl() to fill up the nvme_id_ctrl structure. The default value for the model is set to "Linux" for backward compatibility. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Mark Ruijter [chaitanya.kulkarni@wdc.com *Use macro for default model, coding style fixes. *Use RCU for accessing model in for configfs and in nvmet_execute_identify_ctrl(). ] Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 17 +++++++++-- drivers/nvme/target/configfs.c | 66 +++++++++++++++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 1 + drivers/nvme/target/nvmet.h | 8 +++++ 4 files changed, 90 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 72a7e41f3018..19f949570625 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -322,12 +322,25 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) nvmet_req_complete(req, NVME_SC_INVALID_FIELD | NVME_SC_DNR); } +static void nvmet_id_set_model_number(struct nvme_id_ctrl *id, + struct nvmet_subsys *subsys) +{ + const char *model = NVMET_DEFAULT_CTRL_MODEL; + struct nvmet_subsys_model *subsys_model; + + rcu_read_lock(); + subsys_model = rcu_dereference(subsys->model); + if (subsys_model) + model = subsys_model->number; + memcpy_and_pad(id->mn, sizeof(id->mn), model, strlen(model), ' '); + rcu_read_unlock(); +} + static void nvmet_execute_identify_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvme_id_ctrl *id; u16 status = 0; - const char model[] = "Linux"; id = kzalloc(sizeof(*id), GFP_KERNEL); if (!id) { @@ -342,7 +355,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) memset(id->sn, ' ', sizeof(id->sn)); bin2hex(id->sn, &ctrl->subsys->serial, min(sizeof(ctrl->subsys->serial), sizeof(id->sn) / 2)); - memcpy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1, ' '); + nvmet_id_set_model_number(id, ctrl->subsys); memcpy_and_pad(id->fr, sizeof(id->fr), UTS_RELEASE, strlen(UTS_RELEASE), ' '); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 71c50751b5a6..1654064deea5 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -919,12 +919,78 @@ out_unlock: } CONFIGFS_ATTR(nvmet_subsys_, attr_cntlid_max); +static ssize_t nvmet_subsys_attr_model_show(struct config_item *item, + char *page) +{ + struct nvmet_subsys *subsys = to_subsys(item); + struct nvmet_subsys_model *subsys_model; + char *model = NVMET_DEFAULT_CTRL_MODEL; + int ret; + + rcu_read_lock(); + subsys_model = rcu_dereference(subsys->model); + if (subsys_model) + model = subsys_model->number; + ret = snprintf(page, PAGE_SIZE, "%s\n", model); + rcu_read_unlock(); + + return ret; +} + +/* See Section 1.5 of NVMe 1.4 */ +static bool nvmet_is_ascii(const char c) +{ + return c >= 0x20 && c <= 0x7e; +} + +static ssize_t nvmet_subsys_attr_model_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + struct nvmet_subsys_model *new_model; + char *new_model_number; + int pos = 0, len; + + len = strcspn(page, "\n"); + if (!len) + return -EINVAL; + + for (pos = 0; pos < len; pos++) { + if (!nvmet_is_ascii(page[pos])) + return -EINVAL; + } + + new_model_number = kstrndup(page, len, GFP_KERNEL); + if (!new_model_number) + return -ENOMEM; + + new_model = kzalloc(sizeof(*new_model) + len + 1, GFP_KERNEL); + if (!new_model) { + kfree(new_model_number); + return -ENOMEM; + } + memcpy(new_model->number, new_model_number, len); + + down_write(&nvmet_config_sem); + mutex_lock(&subsys->lock); + new_model = rcu_replace_pointer(subsys->model, new_model, + mutex_is_locked(&subsys->lock)); + mutex_unlock(&subsys->lock); + up_write(&nvmet_config_sem); + + kfree_rcu(new_model, rcuhead); + + return count; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_model); + static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_allow_any_host, &nvmet_subsys_attr_attr_version, &nvmet_subsys_attr_attr_serial, &nvmet_subsys_attr_attr_cntlid_min, &nvmet_subsys_attr_attr_cntlid_max, + &nvmet_subsys_attr_attr_model, NULL, }; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 48080c948692..b685f99d56a1 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1461,6 +1461,7 @@ static void nvmet_subsys_free(struct kref *ref) WARN_ON_ONCE(!list_empty(&subsys->namespaces)); kfree(subsys->subsysnqn); + kfree_rcu(subsys->model, rcuhead); kfree(subsys); } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index c2d518fb1789..42ba2ddd9e96 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -23,6 +23,7 @@ #define NVMET_ASYNC_EVENTS 4 #define NVMET_ERROR_LOG_SLOTS 128 #define NVMET_NO_ERROR_LOC ((u16)-1) +#define NVMET_DEFAULT_CTRL_MODEL "Linux" /* * Supported optional AENs: @@ -202,6 +203,11 @@ struct nvmet_ctrl { struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; }; +struct nvmet_subsys_model { + struct rcu_head rcuhead; + char number[]; +}; + struct nvmet_subsys { enum nvme_subsys_type type; @@ -229,6 +235,8 @@ struct nvmet_subsys { struct config_group namespaces_group; struct config_group allowed_hosts_group; + + struct nvmet_subsys_model __rcu *model; }; static inline struct nvmet_subsys *to_subsys(struct config_item *item) -- cgit v1.2.3 From d3a9b0cadf8cea1746a6bf525d049198e705836a Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 30 Jan 2020 10:29:33 -0800 Subject: nvmet: check sscanf value for subsys serial attr For nvmet in configfs.c we check return values for all the sscanf() calls. Add similar check into the nvmet_subsys_attr_serial_store(). Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/target/configfs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 1654064deea5..7aa10788b7c8 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -849,10 +849,13 @@ static ssize_t nvmet_subsys_attr_serial_show(struct config_item *item, static ssize_t nvmet_subsys_attr_serial_store(struct config_item *item, const char *page, size_t count) { - struct nvmet_subsys *subsys = to_subsys(item); + u64 serial; + + if (sscanf(page, "%llx\n", &serial) != 1) + return -EINVAL; down_write(&nvmet_config_sem); - sscanf(page, "%llx\n", &subsys->serial); + to_subsys(item)->serial = serial; up_write(&nvmet_config_sem); return count; -- cgit v1.2.3 From 9912ade355902adb9dacbec640fac23c4e73019d Mon Sep 17 00:00:00 2001 From: "Wunderlich, Mark" Date: Thu, 16 Jan 2020 00:46:12 +0000 Subject: nvme-tcp: Set SO_PRIORITY for all host sockets Enable ability to associate all sockets related to NVMf TCP traffic to a priority group that will perform optimized network processing for this traffic class. Maintain initial default behavior of using priority of zero. Signed-off-by: Kiran Patil Signed-off-by: Mark Wunderlich Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 49d4373b84eb..e384239af880 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -20,6 +20,16 @@ struct nvme_tcp_queue; +/* Define the socket priority to use for connections were it is desirable + * that the NIC consider performing optimized packet processing or filtering. + * A non-zero value being sufficient to indicate general consideration of any + * possible optimization. Making it a module param allows for alternative + * values that may be unique for some NIC implementations. + */ +static int so_priority; +module_param(so_priority, int, 0644); +MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); + enum nvme_tcp_send_state { NVME_TCP_SEND_CMD_PDU = 0, NVME_TCP_SEND_H2C_PDU, @@ -1309,6 +1319,17 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, goto err_sock; } + if (so_priority > 0) { + ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY, + (char *)&so_priority, sizeof(so_priority)); + if (ret) { + dev_err(ctrl->ctrl.device, + "failed to set SO_PRIORITY sock opt, ret %d\n", + ret); + goto err_sock; + } + } + /* Set socket type of service */ if (nctrl->opts->tos >= 0) { opt = nctrl->opts->tos; -- cgit v1.2.3 From 43cc66892e81bb05283159e489a19cec177e6f9d Mon Sep 17 00:00:00 2001 From: "Wunderlich, Mark" Date: Thu, 16 Jan 2020 00:46:16 +0000 Subject: nvmet-tcp: set SO_PRIORITY for accepted sockets Enable ability to associate all sockets related to NVMf TCP traffic to a priority group that will perform optimized network processing for this traffic class. Maintain initial default behavior of using priority of zero. Signed-off-by: Kiran Patil Signed-off-by: Mark Wunderlich Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index af674fc0bb1e..cbff1038bdb3 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -19,6 +19,16 @@ #define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE) +/* Define the socket priority to use for connections were it is desirable + * that the NIC consider performing optimized packet processing or filtering. + * A non-zero value being sufficient to indicate general consideration of any + * possible optimization. Making it a module param allows for alternative + * values that may be unique for some NIC implementations. + */ +static int so_priority; +module_param(so_priority, int, 0644); +MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority"); + #define NVMET_TCP_RECV_BUDGET 8 #define NVMET_TCP_SEND_BUDGET 8 #define NVMET_TCP_IO_WORK_BUDGET 64 @@ -1433,6 +1443,13 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) if (ret) return ret; + if (so_priority > 0) { + ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY, + (char *)&so_priority, sizeof(so_priority)); + if (ret) + return ret; + } + /* Set socket type of service */ if (inet->rcv_tos > 0) { int tos = inet->rcv_tos; @@ -1622,6 +1639,15 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) goto err_sock; } + if (so_priority > 0) { + ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY, + (char *)&so_priority, sizeof(so_priority)); + if (ret) { + pr_err("failed to set SO_PRIORITY sock opt %d\n", ret); + goto err_sock; + } + } + ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, sizeof(port->addr)); if (ret) { -- cgit v1.2.3 From 76171c6cdf832bc18b6d8207c9be94d78e54ed09 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 7 Feb 2020 17:13:53 -0800 Subject: nvme: expose hostnqn via sysfs for fabrics controllers We allow userspace to connect with a custom hostnqn which is useful for certain use-cases. However there is no way to tell what is the hostnqn used to connect to a given controller. Expose this so userspace can correlate controllers based on hostnqn. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 414076aaf52b..4633acc0e68f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3242,6 +3242,16 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, } static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); +static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->opts->host->nqn); +} +static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); + static ssize_t nvme_sysfs_show_address(struct device *dev, struct device_attribute *attr, char *buf) @@ -3267,6 +3277,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_numa_node.attr, &dev_attr_queue_count.attr, &dev_attr_sqsize.attr, + &dev_attr_hostnqn.attr, NULL }; @@ -3280,6 +3291,8 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_address.attr && !ctrl->ops->get_address) return 0; + if (a == &dev_attr_hostnqn.attr && !ctrl->opts) + return 0; return a->mode; } -- cgit v1.2.3 From 45fb19f766d94a642cd820fe523ac29f502eece2 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Fri, 7 Feb 2020 17:13:54 -0800 Subject: nvme: expose hostid via sysfs for fabrics controllers We allow userspace to connect with a custom hostid which is useful for certain use-cases. However there is is no way to tell what is the hostid used to connect to a given controller. Expose this so userspace can correlate controllers based on hostid. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4633acc0e68f..720840ca875c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3252,6 +3252,16 @@ static ssize_t nvme_sysfs_show_hostnqn(struct device *dev, } static DEVICE_ATTR(hostnqn, S_IRUGO, nvme_sysfs_show_hostnqn, NULL); +static ssize_t nvme_sysfs_show_hostid(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + return snprintf(buf, PAGE_SIZE, "%pU\n", &ctrl->opts->host->id); +} +static DEVICE_ATTR(hostid, S_IRUGO, nvme_sysfs_show_hostid, NULL); + static ssize_t nvme_sysfs_show_address(struct device *dev, struct device_attribute *attr, char *buf) @@ -3278,6 +3288,7 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_queue_count.attr, &dev_attr_sqsize.attr, &dev_attr_hostnqn.attr, + &dev_attr_hostid.attr, NULL }; @@ -3293,6 +3304,8 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_hostnqn.attr && !ctrl->opts) return 0; + if (a == &dev_attr_hostid.attr && !ctrl->opts) + return 0; return a->mode; } -- cgit v1.2.3 From 228914504cecd1c7d1279b5b884ab55d474cc87e Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 11 Feb 2020 13:41:36 +0100 Subject: nvme: Don't deter users from enabling hwmon support I see no good reason for the "If unsure, say N" advice in the description of the NVME_HWMON configuration option. It is not dangerous, it does not select any other option, and has a fairly low overhead. As the option is already not enabled by default, further suggesting hesitant users to not enable it is not useful anyway. Unlike some other options where the description alone may not be sufficient for users to make a decision, NVME_HWMON is pretty simple to grasp in my opinion, so just let the user do what they want. Signed-off-by: Jean Delvare Reviewed-by: Chaitanya Kulkarni Reviewed-by: Guenter Roeck Cc: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/Kconfig | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index b9358db83e96..9c17ed32be64 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -32,8 +32,6 @@ config NVME_HWMON a hardware monitoring device will be created for each NVMe drive in the system. - If unsure, say N. - config NVME_FABRICS tristate -- cgit v1.2.3 From ad95a613ea447e2404e343ab3636c4d960fa9580 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 19 Feb 2020 08:14:31 -0800 Subject: nvme: code cleanup nvme_identify_ns_desc() The function nvme_identify_ns_desc() has 3 levels of nesting which make error message to exceeded > 80 char per line which is not aligned with the kernel code standards and rest of the NVMe subsystem code. Add a helper function to move the processing of the log when the command is successful by reducing the nesting and keeping the code < 80 char per line. Reviewed-by: Christoph Hellwig Signed-off-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 76 +++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 36 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 720840ca875c..c4dbc852b5e9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1055,6 +1055,43 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, + struct nvme_ns_id_desc *cur) +{ + const char *warn_str = "ctrl returned bogus length:"; + void *data = cur; + + switch (cur->nidt) { + case NVME_NIDT_EUI64: + if (cur->nidl != NVME_NIDT_EUI64_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_EUI64\n", + warn_str, cur->nidl); + return -1; + } + memcpy(ids->eui64, data + sizeof(*cur), NVME_NIDT_EUI64_LEN); + return NVME_NIDT_EUI64_LEN; + case NVME_NIDT_NGUID: + if (cur->nidl != NVME_NIDT_NGUID_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_NGUID\n", + warn_str, cur->nidl); + return -1; + } + memcpy(ids->nguid, data + sizeof(*cur), NVME_NIDT_NGUID_LEN); + return NVME_NIDT_NGUID_LEN; + case NVME_NIDT_UUID: + if (cur->nidl != NVME_NIDT_UUID_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_UUID\n", + warn_str, cur->nidl); + return -1; + } + uuid_copy(&ids->uuid, data + sizeof(*cur)); + return NVME_NIDT_UUID_LEN; + default: + /* Skip unknown types */ + return cur->nidl; + } +} + static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { @@ -1083,42 +1120,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, if (cur->nidl == 0) break; - switch (cur->nidt) { - case NVME_NIDT_EUI64: - if (cur->nidl != NVME_NIDT_EUI64_LEN) { - dev_warn(ctrl->device, - "ctrl returned bogus length: %d for NVME_NIDT_EUI64\n", - cur->nidl); - goto free_data; - } - len = NVME_NIDT_EUI64_LEN; - memcpy(ids->eui64, data + pos + sizeof(*cur), len); - break; - case NVME_NIDT_NGUID: - if (cur->nidl != NVME_NIDT_NGUID_LEN) { - dev_warn(ctrl->device, - "ctrl returned bogus length: %d for NVME_NIDT_NGUID\n", - cur->nidl); - goto free_data; - } - len = NVME_NIDT_NGUID_LEN; - memcpy(ids->nguid, data + pos + sizeof(*cur), len); - break; - case NVME_NIDT_UUID: - if (cur->nidl != NVME_NIDT_UUID_LEN) { - dev_warn(ctrl->device, - "ctrl returned bogus length: %d for NVME_NIDT_UUID\n", - cur->nidl); - goto free_data; - } - len = NVME_NIDT_UUID_LEN; - uuid_copy(&ids->uuid, data + pos + sizeof(*cur)); - break; - default: - /* Skip unknown types */ - len = cur->nidl; - break; - } + len = nvme_process_ns_desc(ctrl, ids, cur); + if (len < 0) + goto free_data; len += sizeof(*cur); } -- cgit v1.2.3 From 94d2e705b6a6fe9c56a990c0cd31a7298cfcee9a Mon Sep 17 00:00:00 2001 From: Rupesh Girase Date: Thu, 27 Feb 2020 22:15:26 +0530 Subject: nvme: log additional message for controller status Log the controller status to know more about issue if it lies within kernel nvme subsytem or controller is unhealthy. Signed-off-by: Rupesh Girase Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c4dbc852b5e9..c9988942d0aa 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2083,8 +2083,8 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) return -EINTR; if (time_after(jiffies, timeout)) { dev_err(ctrl->device, - "Device not ready; aborting %s\n", enabled ? - "initialisation" : "reset"); + "Device not ready; aborting %s, CSTS=0x%x\n", + enabled ? "initialisation" : "reset", csts); return -ENODEV; } } -- cgit v1.2.3 From 3e98c2443f5c7f127b5b7492a3089e92a1c85112 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Fri, 28 Feb 2020 18:52:28 -0800 Subject: nvme: Check for readiness more quickly, to speed up boot time After initialization, nvme_wait_ready checks for readiness every 100ms, even though the drive may be ready far sooner than that. This delays system boot by hundreds of milliseconds. Reduce the delay, checking for readiness every millisecond instead. Boot-time tests on an AWS c5.12xlarge: Before: [ 0.546936] initcall nvme_init+0x0/0x5b returned 0 after 37 usecs ... [ 0.764178] nvme nvme0: 2/0/0 default/read/poll queues [ 0.768424] nvme0n1: p1 [ 0.774132] EXT4-fs (nvme0n1p1): mounted filesystem with ordered data mode. Opts: (null) [ 0.774146] VFS: Mounted root (ext4 filesystem) on device 259:1. ... [ 0.788141] Run /sbin/init as init process After: [ 0.537088] initcall nvme_init+0x0/0x5b returned 0 after 37 usecs ... [ 0.543457] nvme nvme0: 2/0/0 default/read/poll queues [ 0.548473] nvme0n1: p1 [ 0.554339] EXT4-fs (nvme0n1p1): mounted filesystem with ordered data mode. Opts: (null) [ 0.554344] VFS: Mounted root (ext4 filesystem) on device 259:1. ... [ 0.567931] Run /sbin/init as init process Signed-off-by: Josh Triplett Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c9988942d0aa..0e38e07a302f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2078,7 +2078,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) if ((csts & NVME_CSTS_RDY) == bit) break; - msleep(100); + usleep_range(1000, 2000); if (fatal_signal_pending(current)) return -EINTR; if (time_after(jiffies, timeout)) { -- cgit v1.2.3 From 6d525f9755c2ce444de2f3d604d41fbe4df91a8c Mon Sep 17 00:00:00 2001 From: Amit Engel Date: Sat, 29 Feb 2020 16:28:41 -0800 Subject: nvmet: check ncqr & nsqr for set-features cmd For set feature command when setting up NVME_FEAT_NUM_QUEUES, check Number of I/O Completion Queues Requested (NCQR) and Number of I/O Submission Queues Requested (NSQR) before we proceed, for invalid values (i.e. 65535) return an appropriate NVMe invalid field status. Signed-off-by: Amit Engel Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 19f949570625..c0aa9c34c699 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -733,13 +733,22 @@ static void nvmet_execute_set_features(struct nvmet_req *req) { struct nvmet_subsys *subsys = req->sq->ctrl->subsys; u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10); + u32 cdw11 = le32_to_cpu(req->cmd->common.cdw11); u16 status = 0; + u16 nsqr; + u16 ncqr; if (!nvmet_check_data_len(req, 0)) return; switch (cdw10 & 0xff) { case NVME_FEAT_NUM_QUEUES: + ncqr = (cdw11 >> 16) & 0xffff; + nsqr = cdw11 & 0xffff; + if (ncqr == 0xffff || nsqr == 0xffff) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } nvmet_set_result(req, (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); break; -- cgit v1.2.3 From e2a366a4b0feaeba8f0bf6091ddd2ac27507a9d3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 28 Feb 2020 21:45:19 +0300 Subject: nvme-pci: slimmer CQ head update Update CQ head with pre-increment operator. This saves subtraction of 1 and a few registers. Also update phase with "^= 1". This generates only one RMW instruction. ffffffff815ba150 : ffffffff815ba150: 0f b7 47 70 movzx eax,WORD PTR [rdi+0x70] ffffffff815ba154: 83 c0 01 add eax,0x1 ffffffff815ba157: 66 89 47 70 mov WORD PTR [rdi+0x70],ax ffffffff815ba15b: 66 3b 47 68 cmp ax,WORD PTR [rdi+0x68] ffffffff815ba15f: 74 01 je ffffffff815ba162 ffffffff815ba161: c3 ret ffffffff815ba162: 31 c0 xor eax,eax ffffffff815ba164: 80 77 74 01 ===> xor BYTE PTR [rdi+0x74],0x1 ffffffff815ba168: 66 89 47 70 mov WORD PTR [rdi+0x70],ax ffffffff815ba16c: c3 ret add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-119 (-119) Function old new delta nvme_poll 690 678 -12 nvme_dev_disable 1230 1177 -53 nvme_irq 613 559 -54 Signed-off-by: Alexey Dobriyan --- drivers/nvme/host/pci.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d3f23d6254e4..cdc9b6149d38 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -982,11 +982,9 @@ static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end) static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) { - if (nvmeq->cq_head == nvmeq->q_depth - 1) { + if (++nvmeq->cq_head == nvmeq->q_depth) { nvmeq->cq_head = 0; - nvmeq->cq_phase = !nvmeq->cq_phase; - } else { - nvmeq->cq_head++; + nvmeq->cq_phase ^= 1; } } -- cgit v1.2.3 From bf392a5dc02a9b796f3da89fc5bb42856aca64cb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 2 Mar 2020 08:45:04 -0800 Subject: nvme-pci: Remove tag from process cq The only user for tagged completion was for timeout handling. That user, though, really only cares if the timed out command is completed, which we can safely check within the timeout handler. Remove the tag check to simplify completion handling. Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cdc9b6149d38..98d8ddd7aa0f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -989,14 +989,13 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) } static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, - u16 *end, unsigned int tag) + u16 *end) { int found = 0; *start = nvmeq->cq_head; while (nvme_cqe_pending(nvmeq)) { - if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag) - found++; + found++; nvme_update_cq_head(nvmeq); } *end = nvmeq->cq_head; @@ -1017,7 +1016,7 @@ static irqreturn_t nvme_irq(int irq, void *data) * the irq handler, even if that was on another CPU. */ rmb(); - nvme_process_cq(nvmeq, &start, &end, -1); + nvme_process_cq(nvmeq, &start, &end); wmb(); if (start != end) { @@ -1040,7 +1039,7 @@ static irqreturn_t nvme_irq_check(int irq, void *data) * Poll for completions any queue, including those not dedicated to polling. * Can be called from any context. */ -static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag) +static int nvme_poll_irqdisable(struct nvme_queue *nvmeq) { struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); u16 start, end; @@ -1053,11 +1052,11 @@ static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag) */ if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) { spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq, &start, &end, tag); + found = nvme_process_cq(nvmeq, &start, &end); spin_unlock(&nvmeq->cq_poll_lock); } else { disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - found = nvme_process_cq(nvmeq, &start, &end, tag); + found = nvme_process_cq(nvmeq, &start, &end); enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } @@ -1075,8 +1074,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx) return 0; spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq, &start, &end, -1); - nvme_complete_cqes(nvmeq, start, end); + found = nvme_process_cq(nvmeq, &start, &end); spin_unlock(&nvmeq->cq_poll_lock); return found; @@ -1253,7 +1251,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) /* * Did we miss an interrupt? */ - if (nvme_poll_irqdisable(nvmeq, req->tag)) { + nvme_poll_irqdisable(nvmeq); + if (blk_mq_request_completed(req)) { dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, completion polled\n", req->tag, nvmeq->qid); @@ -1396,7 +1395,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) else nvme_disable_ctrl(&dev->ctrl); - nvme_poll_irqdisable(nvmeq, -1); + nvme_poll_irqdisable(nvmeq); } /* @@ -1411,7 +1410,7 @@ static void nvme_reap_pending_cqes(struct nvme_dev *dev) int i; for (i = dev->ctrl.queue_count - 1; i > 0; i--) { - nvme_process_cq(&dev->queues[i], &start, &end, -1); + nvme_process_cq(&dev->queues[i], &start, &end); nvme_complete_cqes(&dev->queues[i], start, end); } } -- cgit v1.2.3 From 324b494c286298d51bc5ed5107644ebe23f9dad6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 2 Mar 2020 08:56:53 -0800 Subject: nvme-pci: Remove two-pass completions Completion handling had been done in two steps: find all new completions under a lock, then handle those completions outside the lock. This was done to make the locked section as short as possible so that other threads using the same lock wait less time. The driver no longer shares locks during completion, and is in fact lockless for interrupt driven queues, so the optimization no longer serves its original purpose. Replace the two-pass completion queue handler with a single pass that completes entries immediately. Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 98d8ddd7aa0f..02f22c63adcf 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -971,15 +971,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) nvme_end_request(req, cqe->status, cqe->result); } -static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end) -{ - while (start != end) { - nvme_handle_cqe(nvmeq, start); - if (++start == nvmeq->q_depth) - start = 0; - } -} - static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) { if (++nvmeq->cq_head == nvmeq->q_depth) { @@ -988,19 +979,17 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) } } -static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start, - u16 *end) +static inline int nvme_process_cq(struct nvme_queue *nvmeq) { int found = 0; - *start = nvmeq->cq_head; while (nvme_cqe_pending(nvmeq)) { found++; + nvme_handle_cqe(nvmeq, nvmeq->cq_head); nvme_update_cq_head(nvmeq); } - *end = nvmeq->cq_head; - if (*start != *end) + if (found) nvme_ring_cq_doorbell(nvmeq); return found; } @@ -1009,21 +998,16 @@ static irqreturn_t nvme_irq(int irq, void *data) { struct nvme_queue *nvmeq = data; irqreturn_t ret = IRQ_NONE; - u16 start, end; /* * The rmb/wmb pair ensures we see all updates from a previous run of * the irq handler, even if that was on another CPU. */ rmb(); - nvme_process_cq(nvmeq, &start, &end); + if (nvme_process_cq(nvmeq)) + ret = IRQ_HANDLED; wmb(); - if (start != end) { - nvme_complete_cqes(nvmeq, start, end); - return IRQ_HANDLED; - } - return ret; } @@ -1042,7 +1026,6 @@ static irqreturn_t nvme_irq_check(int irq, void *data) static int nvme_poll_irqdisable(struct nvme_queue *nvmeq) { struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); - u16 start, end; int found; /* @@ -1052,29 +1035,27 @@ static int nvme_poll_irqdisable(struct nvme_queue *nvmeq) */ if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) { spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq, &start, &end); + found = nvme_process_cq(nvmeq); spin_unlock(&nvmeq->cq_poll_lock); } else { disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - found = nvme_process_cq(nvmeq, &start, &end); + found = nvme_process_cq(nvmeq); enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } - nvme_complete_cqes(nvmeq, start, end); return found; } static int nvme_poll(struct blk_mq_hw_ctx *hctx) { struct nvme_queue *nvmeq = hctx->driver_data; - u16 start, end; bool found; if (!nvme_cqe_pending(nvmeq)) return 0; spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq, &start, &end); + found = nvme_process_cq(nvmeq); spin_unlock(&nvmeq->cq_poll_lock); return found; @@ -1406,13 +1387,10 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) */ static void nvme_reap_pending_cqes(struct nvme_dev *dev) { - u16 start, end; int i; - for (i = dev->ctrl.queue_count - 1; i > 0; i--) { - nvme_process_cq(&dev->queues[i], &start, &end); - nvme_complete_cqes(&dev->queues[i], start, end); - } + for (i = dev->ctrl.queue_count - 1; i > 0; i--) + nvme_process_cq(&dev->queues[i]); } static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, -- cgit v1.2.3 From fa059b856a593a7bddd4d3779ae8ab1380e05d91 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 4 Mar 2020 09:17:01 -0800 Subject: nvme-pci: Simplify nvme_poll_irqdisable The timeout handler can use the existing nvme_poll() if it needs to check a polled queue, allowing nvme_poll_irqdisable() to handle only irq driven queues for the remaining callers. Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 02f22c63adcf..f45e26e6af7e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1020,30 +1020,18 @@ static irqreturn_t nvme_irq_check(int irq, void *data) } /* - * Poll for completions any queue, including those not dedicated to polling. + * Poll for completions for any interrupt driven queue * Can be called from any context. */ -static int nvme_poll_irqdisable(struct nvme_queue *nvmeq) +static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) { struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev); - int found; - /* - * For a poll queue we need to protect against the polling thread - * using the CQ lock. For normal interrupt driven threads we have - * to disable the interrupt to avoid racing with it. - */ - if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) { - spin_lock(&nvmeq->cq_poll_lock); - found = nvme_process_cq(nvmeq); - spin_unlock(&nvmeq->cq_poll_lock); - } else { - disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - found = nvme_process_cq(nvmeq); - enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); - } + WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); - return found; + disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + nvme_process_cq(nvmeq); + enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } static int nvme_poll(struct blk_mq_hw_ctx *hctx) @@ -1232,7 +1220,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) /* * Did we miss an interrupt? */ - nvme_poll_irqdisable(nvmeq); + if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) + nvme_poll(req->mq_hctx); + else + nvme_poll_irqdisable(nvmeq); + if (blk_mq_request_completed(req)) { dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, completion polled\n", -- cgit v1.2.3 From 40510a639ec08db81d5ff9c79856baf9dda94748 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 25 Feb 2020 15:53:09 -0800 Subject: nvme-tcp: optimize queue io_cpu assignment for multiple queue maps Currently, queue io_cpu assignment is done sequentially for default, read and poll queues based on queue id. This causes miss-alignment between context of CPU initiating I/O and the I/O worker thread processing queued requests or completions. Change to modify queue io_cpu assignment to take into account queue maps offset. Each queue io_cpu will start at zero for each queue map. This essentially aligns read/poll queues to start over the same range as default queues. Testing performed by Mark with: - ram device (nvmet) - single CPU core (pinned) - 100% 4k reads - engine io_uring (not using sq_thread option) - hipri flag set Micro-benchmark results show a net gain of: - increase of 18%-29% in IOPs - reduction of 16%-22% in average latency - reduction of 7%-23% in 99.99% latency Baseline: ======== QDepth/Batch | IOPs [k] | Avg. Lat [us] | 99.99% Lat [us] ----------------------------------------------------------------- 1/1 | 32.4 | 30.11 | 50.94 32/8 | 179 | 168.20 | 371 CPU alignment: ============= QDepth/Batch | IOPs [k] | Avg. Lat [us] | 99.99% Lat [us] ----------------------------------------------------------------- 1/1 | 38.5 | 25.18 | 39.16 32/8 | 231 | 130.75 | 343 Reported-by: Mark Wunderlich Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 62 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 6 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index e384239af880..11a7c26f8573 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1258,13 +1258,67 @@ free_icreq: return ret; } +static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue) +{ + return nvme_tcp_queue_id(queue) == 0; +} + +static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT]; +} + +static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; +} + +static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + !nvme_tcp_read_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ] + + ctrl->io_queues[HCTX_TYPE_POLL]; +} + +static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + int n = 0; + + if (nvme_tcp_default_queue(queue)) + n = qid - 1; + else if (nvme_tcp_read_queue(queue)) + n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1; + else if (nvme_tcp_poll_queue(queue)) + n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - + ctrl->io_queues[HCTX_TYPE_READ] - 1; + queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); +} + static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, size_t queue_size) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; struct linger sol = { .l_onoff = 1, .l_linger = 0 }; - int ret, opt, rcv_pdu_size, n; + int ret, opt, rcv_pdu_size; queue->ctrl = ctrl; INIT_LIST_HEAD(&queue->send_list); @@ -1343,11 +1397,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } queue->sock->sk->sk_allocation = GFP_ATOMIC; - if (!qid) - n = 0; - else - n = (qid - 1) % num_online_cpus(); - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + nvme_tcp_set_queue_io_cpu(queue); queue->request = NULL; queue->data_remaining = 0; queue->ddgst_remaining = 0; -- cgit v1.2.3 From 9cda34e37489244a8c8628617e24b2dbc8a8edad Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 25 Feb 2020 16:42:27 -0800 Subject: nvmet-tcp: fix maxh2cdata icresp parameter MAXH2CDATA is not zero based. Also no reason to limit ourselves to 1M transfers as we can do more easily. Make this an arbitrary limit of 16M. Reported-by: Wenhua Liu Cc: stable@vger.kernel.org # v5.0+ Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index cbff1038bdb3..1942c941c31c 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -798,7 +798,7 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) icresp->hdr.pdo = 0; icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen); icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0); - icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */ + icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */ icresp->cpda = 0; if (queue->hdr_digest) icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE; -- cgit v1.2.3 From 5ff4e11264780ce49a9acb66e0b4c5a30a569be4 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 25 Feb 2020 16:43:23 -0800 Subject: nvme-tcp: move send failure to nvme_tcp_try_send Consolidate the request failure handling code to where it is being fetched (nvme_tcp_try_send). Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 11a7c26f8573..221a5a59aa06 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1027,8 +1027,15 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) if (req->state == NVME_TCP_SEND_DDGST) ret = nvme_tcp_try_send_ddgst(req); done: - if (ret == -EAGAIN) + if (ret == -EAGAIN) { ret = 0; + } else if (ret < 0) { + dev_err(queue->ctrl->ctrl.device, + "failed to send request %d\n", ret); + if (ret != -EPIPE && ret != -ECONNRESET) + nvme_tcp_fail_request(queue->request); + nvme_tcp_done_send_req(queue); + } return ret; } @@ -1059,21 +1066,10 @@ static void nvme_tcp_io_work(struct work_struct *w) int result; result = nvme_tcp_try_send(queue); - if (result > 0) { + if (result > 0) pending = true; - } else if (unlikely(result < 0)) { - dev_err(queue->ctrl->ctrl.device, - "failed to send request %d\n", result); - - /* - * Fail the request unless peer closed the connection, - * in which case error recovery flow will complete all. - */ - if ((result != -EPIPE) && (result != -ECONNRESET)) - nvme_tcp_fail_request(queue->request); - nvme_tcp_done_send_req(queue); - return; - } + else if (unlikely(result < 0)) + break; result = nvme_tcp_try_recv(queue); if (result > 0) -- cgit v1.2.3 From 761ad26c45b0260a8516bc1fc9d25bb66ca4e25c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 25 Feb 2020 16:43:24 -0800 Subject: nvme-tcp: break from io_work loop if recv failed If we failed to receive data from the socket, don't try to further process it, we will for sure be handling a queue error at this point. While no issue was seen with the current behavior thus far, its safer to cease socket processing if we detected an error. Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 221a5a59aa06..4b20301e517c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1074,6 +1074,8 @@ static void nvme_tcp_io_work(struct work_struct *w) result = nvme_tcp_try_recv(queue); if (result > 0) pending = true; + else if (unlikely(result < 0)) + break; if (!pending) return; -- cgit v1.2.3 From 2db24e4a22bc97c713261a81fc75e2a36db65715 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 9 Mar 2020 17:04:12 +0200 Subject: nvme-pci: properly print controller address Align PCI address print with fabrics address that is printed with newline character. Before: [root@server40 linux]# cat /sys/class/nvme/nvme2/address 0000:0b:00.0[root@server40 linux]# After: [root@server40 linux]# cat /sys/class/nvme/nvme2/address 0000:0b:00.0 [root@server40 linux]# Reviewed-by: Christoph Hellwig Signed-off-by: Max Gurtovoy --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f45e26e6af7e..e6fa0c7bb96c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2656,7 +2656,7 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size) { struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev); - return snprintf(buf, size, "%s", dev_name(&pdev->dev)); + return snprintf(buf, size, "%s\n", dev_name(&pdev->dev)); } static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { -- cgit v1.2.3 From 02cb00e233ade7c050e0f476902e63847e78114e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 8 Mar 2020 12:55:03 +0200 Subject: nvmet: Add get_mdts op for controllers Some transports, such as RDMA, would like to set the Maximum Data Transfer Size (MDTS) according to device/port/ctrl characteristics. This will enable the transport to set the optimal MDTS according to controller needs and device capabilities. Add a new nvmet transport op that is called during ctrl identification. This will not effect transports that don't implement this option. The return value of the new op is according to the NVMe spec definition for MDTS. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Israel Rukshin --- drivers/nvme/target/admin-cmd.c | 8 ++++++-- drivers/nvme/target/nvmet.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index c0aa9c34c699..b9ec489dc748 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -369,8 +369,12 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) /* we support multiple ports, multiples hosts and ANA: */ id->cmic = (1 << 0) | (1 << 1) | (1 << 3); - /* no limit on data transfer sizes for now */ - id->mdts = 0; + /* Limit MDTS according to transport capability */ + if (ctrl->ops->get_mdts) + id->mdts = ctrl->ops->get_mdts(ctrl); + else + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); id->ver = cpu_to_le32(ctrl->subsys->ver); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 42ba2ddd9e96..421dff3ea143 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -289,6 +289,7 @@ struct nvmet_fabrics_ops { struct nvmet_port *port, char *traddr); u16 (*install_queue)(struct nvmet_sq *nvme_sq); void (*discovery_chg)(struct nvmet_port *port); + u8 (*get_mdts)(const struct nvmet_ctrl *ctrl); }; #define NVMET_MAX_INLINE_BIOVEC 8 -- cgit v1.2.3 From ec6d20e16c2d2bef8df2d82d63dcee51caa4ac27 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 8 Mar 2020 12:55:04 +0200 Subject: nvmet-rdma: Implement get_mdts controller op Set the maximal data transfer size to be 1MB (currently mdts is unlimited). This will allow calculating the amount of MR's that one ctrl should allocate to fulfill it's capabilities. Reviewed-by: Christoph Hellwig Signed-off-by: Max Gurtovoy --- drivers/nvme/target/rdma.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 37d262a65877..f47a79b9fc6c 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -31,6 +31,9 @@ #define NVMET_RDMA_MAX_INLINE_SGE 4 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) +/* Assume mpsmin == device_page_size == 4KB */ +#define NVMET_RDMA_MAX_MDTS 8 + struct nvmet_rdma_cmd { struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; struct ib_cqe cqe; @@ -1602,6 +1605,11 @@ static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, } } +static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) +{ + return NVMET_RDMA_MAX_MDTS; +} + static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, @@ -1612,6 +1620,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .queue_response = nvmet_rdma_queue_response, .delete_ctrl = nvmet_rdma_delete_ctrl, .disc_traddr = nvmet_rdma_disc_port_addr, + .get_mdts = nvmet_rdma_get_mdts, }; static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) -- cgit v1.2.3 From c363f249e7e6576587d8982d9087406fe98beb99 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 8 Mar 2020 12:55:05 +0200 Subject: nvmet-rdma: allocate RW ctxs according to mdts Current nvmet-rdma code allocates MR pool budget based on queue size, assuming both host and target use the same "max_pages_per_mr" count. After limiting the mdts value for RDMA controllers, we know the factor of maximum MR's per IO operation. Thus, make sure MR pool will be sufficient for the required IO depth and IO size. That is, say host's SQ size is 100, then the MR pool budget allocated currently at target will also be 100 MRs. But 100 IO WRITE Requests with 256 sg_count(IO size above 1MB) require 200 MRs when target's "max_pages_per_mr" is 128. Reported-by: Krishnamraju Eraparaju Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy --- drivers/nvme/target/rdma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index f47a79b9fc6c..9e1b8c61f54e 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -978,7 +978,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) { struct ib_qp_init_attr qp_attr; struct nvmet_rdma_device *ndev = queue->dev; - int comp_vector, nr_cqe, ret, i; + int comp_vector, nr_cqe, ret, i, factor; /* * Spread the io queues across completion vectors, @@ -1011,7 +1011,9 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) qp_attr.qp_type = IB_QPT_RC; /* +1 for drain */ qp_attr.cap.max_send_wr = queue->send_queue_size + 1; - qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; + factor = rdma_rw_mr_factor(ndev->device, queue->cm_id->port_num, + 1 << NVMET_RDMA_MAX_MDTS); + qp_attr.cap.max_rdma_ctxs = queue->send_queue_size * factor; qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, ndev->device->attrs.max_send_sge); -- cgit v1.2.3 From 764e9332098c0e60251386a507fe46ac91276120 Mon Sep 17 00:00:00 2001 From: John Meneghini Date: Thu, 20 Feb 2020 10:05:38 +0900 Subject: nvme-multipath: do not reset on unknown status The nvme multipath error handling defaults to controller reset if the error is unknown. There are, however, no existing nvme status codes that indicate a reset should be used, and resetting causes unnecessary disruption to the rest of IO. Change nvme's error handling to first check if failover should happen. If not, let the normal error handling take over rather than reset the controller. Based-on-a-patch-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: John Meneghini Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 +---- drivers/nvme/host/multipath.c | 21 +++++++++------------ drivers/nvme/host/nvme.h | 5 +++-- 3 files changed, 13 insertions(+), 18 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0e38e07a302f..fde4b3a526ad 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -291,11 +291,8 @@ void nvme_complete_rq(struct request *req) nvme_req(req)->ctrl->comp_seen = true; if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { - if ((req->cmd_flags & REQ_NVME_MPATH) && - blk_path_error(status)) { - nvme_failover_req(req); + if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req)) return; - } if (!blk_queue_dying(req->q)) { nvme_retry_req(req); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a11900cf3a36..90dd1d641b7b 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -64,17 +64,12 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, } } -void nvme_failover_req(struct request *req) +bool nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; u16 status = nvme_req(req)->status; unsigned long flags; - spin_lock_irqsave(&ns->head->requeue_lock, flags); - blk_steal_bios(&ns->head->requeue_list, req); - spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); - switch (status & 0x7ff) { case NVME_SC_ANA_TRANSITION: case NVME_SC_ANA_INACCESSIBLE: @@ -103,15 +98,17 @@ void nvme_failover_req(struct request *req) nvme_mpath_clear_current_path(ns); break; default: - /* - * Reset the controller for any non-ANA error as we don't know - * what caused the error. - */ - nvme_reset_ctrl(ns->ctrl); - break; + /* This was a non-ANA error so follow the normal error path. */ + return false; } + spin_lock_irqsave(&ns->head->requeue_lock, flags); + blk_steal_bios(&ns->head->requeue_list, req); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); + blk_mq_end_request(req, 0); + kblockd_schedule_work(&ns->head->requeue_work); + return true; } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 1024fec7914c..d800b9a51c2c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -550,7 +550,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); -void nvme_failover_req(struct request *req); +bool nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); @@ -599,8 +599,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); } -static inline void nvme_failover_req(struct request *req) +static inline bool nvme_failover_req(struct request *req) { + return false; } static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) { -- cgit v1.2.3 From 8d8a50e20dc2dc41cb788085968b9024dc36f7a5 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 11 Mar 2020 09:50:37 +0100 Subject: nvme-fabrics: Use scnprintf() for avoiding potential buffer overflow Since snprintf() returns the would-be-output size instead of the actual output size, the succeeding calls may go beyond the given buffer limit. Fix it by replacing with scnprintf(). Reviewed-by: Christoph Hellwig Signed-off-by: Takashi Iwai Signed-off-by: Keith Busch --- drivers/nvme/host/fabrics.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 74b8818ac9a1..2a6c8190eeb7 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -105,14 +105,14 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) int len = 0; if (ctrl->opts->mask & NVMF_OPT_TRADDR) - len += snprintf(buf, size, "traddr=%s", ctrl->opts->traddr); + len += scnprintf(buf, size, "traddr=%s", ctrl->opts->traddr); if (ctrl->opts->mask & NVMF_OPT_TRSVCID) - len += snprintf(buf + len, size - len, "%strsvcid=%s", + len += scnprintf(buf + len, size - len, "%strsvcid=%s", (len) ? "," : "", ctrl->opts->trsvcid); if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) - len += snprintf(buf + len, size - len, "%shost_traddr=%s", + len += scnprintf(buf + len, size - len, "%shost_traddr=%s", (len) ? "," : "", ctrl->opts->host_traddr); - len += snprintf(buf + len, size - len, "\n"); + len += scnprintf(buf + len, size - len, "\n"); return len; } -- cgit v1.2.3 From e90d172b11b845b0f2caa9422c2f9d3ef59af575 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 12 Mar 2020 16:06:39 -0700 Subject: nvmet-tcp: optimize tcp stack TX when data digest is used If we have a 4-byte data digest to send to the wire, but we have more data to send, set MSG_MORE to tell the stack that more is coming. Reviewed-by: Mark Wunderlich Signed-off-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 1942c941c31c..dcee4995e22d 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -626,7 +626,7 @@ static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch) return 1; } -static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd) +static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch) { struct nvmet_tcp_queue *queue = cmd->queue; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; @@ -636,6 +636,9 @@ static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd) }; int ret; + if (!last_in_batch && cmd->queue->send_list_len) + msg.msg_flags |= MSG_MORE; + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (unlikely(ret <= 0)) return ret; @@ -676,7 +679,7 @@ static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue, } if (cmd->state == NVMET_TCP_SEND_DDGST) { - ret = nvmet_try_send_ddgst(cmd); + ret = nvmet_try_send_ddgst(cmd, last_in_batch); if (ret <= 0) goto done_send; } -- cgit v1.2.3 From c225b610311bc5695d952cd3590136f26199a227 Mon Sep 17 00:00:00 2001 From: "masahiro31.yamada@kioxia.com" Date: Thu, 5 Mar 2020 11:13:29 +0000 Subject: nvme: Add compat_ioctl handler for NVME_IOCTL_SUBMIT_IO Currently 32 bit application gets ENOTTY when it calls compat_ioctl with NVME_IOCTL_SUBMIT_IO in 64 bit kernel. The cause is that the results of sizeof(struct nvme_user_io), which is used to define NVME_IOCTL_SUBMIT_IO, are not same between 32 bit compiler and 64 bit compiler. * 32 bit: the result of sizeof nvme_user_io is 44. * 64 bit: the result of sizeof nvme_user_io is 48. 64 bit compiler seems to add 32 bit padding for multiple of 8 bytes. This patch adds a compat_ioctl handler. The handler replaces NVME_IOCTL_SUBMIT_IO32 with NVME_IOCTL_SUBMIT_IO in case 32 bit application calls compat_ioctl for submit in 64 bit kernel. Then, it calls nvme_ioctl as usual. Reviewed-by: Christoph Hellwig Signed-off-by: Masahiro Yamada (KIOXIA) Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fde4b3a526ad..3c1c826ea491 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1585,6 +1585,47 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, return ret; } +#ifdef CONFIG_COMPAT +struct nvme_user_io32 { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +} __attribute__((__packed__)); + +#define NVME_IOCTL_SUBMIT_IO32 _IOW('N', 0x42, struct nvme_user_io32) + +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + /* + * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO + * between 32 bit programs and 64 bit kernel. + * The cause is that the results of sizeof(struct nvme_user_io), + * which is used to define NVME_IOCTL_SUBMIT_IO, + * are not same between 32 bit compiler and 64 bit compiler. + * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling + * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs. + * Other IOCTL numbers are same between 32 bit and 64 bit. + * So there is nothing to do regarding to other IOCTL numbers. + */ + if (cmd == NVME_IOCTL_SUBMIT_IO32) + return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg); + + return nvme_ioctl(bdev, mode, cmd, arg); +} +#else +#define nvme_compat_ioctl NULL +#endif /* CONFIG_COMPAT */ + static int nvme_open(struct block_device *bdev, fmode_t mode) { struct nvme_ns *ns = bdev->bd_disk->private_data; @@ -2028,7 +2069,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit); static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, @@ -2056,7 +2097,7 @@ const struct block_device_operations nvme_ns_head_ops = { .open = nvme_ns_head_open, .release = nvme_ns_head_release, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, .getgeo = nvme_getgeo, .pr_ops = &nvme_pr_ops, }; -- cgit v1.2.3 From f41cfd5d0a04b12a5dae753cd01163661432ebbb Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 18 Mar 2020 17:27:59 +0200 Subject: nvme: release ida resources ida instances allocate some internal memory in addition to the base 'struct ida'. Use ida_destroy() to release that memory at module_exit(). Reviewed-by: Christoph Hellwig Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3c1c826ea491..ad0847b6c769 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4358,6 +4358,7 @@ static void __exit nvme_core_exit(void) destroy_workqueue(nvme_delete_wq); destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); + ida_destroy(&nvme_instance_ida); } MODULE_LICENSE("GPL"); -- cgit v1.2.3 From e7c43feae2ab8744c3112b5a714959c8ea71ca19 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 10 Mar 2020 16:39:10 +0200 Subject: nvme: Use nvme_state_terminal helper Improve code readability. Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Israel Rukshin Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index ad0847b6c769..392af3cf0bf9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2633,8 +2633,7 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, lockdep_assert_held(&nvme_subsystems_lock); list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) { - if (tmp->state == NVME_CTRL_DELETING || - tmp->state == NVME_CTRL_DEAD) + if (nvme_state_terminal(tmp)) continue; if (tmp->cntlid == ctrl->cntlid) { -- cgit v1.2.3 From 6721c18a0610db39cf0110b9be07946bbc208ed7 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:39 +0200 Subject: nvme: Remove unused return code from nvme_delete_ctrl_sync The return code of nvme_delete_ctrl_sync is never used, so change it to void. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 392af3cf0bf9..8a7761c3086e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -192,21 +192,16 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_delete_ctrl); -static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) +static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) { - int ret = 0; - /* * Keep a reference until nvme_do_delete_ctrl() complete, * since ->delete_ctrl can free the controller. */ nvme_get_ctrl(ctrl); - if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) - ret = -EBUSY; - if (!ret) + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) nvme_do_delete_ctrl(ctrl); nvme_put_ctrl(ctrl); - return ret; } static inline bool nvme_ns_has_pi(struct nvme_ns *ns) -- cgit v1.2.3 From 253fd4ac806896293c9b9d12c794195447bad164 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:40 +0200 Subject: nvme-pci: Re-order nvme_pci_free_ctrl Destroy the resources in the same order like in nvme_probe error flow to improve code readability. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e6fa0c7bb96c..ff0bd2d84f3e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2470,13 +2470,13 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) struct nvme_dev *dev = to_nvme_dev(ctrl); nvme_dbbuf_dma_free(dev); - put_device(dev->dev); nvme_free_tagset(dev); if (dev->ctrl.admin_q) blk_put_queue(dev->ctrl.admin_q); - kfree(dev->queues); free_opal_dev(dev->ctrl.opal_dev); mempool_destroy(dev->iod_mempool); + put_device(dev->dev); + kfree(dev->queues); kfree(dev); } -- cgit v1.2.3 From b780d7415aacec855e2f2370cbf98f918b224903 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:41 +0200 Subject: nvme: Fix ctrl use-after-free during sysfs deletion In case nvme_sysfs_delete() is called by the user before taking the ctrl reference count, the ctrl may be freed during the creation and cause the bug. Take the reference as soon as the controller is externally visible, which is done by cdev_device_add() in nvme_init_ctrl(). Also take the reference count at the core layer instead of taking it on each transport separately. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 ++ drivers/nvme/host/fc.c | 4 +--- drivers/nvme/host/pci.c | 1 - drivers/nvme/host/rdma.c | 3 +-- drivers/nvme/host/tcp.c | 3 +-- drivers/nvme/target/loop.c | 3 +-- 6 files changed, 6 insertions(+), 10 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8a7761c3086e..51f80be0fe90 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4130,6 +4130,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, if (ret) goto out_release_instance; + nvme_get_ctrl(ctrl); cdev_init(&ctrl->cdev, &nvme_dev_fops); ctrl->cdev.owner = ops->module; ret = cdev_device_add(&ctrl->cdev, ctrl->device); @@ -4148,6 +4149,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, return 0; out_free_name: + nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 5a70ac395d53..59d2e2bec179 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3181,10 +3181,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto fail_ctrl; } - nvme_get_ctrl(&ctrl->ctrl); - if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { - nvme_put_ctrl(&ctrl->ctrl); dev_err(ctrl->ctrl.device, "NVME-FC{%d}: failed to schedule initial connect\n", ctrl->cnum); @@ -3209,6 +3206,7 @@ fail_ctrl: /* initiate nvme ctrl ref counting teardown */ nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); /* Remove core ctrl ref. */ nvme_put_ctrl(&ctrl->ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ff0bd2d84f3e..4e062c3a84bc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2802,7 +2802,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev)); nvme_reset_ctrl(&dev->ctrl); - nvme_get_ctrl(&dev->ctrl); async_schedule(nvme_async_probe, dev); return 0; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 3e85c5cacefd..ca782deea72d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2043,8 +2043,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - nvme_get_ctrl(&ctrl->ctrl); - mutex_lock(&nvme_rdma_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); @@ -2054,6 +2052,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4b20301e517c..dd569b122a0d 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2428,8 +2428,6 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - nvme_get_ctrl(&ctrl->ctrl); - mutex_lock(&nvme_tcp_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); mutex_unlock(&nvme_tcp_ctrl_mutex); @@ -2439,6 +2437,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 4df4ebde208a..a425e2858829 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -618,8 +618,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); - nvme_get_ctrl(&ctrl->ctrl); - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); @@ -637,6 +635,7 @@ out_free_queues: kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) -- cgit v1.2.3 From 726612b6b8259afa41d265a2722991c87f059223 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:42 +0200 Subject: nvme: Make nvme_uninit_ctrl symmetric to nvme_init_ctrl Put the ctrl reference count at nvme_uninit_ctrl as opposed to nvme_init_ctrl which takes it. This decrease the reference count at the core layer instead of decreasing it on each transport separately. Also move the call of nvme_uninit_ctrl at PCI driver after calling to nvme_release_prp_pools and nvme_dev_unmap, in order to put the reference count after using the dev. This is safe because those functions use nvme_dev which is freed only later at nvme_pci_free_ctrl. Signed-off-by: Israel Rukshin Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/fc.c | 1 - drivers/nvme/host/pci.c | 3 +-- drivers/nvme/host/rdma.c | 1 - drivers/nvme/host/tcp.c | 1 - drivers/nvme/target/loop.c | 2 -- 6 files changed, 2 insertions(+), 8 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 51f80be0fe90..8e6a3ada9d44 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -171,7 +171,6 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl) nvme_remove_namespaces(ctrl); ctrl->ops->delete_ctrl(ctrl); nvme_uninit_ctrl(ctrl); - nvme_put_ctrl(ctrl); } static void nvme_delete_ctrl_work(struct work_struct *work) @@ -4048,6 +4047,7 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) nvme_fault_inject_fini(&ctrl->fault_inject); dev_pm_qos_hide_latency_tolerance(ctrl->device); cdev_device_del(&ctrl->cdev, ctrl->device); + nvme_put_ctrl(ctrl); } EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 59d2e2bec179..a8bf2fb1287b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -3206,7 +3206,6 @@ fail_ctrl: /* initiate nvme ctrl ref counting teardown */ nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); /* Remove core ctrl ref. */ nvme_put_ctrl(&ctrl->ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4e062c3a84bc..4e79e412b276 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2873,10 +2873,9 @@ static void nvme_remove(struct pci_dev *pdev) nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); - nvme_uninit_ctrl(&dev->ctrl); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); - nvme_put_ctrl(&dev->ctrl); + nvme_uninit_ctrl(&dev->ctrl); } #ifdef CONFIG_PM_SLEEP diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ca782deea72d..c99a88247660 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2052,7 +2052,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index dd569b122a0d..f111430bb617 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2437,7 +2437,6 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index a425e2858829..0d54e730cbf2 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -485,7 +485,6 @@ out_destroy_admin: out_disable: dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); } static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { @@ -635,7 +634,6 @@ out_free_queues: kfree(ctrl->queues); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) -- cgit v1.2.3 From ce1518139e6976cf19c133b555083354fdb629b8 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:43 +0200 Subject: nvme: Fix controller creation races with teardown flow Calling nvme_sysfs_delete() when the controller is in the middle of creation may cause several bugs. If the controller is in NEW state we remove delete_controller file and don't delete the controller. The user will not be able to use nvme disconnect command on that controller again, although the controller may be active. Other bugs may happen if the controller is in the middle of create_ctrl callback and nvme_do_delete_ctrl() starts. For example, freeing I/O tagset at nvme_do_delete_ctrl() before it was allocated at create_ctrl callback. To fix all those races don't allow the user to delete the controller before it was fully created. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 5 +++++ drivers/nvme/host/nvme.h | 1 + 2 files changed, 6 insertions(+) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8e6a3ada9d44..66fe301d9abb 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3228,6 +3228,10 @@ static ssize_t nvme_sysfs_delete(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + /* Can't delete non-created controllers */ + if (!ctrl->created) + return -EBUSY; + if (device_remove_file_self(dev, attr)) nvme_delete_ctrl_sync(ctrl); return count; @@ -4039,6 +4043,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) nvme_queue_scan(ctrl); nvme_start_queues(ctrl); } + ctrl->created = true; } EXPORT_SYMBOL_GPL(nvme_start_ctrl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index d800b9a51c2c..2e04a36296d9 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -259,6 +259,7 @@ struct nvme_ctrl { struct nvme_command ka_cmd; struct work_struct fw_act_work; unsigned long events; + bool created; #ifdef CONFIG_NVME_MULTIPATH /* asymmetric namespace access: */ -- cgit v1.2.3 From 96135862dfcce38b98beff7d1009188263b7e6f7 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:44 +0200 Subject: nvme-rdma: Add warning on state change failure at nvme_rdma_setup_ctrl The transition to LIVE state should not fail in case of a new controller. Moving to DELETING state before nvme_tcp_create_ctrl() allocates all the resources may leads to NULL dereference at teardown flow (e.g., IO tagset, admin_q, connect_q). Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/rdma.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index c99a88247660..3ae3011a95ea 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1022,8 +1022,13 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); if (!changed) { - /* state change failure is ok if we're in DELETING state */ + /* + * state change failure is ok if we're in DELETING state, + * unless we're during creation of a new controller to + * avoid races with teardown flow. + */ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; } -- cgit v1.2.3 From bea54ef53fce57c8b2f11315c9384e43b2ecb321 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 24 Mar 2020 17:29:45 +0200 Subject: nvme-tcp: Add warning on state change failure at nvme_tcp_setup_ctrl The transition to LIVE state should not fail in case of a new controller. Moving to DELETING state before nvme_tcp_create_ctrl() allocates all the resources may leads to NULL dereference at teardown flow (e.g., IO tagset, admin_q, connect_q). Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index f111430bb617..0ef14f0fad86 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1930,8 +1930,13 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) } if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { - /* state change failure is ok if we're in DELETING state */ + /* + * state change failure is ok if we're in DELETING state, + * unless we're during creation of a new controller to + * avoid races with teardown flow. + */ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; } -- cgit v1.2.3 From fb314eb0cbb2e11540d1ae1a7b28346397f621ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 14:19:35 +0100 Subject: nvme: refactor nvme_identify_ns_descs error handling Move the handling of an error into the function from the caller, and only do it for an actual error on the admin command itself, not the command parsing, as that should be enough to deal with devices claiming a bogus version compliance. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 66fe301d9abb..6bd1c6dfac6b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1102,8 +1102,17 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data, NVME_IDENTIFY_DATA_SIZE); - if (status) + if (status) { + dev_warn(ctrl->device, + "Identify Descriptors failed (%d)\n", status); + /* + * Don't treat an error as fatal, as we potentially already + * have a NGUID or EUI-64. + */ + if (status > 0) + status = 0; goto free_data; + } for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) { struct nvme_ns_id_desc *cur = data + pos; @@ -1757,26 +1766,15 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, struct nvme_id_ns *id, struct nvme_ns_ids *ids) { - int ret = 0; - memset(ids, 0, sizeof(*ids)); if (ctrl->vs >= NVME_VS(1, 1, 0)) memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); if (ctrl->vs >= NVME_VS(1, 2, 0)) memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0)) { - /* Don't treat error as fatal we potentially - * already have a NGUID or EUI-64 - */ - ret = nvme_identify_ns_descs(ctrl, nsid, ids); - if (ret) - dev_warn(ctrl->device, - "Identify Descriptors failed (%d)\n", ret); - if (ret > 0) - ret = 0; - } - return ret; + if (ctrl->vs >= NVME_VS(1, 3, 0)) + return nvme_identify_ns_descs(ctrl, nsid, ids); + return 0; } static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) -- cgit v1.2.3 From 026d2ef752f47f33efd92244b9cf6be65d2a1621 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 14:19:36 +0100 Subject: nvme: rename __nvme_find_ns_head to nvme_find_ns_head There is no non __-prefixed version, so make the name a little more readable. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6bd1c6dfac6b..56a0dc18ed2d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3357,7 +3357,7 @@ static const struct attribute_group *nvme_dev_attr_groups[] = { NULL, }; -static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys, +static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, unsigned nsid) { struct nvme_ns_head *h; @@ -3457,7 +3457,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, mutex_lock(&ctrl->subsys->lock); if (is_shared) - head = __nvme_find_ns_head(ctrl->subsys, nsid); + head = nvme_find_ns_head(ctrl->subsys, nsid); if (!head) { head = nvme_alloc_ns_head(ctrl, nsid, id); if (IS_ERR(head)) { -- cgit v1.2.3 From 43fcd9e1eae87c3235b8077f97bc6a286c3ae59b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 25 Mar 2020 14:19:37 +0100 Subject: nvme: cleanup namespace identifier reporting in nvme_init_ns_head Lift the common namespace identifier reporting between the shared namespace and new nshead cases into common code. This also means one less lock is held while doing I/O. Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'drivers/nvme') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 56a0dc18ed2d..2b0f693437a8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3390,7 +3390,8 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, } static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, - unsigned nsid, struct nvme_id_ns *id) + unsigned nsid, struct nvme_id_ns *id, + struct nvme_ns_ids *ids) { struct nvme_ns_head *head; size_t size = sizeof(*head); @@ -3413,12 +3414,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_ida_remove; head->subsys = ctrl->subsys; head->ns_id = nsid; + head->ids = *ids; kref_init(&head->ref); - ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids); - if (ret) - goto out_cleanup_srcu; - ret = __nvme_check_ids(ctrl->subsys, head); if (ret) { dev_err(ctrl->device, @@ -3453,24 +3451,23 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, struct nvme_ctrl *ctrl = ns->ctrl; bool is_shared = id->nmic & (1 << 0); struct nvme_ns_head *head = NULL; + struct nvme_ns_ids ids; int ret = 0; + ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); + if (ret) + goto out; + mutex_lock(&ctrl->subsys->lock); if (is_shared) head = nvme_find_ns_head(ctrl->subsys, nsid); if (!head) { - head = nvme_alloc_ns_head(ctrl, nsid, id); + head = nvme_alloc_ns_head(ctrl, nsid, id, &ids); if (IS_ERR(head)) { ret = PTR_ERR(head); goto out_unlock; } } else { - struct nvme_ns_ids ids; - - ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); - if (ret) - goto out_unlock; - if (!nvme_ns_ids_equal(&head->ids, &ids)) { dev_err(ctrl->device, "IDs don't match for shared namespace %d\n", @@ -3485,6 +3482,7 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, out_unlock: mutex_unlock(&ctrl->subsys->lock); +out: if (ret > 0) ret = blk_status_to_errno(nvme_error_status(ret)); return ret; -- cgit v1.2.3