From b5f96cb719d8ba220b565ddd3ba4ac0d8bcfb130 Mon Sep 17 00:00:00 2001 From: Klaus Jensen Date: Tue, 13 Dec 2022 09:58:07 +0100 Subject: nvme-pci: fix doorbell buffer value endianness When using shadow doorbells, the event index and the doorbell values are written to host memory. Prior to this patch, the values written would erroneously be written in host endianness. This causes trouble on big-endian platforms. Fix this by adding missing endian conversions. This issue was noticed by Guenter while testing various big-endian platforms under QEMU[1]. A similar fix required for hw/nvme in QEMU is up for review as well[2]. [1]: https://lore.kernel.org/qemu-devel/20221209110022.GA3396194@roeck-us.net/ [2]: https://lore.kernel.org/qemu-devel/20221212114409.34972-4-its@irrelevant.dk/ Fixes: f9f38e33389c ("nvme: improve performance for virtual NVMe devices") Reported-by: Guenter Roeck Signed-off-by: Klaus Jensen Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f0f8027644bb..017442858054 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -144,9 +144,9 @@ struct nvme_dev { mempool_t *iod_mempool; /* shadow doorbell buffer support: */ - u32 *dbbuf_dbs; + __le32 *dbbuf_dbs; dma_addr_t dbbuf_dbs_dma_addr; - u32 *dbbuf_eis; + __le32 *dbbuf_eis; dma_addr_t dbbuf_eis_dma_addr; /* host memory buffer support: */ @@ -208,10 +208,10 @@ struct nvme_queue { #define NVMEQ_SQ_CMB 1 #define NVMEQ_DELETE_ERROR 2 #define NVMEQ_POLLED 3 - u32 *dbbuf_sq_db; - u32 *dbbuf_cq_db; - u32 *dbbuf_sq_ei; - u32 *dbbuf_cq_ei; + __le32 *dbbuf_sq_db; + __le32 *dbbuf_cq_db; + __le32 *dbbuf_sq_ei; + __le32 *dbbuf_cq_ei; struct completion delete_done; }; @@ -343,11 +343,11 @@ static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old) } /* Update dbbuf and return true if an MMIO is required */ -static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, - volatile u32 *dbbuf_ei) +static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, + volatile __le32 *dbbuf_ei) { if (dbbuf_db) { - u16 old_value; + u16 old_value, event_idx; /* * Ensure that the queue is written before updating @@ -355,8 +355,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, */ wmb(); - old_value = *dbbuf_db; - *dbbuf_db = value; + old_value = le32_to_cpu(*dbbuf_db); + *dbbuf_db = cpu_to_le32(value); /* * Ensure that the doorbell is updated before reading the event @@ -366,7 +366,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db, */ mb(); - if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value)) + event_idx = le32_to_cpu(*dbbuf_ei); + if (!nvme_dbbuf_need_event(event_idx, value, old_value)) return false; } -- cgit v1.2.3 From c89a529e823d51dd23c7ec0c047c7a454a428541 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 19 Dec 2022 10:59:06 -0800 Subject: nvme-pci: fix mempool alloc size Convert the max size to bytes to match the units of the divisor that calculates the worst-case number of PRP entries. The result is used to determine how many PRP Lists are required. The code was previously rounding this to 1 list, but we can require 2 in the worst case. In that scenario, the driver would corrupt memory beyond the size provided by the mempool. While unlikely to occur (you'd need a 4MB in exactly 127 phys segments on a queue that doesn't support SGLs), this memory corruption has been observed by kfence. Cc: Jens Axboe Fixes: 943e942e6266f ("nvme-pci: limit max IO size and segments to avoid high order allocations") Signed-off-by: Keith Busch Reviewed-by: Jens Axboe Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 017442858054..6e9d1c7409a9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -381,8 +381,8 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, */ static int nvme_pci_npages_prp(void) { - unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE, - NVME_CTRL_PAGE_SIZE); + unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; + unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } -- cgit v1.2.3 From 841734234a28fd5cd0889b84bd4d93a0988fa11e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 19 Dec 2022 13:54:55 -0800 Subject: nvme-pci: fix page size checks The size allocated out of the dma pool is at most NVME_CTRL_PAGE_SIZE, which may be smaller than the PAGE_SIZE. Fixes: c61b82c7b7134 ("nvme-pci: fix PRP pool size") Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6e9d1c7409a9..804b6a6cb43a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -36,7 +36,7 @@ #define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) -#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) +#define SGES_PER_PAGE (NVME_CTRL_PAGE_SIZE / sizeof(struct nvme_sgl_desc)) /* * These can be higher, but we need to ensure that any command doesn't @@ -383,7 +383,7 @@ static int nvme_pci_npages_prp(void) { unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); + return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8); } /* @@ -393,7 +393,7 @@ static int nvme_pci_npages_prp(void) static int nvme_pci_npages_sgl(void) { return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc), - PAGE_SIZE); + NVME_CTRL_PAGE_SIZE); } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -709,7 +709,7 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, sge->length = cpu_to_le32(entries * sizeof(*sge)); sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } else { - sge->length = cpu_to_le32(PAGE_SIZE); + sge->length = cpu_to_le32(NVME_CTRL_PAGE_SIZE); sge->type = NVME_SGL_FMT_SEG_DESC << 4; } } -- cgit v1.2.3 From 3659fb5ac29a5e6102bebe494ac789fd47fb78f4 Mon Sep 17 00:00:00 2001 From: Yanjun Zhang Date: Thu, 22 Dec 2022 09:57:21 +0800 Subject: nvme: fix multipath crash caused by flush request when blktrace is enabled The flush request initialized by blk_kick_flush has NULL bio, and it may be dealt with nvme_end_req during io completion. When blktrace is enabled, nvme_trace_bio_complete with multipath activated trying to access NULL pointer bio from flush request results in the following crash: [ 2517.831677] BUG: kernel NULL pointer dereference, address: 000000000000001a [ 2517.835213] #PF: supervisor read access in kernel mode [ 2517.838724] #PF: error_code(0x0000) - not-present page [ 2517.842222] PGD 7b2d51067 P4D 0 [ 2517.845684] Oops: 0000 [#1] SMP NOPTI [ 2517.849125] CPU: 2 PID: 732 Comm: kworker/2:1H Kdump: loaded Tainted: G S 5.15.67-0.cl9.x86_64 #1 [ 2517.852723] Hardware name: XFUSION 2288H V6/BC13MBSBC, BIOS 1.13 07/27/2022 [ 2517.856358] Workqueue: nvme_tcp_wq nvme_tcp_io_work [nvme_tcp] [ 2517.859993] RIP: 0010:blk_add_trace_bio_complete+0x6/0x30 [ 2517.863628] Code: 1f 44 00 00 48 8b 46 08 31 c9 ba 04 00 10 00 48 8b 80 50 03 00 00 48 8b 78 50 e9 e5 fe ff ff 0f 1f 44 00 00 41 54 49 89 f4 55 <0f> b6 7a 1a 48 89 d5 e8 3e 1c 2b 00 48 89 ee 4c 89 e7 5d 89 c1 ba [ 2517.871269] RSP: 0018:ff7f6a008d9dbcd0 EFLAGS: 00010286 [ 2517.875081] RAX: ff3d5b4be00b1d50 RBX: 0000000002040002 RCX: ff3d5b0a270f2000 [ 2517.878966] RDX: 0000000000000000 RSI: ff3d5b0b021fb9f8 RDI: 0000000000000000 [ 2517.882849] RBP: ff3d5b0b96a6fa00 R08: 0000000000000001 R09: 0000000000000000 [ 2517.886718] R10: 000000000000000c R11: 000000000000000c R12: ff3d5b0b021fb9f8 [ 2517.890575] R13: 0000000002000000 R14: ff3d5b0b021fb1b0 R15: 0000000000000018 [ 2517.894434] FS: 0000000000000000(0000) GS:ff3d5b42bfc80000(0000) knlGS:0000000000000000 [ 2517.898299] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2517.902157] CR2: 000000000000001a CR3: 00000004f023e005 CR4: 0000000000771ee0 [ 2517.906053] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 2517.909930] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 2517.913761] PKRU: 55555554 [ 2517.917558] Call Trace: [ 2517.921294] [ 2517.924982] nvme_complete_rq+0x1c3/0x1e0 [nvme_core] [ 2517.928715] nvme_tcp_recv_pdu+0x4d7/0x540 [nvme_tcp] [ 2517.932442] nvme_tcp_recv_skb+0x4f/0x240 [nvme_tcp] [ 2517.936137] ? nvme_tcp_recv_pdu+0x540/0x540 [nvme_tcp] [ 2517.939830] tcp_read_sock+0x9c/0x260 [ 2517.943486] nvme_tcp_try_recv+0x65/0xa0 [nvme_tcp] [ 2517.947173] nvme_tcp_io_work+0x64/0x90 [nvme_tcp] [ 2517.950834] process_one_work+0x1e8/0x390 [ 2517.954473] worker_thread+0x53/0x3c0 [ 2517.958069] ? process_one_work+0x390/0x390 [ 2517.961655] kthread+0x10c/0x130 [ 2517.965211] ? set_kthread_struct+0x40/0x40 [ 2517.968760] ret_from_fork+0x1f/0x30 [ 2517.972285] To avoid this situation, add a NULL check for req->bio before calling trace_block_bio_complete. Signed-off-by: Yanjun Zhang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6bbb73ef8b25..424c8a467a0c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -893,7 +893,7 @@ static inline void nvme_trace_bio_complete(struct request *req) { struct nvme_ns *ns = req->q->queuedata; - if (req->cmd_flags & REQ_NVME_MPATH) + if ((req->cmd_flags & REQ_NVME_MPATH) && req->bio) trace_block_bio_complete(ns->head->disk->queue, req->bio); } -- cgit v1.2.3 From 246cf66e300b76099b5dbd3fdd39e9a5dbc53f02 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Dec 2022 11:06:05 +0800 Subject: block, bfq: fix uaf for bfqq in bfq_exit_icq_bfqq Commit 64dc8c732f5c ("block, bfq: fix possible uaf for 'bfqq->bic'") will access 'bic->bfqq' in bic_set_bfqq(), however, bfq_exit_icq_bfqq() can free bfqq first, and then call bic_set_bfqq(), which will cause uaf. Fix the problem by moving bfq_exit_bfqq() behind bic_set_bfqq(). Fixes: 64dc8c732f5c ("block, bfq: fix possible uaf for 'bfqq->bic'") Reported-by: Yi Zhang Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20221226030605.1437081-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 16f43bbc575a..ccf2204477a5 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5317,8 +5317,8 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); - bfq_exit_bfqq(bfqd, bfqq); bic_set_bfqq(bic, NULL, is_sync); + bfq_exit_bfqq(bfqd, bfqq); spin_unlock_irqrestore(&bfqd->lock, flags); } } -- cgit v1.2.3 From 33b93727ce90c8db916fb071ed13e90106339754 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 25 Dec 2022 11:32:31 +0100 Subject: nvme: fix setting the queue depth in nvme_alloc_io_tag_set While the CAP.MQES field in NVMe is a 0s based filed with a natural one off, we also need to account for the queue wrap condition and fix undo the one off again in nvme_alloc_io_tag_set. This was never properly done by the fabrics drivers, but they don't seem to care because there is no actual physical queue that can wrap around, but it became a problem when converting over the PCIe driver. Also add back the BLK_MQ_MAX_DEPTH check that was lost in the same commit. Fixes: 0da7feaa5913 ("nvme-pci: use the tagset alloc/free helpers") Reported-by: Hugh Dickins Signed-off-by: Christoph Hellwig Tested-by: Hugh Dickins Link: https://lore.kernel.org/r/20221225103234.226794-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e26b085a007a..cda1361e6d4f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4897,7 +4897,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, memset(set, 0, sizeof(*set)); set->ops = ops; - set->queue_depth = ctrl->sqsize + 1; + set->queue_depth = min_t(unsigned, ctrl->sqsize, BLK_MQ_MAX_DEPTH - 1); /* * Some Apple controllers requires tags to be unique across admin and * the (only) I/O queue, so reserve the first 32 tags of the I/O queue. -- cgit v1.2.3 From 88d356ca41ba1c3effc2d4208dfbd4392f58cd6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 25 Dec 2022 11:32:32 +0100 Subject: nvme-pci: update sqsize when adjusting the queue depth Update the core sqsize field in addition to the PCIe-specific q_depth field as the core tagset allocation helpers rely on it. Fixes: 0da7feaa5913 ("nvme-pci: use the tagset alloc/free helpers") Signed-off-by: Christoph Hellwig Acked-by: Hugh Dickins Link: https://lore.kernel.org/r/20221225103234.226794-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 804b6a6cb43a..b13baccedb4a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2333,10 +2333,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (dev->cmb_use_sqes) { result = nvme_cmb_qdepth(dev, nr_io_queues, sizeof(struct nvme_command)); - if (result > 0) + if (result > 0) { dev->q_depth = result; - else + dev->ctrl.sqsize = result - 1; + } else { dev->cmb_use_sqes = false; + } } do { @@ -2537,7 +2539,6 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); - dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; @@ -2578,7 +2579,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", dev->q_depth); } - + dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ nvme_map_cmb(dev); -- cgit v1.2.3 From 8ca4fc323d2e4ab9dabbdd57633af40b0c7e6af9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2022 11:09:55 +0100 Subject: docs, nvme: add a feature and quirk policy document This adds a document about what specification features are supported by the Linux NVMe driver, and what qualifies for a quirk if an implementation has problems following the specification. Signed-off-by: Jens Axboe Signed-off-by: Keith Busch Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Reviewed-by: Randy Dunlap Acked-by: Jonathan Corbet --- .../maintainer/maintainer-entry-profile.rst | 1 + Documentation/nvme/feature-and-quirk-policy.rst | 77 ++++++++++++++++++++++ MAINTAINERS | 1 + 3 files changed, 79 insertions(+) create mode 100644 Documentation/nvme/feature-and-quirk-policy.rst diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst index 93b2ae6c34a9..cfd37f31077f 100644 --- a/Documentation/maintainer/maintainer-entry-profile.rst +++ b/Documentation/maintainer/maintainer-entry-profile.rst @@ -104,3 +104,4 @@ to do something different in the near future. ../riscv/patch-acceptance ../driver-api/media/maintainer-entry-profile ../driver-api/vfio-pci-device-specific-driver-acceptance + ../nvme/feature-and-quirk-policy diff --git a/Documentation/nvme/feature-and-quirk-policy.rst b/Documentation/nvme/feature-and-quirk-policy.rst new file mode 100644 index 000000000000..c01d836d8e41 --- /dev/null +++ b/Documentation/nvme/feature-and-quirk-policy.rst @@ -0,0 +1,77 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================================= +Linux NVMe feature and and quirk policy +======================================= + +This file explains the policy used to decide what is supported by the +Linux NVMe driver and what is not. + + +Introduction +============ + +NVM Express is an open collection of standards and information. + +The Linux NVMe host driver in drivers/nvme/host/ supports devices +implementing the NVM Express (NVMe) family of specifications, which +currently consists of a number of documents: + + - the NVMe Base specification + - various Command Set specifications (e.g. NVM Command Set) + - various Transport specifications (e.g. PCIe, Fibre Channel, RDMA, TCP) + - the NVMe Management Interface specification + +See https://nvmexpress.org/developers/ for the NVMe specifications. + + +Supported features +================== + +NVMe is a large suite of specifications, and contains features that are only +useful or suitable for specific use-cases. It is important to note that Linux +does not aim to implement every feature in the specification. Every additional +feature implemented introduces more code, more maintenance and potentially more +bugs. Hence there is an inherent tradeoff between functionality and +maintainability of the NVMe host driver. + +Any feature implemented in the Linux NVMe host driver must support the +following requirements: + + 1. The feature is specified in a release version of an official NVMe + specification, or in a ratified Technical Proposal (TP) that is + available on NVMe website. Or if it is not directly related to the + on-wire protocol, does not contradict any of the NVMe specifications. + 2. Does not conflict with the Linux architecture, nor the design of the + NVMe host driver. + 3. Has a clear, indisputable value-proposition and a wide consensus across + the community. + +Vendor specific extensions are generally not supported in the NVMe host +driver. + +It is strongly recommended to work with the Linux NVMe and block layer +maintainers and get feedback on specification changes that are intended +to be used by the Linux NVMe host driver in order to avoid conflict at a +later stage. + + +Quirks +====== + +Sometimes implementations of open standards fail to correctly implement parts +of the standards. Linux uses identifier-based quirks to work around such +implementation bugs. The intent of quirks is to deal with widely available +hardware, usually consumer, which Linux users can't use without these quirks. +Typically these implementations are not or only superficially tested with Linux +by the hardware manufacturer. + +The Linux NVMe maintainers decide ad hoc whether to quirk implementations +based on the impact of the problem to Linux users and how it impacts +maintainability of the driver. In general quirks are a last resort, if no +firmware updates or other workarounds are available from the vendor. + +Quirks will not be added to the Linux kernel for hardware that isn't available +on the mass market. Hardware that fails qualification for enterprise Linux +distributions, ChromeOS, Android or other consumers of the Linux kernel +should be fixed before it is shipped instead of relying on Linux quirks. diff --git a/MAINTAINERS b/MAINTAINERS index bb77a3ed9d54..d53b3a6cdc67 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14827,6 +14827,7 @@ L: linux-nvme@lists.infradead.org S: Supported W: http://git.infradead.org/nvme.git T: git://git.infradead.org/nvme.git +F: Documentation/nvme/ F: drivers/nvme/host/ F: drivers/nvme/common/ F: include/linux/nvme* -- cgit v1.2.3 From 685e6311637e46f3212439ce2789f8a300e5050f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Dec 2022 10:30:45 +0100 Subject: nvme: fix the NVME_CMD_EFFECTS_CSE_MASK definition 3 << 16 does not generate the correct mask for bits 16, 17 and 18. Use the GENMASK macro to generate the correct mask instead. Fixes: 84fef62d135b ("nvme: check admin passthru command effects") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi --- include/linux/nvme.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d6be2a686100..d1cd53f2b6ab 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -7,6 +7,7 @@ #ifndef _LINUX_NVME_H #define _LINUX_NVME_H +#include #include #include @@ -639,7 +640,7 @@ enum { NVME_CMD_EFFECTS_NCC = 1 << 2, NVME_CMD_EFFECTS_NIC = 1 << 3, NVME_CMD_EFFECTS_CCC = 1 << 4, - NVME_CMD_EFFECTS_CSE_MASK = 3 << 16, + NVME_CMD_EFFECTS_CSE_MASK = GENMASK(18, 16), NVME_CMD_EFFECTS_UUID_SEL = 1 << 19, }; -- cgit v1.2.3 From 61f37154c599cf9f2f84dcbd9be842f8645a7099 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2022 15:20:04 +0100 Subject: nvmet: use NVME_CMD_EFFECTS_CSUPP instead of open coding it Use NVME_CMD_EFFECTS_CSUPP instead of open coding it and assign a single value to multiple array entries instead of repeated assignments. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/target/admin-cmd.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 53a004ea320c..111a5cb6403f 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -164,26 +164,29 @@ out: static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log) { - log->acs[nvme_admin_get_log_page] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_identify] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_abort_cmd] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_set_features] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_get_features] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_async_event] = cpu_to_le32(1 << 0); - log->acs[nvme_admin_keep_alive] = cpu_to_le32(1 << 0); - - log->iocs[nvme_cmd_read] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_write] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_flush] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_dsm] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_write_zeroes] = cpu_to_le32(1 << 0); + log->acs[nvme_admin_get_log_page] = + log->acs[nvme_admin_identify] = + log->acs[nvme_admin_abort_cmd] = + log->acs[nvme_admin_set_features] = + log->acs[nvme_admin_get_features] = + log->acs[nvme_admin_async_event] = + log->acs[nvme_admin_keep_alive] = + cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); + + log->iocs[nvme_cmd_read] = + log->iocs[nvme_cmd_write] = + log->iocs[nvme_cmd_flush] = + log->iocs[nvme_cmd_dsm] = + log->iocs[nvme_cmd_write_zeroes] = + cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); } static void nvmet_get_cmd_effects_zns(struct nvme_effects_log *log) { - log->iocs[nvme_cmd_zone_append] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_zone_mgmt_send] = cpu_to_le32(1 << 0); - log->iocs[nvme_cmd_zone_mgmt_recv] = cpu_to_le32(1 << 0); + log->iocs[nvme_cmd_zone_append] = + log->iocs[nvme_cmd_zone_mgmt_send] = + log->iocs[nvme_cmd_zone_mgmt_recv] = + cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); } static void nvmet_execute_get_log_cmd_effects_ns(struct nvmet_req *req) -- cgit v1.2.3 From f2d1421391bba0b15684d2379a47a089f0e561d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2022 15:20:56 +0100 Subject: nvmet: set the LBCC bit for commands that modify data Write, Write Zeroes, Zone append and a Zone Reset through Zone Management Send modify the logical block content of a namespace, so make sure the LBCC bit is reported for them. Fixes: b5d0b38c0475 ("nvmet: add Command Set Identifier support") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/target/admin-cmd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 111a5cb6403f..6a54ed6fb121 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -174,17 +174,19 @@ static void nvmet_get_cmd_effects_nvm(struct nvme_effects_log *log) cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); log->iocs[nvme_cmd_read] = - log->iocs[nvme_cmd_write] = log->iocs[nvme_cmd_flush] = log->iocs[nvme_cmd_dsm] = - log->iocs[nvme_cmd_write_zeroes] = cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); + log->iocs[nvme_cmd_write] = + log->iocs[nvme_cmd_write_zeroes] = + cpu_to_le32(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC); } static void nvmet_get_cmd_effects_zns(struct nvme_effects_log *log) { log->iocs[nvme_cmd_zone_append] = log->iocs[nvme_cmd_zone_mgmt_send] = + cpu_to_le32(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC); log->iocs[nvme_cmd_zone_mgmt_recv] = cpu_to_le32(NVME_CMD_EFFECTS_CSUPP); } -- cgit v1.2.3 From 2a459f6933e1c459bffb7cc73fd6c900edc714bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Dec 2022 09:51:19 +0100 Subject: nvmet: don't defer passthrough commands with trivial effects to the workqueue Mask out the "Command Supported" and "Logical Block Content Change" bits and only defer execution of commands that have non-trivial effects to the workqueue for synchronous execution. This allows to execute admin commands asynchronously on controllers that provide a Command Supported and Effects log page, and will keep allowing to execute Write commands asynchronously once command effects on I/O commands are taken into account. Fixes: c1fef73f793b ("nvmet: add passthru code to process commands") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi --- drivers/nvme/target/passthru.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 79af5140af8b..adc0958755d6 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -334,14 +334,13 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) } /* - * If there are effects for the command we are about to execute, or - * an end_req function we need to use nvme_execute_passthru_rq() - * synchronously in a work item seeing the end_req function and - * nvme_passthru_end() can't be called in the request done callback - * which is typically in interrupt context. + * If a command needs post-execution fixups, or there are any + * non-trivial effects, make sure to execute the command synchronously + * in a workqueue so that nvme_passthru_end gets called. */ effects = nvme_command_effects(ctrl, ns, req->cmd->common.opcode); - if (req->p.use_workqueue || effects) { + if (req->p.use_workqueue || + (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))) { INIT_WORK(&req->p.work, nvmet_passthru_execute_cmd_work); req->p.rq = rq; queue_work(nvmet_wq, &req->p.work); -- cgit v1.2.3 From 831ed60c2aca2d7c517b2da22897a90224a97d27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 21 Dec 2022 10:12:17 +0100 Subject: nvme: also return I/O command effects from nvme_command_effects To be able to use the Commands Supported and Effects Log for allowing unprivileged passtrough, it needs to be corretly reported for I/O commands as well. Return the I/O command effects from nvme_command_effects, and also add a default list of effects for the NVM command set. For other command sets, the Commands Supported and Effects log is required to be present already. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Kanchan Joshi --- drivers/nvme/host/core.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cda1361e6d4f..d307ae4d8a57 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1074,6 +1074,18 @@ static u32 nvme_known_admin_effects(u8 opcode) return 0; } +static u32 nvme_known_nvm_effects(u8 opcode) +{ + switch (opcode) { + case nvme_cmd_write: + case nvme_cmd_write_zeroes: + case nvme_cmd_write_uncor: + return NVME_CMD_EFFECTS_LBCC; + default: + return 0; + } +} + u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) { u32 effects = 0; @@ -1081,16 +1093,24 @@ u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode) if (ns) { if (ns->head->effects) effects = le32_to_cpu(ns->head->effects->iocs[opcode]); + if (ns->head->ids.csi == NVME_CAP_CSS_NVM) + effects |= nvme_known_nvm_effects(opcode); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn_once(ctrl->device, - "IO command:%02x has unhandled effects:%08x\n", + "IO command:%02x has unusual effects:%08x\n", opcode, effects); - return 0; - } - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->acs[opcode]); - effects |= nvme_known_admin_effects(opcode); + /* + * NVME_CMD_EFFECTS_CSE_MASK causes a freeze all I/O queues, + * which would deadlock when done on an I/O command. Note that + * We already warn about an unusual effect above. + */ + effects &= ~NVME_CMD_EFFECTS_CSE_MASK; + } else { + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->acs[opcode]); + effects |= nvme_known_admin_effects(opcode); + } return effects; } -- cgit v1.2.3 From 6f99ac04c469b5d0a180a4ccea99d25d5dc9d21c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Dec 2022 16:13:38 +0100 Subject: nvme: consult the CSE log page for unprivileged passthrough Commands like Write Zeros can change the contents of a namespaces without actually transferring data. To protect against this, check the Commands Supported and Effects log is supported by the controller for any unprivileg command passthrough and refuse unprivileged passthrough if the command has any effects that can change data or metadata. Note: While the Commands Support and Effects log page has only been mandatory since NVMe 2.0, it is widely supported because Windows requires it for any command passthrough from userspace. Fixes: e4fbcf32c860 ("nvme: identify-namespace without CAP_SYS_ADMIN") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Kanchan Joshi --- drivers/nvme/host/ioctl.c | 28 ++++++++++++++++++++++++---- include/linux/nvme.h | 1 + 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 9ddda571f046..a8639919237e 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -11,6 +11,8 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, fmode_t mode) { + u32 effects; + if (capable(CAP_SYS_ADMIN)) return true; @@ -43,11 +45,29 @@ static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c, } /* - * Only allow I/O commands that transfer data to the controller if the - * special file is open for writing, but always allow I/O commands that - * transfer data from the controller. + * Check if the controller provides a Commands Supported and Effects log + * and marks this command as supported. If not reject unprivileged + * passthrough. + */ + effects = nvme_command_effects(ns->ctrl, ns, c->common.opcode); + if (!(effects & NVME_CMD_EFFECTS_CSUPP)) + return false; + + /* + * Don't allow passthrough for command that have intrusive (or unknown) + * effects. + */ + if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_UUID_SEL | + NVME_CMD_EFFECTS_SCOPE_MASK)) + return false; + + /* + * Only allow I/O commands that transfer data to the controller or that + * change the logical block contents if the file descriptor is open for + * writing. */ - if (nvme_is_write(c)) + if (nvme_is_write(c) || (effects & NVME_CMD_EFFECTS_LBCC)) return mode & FMODE_WRITE; return true; } diff --git a/include/linux/nvme.h b/include/linux/nvme.h index d1cd53f2b6ab..4fad4aa245fb 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -642,6 +642,7 @@ enum { NVME_CMD_EFFECTS_CCC = 1 << 4, NVME_CMD_EFFECTS_CSE_MASK = GENMASK(18, 16), NVME_CMD_EFFECTS_UUID_SEL = 1 << 19, + NVME_CMD_EFFECTS_SCOPE_MASK = GENMASK(31, 20), }; struct nvme_effects_log { -- cgit v1.2.3 From 76807fcd73b818eb9f245ef1035aed34ecdd9813 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 25 Dec 2022 13:28:51 +0200 Subject: nvme-auth: fix smatch warning complaints When initializing auth context, there may be no secrets passed by the user. Make return code explicit when returning successfully. smatch warnings: drivers/nvme/host/auth.c:950 nvme_auth_init_ctrl() warn: missing error code? 'ret' Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index bb0abbe4491c..4424f53a8a0a 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -953,7 +953,7 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) goto err_free_dhchap_secret; if (!ctrl->opts->dhchap_secret && !ctrl->opts->dhchap_ctrl_secret) - return ret; + return 0; ctrl->dhchap_ctxs = kvcalloc(ctrl_max_dhchaps(ctrl), sizeof(*chap), GFP_KERNEL); -- cgit v1.2.3