From 82394db7383d33641f3f565bd79792fb41b1741f Mon Sep 17 00:00:00 2001
From: Matias Bjørling <matias.bjorling@wdc.com>
Date: Mon, 29 Jun 2020 12:06:37 -0700
Subject: block: add capacity field to zone descriptors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In the zoned storage model, the sectors within a zone are typically all
writeable. With the introduction of the Zoned Namespace (ZNS) Command
Set in the NVM Express organization, the model was extended to have a
specific writeable capacity.

Extend the zone descriptor data structure with a zone capacity field to
indicate to the user how many sectors in a zone are writeable.

Introduce backward compatibility in the zone report ioctl by extending
the zone report header data structure with a flags field to indicate if
the capacity field is available.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Javier González <javier.gonz@samsung.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/uapi/linux/blkzoned.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h
index 0cdef67135f0..42c3366cc25f 100644
--- a/include/uapi/linux/blkzoned.h
+++ b/include/uapi/linux/blkzoned.h
@@ -73,6 +73,15 @@ enum blk_zone_cond {
 	BLK_ZONE_COND_OFFLINE	= 0xF,
 };
 
+/**
+ * enum blk_zone_report_flags - Feature flags of reported zone descriptors.
+ *
+ * @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field.
+ */
+enum blk_zone_report_flags {
+	BLK_ZONE_REP_CAPACITY	= (1 << 0),
+};
+
 /**
  * struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl.
  *
@@ -99,7 +108,9 @@ struct blk_zone {
 	__u8	cond;		/* Zone condition */
 	__u8	non_seq;	/* Non-sequential write resources active */
 	__u8	reset;		/* Reset write pointer recommended */
-	__u8	reserved[36];
+	__u8	resv[4];
+	__u64	capacity;	/* Zone capacity in number of sectors */
+	__u8	reserved[24];
 };
 
 /**
@@ -115,7 +126,7 @@ struct blk_zone {
 struct blk_zone_report {
 	__u64		sector;
 	__u32		nr_zones;
-	__u8		reserved[4];
+	__u32		flags;
 	struct blk_zone zones[0];
 };
 
-- 
cgit v1.2.3


From 71010c30945425203da8d069a10fa45a05a00f96 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Mon, 29 Jun 2020 12:06:39 -0700
Subject: nvme: implement multiple I/O Command Set support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implements support for multiple I/O Command Sets.  NVMe TP 4056
introduces a method to enumerate multiple command sets per namespace. If
the command set is exposed, this method for enumeration will be used
instead of the traditional method that uses the CC.CSS register command
set register for command set identification.

For namespaces where the Command Set Identifier is not supported or
recognized, the specific namespace will not be created.

Reviewed-by: Javier González <javier.gonz@samsung.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
Reviewed-by: Daniel Wagner <dwagner@suse.de>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 53 +++++++++++++++++++++++++++++++++++++++---------
 drivers/nvme/host/nvme.h |  1 +
 include/linux/nvme.h     | 19 +++++++++++++++--
 3 files changed, 61 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 96898040a6d5..892291dbee64 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1056,8 +1056,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 	return error;
 }
 
+static bool nvme_multi_css(struct nvme_ctrl *ctrl)
+{
+	return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
+}
+
 static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
-		struct nvme_ns_id_desc *cur)
+		struct nvme_ns_id_desc *cur, bool *csi_seen)
 {
 	const char *warn_str = "ctrl returned bogus length:";
 	void *data = cur;
@@ -1087,6 +1092,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
 		}
 		uuid_copy(&ids->uuid, data + sizeof(*cur));
 		return NVME_NIDT_UUID_LEN;
+	case NVME_NIDT_CSI:
+		if (cur->nidl != NVME_NIDT_CSI_LEN) {
+			dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
+				 warn_str, cur->nidl);
+			return -1;
+		}
+		memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
+		*csi_seen = true;
+		return NVME_NIDT_CSI_LEN;
 	default:
 		/* Skip unknown types */
 		return cur->nidl;
@@ -1097,10 +1111,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 		struct nvme_ns_ids *ids)
 {
 	struct nvme_command c = { };
-	int status;
+	bool csi_seen = false;
+	int status, pos, len;
 	void *data;
-	int pos;
-	int len;
 
 	c.identify.opcode = nvme_admin_identify;
 	c.identify.nsid = cpu_to_le32(nsid);
@@ -1125,7 +1138,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 		  * device just because of a temporal retry-able error (such
 		  * as path of transport errors).
 		  */
-		if (status > 0 && (status & NVME_SC_DNR))
+		if (status > 0 && (status & NVME_SC_DNR) && !nvme_multi_css(ctrl))
 			status = 0;
 		goto free_data;
 	}
@@ -1136,12 +1149,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
 		if (cur->nidl == 0)
 			break;
 
-		len = nvme_process_ns_desc(ctrl, ids, cur);
+		len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
 		if (len < 0)
-			goto free_data;
+			break;
 
 		len += sizeof(*cur);
 	}
+
+	if (nvme_multi_css(ctrl) && !csi_seen) {
+		dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
+			 nsid);
+		status = -EINVAL;
+	}
+
 free_data:
 	kfree(data);
 	return status;
@@ -1798,7 +1818,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
 		memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
 	if (ctrl->vs >= NVME_VS(1, 2, 0))
 		memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
-	if (ctrl->vs >= NVME_VS(1, 3, 0))
+	if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
 		return nvme_identify_ns_descs(ctrl, nsid, ids);
 	return 0;
 }
@@ -1814,7 +1834,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
 {
 	return uuid_equal(&a->uuid, &b->uuid) &&
 		memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
-		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
+		memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
+		a->csi == b->csi;
 }
 
 static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
@@ -1936,6 +1957,15 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	if (ns->lba_shift == 0)
 		ns->lba_shift = 9;
 
+	switch (ns->head->ids.csi) {
+	case NVME_CSI_NVM:
+		break;
+	default:
+		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
+			ns->head->ids.csi, ns->head->ns_id);
+		return -ENODEV;
+	}
+
 	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
 	    is_power_of_2(ctrl->max_hw_sectors))
 		iob = ctrl->max_hw_sectors;
@@ -2270,7 +2300,10 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
 
 	ctrl->page_size = 1 << page_shift;
 
-	ctrl->ctrl_config = NVME_CC_CSS_NVM;
+	if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
+		ctrl->ctrl_config = NVME_CC_CSS_CSI;
+	else
+		ctrl->ctrl_config = NVME_CC_CSS_NVM;
 	ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
 	ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
 	ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index b5a2e8b7e0be..5573159f714d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -339,6 +339,7 @@ struct nvme_ns_ids {
 	u8	eui64[8];
 	u8	nguid[16];
 	uuid_t	uuid;
+	u8	csi;
 };
 
 /*
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 5ce51ab4c50e..81ffe5247505 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -132,6 +132,7 @@ enum {
 #define NVME_CAP_TIMEOUT(cap)	(((cap) >> 24) & 0xff)
 #define NVME_CAP_STRIDE(cap)	(((cap) >> 32) & 0xf)
 #define NVME_CAP_NSSRC(cap)	(((cap) >> 36) & 0x1)
+#define NVME_CAP_CSS(cap)	(((cap) >> 37) & 0xff)
 #define NVME_CAP_MPSMIN(cap)	(((cap) >> 48) & 0xf)
 #define NVME_CAP_MPSMAX(cap)	(((cap) >> 52) & 0xf)
 
@@ -162,7 +163,6 @@ enum {
 
 enum {
 	NVME_CC_ENABLE		= 1 << 0,
-	NVME_CC_CSS_NVM		= 0 << 4,
 	NVME_CC_EN_SHIFT	= 0,
 	NVME_CC_CSS_SHIFT	= 4,
 	NVME_CC_MPS_SHIFT	= 7,
@@ -170,6 +170,9 @@ enum {
 	NVME_CC_SHN_SHIFT	= 14,
 	NVME_CC_IOSQES_SHIFT	= 16,
 	NVME_CC_IOCQES_SHIFT	= 20,
+	NVME_CC_CSS_NVM		= 0 << NVME_CC_CSS_SHIFT,
+	NVME_CC_CSS_CSI		= 6 << NVME_CC_CSS_SHIFT,
+	NVME_CC_CSS_MASK	= 7 << NVME_CC_CSS_SHIFT,
 	NVME_CC_AMS_RR		= 0 << NVME_CC_AMS_SHIFT,
 	NVME_CC_AMS_WRRU	= 1 << NVME_CC_AMS_SHIFT,
 	NVME_CC_AMS_VS		= 7 << NVME_CC_AMS_SHIFT,
@@ -179,6 +182,8 @@ enum {
 	NVME_CC_SHN_MASK	= 3 << NVME_CC_SHN_SHIFT,
 	NVME_CC_IOSQES		= NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT,
 	NVME_CC_IOCQES		= NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT,
+	NVME_CAP_CSS_NVM	= 1 << 0,
+	NVME_CAP_CSS_CSI	= 1 << 6,
 	NVME_CSTS_RDY		= 1 << 0,
 	NVME_CSTS_CFS		= 1 << 1,
 	NVME_CSTS_NSSRO		= 1 << 4,
@@ -374,6 +379,8 @@ enum {
 	NVME_ID_CNS_CTRL		= 0x01,
 	NVME_ID_CNS_NS_ACTIVE_LIST	= 0x02,
 	NVME_ID_CNS_NS_DESC_LIST	= 0x03,
+	NVME_ID_CNS_CS_NS		= 0x05,
+	NVME_ID_CNS_CS_CTRL		= 0x06,
 	NVME_ID_CNS_NS_PRESENT_LIST	= 0x10,
 	NVME_ID_CNS_NS_PRESENT		= 0x11,
 	NVME_ID_CNS_CTRL_NS_LIST	= 0x12,
@@ -383,6 +390,10 @@ enum {
 	NVME_ID_CNS_UUID_LIST		= 0x17,
 };
 
+enum {
+	NVME_CSI_NVM			= 0,
+};
+
 enum {
 	NVME_DIR_IDENTIFY		= 0x00,
 	NVME_DIR_STREAMS		= 0x01,
@@ -435,11 +446,13 @@ struct nvme_ns_id_desc {
 #define NVME_NIDT_EUI64_LEN	8
 #define NVME_NIDT_NGUID_LEN	16
 #define NVME_NIDT_UUID_LEN	16
+#define NVME_NIDT_CSI_LEN	1
 
 enum {
 	NVME_NIDT_EUI64		= 0x01,
 	NVME_NIDT_NGUID		= 0x02,
 	NVME_NIDT_UUID		= 0x03,
+	NVME_NIDT_CSI		= 0x04,
 };
 
 struct nvme_smart_log {
@@ -972,7 +985,9 @@ struct nvme_identify {
 	__u8			cns;
 	__u8			rsvd3;
 	__le16			ctrlid;
-	__u32			rsvd11[5];
+	__u8			rsvd11[3];
+	__u8			csi;
+	__u32			rsvd12[4];
 };
 
 #define NVME_IDENTIFY_DATA_SIZE 4096
-- 
cgit v1.2.3


From be93e87e780253780df9bb6ecc9bc1199b0d94c3 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@wdc.com>
Date: Mon, 29 Jun 2020 12:06:40 -0700
Subject: nvme: support for multiple Command Sets Supported and Effects log
 pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Commands Supported and Effects log page was extended with a CSI
field that enables the host to query the log page for each command set
supported. Retrieve this log page for each command set that an attached
namespace supports, and save a pointer to that log in the namespace head.

Reviewed-by: Matias Bjørling <matias.bjorling@wdc.com>
Reviewed-by: Javier González <javier.gonz@samsung.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Keith Busch <keith.busch@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c      | 79 ++++++++++++++++++++++++++++++++-----------
 drivers/nvme/host/hwmon.c     |  2 +-
 drivers/nvme/host/lightnvm.c  |  4 +--
 drivers/nvme/host/multipath.c |  2 +-
 drivers/nvme/host/nvme.h      | 10 +++++-
 include/linux/nvme.h          |  4 ++-
 6 files changed, 76 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 892291dbee64..62b2cdc764da 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1370,8 +1370,8 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	u32 effects = 0;
 
 	if (ns) {
-		if (ctrl->effects)
-			effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+		if (ns->head->effects)
+			effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
 		if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
 			dev_warn(ctrl->device,
 				 "IO command:%02x has unhandled effects:%08x\n",
@@ -2851,7 +2851,7 @@ out_unlock:
 	return ret;
 }
 
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
 		void *log, size_t size, u64 offset)
 {
 	struct nvme_command c = { };
@@ -2865,27 +2865,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
 	c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
 	c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
 	c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
+	c.get_log_page.csi = csi;
 
 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
 }
 
-static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
+static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi)
 {
+	struct nvme_cel *cel, *ret = NULL;
+
+	spin_lock(&ctrl->lock);
+	list_for_each_entry(cel, &ctrl->cels, entry) {
+		if (cel->csi == csi) {
+			ret = cel;
+			break;
+		}
+	}
+	spin_unlock(&ctrl->lock);
+
+	return ret;
+}
+
+static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
+				struct nvme_effects_log **log)
+{
+	struct nvme_cel *cel = nvme_find_cel(ctrl, csi);
 	int ret;
 
-	if (!ctrl->effects)
-		ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
+	if (cel)
+		goto out;
 
-	if (!ctrl->effects)
-		return 0;
+	cel = kzalloc(sizeof(*cel), GFP_KERNEL);
+	if (!cel)
+		return -ENOMEM;
 
-	ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
-			ctrl->effects, sizeof(*ctrl->effects), 0);
+	ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi,
+			&cel->log, sizeof(cel->log), 0);
 	if (ret) {
-		kfree(ctrl->effects);
-		ctrl->effects = NULL;
+		kfree(cel);
+		return ret;
 	}
-	return ret;
+
+	cel->csi = csi;
+
+	spin_lock(&ctrl->lock);
+	list_add_tail(&cel->entry, &ctrl->cels);
+	spin_unlock(&ctrl->lock);
+out:
+	*log = &cel->log;
+	return 0;
 }
 
 /*
@@ -2918,7 +2946,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	}
 
 	if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
-		ret = nvme_get_effects_log(ctrl);
+		ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
 		if (ret < 0)
 			goto out_free;
 	}
@@ -3551,6 +3579,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
 		goto out_cleanup_srcu;
 	}
 
+	if (head->ids.csi) {
+		ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
+		if (ret)
+			goto out_cleanup_srcu;
+	} else
+		head->effects = ctrl->effects;
+
 	ret = nvme_mpath_alloc_disk(ctrl, head);
 	if (ret)
 		goto out_cleanup_srcu;
@@ -3891,8 +3926,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
 	 * raced with us in reading the log page, which could cause us to miss
 	 * updates.
 	 */
-	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
-			log_size, 0);
+	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
+			NVME_CSI_NVM, log, log_size, 0);
 	if (error)
 		dev_warn(ctrl->device,
 			"reading changed ns log failed: %d\n", error);
@@ -4036,8 +4071,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
 	if (!log)
 		return;
 
-	if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log,
-			sizeof(*log), 0))
+	if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
+			log, sizeof(*log), 0))
 		dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
 	kfree(log);
 }
@@ -4174,11 +4209,16 @@ static void nvme_free_ctrl(struct device *dev)
 	struct nvme_ctrl *ctrl =
 		container_of(dev, struct nvme_ctrl, ctrl_device);
 	struct nvme_subsystem *subsys = ctrl->subsys;
+	struct nvme_cel *cel, *next;
 
 	if (subsys && ctrl->instance != subsys->instance)
 		ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 
-	kfree(ctrl->effects);
+	list_for_each_entry_safe(cel, next, &ctrl->cels, entry) {
+		list_del(&cel->entry);
+		kfree(cel);
+	}
+
 	nvme_mpath_uninit(ctrl);
 	__free_page(ctrl->discard_page);
 
@@ -4209,6 +4249,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	spin_lock_init(&ctrl->lock);
 	mutex_init(&ctrl->scan_lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
+	INIT_LIST_HEAD(&ctrl->cels);
 	init_rwsem(&ctrl->namespaces_rwsem);
 	ctrl->dev = dev;
 	ctrl->ops = ops;
diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c
index 2e6477ed420f..23ba8bf678ae 100644
--- a/drivers/nvme/host/hwmon.c
+++ b/drivers/nvme/host/hwmon.c
@@ -62,7 +62,7 @@ static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data)
 	int ret;
 
 	ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0,
-			   &data->log, sizeof(data->log), 0);
+			   NVME_CSI_NVM, &data->log, sizeof(data->log), 0);
 
 	return ret <= 0 ? ret : -EIO;
 }
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 69608755d415..8e562d0f2c30 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -593,8 +593,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
 		dev_meta_off = dev_meta;
 
 		ret = nvme_get_log(ctrl, ns->head->ns_id,
-				NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len,
-				offset);
+				NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM,
+				dev_meta, len, offset);
 		if (ret) {
 			dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
 			break;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5a37a595411e..74bad4e3d377 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -527,7 +527,7 @@ static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
 	int error;
 
 	mutex_lock(&ctrl->ana_lock);
-	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0,
+	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM,
 			ctrl->ana_log_buf, ctrl->ana_log_size, 0);
 	if (error) {
 		dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 5573159f714d..fe9424c7097f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -191,6 +191,12 @@ struct nvme_fault_inject {
 #endif
 };
 
+struct nvme_cel {
+	struct list_head	entry;
+	struct nvme_effects_log	log;
+	u8			csi;
+};
+
 struct nvme_ctrl {
 	bool comp_seen;
 	enum nvme_ctrl_state state;
@@ -257,6 +263,7 @@ struct nvme_ctrl {
 	unsigned long quirks;
 	struct nvme_id_power_state psd[32];
 	struct nvme_effects_log *effects;
+	struct list_head cels;
 	struct work_struct scan_work;
 	struct work_struct async_event_work;
 	struct delayed_work ka_work;
@@ -359,6 +366,7 @@ struct nvme_ns_head {
 	struct kref		ref;
 	bool			shared;
 	int			instance;
+	struct nvme_effects_log *effects;
 #ifdef CONFIG_NVME_MULTIPATH
 	struct gendisk		*disk;
 	struct bio_list		requeue_list;
@@ -561,7 +569,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
 int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
 
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
 		void *log, size_t size, u64 offset);
 
 extern const struct attribute_group *nvme_ns_id_attr_groups[];
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 81ffe5247505..95cd03e240a1 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1101,7 +1101,9 @@ struct nvme_get_log_page_command {
 		};
 		__le64 lpo;
 	};
-	__u32			rsvd14[2];
+	__u8			rsvd14[3];
+	__u8			csi;
+	__u32			rsvd15;
 };
 
 struct nvme_directive_cmd {
-- 
cgit v1.2.3


From 240e6ee272c07a2636dfc7d65f5bbb18377c49e5 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@wdc.com>
Date: Mon, 29 Jun 2020 12:06:41 -0700
Subject: nvme: support for zoned namespaces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined
in NVM Express TP4053. Zoned namespaces are discovered based on their
Command Set Identifier reported in the namespaces Namespace
Identification Descriptor list. A successfully discovered Zoned
Namespace will be registered with the block layer as a host managed
zoned block device with Zone Append command support. A namespace that
does not support append is not supported by the driver.

Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Javier González <javier.gonz@samsung.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Signed-off-by: Hans Holmberg <hans.holmberg@wdc.com>
Signed-off-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
Signed-off-by: Ajay Joshi <ajay.joshi@wdc.com>
Signed-off-by: Aravind Ramesh <aravind.ramesh@wdc.com>
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Keith Busch <keith.busch@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/Kconfig              |   5 +-
 drivers/nvme/host/Makefile |   1 +
 drivers/nvme/host/core.c   |  97 ++++++++++++++---
 drivers/nvme/host/nvme.h   |  39 +++++++
 drivers/nvme/host/zns.c    | 254 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h       | 111 ++++++++++++++++++++
 6 files changed, 492 insertions(+), 15 deletions(-)
 create mode 100644 drivers/nvme/host/zns.c

(limited to 'include')

diff --git a/block/Kconfig b/block/Kconfig
index 9357d7302398..bbad5e8bbffe 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -86,9 +86,10 @@ config BLK_DEV_ZONED
 	select MQ_IOSCHED_DEADLINE
 	help
 	Block layer zoned block device support. This option enables
-	support for ZAC/ZBC host-managed and host-aware zoned block devices.
+	support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
+	devices.
 
-	Say yes here if you have a ZAC or ZBC storage device.
+	Say yes here if you have a ZAC, ZBC, or ZNS storage device.
 
 config BLK_DEV_THROTTLING
 	bool "Block layer bio throttling support"
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index fc7b26be692d..d7f6a87687b8 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -13,6 +13,7 @@ nvme-core-y				:= core.o
 nvme-core-$(CONFIG_TRACING)		+= trace.o
 nvme-core-$(CONFIG_NVME_MULTIPATH)	+= multipath.o
 nvme-core-$(CONFIG_NVM)			+= lightnvm.o
+nvme-core-$(CONFIG_BLK_DEV_ZONED)	+= zns.o
 nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS)	+= fault_inject.o
 nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 62b2cdc764da..a8ee10a0cd32 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
 static struct class *nvme_class;
 static struct class *nvme_subsys_class;
 
-static int nvme_revalidate_disk(struct gendisk *disk);
+static int _nvme_revalidate_disk(struct gendisk *disk);
 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 					   unsigned nsid);
@@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
 			nvme_retry_req(req);
 			return;
 		}
+	} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+		   req_op(req) == REQ_OP_ZONE_APPEND) {
+		req->__sector = nvme_lba_to_sect(req->q->queuedata,
+			le64_to_cpu(nvme_req(req)->result.u64));
 	}
 
 	nvme_trace_bio_complete(req, status);
@@ -673,7 +677,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 }
 
 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
-		struct request *req, struct nvme_command *cmnd)
+		struct request *req, struct nvme_command *cmnd,
+		enum nvme_opcode op)
 {
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	u16 control = 0;
@@ -687,7 +692,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
-	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+	cmnd->rw.opcode = op;
 	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 	cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
 	cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
@@ -716,6 +721,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 		case NVME_NS_DPS_PI_TYPE2:
 			control |= NVME_RW_PRINFO_PRCHK_GUARD |
 					NVME_RW_PRINFO_PRCHK_REF;
+			if (op == nvme_cmd_zone_append)
+				control |= NVME_RW_APPEND_PIREMAP;
 			cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
 			break;
 		}
@@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 	case REQ_OP_FLUSH:
 		nvme_setup_flush(ns, cmd);
 		break;
+	case REQ_OP_ZONE_RESET_ALL:
+	case REQ_OP_ZONE_RESET:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
+		break;
+	case REQ_OP_ZONE_OPEN:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
+		break;
+	case REQ_OP_ZONE_CLOSE:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
+		break;
+	case REQ_OP_ZONE_FINISH:
+		ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
+		break;
 	case REQ_OP_WRITE_ZEROES:
 		ret = nvme_setup_write_zeroes(ns, req, cmd);
 		break;
@@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 		ret = nvme_setup_discard(ns, req, cmd);
 		break;
 	case REQ_OP_READ:
+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
+		break;
 	case REQ_OP_WRITE:
-		ret = nvme_setup_rw(ns, req, cmd);
+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+		break;
+	case REQ_OP_ZONE_APPEND:
+		ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -1398,14 +1423,23 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	return effects;
 }
 
-static void nvme_update_formats(struct nvme_ctrl *ctrl)
+static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
 {
 	struct nvme_ns *ns;
 
 	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		if (ns->disk && nvme_revalidate_disk(ns->disk))
+		if (ns->disk && _nvme_revalidate_disk(ns->disk))
 			nvme_set_queue_dying(ns);
+		else if (blk_queue_is_zoned(ns->disk->queue)) {
+			/*
+			 * IO commands are required to fully revalidate a zoned
+			 * device. Force the command effects to trigger rescan
+			 * work so report zones can run in a context with
+			 * unfrozen IO queues.
+			 */
+			*effects |= NVME_CMD_EFFECTS_NCC;
+		}
 	up_read(&ctrl->namespaces_rwsem);
 }
 
@@ -1417,7 +1451,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
 	 * this command.
 	 */
 	if (effects & NVME_CMD_EFFECTS_LBCC)
-		nvme_update_formats(ctrl);
+		nvme_update_formats(ctrl, &effects);
 	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
 		nvme_unfreeze(ctrl);
 		nvme_mpath_unfreeze(ctrl->subsys);
@@ -1532,7 +1566,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
  * Issue ioctl requests on the first available path.  Note that unlike normal
  * block layer requests we will not retry failed request on another controller.
  */
-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
 		struct nvme_ns_head **head, int *srcu_idx)
 {
 #ifdef CONFIG_NVME_MULTIPATH
@@ -1552,7 +1586,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
 	return disk->private_data;
 }
 
-static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
 {
 	if (head)
 		srcu_read_unlock(&head->srcu, idx);
@@ -1945,23 +1979,34 @@ static void nvme_update_disk_info(struct gendisk *disk,
 
 static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 {
+	unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
 	struct nvme_ns *ns = disk->private_data;
 	struct nvme_ctrl *ctrl = ns->ctrl;
+	int ret;
 	u32 iob;
 
 	/*
 	 * If identify namespace failed, use default 512 byte block size so
 	 * block layer can use before failing read/write for 0 capacity.
 	 */
-	ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
+	ns->lba_shift = id->lbaf[lbaf].ds;
 	if (ns->lba_shift == 0)
 		ns->lba_shift = 9;
 
 	switch (ns->head->ids.csi) {
 	case NVME_CSI_NVM:
 		break;
+	case NVME_CSI_ZNS:
+		ret = nvme_update_zone_info(disk, ns, lbaf);
+		if (ret) {
+			dev_warn(ctrl->device,
+				"failed to add zoned namespace:%u ret:%d\n",
+				ns->head->ns_id, ret);
+			return ret;
+		}
+		break;
 	default:
-		dev_warn(ctrl->device, "unknown csi:%d ns:%d\n",
+		dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
 			ns->head->ids.csi, ns->head->ns_id);
 		return -ENODEV;
 	}
@@ -1973,7 +2018,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 		iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
 
 	ns->features = 0;
-	ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
 	/* the PI implementation requires metadata equal t10 pi tuple size */
 	if (ns->ms == sizeof(struct t10_pi_tuple))
 		ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
@@ -2015,7 +2060,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	return 0;
 }
 
-static int nvme_revalidate_disk(struct gendisk *disk)
+static int _nvme_revalidate_disk(struct gendisk *disk)
 {
 	struct nvme_ns *ns = disk->private_data;
 	struct nvme_ctrl *ctrl = ns->ctrl;
@@ -2063,6 +2108,28 @@ out:
 	return ret;
 }
 
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+	int ret;
+
+	ret = _nvme_revalidate_disk(disk);
+	if (ret)
+		return ret;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+	if (blk_queue_is_zoned(disk->queue)) {
+		struct nvme_ns *ns = disk->private_data;
+		struct nvme_ctrl *ctrl = ns->ctrl;
+
+		ret = blk_revalidate_disk_zones(disk, NULL);
+		if (!ret)
+			blk_queue_max_zone_append_sectors(disk->queue,
+							  ctrl->max_zone_append);
+	}
+#endif
+	return ret;
+}
+
 static char nvme_pr_type(enum pr_type type)
 {
 	switch (type) {
@@ -2193,6 +2260,7 @@ static const struct block_device_operations nvme_fops = {
 	.release	= nvme_release,
 	.getgeo		= nvme_getgeo,
 	.revalidate_disk= nvme_revalidate_disk,
+	.report_zones	= nvme_report_zones,
 	.pr_ops		= &nvme_pr_ops,
 };
 
@@ -2219,6 +2287,7 @@ const struct block_device_operations nvme_ns_head_ops = {
 	.ioctl		= nvme_ioctl,
 	.compat_ioctl	= nvme_compat_ioctl,
 	.getgeo		= nvme_getgeo,
+	.report_zones	= nvme_report_zones,
 	.pr_ops		= &nvme_pr_ops,
 };
 #endif /* CONFIG_NVME_MULTIPATH */
@@ -4446,6 +4515,8 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ns_zns) != NVME_IDENTIFY_DATA_SIZE);
+	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl_zns) != NVME_IDENTIFY_DATA_SIZE);
 	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index fe9424c7097f..13ca90bcd352 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -238,6 +238,9 @@ struct nvme_ctrl {
 	u32 max_hw_sectors;
 	u32 max_segments;
 	u32 max_integrity_segments;
+#ifdef CONFIG_BLK_DEV_ZONED
+	u32 max_zone_append;
+#endif
 	u16 crdt[3];
 	u16 oncs;
 	u16 oacs;
@@ -404,6 +407,9 @@ struct nvme_ns {
 	u16 sgs;
 	u32 sws;
 	u8 pi_type;
+#ifdef CONFIG_BLK_DEV_ZONED
+	u64 zsze;
+#endif
 	unsigned long features;
 	unsigned long flags;
 #define NVME_NS_REMOVING	0
@@ -571,6 +577,9 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
 
 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
 		void *log, size_t size, u64 offset);
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+		struct nvme_ns_head **head, int *srcu_idx);
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx);
 
 extern const struct attribute_group *nvme_ns_id_attr_groups[];
 extern const struct block_device_operations nvme_ns_head_ops;
@@ -693,6 +702,36 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 }
 #endif /* CONFIG_NVME_MULTIPATH */
 
+#ifdef CONFIG_BLK_DEV_ZONED
+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
+			  unsigned lbaf);
+
+int nvme_report_zones(struct gendisk *disk, sector_t sector,
+		      unsigned int nr_zones, report_zones_cb cb, void *data);
+
+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
+				       struct nvme_command *cmnd,
+				       enum nvme_zone_mgmt_action action);
+#else
+#define nvme_report_zones NULL
+
+static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
+		struct request *req, struct nvme_command *cmnd,
+		enum nvme_zone_mgmt_action action)
+{
+	return BLK_STS_NOTSUPP;
+}
+
+static inline int nvme_update_zone_info(struct gendisk *disk,
+					struct nvme_ns *ns,
+					unsigned lbaf)
+{
+	dev_warn(ns->ctrl->device,
+		 "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
+	return -EPROTONOSUPPORT;
+}
+#endif
+
 #ifdef CONFIG_NVM
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
 void nvme_nvm_unregister(struct nvme_ns *ns);
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
new file mode 100644
index 000000000000..04e5b991c00c
--- /dev/null
+++ b/drivers/nvme/host/zns.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include "nvme.h"
+
+static int nvme_set_max_append(struct nvme_ctrl *ctrl)
+{
+	struct nvme_command c = { };
+	struct nvme_id_ctrl_zns *id;
+	int status;
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id)
+		return -ENOMEM;
+
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.cns = NVME_ID_CNS_CS_CTRL;
+	c.identify.csi = NVME_CSI_ZNS;
+
+	status = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
+	if (status) {
+		kfree(id);
+		return status;
+	}
+
+	if (id->zasl)
+		ctrl->max_zone_append = 1 << (id->zasl + 3);
+	else
+		ctrl->max_zone_append = ctrl->max_hw_sectors;
+	kfree(id);
+	return 0;
+}
+
+int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
+			  unsigned lbaf)
+{
+	struct nvme_effects_log *log = ns->head->effects;
+	struct request_queue *q = disk->queue;
+	struct nvme_command c = { };
+	struct nvme_id_ns_zns *id;
+	int status;
+
+	/* Driver requires zone append support */
+	if (!(le32_to_cpu(log->iocs[nvme_cmd_zone_append]) &
+			NVME_CMD_EFFECTS_CSUPP)) {
+		dev_warn(ns->ctrl->device,
+			"append not supported for zoned namespace:%d\n",
+			ns->head->ns_id);
+		return -EINVAL;
+	}
+
+	/* Lazily query controller append limit for the first zoned namespace */
+	if (!ns->ctrl->max_zone_append) {
+		status = nvme_set_max_append(ns->ctrl);
+		if (status)
+			return status;
+	}
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id)
+		return -ENOMEM;
+
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(ns->head->ns_id);
+	c.identify.cns = NVME_ID_CNS_CS_NS;
+	c.identify.csi = NVME_CSI_ZNS;
+
+	status = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, id, sizeof(*id));
+	if (status)
+		goto free_data;
+
+	/*
+	 * We currently do not handle devices requiring any of the zoned
+	 * operation characteristics.
+	 */
+	if (id->zoc) {
+		dev_warn(ns->ctrl->device,
+			"zone operations:%x not supported for namespace:%u\n",
+			le16_to_cpu(id->zoc), ns->head->ns_id);
+		status = -EINVAL;
+		goto free_data;
+	}
+
+	ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
+	if (!is_power_of_2(ns->zsze)) {
+		dev_warn(ns->ctrl->device,
+			"invalid zone size:%llu for namespace:%u\n",
+			ns->zsze, ns->head->ns_id);
+		status = -EINVAL;
+		goto free_data;
+	}
+
+	q->limits.zoned = BLK_ZONED_HM;
+	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
+free_data:
+	kfree(id);
+	return status;
+}
+
+static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
+					  unsigned int nr_zones, size_t *buflen)
+{
+	struct request_queue *q = ns->disk->queue;
+	size_t bufsize;
+	void *buf;
+
+	const size_t min_bufsize = sizeof(struct nvme_zone_report) +
+				   sizeof(struct nvme_zone_descriptor);
+
+	nr_zones = min_t(unsigned int, nr_zones,
+			 get_capacity(ns->disk) >> ilog2(ns->zsze));
+
+	bufsize = sizeof(struct nvme_zone_report) +
+		nr_zones * sizeof(struct nvme_zone_descriptor);
+	bufsize = min_t(size_t, bufsize,
+			queue_max_hw_sectors(q) << SECTOR_SHIFT);
+	bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT);
+
+	while (bufsize >= min_bufsize) {
+		buf = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
+		if (buf) {
+			*buflen = bufsize;
+			return buf;
+		}
+		bufsize >>= 1;
+	}
+	return NULL;
+}
+
+static int __nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
+				  struct nvme_zone_report *report,
+				  size_t buflen)
+{
+	struct nvme_command c = { };
+	int ret;
+
+	c.zmr.opcode = nvme_cmd_zone_mgmt_recv;
+	c.zmr.nsid = cpu_to_le32(ns->head->ns_id);
+	c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
+	c.zmr.numd = cpu_to_le32(nvme_bytes_to_numd(buflen));
+	c.zmr.zra = NVME_ZRA_ZONE_REPORT;
+	c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
+	c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
+
+	ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
+	if (ret)
+		return ret;
+
+	return le64_to_cpu(report->nr_zones);
+}
+
+static int nvme_zone_parse_entry(struct nvme_ns *ns,
+				 struct nvme_zone_descriptor *entry,
+				 unsigned int idx, report_zones_cb cb,
+				 void *data)
+{
+	struct blk_zone zone = { };
+
+	if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
+		dev_err(ns->ctrl->device, "invalid zone type %#x\n",
+				entry->zt);
+		return -EINVAL;
+	}
+
+	zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+	zone.cond = entry->zs >> 4;
+	zone.len = ns->zsze;
+	zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
+	zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
+	zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
+
+	return cb(&zone, idx, data);
+}
+
+static int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
+			unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+	struct nvme_zone_report *report;
+	int ret, zone_idx = 0;
+	unsigned int nz, i;
+	size_t buflen;
+
+	report = nvme_zns_alloc_report_buffer(ns, nr_zones, &buflen);
+	if (!report)
+		return -ENOMEM;
+
+	sector &= ~(ns->zsze - 1);
+	while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
+		memset(report, 0, buflen);
+		ret = __nvme_ns_report_zones(ns, sector, report, buflen);
+		if (ret < 0)
+			goto out_free;
+
+		nz = min_t(unsigned int, ret, nr_zones);
+		if (!nz)
+			break;
+
+		for (i = 0; i < nz && zone_idx < nr_zones; i++) {
+			ret = nvme_zone_parse_entry(ns, &report->entries[i],
+						    zone_idx, cb, data);
+			if (ret)
+				goto out_free;
+			zone_idx++;
+		}
+
+		sector += ns->zsze * nz;
+	}
+
+	if (zone_idx > 0)
+		ret = zone_idx;
+	else
+		ret = -EINVAL;
+out_free:
+	kvfree(report);
+	return ret;
+}
+
+int nvme_report_zones(struct gendisk *disk, sector_t sector,
+		      unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+	struct nvme_ns_head *head = NULL;
+	struct nvme_ns *ns;
+	int srcu_idx, ret;
+
+	ns = nvme_get_ns_from_disk(disk, &head, &srcu_idx);
+	if (unlikely(!ns))
+		return -EWOULDBLOCK;
+
+	if (ns->head->ids.csi == NVME_CSI_ZNS)
+		ret = nvme_ns_report_zones(ns, sector, nr_zones, cb, data);
+	else
+		ret = -EINVAL;
+	nvme_put_ns_from_disk(head, srcu_idx);
+
+	return ret;
+}
+
+blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
+		struct nvme_command *c, enum nvme_zone_mgmt_action action)
+{
+	c->zms.opcode = nvme_cmd_zone_mgmt_send;
+	c->zms.nsid = cpu_to_le32(ns->head->ns_id);
+	c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
+	c->zms.zsa = action;
+
+	if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
+		c->zms.select_all = 1;
+
+	return BLK_STS_OK;
+}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 95cd03e240a1..1643005d21e3 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -374,6 +374,30 @@ struct nvme_id_ns {
 	__u8			vs[3712];
 };
 
+struct nvme_zns_lbafe {
+	__le64			zsze;
+	__u8			zdes;
+	__u8			rsvd9[7];
+};
+
+struct nvme_id_ns_zns {
+	__le16			zoc;
+	__le16			ozcs;
+	__le32			mar;
+	__le32			mor;
+	__le32			rrl;
+	__le32			frl;
+	__u8			rsvd20[2796];
+	struct nvme_zns_lbafe	lbafe[16];
+	__u8			rsvd3072[768];
+	__u8			vs[256];
+};
+
+struct nvme_id_ctrl_zns {
+	__u8	zasl;
+	__u8	rsvd1[4095];
+};
+
 enum {
 	NVME_ID_CNS_NS			= 0x00,
 	NVME_ID_CNS_CTRL		= 0x01,
@@ -392,6 +416,7 @@ enum {
 
 enum {
 	NVME_CSI_NVM			= 0,
+	NVME_CSI_ZNS			= 2,
 };
 
 enum {
@@ -532,6 +557,27 @@ struct nvme_ana_rsp_hdr {
 	__le16	rsvd10[3];
 };
 
+struct nvme_zone_descriptor {
+	__u8		zt;
+	__u8		zs;
+	__u8		za;
+	__u8		rsvd3[5];
+	__le64		zcap;
+	__le64		zslba;
+	__le64		wp;
+	__u8		rsvd32[32];
+};
+
+enum {
+	NVME_ZONE_TYPE_SEQWRITE_REQ	= 0x2,
+};
+
+struct nvme_zone_report {
+	__le64		nr_zones;
+	__u8		resv8[56];
+	struct nvme_zone_descriptor entries[];
+};
+
 enum {
 	NVME_SMART_CRIT_SPARE		= 1 << 0,
 	NVME_SMART_CRIT_TEMPERATURE	= 1 << 1,
@@ -626,6 +672,9 @@ enum nvme_opcode {
 	nvme_cmd_resv_report	= 0x0e,
 	nvme_cmd_resv_acquire	= 0x11,
 	nvme_cmd_resv_release	= 0x15,
+	nvme_cmd_zone_mgmt_send	= 0x79,
+	nvme_cmd_zone_mgmt_recv	= 0x7a,
+	nvme_cmd_zone_append	= 0x7d,
 };
 
 #define nvme_opcode_name(opcode)	{ opcode, #opcode }
@@ -764,6 +813,7 @@ struct nvme_rw_command {
 enum {
 	NVME_RW_LR			= 1 << 15,
 	NVME_RW_FUA			= 1 << 14,
+	NVME_RW_APPEND_PIREMAP		= 1 << 9,
 	NVME_RW_DSM_FREQ_UNSPEC		= 0,
 	NVME_RW_DSM_FREQ_TYPICAL	= 1,
 	NVME_RW_DSM_FREQ_RARE		= 2,
@@ -829,6 +879,53 @@ struct nvme_write_zeroes_cmd {
 	__le16			appmask;
 };
 
+enum nvme_zone_mgmt_action {
+	NVME_ZONE_CLOSE		= 0x1,
+	NVME_ZONE_FINISH	= 0x2,
+	NVME_ZONE_OPEN		= 0x3,
+	NVME_ZONE_RESET		= 0x4,
+	NVME_ZONE_OFFLINE	= 0x5,
+	NVME_ZONE_SET_DESC_EXT	= 0x10,
+};
+
+struct nvme_zone_mgmt_send_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__le32			cdw2[2];
+	__le64			metadata;
+	union nvme_data_ptr	dptr;
+	__le64			slba;
+	__le32			cdw12;
+	__u8			zsa;
+	__u8			select_all;
+	__u8			rsvd13[2];
+	__le32			cdw14[2];
+};
+
+struct nvme_zone_mgmt_recv_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__le64			rsvd2[2];
+	union nvme_data_ptr	dptr;
+	__le64			slba;
+	__le32			numd;
+	__u8			zra;
+	__u8			zrasf;
+	__u8			pr;
+	__u8			rsvd13;
+	__le32			cdw14[2];
+};
+
+enum {
+	NVME_ZRA_ZONE_REPORT		= 0,
+	NVME_ZRASF_ZONE_REPORT_ALL	= 0,
+	NVME_REPORT_ZONE_PARTIAL	= 1,
+};
+
 /* Features */
 
 enum {
@@ -1300,6 +1397,8 @@ struct nvme_command {
 		struct nvme_format_cmd format;
 		struct nvme_dsm_cmd dsm;
 		struct nvme_write_zeroes_cmd write_zeroes;
+		struct nvme_zone_mgmt_send_cmd zms;
+		struct nvme_zone_mgmt_recv_cmd zmr;
 		struct nvme_abort_cmd abort;
 		struct nvme_get_log_page_command get_log_page;
 		struct nvmf_common_command fabrics;
@@ -1433,6 +1532,18 @@ enum {
 	NVME_SC_DISCOVERY_RESTART	= 0x190,
 	NVME_SC_AUTH_REQUIRED		= 0x191,
 
+	/*
+	 * I/O Command Set Specific - Zoned commands:
+	 */
+	NVME_SC_ZONE_BOUNDARY_ERROR	= 0x1b8,
+	NVME_SC_ZONE_FULL		= 0x1b9,
+	NVME_SC_ZONE_READ_ONLY		= 0x1ba,
+	NVME_SC_ZONE_OFFLINE		= 0x1bb,
+	NVME_SC_ZONE_INVALID_WRITE	= 0x1bc,
+	NVME_SC_ZONE_TOO_MANY_ACTIVE	= 0x1bd,
+	NVME_SC_ZONE_TOO_MANY_OPEN	= 0x1be,
+	NVME_SC_ZONE_INVALID_TRANSITION	= 0x1bf,
+
 	/*
 	 * Media and Data Integrity Errors:
 	 */
-- 
cgit v1.2.3


From e15864f8ea05b24071b07300459ae7e511d0b938 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Tue, 14 Jul 2020 23:18:23 +0200
Subject: block: add max_open_zones to blk-sysfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new max_open_zones definition in the sysfs documentation.
This definition will be common for all devices utilizing the zoned block
device support in the kernel.

Export max open zones according to this new definition for NVMe Zoned
Namespace devices, ZAC ATA devices (which are treated as SCSI devices by
the kernel), and ZBC SCSI devices.

Add the new max_open_zones member to struct request_queue, rather
than as a queue limit, since this property cannot be split across stacking
drivers.

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Javier González <javier@javigon.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/testing/sysfs-block |  9 +++++++++
 Documentation/block/queue-sysfs.rst   |  7 +++++++
 block/blk-sysfs.c                     | 15 +++++++++++++++
 drivers/nvme/host/zns.c               |  1 +
 drivers/scsi/sd_zbc.c                 |  4 ++++
 include/linux/blkdev.h                | 25 +++++++++++++++++++++++++
 6 files changed, 61 insertions(+)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index ed8c14f161ee..f151d9cf90de 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -273,6 +273,15 @@ Description:
 		device ("host-aware" or "host-managed" zone model). For regular
 		block devices, the value is always 0.
 
+What:		/sys/block/<disk>/queue/max_open_zones
+Date:		July 2020
+Contact:	Niklas Cassel <niklas.cassel@wdc.com>
+Description:
+		For zoned block devices (zoned attribute indicating
+		"host-managed" or "host-aware"), the sum of zones belonging to
+		any of the zone states: EXPLICIT OPEN or IMPLICIT OPEN,
+		is limited by this value. If this value is 0, there is no limit.
+
 What:		/sys/block/<disk>/queue/chunk_sectors
 Date:		September 2016
 Contact:	Hannes Reinecke <hare@suse.com>
diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
index 6a8513af9201..f01cf8530ae4 100644
--- a/Documentation/block/queue-sysfs.rst
+++ b/Documentation/block/queue-sysfs.rst
@@ -117,6 +117,13 @@ Maximum number of elements in a DMA scatter/gather list with integrity
 data that will be submitted by the block layer core to the associated
 block driver.
 
+max_open_zones (RO)
+-------------------
+For zoned block devices (zoned attribute indicating "host-managed" or
+"host-aware"), the sum of zones belonging to any of the zone states:
+EXPLICIT OPEN or IMPLICIT OPEN, is limited by this value.
+If this value is 0, there is no limit.
+
 max_sectors_kb (RW)
 -------------------
 This is the maximum number of kilobytes that the block layer will allow
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index be67952e7be2..414f04579d77 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -306,6 +306,11 @@ static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
 	return queue_var_show(blk_queue_nr_zones(q), page);
 }
 
+static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_max_open_zones(q), page);
+}
+
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
 {
 	return queue_var_show((blk_queue_nomerges(q) << 1) |
@@ -668,6 +673,11 @@ static struct queue_sysfs_entry queue_nr_zones_entry = {
 	.show = queue_nr_zones_show,
 };
 
+static struct queue_sysfs_entry queue_max_open_zones_entry = {
+	.attr = {.name = "max_open_zones", .mode = 0444 },
+	.show = queue_max_open_zones_show,
+};
+
 static struct queue_sysfs_entry queue_nomerges_entry = {
 	.attr = {.name = "nomerges", .mode = 0644 },
 	.show = queue_nomerges_show,
@@ -766,6 +776,7 @@ static struct attribute *queue_attrs[] = {
 	&queue_nonrot_entry.attr,
 	&queue_zoned_entry.attr,
 	&queue_nr_zones_entry.attr,
+	&queue_max_open_zones_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
 	&queue_iostats_entry.attr,
@@ -793,6 +804,10 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
 		(!q->mq_ops || !q->mq_ops->timeout))
 			return 0;
 
+	if (attr == &queue_max_open_zones_entry.attr &&
+	    !blk_queue_is_zoned(q))
+		return 0;
+
 	return attr->mode;
 }
 
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 04e5b991c00c..3d80b9cf6bfc 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -96,6 +96,7 @@ int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
 
 	q->limits.zoned = BLK_ZONED_HM;
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
+	blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1);
 free_data:
 	kfree(id);
 	return status;
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 183a20720da9..aa3564139b40 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -717,6 +717,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
 	/* The drive satisfies the kernel restrictions: set it up */
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 	blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
+	if (sdkp->zones_max_open == U32_MAX)
+		blk_queue_max_open_zones(q, 0);
+	else
+		blk_queue_max_open_zones(q, sdkp->zones_max_open);
 	nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
 
 	/* READ16/WRITE16 is mandatory for ZBC disks */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index de7adc59b993..c8beb8bbdb08 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -513,6 +513,7 @@ struct request_queue {
 	unsigned int		nr_zones;
 	unsigned long		*conv_zones_bitmap;
 	unsigned long		*seq_zones_wlock;
+	unsigned int		max_open_zones;
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 	/*
@@ -722,6 +723,17 @@ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
 		return true;
 	return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap);
 }
+
+static inline void blk_queue_max_open_zones(struct request_queue *q,
+		unsigned int max_open_zones)
+{
+	q->max_open_zones = max_open_zones;
+}
+
+static inline unsigned int queue_max_open_zones(const struct request_queue *q)
+{
+	return q->max_open_zones;
+}
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
 {
@@ -737,6 +749,10 @@ static inline unsigned int blk_queue_zone_no(struct request_queue *q,
 {
 	return 0;
 }
+static inline unsigned int queue_max_open_zones(const struct request_queue *q)
+{
+	return 0;
+}
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 static inline bool rq_is_sync(struct request *rq)
@@ -1519,6 +1535,15 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev)
 	return 0;
 }
 
+static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return queue_max_open_zones(q);
+	return 0;
+}
+
 static inline int queue_dma_alignment(const struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
-- 
cgit v1.2.3


From 659bf827ba8f1183b714341d8a1d4b1e446178d9 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <niklas.cassel@wdc.com>
Date: Tue, 14 Jul 2020 23:18:24 +0200
Subject: block: add max_active_zones to blk-sysfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new max_active zones definition in the sysfs documentation.
This definition will be common for all devices utilizing the zoned block
device support in the kernel.

Export max_active_zones according to this new definition for NVMe Zoned
Namespace devices, ZAC ATA devices (which are treated as SCSI devices by
the kernel), and ZBC SCSI devices.

Add the new max_active_zones member to struct request_queue, rather
than as a queue limit, since this property cannot be split across stacking
drivers.

For SCSI devices, even though max active zones is not part of the ZBC/ZAC
spec, export max_active_zones as 0, signifying "no limit".

Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Javier González <javier@javigon.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/testing/sysfs-block |  9 +++++++++
 Documentation/block/queue-sysfs.rst   |  7 +++++++
 block/blk-sysfs.c                     | 14 +++++++++++++-
 drivers/nvme/host/zns.c               |  1 +
 drivers/scsi/sd_zbc.c                 |  1 +
 include/linux/blkdev.h                | 25 +++++++++++++++++++++++++
 6 files changed, 56 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index f151d9cf90de..2322eb748b38 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -273,6 +273,15 @@ Description:
 		device ("host-aware" or "host-managed" zone model). For regular
 		block devices, the value is always 0.
 
+What:		/sys/block/<disk>/queue/max_active_zones
+Date:		July 2020
+Contact:	Niklas Cassel <niklas.cassel@wdc.com>
+Description:
+		For zoned block devices (zoned attribute indicating
+		"host-managed" or "host-aware"), the sum of zones belonging to
+		any of the zone states: EXPLICIT OPEN, IMPLICIT OPEN or CLOSED,
+		is limited by this value. If this value is 0, there is no limit.
+
 What:		/sys/block/<disk>/queue/max_open_zones
 Date:		July 2020
 Contact:	Niklas Cassel <niklas.cassel@wdc.com>
diff --git a/Documentation/block/queue-sysfs.rst b/Documentation/block/queue-sysfs.rst
index f01cf8530ae4..f261a5c84170 100644
--- a/Documentation/block/queue-sysfs.rst
+++ b/Documentation/block/queue-sysfs.rst
@@ -117,6 +117,13 @@ Maximum number of elements in a DMA scatter/gather list with integrity
 data that will be submitted by the block layer core to the associated
 block driver.
 
+max_active_zones (RO)
+---------------------
+For zoned block devices (zoned attribute indicating "host-managed" or
+"host-aware"), the sum of zones belonging to any of the zone states:
+EXPLICIT OPEN, IMPLICIT OPEN or CLOSED, is limited by this value.
+If this value is 0, there is no limit.
+
 max_open_zones (RO)
 -------------------
 For zoned block devices (zoned attribute indicating "host-managed" or
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 414f04579d77..7dda709f3ccb 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -311,6 +311,11 @@ static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page)
 	return queue_var_show(queue_max_open_zones(q), page);
 }
 
+static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_max_active_zones(q), page);
+}
+
 static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
 {
 	return queue_var_show((blk_queue_nomerges(q) << 1) |
@@ -678,6 +683,11 @@ static struct queue_sysfs_entry queue_max_open_zones_entry = {
 	.show = queue_max_open_zones_show,
 };
 
+static struct queue_sysfs_entry queue_max_active_zones_entry = {
+	.attr = {.name = "max_active_zones", .mode = 0444 },
+	.show = queue_max_active_zones_show,
+};
+
 static struct queue_sysfs_entry queue_nomerges_entry = {
 	.attr = {.name = "nomerges", .mode = 0644 },
 	.show = queue_nomerges_show,
@@ -777,6 +787,7 @@ static struct attribute *queue_attrs[] = {
 	&queue_zoned_entry.attr,
 	&queue_nr_zones_entry.attr,
 	&queue_max_open_zones_entry.attr,
+	&queue_max_active_zones_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
 	&queue_iostats_entry.attr,
@@ -804,7 +815,8 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
 		(!q->mq_ops || !q->mq_ops->timeout))
 			return 0;
 
-	if (attr == &queue_max_open_zones_entry.attr &&
+	if ((attr == &queue_max_open_zones_entry.attr ||
+	     attr == &queue_max_active_zones_entry.attr) &&
 	    !blk_queue_is_zoned(q))
 		return 0;
 
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 3d80b9cf6bfc..57cfd78731fb 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -97,6 +97,7 @@ int nvme_update_zone_info(struct gendisk *disk, struct nvme_ns *ns,
 	q->limits.zoned = BLK_ZONED_HM;
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
 	blk_queue_max_open_zones(q, le32_to_cpu(id->mor) + 1);
+	blk_queue_max_active_zones(q, le32_to_cpu(id->mar) + 1);
 free_data:
 	kfree(id);
 	return status;
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index aa3564139b40..d8b2c49d645b 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -721,6 +721,7 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
 		blk_queue_max_open_zones(q, 0);
 	else
 		blk_queue_max_open_zones(q, sdkp->zones_max_open);
+	blk_queue_max_active_zones(q, 0);
 	nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
 
 	/* READ16/WRITE16 is mandatory for ZBC disks */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c8beb8bbdb08..285b59cfc064 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -514,6 +514,7 @@ struct request_queue {
 	unsigned long		*conv_zones_bitmap;
 	unsigned long		*seq_zones_wlock;
 	unsigned int		max_open_zones;
+	unsigned int		max_active_zones;
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 	/*
@@ -734,6 +735,17 @@ static inline unsigned int queue_max_open_zones(const struct request_queue *q)
 {
 	return q->max_open_zones;
 }
+
+static inline void blk_queue_max_active_zones(struct request_queue *q,
+		unsigned int max_active_zones)
+{
+	q->max_active_zones = max_active_zones;
+}
+
+static inline unsigned int queue_max_active_zones(const struct request_queue *q)
+{
+	return q->max_active_zones;
+}
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
 {
@@ -753,6 +765,10 @@ static inline unsigned int queue_max_open_zones(const struct request_queue *q)
 {
 	return 0;
 }
+static inline unsigned int queue_max_active_zones(const struct request_queue *q)
+{
+	return 0;
+}
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 static inline bool rq_is_sync(struct request *rq)
@@ -1544,6 +1560,15 @@ static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
 	return 0;
 }
 
+static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return queue_max_active_zones(q);
+	return 0;
+}
+
 static inline int queue_dma_alignment(const struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
-- 
cgit v1.2.3


From c333f9495c451d958c6f4a41e5de2d8f80f79496 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 17 Jul 2020 16:37:13 -0700
Subject: raid: md_p.h: drop duplicated word in a comment

Drop the doubled word "the" in a comment.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Song Liu <song@kernel.org>
Cc: linux-raid@vger.kernel.org
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/uapi/linux/raid/md_p.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 1f2d8c81f0e0..e5a98a16f9b0 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -123,7 +123,7 @@ typedef struct mdp_device_descriptor_s {
 
 /*
  * Notes:
- * - if an array is being reshaped (restriped) in order to change the
+ * - if an array is being reshaped (restriped) in order to change
  *   the number of active devices in the array, 'raid_disks' will be
  *   the larger of the old and new numbers.  'delta_disks' will
  *   be the "new - old".  So if +ve, raid_disks is the new value, and
-- 
cgit v1.2.3


From d721a43ff69cd473019c3b77aacb76b09102aca3 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Sat, 25 Jul 2020 20:00:27 +0800
Subject: bcache: increase super block version for cache device and backing
 device

The new added super block version BCACHE_SB_VERSION_BDEV_WITH_FEATURES
(5) BCACHE_SB_VERSION_CDEV_WITH_FEATURES value (6), is for the feature
set bits.

Devices have super block version equal to the new version will have
three new members for feature set bits in the on-disk super block,
        __le64                  feature_compat;
        __le64                  feature_incompat;
        __le64                  feature_ro_compat;

They are used for further new features which may introduce on-disk
format change, and avoid unncessary super block version increase.

The very basic features handling code skeleton is also initialized in
this patch.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/features.h | 78 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/bcache/super.c    | 32 ++++++++++++++++--
 include/uapi/linux/bcache.h  | 29 +++++++++++-----
 3 files changed, 128 insertions(+), 11 deletions(-)
 create mode 100644 drivers/md/bcache/features.h

(limited to 'include')

diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
new file mode 100644
index 000000000000..ae7df37b9862
--- /dev/null
+++ b/drivers/md/bcache/features.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _BCACHE_FEATURES_H
+#define _BCACHE_FEATURES_H
+
+#include <linux/bcache.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#define BCH_FEATURE_COMPAT		0
+#define BCH_FEATURE_RO_COMPAT		1
+#define BCH_FEATURE_INCOMPAT		2
+#define BCH_FEATURE_TYPE_MASK		0x03
+
+#define BCH_FEATURE_COMPAT_SUUP		0
+#define BCH_FEATURE_RO_COMPAT_SUUP	0
+#define BCH_FEATURE_INCOMPAT_SUUP	0
+
+#define BCH_HAS_COMPAT_FEATURE(sb, mask) \
+		((sb)->feature_compat & (mask))
+#define BCH_HAS_RO_COMPAT_FEATURE(sb, mask) \
+		((sb)->feature_ro_compat & (mask))
+#define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \
+		((sb)->feature_incompat & (mask))
+
+/* Feature set definition */
+
+#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \
+static inline int bch_has_feature_##name(struct cache_sb *sb) \
+{ \
+	return (((sb)->feature_compat & \
+		BCH##_FEATURE_COMPAT_##flagname) != 0); \
+} \
+static inline void bch_set_feature_##name(struct cache_sb *sb) \
+{ \
+	(sb)->feature_compat |= \
+		BCH##_FEATURE_COMPAT_##flagname; \
+} \
+static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+{ \
+	(sb)->feature_compat &= \
+		~BCH##_FEATURE_COMPAT_##flagname; \
+}
+
+#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
+static inline int bch_has_feature_##name(struct cache_sb *sb) \
+{ \
+	return (((sb)->feature_ro_compat & \
+		BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \
+} \
+static inline void bch_set_feature_##name(struct cache_sb *sb) \
+{ \
+	(sb)->feature_ro_compat |= \
+		BCH##_FEATURE_RO_COMPAT_##flagname; \
+} \
+static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+{ \
+	(sb)->feature_ro_compat &= \
+		~BCH##_FEATURE_RO_COMPAT_##flagname; \
+}
+
+#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \
+static inline int bch_has_feature_##name(struct cache_sb *sb) \
+{ \
+	return (((sb)->feature_incompat & \
+		BCH##_FEATURE_INCOMPAT_##flagname) != 0); \
+} \
+static inline void bch_set_feature_##name(struct cache_sb *sb) \
+{ \
+	(sb)->feature_incompat |= \
+		BCH##_FEATURE_INCOMPAT_##flagname; \
+} \
+static inline void bch_clear_feature_##name(struct cache_sb *sb) \
+{ \
+	(sb)->feature_incompat &= \
+		~BCH##_FEATURE_INCOMPAT_##flagname; \
+}
+
+#endif
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 40fb18028c01..c6ef410a21a8 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -13,6 +13,7 @@
 #include "extents.h"
 #include "request.h"
 #include "writeback.h"
+#include "features.h"
 
 #include <linux/blkdev.h>
 #include <linux/debugfs.h>
@@ -194,6 +195,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 		sb->data_offset	= BDEV_DATA_START_DEFAULT;
 		break;
 	case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
+	case BCACHE_SB_VERSION_BDEV_WITH_FEATURES:
 		sb->data_offset	= le64_to_cpu(s->data_offset);
 
 		err = "Bad data offset";
@@ -207,6 +209,14 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 		if (err)
 			goto err;
 		break;
+	case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
+		err = read_super_common(sb, bdev, s);
+		if (err)
+			goto err;
+		sb->feature_compat = le64_to_cpu(s->feature_compat);
+		sb->feature_incompat = le64_to_cpu(s->feature_incompat);
+		sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
+		break;
 	default:
 		err = "Unsupported superblock version";
 		goto err;
@@ -241,7 +251,6 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
 			offset_in_page(out));
 
 	out->offset		= cpu_to_le64(sb->offset);
-	out->version		= cpu_to_le64(sb->version);
 
 	memcpy(out->uuid,	sb->uuid, 16);
 	memcpy(out->set_uuid,	sb->set_uuid, 16);
@@ -257,6 +266,13 @@ static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
 	for (i = 0; i < sb->keys; i++)
 		out->d[i] = cpu_to_le64(sb->d[i]);
 
+	if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
+		out->feature_compat    = cpu_to_le64(sb->feature_compat);
+		out->feature_incompat  = cpu_to_le64(sb->feature_incompat);
+		out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat);
+	}
+
+	out->version		= cpu_to_le64(sb->version);
 	out->csum = csum_set(out);
 
 	pr_debug("ver %llu, flags %llu, seq %llu\n",
@@ -313,17 +329,20 @@ void bcache_write_super(struct cache_set *c)
 {
 	struct closure *cl = &c->sb_write;
 	struct cache *ca;
-	unsigned int i;
+	unsigned int i, version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
 
 	down(&c->sb_write_mutex);
 	closure_init(cl, &c->cl);
 
 	c->sb.seq++;
 
+	if (c->sb.version > version)
+		version = c->sb.version;
+
 	for_each_cache(ca, c, i) {
 		struct bio *bio = &ca->sb_bio;
 
-		ca->sb.version		= BCACHE_SB_VERSION_CDEV_WITH_UUID;
+		ca->sb.version		= version;
 		ca->sb.seq		= c->sb.seq;
 		ca->sb.last_mount	= c->sb.last_mount;
 
@@ -1839,6 +1858,13 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	c->sb.bucket_size	= sb->bucket_size;
 	c->sb.nr_in_set		= sb->nr_in_set;
 	c->sb.last_mount	= sb->last_mount;
+	c->sb.version		= sb->version;
+	if (c->sb.version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
+		c->sb.feature_compat = sb->feature_compat;
+		c->sb.feature_ro_compat = sb->feature_ro_compat;
+		c->sb.feature_incompat = sb->feature_incompat;
+	}
+
 	c->bucket_bits		= ilog2(sb->bucket_size);
 	c->block_bits		= ilog2(sb->block_size);
 	c->nr_uuids		= bucket_bytes(c) / sizeof(struct uuid_entry);
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 9a1965c6c3d0..47df2db2e727 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -141,11 +141,13 @@ static inline struct bkey *bkey_idx(const struct bkey *k, unsigned int nr_keys)
  * Version 3: Cache device with new UUID format
  * Version 4: Backing device with data offset
  */
-#define BCACHE_SB_VERSION_CDEV		0
-#define BCACHE_SB_VERSION_BDEV		1
-#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3
-#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4
-#define BCACHE_SB_MAX_VERSION		4
+#define BCACHE_SB_VERSION_CDEV			0
+#define BCACHE_SB_VERSION_BDEV			1
+#define BCACHE_SB_VERSION_CDEV_WITH_UUID	3
+#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET	4
+#define BCACHE_SB_VERSION_CDEV_WITH_FEATURES	5
+#define BCACHE_SB_VERSION_BDEV_WITH_FEATURES	6
+#define BCACHE_SB_MAX_VERSION			6
 
 #define SB_SECTOR			8
 #define SB_OFFSET			(SB_SECTOR << SECTOR_SHIFT)
@@ -173,7 +175,12 @@ struct cache_sb_disk {
 
 	__le64			flags;
 	__le64			seq;
-	__le64			pad[8];
+
+	__le64			feature_compat;
+	__le64			feature_incompat;
+	__le64			feature_ro_compat;
+
+	__le64			pad[5];
 
 	union {
 	struct {
@@ -224,7 +231,12 @@ struct cache_sb {
 
 	__u64			flags;
 	__u64			seq;
-	__u64			pad[8];
+
+	__u64			feature_compat;
+	__u64			feature_incompat;
+	__u64			feature_ro_compat;
+
+	__u64			pad[5];
 
 	union {
 	struct {
@@ -262,7 +274,8 @@ struct cache_sb {
 static inline _Bool SB_IS_BDEV(const struct cache_sb *sb)
 {
 	return sb->version == BCACHE_SB_VERSION_BDEV
-		|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET;
+		|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET
+		|| sb->version == BCACHE_SB_VERSION_BDEV_WITH_FEATURES;
 }
 
 BITMASK(CACHE_SYNC,			struct cache_sb, flags, 0, 1);
-- 
cgit v1.2.3


From 4c1ccd0896d6a45f2159280d957afd441a7aeaba Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Sat, 25 Jul 2020 20:00:29 +0800
Subject: bcache: struct cache_sb is only for in-memory super block now

We have struct cache_sb_disk for on-disk super block already, it is
unnecessary to keep the in-memory super block format exactly mapping
to the on-disk struct layout.

This patch adds code comments to notice that struct cache_sb is not
exactly mapping to cache_sb_disk, and removes the useless member csum
and pad[5].

Although struct cache_sb does not belong to uapi, but there are still
some on-disk format related macros reference it and it is unncessary to
get rid of such dependency now. So struct cache_sb will continue to stay
in include/uapi/linux/bache.h for now.

Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/bcache.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 47df2db2e727..0ef984ea515a 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -215,8 +215,13 @@ struct cache_sb_disk {
 	__le64			d[SB_JOURNAL_BUCKETS];	/* journal buckets */
 };
 
+/*
+ * This is for in-memory bcache super block.
+ * NOTE: cache_sb is NOT exactly mapping to cache_sb_disk, the member
+ *       size, ordering and even whole struct size may be different
+ *       from cache_sb_disk.
+ */
 struct cache_sb {
-	__u64			csum;
 	__u64			offset;	/* sector where this sb was written */
 	__u64			version;
 
@@ -236,8 +241,6 @@ struct cache_sb {
 	__u64			feature_incompat;
 	__u64			feature_ro_compat;
 
-	__u64			pad[5];
-
 	union {
 	struct {
 		/* Cache devices */
@@ -245,7 +248,6 @@ struct cache_sb {
 
 		__u16		block_size;	/* sectors */
 		__u16		bucket_size;	/* sectors */
-
 		__u16		nr_in_set;
 		__u16		nr_this_dev;
 	};
-- 
cgit v1.2.3


From ffa470327572b8f85dceda48fd0676d9658cb8c5 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Sat, 25 Jul 2020 20:00:35 +0800
Subject: bcache: add bucket_size_hi into struct cache_sb_disk for large bucket

The large bucket feature is to extend bucket_size from 16bit to 32bit.

When create cache device on zoned device (e.g. zoned NVMe SSD), making
a single bucket cover one or more zones of the zoned device is the
simplest way to support zoned device as cache by bcache.

But current maximum bucket size is 16MB and a typical zone size of zoned
device is 256MB, this is the major motiviation to extend bucket size to
a larger bit width.

This patch is the basic and first change to support large bucket size,
the major changes it makes are,
- Add BCH_FEATURE_INCOMPAT_LARGE_BUCKET for the large bucket feature,
  INCOMPAT means it introduces incompatible on-disk format change.
- Add BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET) routines.
- Adds __le16 bucket_size_hi into struct cache_sb_disk at offset 0x8d0
  for the on-disk super block format.
- For the in-memory super block struct cache_sb, member bucket_size is
  extended from __u16 to __32.
- Add get_bucket_size() to combine the bucket_size and bucket_size_hi
  from struct cache_sb_disk into an unsigned int value.

Since we already have large bucket size helpers meta_bucket_pages(),
meta_bucket_bytes() and alloc_meta_bucket_pages(), they make sure when
bucket size > 8MB, the memory allocation for bcache meta data bucket
won't fail no matter how large the bucket size extended. So these meta
data buckets are handled properly when the bucket size width increase
from 16bit to 32bit, we don't need to worry about them.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/alloc.c    |  2 +-
 drivers/md/bcache/features.c | 22 ++++++++++++++++++++++
 drivers/md/bcache/features.h |  9 ++++++---
 drivers/md/bcache/movinggc.c |  4 ++--
 drivers/md/bcache/super.c    | 23 +++++++++++++++++++----
 include/uapi/linux/bcache.h  |  3 ++-
 6 files changed, 52 insertions(+), 11 deletions(-)
 create mode 100644 drivers/md/bcache/features.c

(limited to 'include')

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a1df0d95151c..52035a78d836 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -87,7 +87,7 @@ void bch_rescale_priorities(struct cache_set *c, int sectors)
 {
 	struct cache *ca;
 	struct bucket *b;
-	unsigned int next = c->nbuckets * c->sb.bucket_size / 1024;
+	unsigned long next = c->nbuckets * c->sb.bucket_size / 1024;
 	unsigned int i;
 	int r;
 
diff --git a/drivers/md/bcache/features.c b/drivers/md/bcache/features.c
new file mode 100644
index 000000000000..ba53944bb390
--- /dev/null
+++ b/drivers/md/bcache/features.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Feature set bits and string conversion.
+ * Inspired by ext4's features compat/incompat/ro_compat related code.
+ *
+ * Copyright 2020 Coly Li <colyli@suse.de>
+ *
+ */
+#include <linux/bcache.h>
+#include "bcache.h"
+
+struct feature {
+	int		compat;
+	unsigned int	mask;
+	const char	*string;
+};
+
+static struct feature feature_list[] = {
+	{BCH_FEATURE_INCOMPAT, BCH_FEATURE_INCOMPAT_LARGE_BUCKET,
+		"large_bucket"},
+	{0, 0, 0 },
+};
diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
index ae7df37b9862..dca052cf5203 100644
--- a/drivers/md/bcache/features.h
+++ b/drivers/md/bcache/features.h
@@ -11,9 +11,13 @@
 #define BCH_FEATURE_INCOMPAT		2
 #define BCH_FEATURE_TYPE_MASK		0x03
 
+/* Feature set definition */
+/* Incompat feature set */
+#define BCH_FEATURE_INCOMPAT_LARGE_BUCKET	0x0001 /* 32bit bucket size */
+
 #define BCH_FEATURE_COMPAT_SUUP		0
 #define BCH_FEATURE_RO_COMPAT_SUUP	0
-#define BCH_FEATURE_INCOMPAT_SUUP	0
+#define BCH_FEATURE_INCOMPAT_SUUP	BCH_FEATURE_INCOMPAT_LARGE_BUCKET
 
 #define BCH_HAS_COMPAT_FEATURE(sb, mask) \
 		((sb)->feature_compat & (mask))
@@ -22,8 +26,6 @@
 #define BCH_HAS_INCOMPAT_FEATURE(sb, mask) \
 		((sb)->feature_incompat & (mask))
 
-/* Feature set definition */
-
 #define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \
 static inline int bch_has_feature_##name(struct cache_sb *sb) \
 { \
@@ -75,4 +77,5 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \
 		~BCH##_FEATURE_INCOMPAT_##flagname; \
 }
 
+BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET);
 #endif
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index b7dd2d75f58c..5872d6470470 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -206,8 +206,8 @@ void bch_moving_gc(struct cache_set *c)
 	mutex_lock(&c->bucket_lock);
 
 	for_each_cache(ca, c, i) {
-		unsigned int sectors_to_move = 0;
-		unsigned int reserve_sectors = ca->sb.bucket_size *
+		unsigned long sectors_to_move = 0;
+		unsigned long reserve_sectors = ca->sb.bucket_size *
 			     fifo_used(&ca->free[RESERVE_MOVINGGC]);
 
 		ca->heap.used = 0;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e4f05c4ddcdd..62c9681fe92f 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -60,6 +60,17 @@ struct workqueue_struct *bch_journal_wq;
 
 /* Superblock */
 
+static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s)
+{
+	unsigned int bucket_size = le16_to_cpu(s->bucket_size);
+
+	if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES &&
+	     bch_has_feature_large_bucket(sb))
+		bucket_size |= le16_to_cpu(s->bucket_size_hi) << 16;
+
+	return bucket_size;
+}
+
 static const char *read_super_common(struct cache_sb *sb,  struct block_device *bdev,
 				     struct cache_sb_disk *s)
 {
@@ -68,7 +79,7 @@ static const char *read_super_common(struct cache_sb *sb,  struct block_device *
 
 	sb->first_bucket= le16_to_cpu(s->first_bucket);
 	sb->nbuckets	= le64_to_cpu(s->nbuckets);
-	sb->bucket_size	= le16_to_cpu(s->bucket_size);
+	sb->bucket_size	= get_bucket_size(sb, s);
 
 	sb->nr_in_set	= le16_to_cpu(s->nr_in_set);
 	sb->nr_this_dev	= le16_to_cpu(s->nr_this_dev);
@@ -210,12 +221,16 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 			goto err;
 		break;
 	case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
-		err = read_super_common(sb, bdev, s);
-		if (err)
-			goto err;
+		/*
+		 * Feature bits are needed in read_super_common(),
+		 * convert them firstly.
+		 */
 		sb->feature_compat = le64_to_cpu(s->feature_compat);
 		sb->feature_incompat = le64_to_cpu(s->feature_incompat);
 		sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
+		err = read_super_common(sb, bdev, s);
+		if (err)
+			goto err;
 		break;
 	default:
 		err = "Unsupported superblock version";
diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h
index 0ef984ea515a..52e8bcb33981 100644
--- a/include/uapi/linux/bcache.h
+++ b/include/uapi/linux/bcache.h
@@ -213,6 +213,7 @@ struct cache_sb_disk {
 		__le16		keys;
 	};
 	__le64			d[SB_JOURNAL_BUCKETS];	/* journal buckets */
+	__le16			bucket_size_hi;
 };
 
 /*
@@ -247,9 +248,9 @@ struct cache_sb {
 		__u64		nbuckets;	/* device size */
 
 		__u16		block_size;	/* sectors */
-		__u16		bucket_size;	/* sectors */
 		__u16		nr_in_set;
 		__u16		nr_this_dev;
+		__u32		bucket_size;	/* sectors */
 	};
 	struct {
 		/* Backing devices */
-- 
cgit v1.2.3


From fe5e26a70cc544004b9aa9c3b96cb1d1cc132e09 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 18 Jul 2020 17:30:15 -0700
Subject: nvme-fc: drop a duplicated word in a comment

Drop the repeated word "a" in a comment.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme-fc-driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 41e7795a3ee4..2a38f2b477a5 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -672,7 +672,7 @@ enum {
  * Values set by the LLDD indicating completion status of the FCP operation.
  * Must be set prior to calling the done() callback.
  * @transferred_length: amount of DATA_OUT payload data received by a
- *            a WRITEDATA operation. If not a WRITEDATA operation, value must
+ *            WRITEDATA operation. If not a WRITEDATA operation, value must
  *            be set to 0. Should equal transfer_length on success.
  * @fcp_error: status of the FCP operation. Must be 0 on success; on failure
  *            must be a NVME_SC_FC_xxxx value.
-- 
cgit v1.2.3


From c1fef73f793b7fd9d2ffcb5ef85807ea55bf7adb Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Fri, 24 Jul 2020 11:25:17 -0600
Subject: nvmet: add passthru code to process commands

Add passthru command handling capability for the NVMeOF target and
export passthru APIs which are used to integrate passthru
code with nvmet-core.

The new file passthru.c handles passthru cmd parsing and execution.
In the passthru mode, we create a block layer request from the nvmet
request and map the data on to the block layer request.

Admin commands and features are on an allow list as there are a number
of each that don't make too much sense with passthrough. We use an
allow list such that new commands can be considered before being blindly
passed through. In both cases, vendor specific commands are always
allowed.

We also reject reservation IO commands as the underlying device cannot
differentiate between multiple hosts behind a fabric.

Based-on-a-patch-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/Makefile    |   1 +
 drivers/nvme/target/admin-cmd.c |   7 +-
 drivers/nvme/target/core.c      |   3 +
 drivers/nvme/target/nvmet.h     |  39 ++++
 drivers/nvme/target/passthru.c  | 458 ++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h            |   4 +
 6 files changed, 510 insertions(+), 2 deletions(-)
 create mode 100644 drivers/nvme/target/passthru.c

(limited to 'include')

diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 2b33836f3d3e..ebf91fc4c72e 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
 
 nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
 			discovery.o io-cmd-file.o io-cmd-bdev.o
+nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+= passthru.o
 nvme-loop-y	+= loop.o
 nvmet-rdma-y	+= rdma.o
 nvmet-fc-y	+= fc.o
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 55918fcef80b..e9fe91786bbb 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -749,7 +749,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
 	return 0;
 }
 
-static void nvmet_execute_set_features(struct nvmet_req *req)
+void nvmet_execute_set_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
 	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
@@ -824,7 +824,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req)
 	nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
 }
 
-static void nvmet_execute_get_features(struct nvmet_req *req)
+void nvmet_execute_get_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
 	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
@@ -940,6 +940,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 	if (unlikely(ret))
 		return ret;
 
+	if (nvmet_req_passthru_ctrl(req))
+		return nvmet_parse_passthru_admin_cmd(req);
+
 	switch (cmd->common.opcode) {
 	case nvme_admin_get_log_page:
 		req->execute = nvmet_execute_get_log_page;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 59621b816f6e..c5a1c82e699b 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -849,6 +849,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 	if (unlikely(ret))
 		return ret;
 
+	if (nvmet_req_passthru_ctrl(req))
+		return nvmet_parse_passthru_io_cmd(req);
+
 	req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
 	if (unlikely(!req->ns)) {
 		req->error_loc = offsetof(struct nvme_common_command, nsid);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 49e14111446c..51719dc62083 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -242,6 +242,10 @@ struct nvmet_subsys {
 	struct config_group	allowed_hosts_group;
 
 	struct nvmet_subsys_model	__rcu *model;
+
+#ifdef CONFIG_NVME_TARGET_PASSTHRU
+	struct nvme_ctrl	*passthru_ctrl;
+#endif /* CONFIG_NVME_TARGET_PASSTHRU */
 };
 
 static inline struct nvmet_subsys *to_subsys(struct config_item *item)
@@ -321,6 +325,11 @@ struct nvmet_req {
 			struct bio_vec          *bvec;
 			struct work_struct      work;
 		} f;
+		struct {
+			struct request		*rq;
+			struct work_struct      work;
+			bool			use_workqueue;
+		} p;
 	};
 	int			sg_cnt;
 	int			metadata_sg_cnt;
@@ -400,6 +409,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status);
 int nvmet_req_alloc_sgls(struct nvmet_req *req);
 void nvmet_req_free_sgls(struct nvmet_req *req);
 
+void nvmet_execute_set_features(struct nvmet_req *req);
+void nvmet_execute_get_features(struct nvmet_req *req);
 void nvmet_execute_keep_alive(struct nvmet_req *req);
 
 void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
@@ -532,6 +543,34 @@ static inline u32 nvmet_dsm_len(struct nvmet_req *req)
 		sizeof(struct nvme_dsm_range);
 }
 
+#ifdef CONFIG_NVME_TARGET_PASSTHRU
+u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req);
+u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req);
+static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
+{
+	return subsys->passthru_ctrl;
+}
+#else /* CONFIG_NVME_TARGET_PASSTHRU */
+static inline u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
+{
+	return 0;
+}
+static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
+{
+	return 0;
+}
+static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
+{
+	return NULL;
+}
+#endif /* CONFIG_NVME_TARGET_PASSTHRU */
+
+static inline struct nvme_ctrl *
+nvmet_req_passthru_ctrl(struct nvmet_req *req)
+{
+	return nvmet_passthru_ctrl(req->sq->ctrl->subsys);
+}
+
 u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
 
 /* Convert a 32-bit number to a 16-bit 0's based number */
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
new file mode 100644
index 000000000000..b9727553ab30
--- /dev/null
+++ b/drivers/nvme/target/passthru.c
@@ -0,0 +1,458 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe Over Fabrics Target Passthrough command implementation.
+ *
+ * Copyright (c) 2017-2018 Western Digital Corporation or its
+ * affiliates.
+ * Copyright (c) 2019-2020, Eideticom Inc.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+
+#include "../host/nvme.h"
+#include "nvmet.h"
+
+MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU);
+
+static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvme_ctrl *pctrl = ctrl->subsys->passthru_ctrl;
+	u16 status = NVME_SC_SUCCESS;
+	struct nvme_id_ctrl *id;
+	u32 max_hw_sectors;
+	int page_shift;
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id)
+		return NVME_SC_INTERNAL;
+
+	status = nvmet_copy_from_sgl(req, 0, id, sizeof(*id));
+	if (status)
+		goto out_free;
+
+	id->cntlid = cpu_to_le16(ctrl->cntlid);
+	id->ver = cpu_to_le32(ctrl->subsys->ver);
+
+	/*
+	 * The passthru NVMe driver may have a limit on the number of segments
+	 * which depends on the host's memory fragementation. To solve this,
+	 * ensure mdts is limited to the pages equal to the number of segments.
+	 */
+	max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9),
+				      pctrl->max_hw_sectors);
+
+	page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+
+	id->mdts = ilog2(max_hw_sectors) + 9 - page_shift;
+
+	id->acl = 3;
+	/*
+	 * We export aerl limit for the fabrics controller, update this when
+	 * passthru based aerl support is added.
+	 */
+	id->aerl = NVMET_ASYNC_EVENTS - 1;
+
+	/* emulate kas as most of the PCIe ctrl don't have a support for kas */
+	id->kas = cpu_to_le16(NVMET_KAS);
+
+	/* don't support host memory buffer */
+	id->hmpre = 0;
+	id->hmmin = 0;
+
+	id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes);
+	id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes);
+	id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
+
+	/* don't support fuse commands */
+	id->fuses = 0;
+
+	id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
+	if (ctrl->ops->flags & NVMF_KEYED_SGLS)
+		id->sgls |= cpu_to_le32(1 << 2);
+	if (req->port->inline_data_size)
+		id->sgls |= cpu_to_le32(1 << 20);
+
+	/*
+	 * When passsthru controller is setup using nvme-loop transport it will
+	 * export the passthru ctrl subsysnqn (PCIe NVMe ctrl) and will fail in
+	 * the nvme/host/core.c in the nvme_init_subsystem()->nvme_active_ctrl()
+	 * code path with duplicate ctr subsynqn. In order to prevent that we
+	 * mask the passthru-ctrl subsysnqn with the target ctrl subsysnqn.
+	 */
+	memcpy(id->subnqn, ctrl->subsysnqn, sizeof(id->subnqn));
+
+	/* use fabric id-ctrl values */
+	id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) +
+				req->port->inline_data_size) / 16);
+	id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16);
+
+	id->msdbd = ctrl->ops->msdbd;
+
+	/* Support multipath connections with fabrics */
+	id->cmic |= 1 << 1;
+
+	/* Disable reservations, see nvmet_parse_passthru_io_cmd() */
+	id->oncs &= cpu_to_le16(~NVME_CTRL_ONCS_RESERVATIONS);
+
+	status = nvmet_copy_to_sgl(req, 0, id, sizeof(struct nvme_id_ctrl));
+
+out_free:
+	kfree(id);
+	return status;
+}
+
+static u16 nvmet_passthru_override_id_ns(struct nvmet_req *req)
+{
+	u16 status = NVME_SC_SUCCESS;
+	struct nvme_id_ns *id;
+	int i;
+
+	id = kzalloc(sizeof(*id), GFP_KERNEL);
+	if (!id)
+		return NVME_SC_INTERNAL;
+
+	status = nvmet_copy_from_sgl(req, 0, id, sizeof(struct nvme_id_ns));
+	if (status)
+		goto out_free;
+
+	for (i = 0; i < (id->nlbaf + 1); i++)
+		if (id->lbaf[i].ms)
+			memset(&id->lbaf[i], 0, sizeof(id->lbaf[i]));
+
+	id->flbas = id->flbas & ~(1 << 4);
+
+	/*
+	 * Presently the NVMEof target code does not support sending
+	 * metadata, so we must disable it here. This should be updated
+	 * once target starts supporting metadata.
+	 */
+	id->mc = 0;
+
+	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
+
+out_free:
+	kfree(id);
+	return status;
+}
+
+static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
+{
+	struct nvmet_req *req = container_of(w, struct nvmet_req, p.work);
+	struct request *rq = req->p.rq;
+	u16 status;
+
+	nvme_execute_passthru_rq(rq);
+
+	status = nvme_req(rq)->status;
+	if (status == NVME_SC_SUCCESS &&
+	    req->cmd->common.opcode == nvme_admin_identify) {
+		switch (req->cmd->identify.cns) {
+		case NVME_ID_CNS_CTRL:
+			nvmet_passthru_override_id_ctrl(req);
+			break;
+		case NVME_ID_CNS_NS:
+			nvmet_passthru_override_id_ns(req);
+			break;
+		}
+	}
+
+	req->cqe->result = nvme_req(rq)->result;
+	nvmet_req_complete(req, status);
+	blk_put_request(rq);
+}
+
+static void nvmet_passthru_req_done(struct request *rq,
+				    blk_status_t blk_status)
+{
+	struct nvmet_req *req = rq->end_io_data;
+
+	req->cqe->result = nvme_req(rq)->result;
+	nvmet_req_complete(req, nvme_req(rq)->status);
+	blk_put_request(rq);
+}
+
+static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq)
+{
+	int sg_cnt = req->sg_cnt;
+	struct scatterlist *sg;
+	int op_flags = 0;
+	struct bio *bio;
+	int i, ret;
+
+	if (req->cmd->common.opcode == nvme_cmd_flush)
+		op_flags = REQ_FUA;
+	else if (nvme_is_write(req->cmd))
+		op_flags = REQ_SYNC | REQ_IDLE;
+
+	bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES));
+	bio->bi_end_io = bio_put;
+	bio->bi_opf = req_op(rq) | op_flags;
+
+	for_each_sg(req->sg, sg, req->sg_cnt, i) {
+		if (bio_add_pc_page(rq->q, bio, sg_page(sg), sg->length,
+				    sg->offset) < sg->length) {
+			bio_put(bio);
+			return -EINVAL;
+		}
+		sg_cnt--;
+	}
+
+	ret = blk_rq_append_bio(rq, &bio);
+	if (unlikely(ret)) {
+		bio_put(bio);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void nvmet_passthru_execute_cmd(struct nvmet_req *req)
+{
+	struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req);
+	struct request_queue *q = ctrl->admin_q;
+	struct nvme_ns *ns = NULL;
+	struct request *rq = NULL;
+	u32 effects;
+	u16 status;
+	int ret;
+
+	if (likely(req->sq->qid != 0)) {
+		u32 nsid = le32_to_cpu(req->cmd->common.nsid);
+
+		ns = nvme_find_get_ns(ctrl, nsid);
+		if (unlikely(!ns)) {
+			pr_err("failed to get passthru ns nsid:%u\n", nsid);
+			status = NVME_SC_INVALID_NS | NVME_SC_DNR;
+			goto fail_out;
+		}
+
+		q = ns->queue;
+	}
+
+	rq = nvme_alloc_request(q, req->cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
+	if (IS_ERR(rq)) {
+		rq = NULL;
+		status = NVME_SC_INTERNAL;
+		goto fail_out;
+	}
+
+	if (req->sg_cnt) {
+		ret = nvmet_passthru_map_sg(req, rq);
+		if (unlikely(ret)) {
+			status = NVME_SC_INTERNAL;
+			goto fail_out;
+		}
+	}
+
+	/*
+	 * If there are effects for the command we are about to execute, or
+	 * an end_req function we need to use nvme_execute_passthru_rq()
+	 * synchronously in a work item seeing the end_req function and
+	 * nvme_passthru_end() can't be called in the request done callback
+	 * which is typically in interrupt context.
+	 */
+	effects = nvme_command_effects(ctrl, ns, req->cmd->common.opcode);
+	if (req->p.use_workqueue || effects) {
+		INIT_WORK(&req->p.work, nvmet_passthru_execute_cmd_work);
+		req->p.rq = rq;
+		schedule_work(&req->p.work);
+	} else {
+		rq->end_io_data = req;
+		blk_execute_rq_nowait(rq->q, ns ? ns->disk : NULL, rq, 0,
+				      nvmet_passthru_req_done);
+	}
+
+	if (ns)
+		nvme_put_ns(ns);
+
+	return;
+
+fail_out:
+	if (ns)
+		nvme_put_ns(ns);
+	nvmet_req_complete(req, status);
+	blk_put_request(rq);
+}
+
+/*
+ * We need to emulate set host behaviour to ensure that any requested
+ * behaviour of the target's host matches the requested behaviour
+ * of the device's host and fail otherwise.
+ */
+static void nvmet_passthru_set_host_behaviour(struct nvmet_req *req)
+{
+	struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req);
+	struct nvme_feat_host_behavior *host;
+	u16 status = NVME_SC_INTERNAL;
+	int ret;
+
+	host = kzalloc(sizeof(*host) * 2, GFP_KERNEL);
+	if (!host)
+		goto out_complete_req;
+
+	ret = nvme_get_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
+				host, sizeof(*host), NULL);
+	if (ret)
+		goto out_free_host;
+
+	status = nvmet_copy_from_sgl(req, 0, &host[1], sizeof(*host));
+	if (status)
+		goto out_free_host;
+
+	if (memcmp(&host[0], &host[1], sizeof(host[0]))) {
+		pr_warn("target host has requested different behaviour from the local host\n");
+		status = NVME_SC_INTERNAL;
+	}
+
+out_free_host:
+	kfree(host);
+out_complete_req:
+	nvmet_req_complete(req, status);
+}
+
+static u16 nvmet_setup_passthru_command(struct nvmet_req *req)
+{
+	req->p.use_workqueue = false;
+	req->execute = nvmet_passthru_execute_cmd;
+	return NVME_SC_SUCCESS;
+}
+
+u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
+{
+	switch (req->cmd->common.opcode) {
+	case nvme_cmd_resv_register:
+	case nvme_cmd_resv_report:
+	case nvme_cmd_resv_acquire:
+	case nvme_cmd_resv_release:
+		/*
+		 * Reservations cannot be supported properly because the
+		 * underlying device has no way of differentiating different
+		 * hosts that connect via fabrics. This could potentially be
+		 * emulated in the future if regular targets grow support for
+		 * this feature.
+		 */
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+	}
+
+	return nvmet_setup_passthru_command(req);
+}
+
+/*
+ * Only features that are emulated or specifically allowed in the list  are
+ * passed down to the controller. This function implements the allow list for
+ * both get and set features.
+ */
+static u16 nvmet_passthru_get_set_features(struct nvmet_req *req)
+{
+	switch (le32_to_cpu(req->cmd->features.fid)) {
+	case NVME_FEAT_ARBITRATION:
+	case NVME_FEAT_POWER_MGMT:
+	case NVME_FEAT_LBA_RANGE:
+	case NVME_FEAT_TEMP_THRESH:
+	case NVME_FEAT_ERR_RECOVERY:
+	case NVME_FEAT_VOLATILE_WC:
+	case NVME_FEAT_WRITE_ATOMIC:
+	case NVME_FEAT_AUTO_PST:
+	case NVME_FEAT_TIMESTAMP:
+	case NVME_FEAT_HCTM:
+	case NVME_FEAT_NOPSC:
+	case NVME_FEAT_RRL:
+	case NVME_FEAT_PLM_CONFIG:
+	case NVME_FEAT_PLM_WINDOW:
+	case NVME_FEAT_HOST_BEHAVIOR:
+	case NVME_FEAT_SANITIZE:
+	case NVME_FEAT_VENDOR_START ... NVME_FEAT_VENDOR_END:
+		return nvmet_setup_passthru_command(req);
+
+	case NVME_FEAT_ASYNC_EVENT:
+		/* There is no support for forwarding ASYNC events */
+	case NVME_FEAT_IRQ_COALESCE:
+	case NVME_FEAT_IRQ_CONFIG:
+		/* The IRQ settings will not apply to the target controller */
+	case NVME_FEAT_HOST_MEM_BUF:
+		/*
+		 * Any HMB that's set will not be passed through and will
+		 * not work as expected
+		 */
+	case NVME_FEAT_SW_PROGRESS:
+		/*
+		 * The Pre-Boot Software Load Count doesn't make much
+		 * sense for a target to export
+		 */
+	case NVME_FEAT_RESV_MASK:
+	case NVME_FEAT_RESV_PERSIST:
+		/* No reservations, see nvmet_parse_passthru_io_cmd() */
+	default:
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+	}
+}
+
+u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
+{
+	/*
+	 * Passthru all vendor specific commands
+	 */
+	if (req->cmd->common.opcode >= nvme_admin_vendor_start)
+		return nvmet_setup_passthru_command(req);
+
+	switch (req->cmd->common.opcode) {
+	case nvme_admin_async_event:
+		req->execute = nvmet_execute_async_event;
+		return NVME_SC_SUCCESS;
+	case nvme_admin_keep_alive:
+		/*
+		 * Most PCIe ctrls don't support keep alive cmd, we route keep
+		 * alive to the non-passthru mode. In future please change this
+		 * code when PCIe ctrls with keep alive support available.
+		 */
+		req->execute = nvmet_execute_keep_alive;
+		return NVME_SC_SUCCESS;
+	case nvme_admin_set_features:
+		switch (le32_to_cpu(req->cmd->features.fid)) {
+		case NVME_FEAT_ASYNC_EVENT:
+		case NVME_FEAT_KATO:
+		case NVME_FEAT_NUM_QUEUES:
+		case NVME_FEAT_HOST_ID:
+			req->execute = nvmet_execute_set_features;
+			return NVME_SC_SUCCESS;
+		case NVME_FEAT_HOST_BEHAVIOR:
+			req->execute = nvmet_passthru_set_host_behaviour;
+			return NVME_SC_SUCCESS;
+		default:
+			return nvmet_passthru_get_set_features(req);
+		}
+		break;
+	case nvme_admin_get_features:
+		switch (le32_to_cpu(req->cmd->features.fid)) {
+		case NVME_FEAT_ASYNC_EVENT:
+		case NVME_FEAT_KATO:
+		case NVME_FEAT_NUM_QUEUES:
+		case NVME_FEAT_HOST_ID:
+			req->execute = nvmet_execute_get_features;
+			return NVME_SC_SUCCESS;
+		default:
+			return nvmet_passthru_get_set_features(req);
+		}
+		break;
+	case nvme_admin_identify:
+		switch (req->cmd->identify.cns) {
+		case NVME_ID_CNS_CTRL:
+			req->execute = nvmet_passthru_execute_cmd;
+			req->p.use_workqueue = true;
+			return NVME_SC_SUCCESS;
+		case NVME_ID_CNS_NS:
+			req->execute = nvmet_passthru_execute_cmd;
+			req->p.use_workqueue = true;
+			return NVME_SC_SUCCESS;
+		default:
+			return nvmet_setup_passthru_command(req);
+		}
+	case nvme_admin_get_log_page:
+		return nvmet_setup_passthru_command(req);
+	default:
+		/* Reject commands not in the allowlist above */
+		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+	}
+}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 1643005d21e3..d92535997687 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -312,6 +312,7 @@ enum {
 	NVME_CTRL_ONCS_WRITE_UNCORRECTABLE	= 1 << 1,
 	NVME_CTRL_ONCS_DSM			= 1 << 2,
 	NVME_CTRL_ONCS_WRITE_ZEROES		= 1 << 3,
+	NVME_CTRL_ONCS_RESERVATIONS		= 1 << 5,
 	NVME_CTRL_ONCS_TIMESTAMP		= 1 << 6,
 	NVME_CTRL_VWC_PRESENT			= 1 << 0,
 	NVME_CTRL_OACS_SEC_SUPP                 = 1 << 0,
@@ -982,6 +983,7 @@ enum nvme_admin_opcode {
 	nvme_admin_security_recv	= 0x82,
 	nvme_admin_sanitize_nvm		= 0x84,
 	nvme_admin_get_lba_status	= 0x86,
+	nvme_admin_vendor_start		= 0xC0,
 };
 
 #define nvme_admin_opcode_name(opcode)	{ opcode, #opcode }
@@ -1045,6 +1047,8 @@ enum {
 	NVME_FEAT_RESV_MASK	= 0x82,
 	NVME_FEAT_RESV_PERSIST	= 0x83,
 	NVME_FEAT_WRITE_PROTECT	= 0x84,
+	NVME_FEAT_VENDOR_START	= 0xC0,
+	NVME_FEAT_VENDOR_END	= 0xFF,
 	NVME_LOG_ERROR		= 0x01,
 	NVME_LOG_SMART		= 0x02,
 	NVME_LOG_FW_SLOT	= 0x03,
-- 
cgit v1.2.3