From 86677a4e71070ffa4ee476ea119cd61d254a7af4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 21 Jun 2022 17:23:16 -0700 Subject: cxl/Documentation: List attribute permissions Clarify the access permission of CXL sysfs attributes in the documentation to help development of userspace tooling. Reported-by: Alison Schofield Reviewed-by: Alison Schofield Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165603881198.551046.12893348287451903699.stgit@dwillia2-xfh Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 81 +++++++++++++++++---------------- 1 file changed, 41 insertions(+), 40 deletions(-) (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 7c2b846521f3..1fd5984b6158 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -57,28 +57,28 @@ Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - CXL device objects export the devtype attribute which mirrors - the same value communicated in the DEVTYPE environment variable - for uevents for devices on the "cxl" bus. + (RO) CXL device objects export the devtype attribute which + mirrors the same value communicated in the DEVTYPE environment + variable for uevents for devices on the "cxl" bus. What: /sys/bus/cxl/devices/*/modalias Date: December, 2021 KernelVersion: v5.18 Contact: linux-cxl@vger.kernel.org Description: - CXL device objects export the modalias attribute which mirrors - the same value communicated in the MODALIAS environment variable - for uevents for devices on the "cxl" bus. + (RO) CXL device objects export the modalias attribute which + mirrors the same value communicated in the MODALIAS environment + variable for uevents for devices on the "cxl" bus. What: /sys/bus/cxl/devices/portX/uport Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - CXL port objects are enumerated from either a platform firmware - device (ACPI0017 and ACPI0016) or PCIe switch upstream port with - CXL component registers. The 'uport' symlink connects the CXL - portX object to the device that published the CXL port + (RO) CXL port objects are enumerated from either a platform + firmware device (ACPI0017 and ACPI0016) or PCIe switch upstream + port with CXL component registers. The 'uport' symlink connects + the CXL portX object to the device that published the CXL port capability. What: /sys/bus/cxl/devices/portX/dportY @@ -86,20 +86,20 @@ Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - CXL port objects are enumerated from either a platform firmware - device (ACPI0017 and ACPI0016) or PCIe switch upstream port with - CXL component registers. The 'dportY' symlink identifies one or - more downstream ports that the upstream port may target in its - decode of CXL memory resources. The 'Y' integer reflects the - hardware port unique-id used in the hardware decoder target - list. + (RO) CXL port objects are enumerated from either a platform + firmware device (ACPI0017 and ACPI0016) or PCIe switch upstream + port with CXL component registers. The 'dportY' symlink + identifies one or more downstream ports that the upstream port + may target in its decode of CXL memory resources. The 'Y' + integer reflects the hardware port unique-id used in the + hardware decoder target list. What: /sys/bus/cxl/devices/decoderX.Y Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - CXL decoder objects are enumerated from either a platform + (RO) CXL decoder objects are enumerated from either a platform firmware description, or a CXL HDM decoder register set in a PCIe device (see CXL 2.0 section 8.2.5.12 CXL HDM Decoder Capability Structure). The 'X' in decoderX.Y represents the @@ -111,42 +111,43 @@ Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - The 'start' and 'size' attributes together convey the physical - address base and number of bytes mapped in the decoder's decode - window. For decoders of devtype "cxl_decoder_root" the address - range is fixed. For decoders of devtype "cxl_decoder_switch" the - address is bounded by the decode range of the cxl_port ancestor - of the decoder's cxl_port, and dynamically updates based on the - active memory regions in that address space. + (RO) The 'start' and 'size' attributes together convey the + physical address base and number of bytes mapped in the + decoder's decode window. For decoders of devtype + "cxl_decoder_root" the address range is fixed. For decoders of + devtype "cxl_decoder_switch" the address is bounded by the + decode range of the cxl_port ancestor of the decoder's cxl_port, + and dynamically updates based on the active memory regions in + that address space. What: /sys/bus/cxl/devices/decoderX.Y/locked Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - CXL HDM decoders have the capability to lock the configuration - until the next device reset. For decoders of devtype - "cxl_decoder_root" there is no standard facility to unlock them. - For decoders of devtype "cxl_decoder_switch" a secondary bus - reset, of the PCIe bridge that provides the bus for this - decoders uport, unlocks / resets the decoder. + (RO) CXL HDM decoders have the capability to lock the + configuration until the next device reset. For decoders of + devtype "cxl_decoder_root" there is no standard facility to + unlock them. For decoders of devtype "cxl_decoder_switch" a + secondary bus reset, of the PCIe bridge that provides the bus + for this decoders uport, unlocks / resets the decoder. What: /sys/bus/cxl/devices/decoderX.Y/target_list Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - Display a comma separated list of the current decoder target - configuration. The list is ordered by the current configured - interleave order of the decoder's dport instances. Each entry in - the list is a dport id. + (RO) Display a comma separated list of the current decoder + target configuration. The list is ordered by the current + configured interleave order of the decoder's dport instances. + Each entry in the list is a dport id. What: /sys/bus/cxl/devices/decoderX.Y/cap_{pmem,ram,type2,type3} Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - When a CXL decoder is of devtype "cxl_decoder_root", it + (RO) When a CXL decoder is of devtype "cxl_decoder_root", it represents a fixed memory window identified by platform firmware. A fixed window may only support a subset of memory types. The 'cap_*' attributes indicate whether persistent @@ -158,8 +159,8 @@ Date: June, 2021 KernelVersion: v5.14 Contact: linux-cxl@vger.kernel.org Description: - When a CXL decoder is of devtype "cxl_decoder_switch", it can - optionally decode either accelerator memory (type-2) or expander - memory (type-3). The 'target_type' attribute indicates the - current setting which may dynamically change based on what + (RO) When a CXL decoder is of devtype "cxl_decoder_switch", it + can optionally decode either accelerator memory (type-2) or + expander memory (type-3). The 'target_type' attribute indicates + the current setting which may dynamically change based on what memory regions are activated in this decode hierarchy. -- cgit v1.2.3 From c97006046c791f82cb5ba3219ef4a511ec5f3932 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Tue, 19 Jul 2022 13:52:49 -0700 Subject: cxl/port: Read CDAT table The per-device CDAT data provides performance data that is relevant for mapping which CXL devices can participate in which CXL ranges by QTG (QoS Throttling Group) (per ECN: CXL 2.0 CEDT CFMWS & QTG_DSM) [1]. The QTG association specified in the ECN is advisory. Until the cxl_acpi driver grows support for invoking the QTG _DSM method the CDAT data is only of interest to userspace that may need it for debug purposes. Search the DOE mailboxes available, query CDAT data, cache the data and make it available via a sysfs binary attribute per endpoint at: /sys/bus/cxl/devices/endpointX/CDAT ...similar to other ACPI-structured table data in /sys/firmware/ACPI/tables. The CDAT is relative to 'struct cxl_port' objects since switches in addition to endpoints can host a CDAT instance. Switch CDAT support is not implemented. This does not support table updates at runtime. It will always provide whatever was there when first cached. It is also the case that table updates are not expected outside of explicit DPA address map affecting commands like Set Partition with the immediate flag set. Given that the driver does not support Set Partition with the immediate flag set there is no current need for update support. Link: https://www.computeexpresslink.org/spec-landing [1] Signed-off-by: Jonathan Cameron Co-developed-by: Jonathan Cameron Signed-off-by: Ira Weiny [djbw: drop in-kernel parsing infra for now, and other minor fixups] Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20220719205249.566684-7-ira.weiny@intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 10 ++ drivers/cxl/core/pci.c | 173 ++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 7 ++ drivers/cxl/cxlpci.h | 1 + drivers/cxl/port.c | 53 ++++++++++ 5 files changed, 244 insertions(+) (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 1fd5984b6158..e94c5aebc368 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -164,3 +164,13 @@ Description: expander memory (type-3). The 'target_type' attribute indicates the current setting which may dynamically change based on what memory regions are activated in this decode hierarchy. + +What: /sys/bus/cxl/devices/endpointX/CDAT +Date: July, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) If this sysfs entry is not present no DOE mailbox was + found to support CDAT data. If it is present and the length of + the data is 0 reading the CDAT data failed. Otherwise the CDAT + data is reported. diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 7672789c3225..9240df53ed87 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -452,3 +453,175 @@ hdm_init: return 0; } EXPORT_SYMBOL_NS_GPL(cxl_hdm_decode_init, CXL); + +#define CXL_DOE_TABLE_ACCESS_REQ_CODE 0x000000ff +#define CXL_DOE_TABLE_ACCESS_REQ_CODE_READ 0 +#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE 0x0000ff00 +#define CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA 0 +#define CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE 0xffff0000 +#define CXL_DOE_TABLE_ACCESS_LAST_ENTRY 0xffff +#define CXL_DOE_PROTOCOL_TABLE_ACCESS 2 + +static struct pci_doe_mb *find_cdat_doe(struct device *uport) +{ + struct cxl_memdev *cxlmd; + struct cxl_dev_state *cxlds; + unsigned long index; + void *entry; + + cxlmd = to_cxl_memdev(uport); + cxlds = cxlmd->cxlds; + + xa_for_each(&cxlds->doe_mbs, index, entry) { + struct pci_doe_mb *cur = entry; + + if (pci_doe_supports_prot(cur, PCI_DVSEC_VENDOR_ID_CXL, + CXL_DOE_PROTOCOL_TABLE_ACCESS)) + return cur; + } + + return NULL; +} + +#define CDAT_DOE_REQ(entry_handle) \ + (FIELD_PREP(CXL_DOE_TABLE_ACCESS_REQ_CODE, \ + CXL_DOE_TABLE_ACCESS_REQ_CODE_READ) | \ + FIELD_PREP(CXL_DOE_TABLE_ACCESS_TABLE_TYPE, \ + CXL_DOE_TABLE_ACCESS_TABLE_TYPE_CDATA) | \ + FIELD_PREP(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, (entry_handle))) + +static void cxl_doe_task_complete(struct pci_doe_task *task) +{ + complete(task->private); +} + +struct cdat_doe_task { + u32 request_pl; + u32 response_pl[32]; + struct completion c; + struct pci_doe_task task; +}; + +#define DECLARE_CDAT_DOE_TASK(req, cdt) \ +struct cdat_doe_task cdt = { \ + .c = COMPLETION_INITIALIZER_ONSTACK(cdt.c), \ + .request_pl = req, \ + .task = { \ + .prot.vid = PCI_DVSEC_VENDOR_ID_CXL, \ + .prot.type = CXL_DOE_PROTOCOL_TABLE_ACCESS, \ + .request_pl = &cdt.request_pl, \ + .request_pl_sz = sizeof(cdt.request_pl), \ + .response_pl = cdt.response_pl, \ + .response_pl_sz = sizeof(cdt.response_pl), \ + .complete = cxl_doe_task_complete, \ + .private = &cdt.c, \ + } \ +} + +static int cxl_cdat_get_length(struct device *dev, + struct pci_doe_mb *cdat_doe, + size_t *length) +{ + DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(0), t); + int rc; + + rc = pci_doe_submit_task(cdat_doe, &t.task); + if (rc < 0) { + dev_err(dev, "DOE submit failed: %d", rc); + return rc; + } + wait_for_completion(&t.c); + if (t.task.rv < sizeof(u32)) + return -EIO; + + *length = t.response_pl[1]; + dev_dbg(dev, "CDAT length %zu\n", *length); + + return 0; +} + +static int cxl_cdat_read_table(struct device *dev, + struct pci_doe_mb *cdat_doe, + struct cxl_cdat *cdat) +{ + size_t length = cdat->length; + u32 *data = cdat->table; + int entry_handle = 0; + + do { + DECLARE_CDAT_DOE_TASK(CDAT_DOE_REQ(entry_handle), t); + size_t entry_dw; + u32 *entry; + int rc; + + rc = pci_doe_submit_task(cdat_doe, &t.task); + if (rc < 0) { + dev_err(dev, "DOE submit failed: %d", rc); + return rc; + } + wait_for_completion(&t.c); + /* 1 DW header + 1 DW data min */ + if (t.task.rv < (2 * sizeof(u32))) + return -EIO; + + /* Get the CXL table access header entry handle */ + entry_handle = FIELD_GET(CXL_DOE_TABLE_ACCESS_ENTRY_HANDLE, + t.response_pl[0]); + entry = t.response_pl + 1; + entry_dw = t.task.rv / sizeof(u32); + /* Skip Header */ + entry_dw -= 1; + entry_dw = min(length / sizeof(u32), entry_dw); + /* Prevent length < 1 DW from causing a buffer overflow */ + if (entry_dw) { + memcpy(data, entry, entry_dw * sizeof(u32)); + length -= entry_dw * sizeof(u32); + data += entry_dw; + } + } while (entry_handle != CXL_DOE_TABLE_ACCESS_LAST_ENTRY); + + return 0; +} + +/** + * read_cdat_data - Read the CDAT data on this port + * @port: Port to read data from + * + * This call will sleep waiting for responses from the DOE mailbox. + */ +void read_cdat_data(struct cxl_port *port) +{ + struct pci_doe_mb *cdat_doe; + struct device *dev = &port->dev; + struct device *uport = port->uport; + size_t cdat_length; + int rc; + + cdat_doe = find_cdat_doe(uport); + if (!cdat_doe) { + dev_dbg(dev, "No CDAT mailbox\n"); + return; + } + + port->cdat_available = true; + + if (cxl_cdat_get_length(dev, cdat_doe, &cdat_length)) { + dev_dbg(dev, "No CDAT length\n"); + return; + } + + port->cdat.table = devm_kzalloc(dev, cdat_length, GFP_KERNEL); + if (!port->cdat.table) + return; + + port->cdat.length = cdat_length; + rc = cxl_cdat_read_table(dev, cdat_doe, &port->cdat); + if (rc) { + /* Don't leave table data allocated on error */ + devm_kfree(dev, port->cdat.table); + port->cdat.table = NULL; + port->cdat.length = 0; + dev_err(dev, "CDAT data read error\n"); + } +} +EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 570bd9f8141b..21a9d6fcc61e 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -289,6 +289,8 @@ struct cxl_nvdimm { * @component_reg_phys: component register capability base address (optional) * @dead: last ep has been removed, force port re-creation * @depth: How deep this port is relative to the root. depth 0 is the root. + * @cdat: Cached CDAT data + * @cdat_available: Should a CDAT attribute be available in sysfs */ struct cxl_port { struct device dev; @@ -301,6 +303,11 @@ struct cxl_port { resource_size_t component_reg_phys; bool dead; unsigned int depth; + struct cxl_cdat { + void *table; + size_t length; + } cdat; + bool cdat_available; }; /** diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h index fce1c11729c2..eec597dbe763 100644 --- a/drivers/cxl/cxlpci.h +++ b/drivers/cxl/cxlpci.h @@ -74,4 +74,5 @@ static inline resource_size_t cxl_regmap_to_base(struct pci_dev *pdev, int devm_cxl_port_enumerate_dports(struct cxl_port *port); struct cxl_dev_state; int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm); +void read_cdat_data(struct cxl_port *port); #endif /* __CXL_PCI_H__ */ diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c index 3cf308f114c4..5453771bf330 100644 --- a/drivers/cxl/port.c +++ b/drivers/cxl/port.c @@ -53,6 +53,9 @@ static int cxl_port_probe(struct device *dev) struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport); struct cxl_dev_state *cxlds = cxlmd->cxlds; + /* Cache the data early to ensure is_visible() works */ + read_cdat_data(port); + get_device(&cxlmd->dev); rc = devm_add_action_or_reset(dev, schedule_detach, cxlmd); if (rc) @@ -78,10 +81,60 @@ static int cxl_port_probe(struct device *dev) return 0; } +static ssize_t CDAT_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t offset, size_t count) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_port *port = to_cxl_port(dev); + + if (!port->cdat_available) + return -ENXIO; + + if (!port->cdat.table) + return 0; + + return memory_read_from_buffer(buf, count, &offset, + port->cdat.table, + port->cdat.length); +} + +static BIN_ATTR_ADMIN_RO(CDAT, 0); + +static umode_t cxl_port_bin_attr_is_visible(struct kobject *kobj, + struct bin_attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_port *port = to_cxl_port(dev); + + if ((attr == &bin_attr_CDAT) && port->cdat_available) + return attr->attr.mode; + + return 0; +} + +static struct bin_attribute *cxl_cdat_bin_attributes[] = { + &bin_attr_CDAT, + NULL, +}; + +static struct attribute_group cxl_cdat_attribute_group = { + .bin_attrs = cxl_cdat_bin_attributes, + .is_bin_visible = cxl_port_bin_attr_is_visible, +}; + +static const struct attribute_group *cxl_port_attribute_groups[] = { + &cxl_cdat_attribute_group, + NULL, +}; + static struct cxl_driver cxl_port_driver = { .name = "cxl_port", .probe = cxl_port_probe, .id = CXL_DEVICE_PORT, + .drv = { + .dev_groups = cxl_port_attribute_groups, + }, }; module_cxl_driver(cxl_port_driver); -- cgit v1.2.3 From 6b625b2bb8ffc6e903a7891008bf423858bbffe6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 10 Jul 2022 09:56:05 -0700 Subject: Documentation/cxl: Use a double line break between entries Make it easier to read delineations between the "Description" line break, new paragraph line breaks, and new entries. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165784324750.1758207.10379257962719807754.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index e94c5aebc368..5ada6d9543d3 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -7,6 +7,7 @@ Description: all descendant memdevs for unbind. Writing '1' to this attribute flushes that work. + What: /sys/bus/cxl/devices/memX/firmware_version Date: December, 2020 KernelVersion: v5.12 @@ -16,6 +17,7 @@ Description: Memory Device Output Payload in the CXL-2.0 specification. + What: /sys/bus/cxl/devices/memX/ram/size Date: December, 2020 KernelVersion: v5.12 @@ -25,6 +27,7 @@ Description: identically named field in the Identify Memory Device Output Payload in the CXL-2.0 specification. + What: /sys/bus/cxl/devices/memX/pmem/size Date: December, 2020 KernelVersion: v5.12 @@ -34,6 +37,7 @@ Description: identically named field in the Identify Memory Device Output Payload in the CXL-2.0 specification. + What: /sys/bus/cxl/devices/memX/serial Date: January, 2022 KernelVersion: v5.18 @@ -43,6 +47,7 @@ Description: capability. Mandatory for CXL devices, see CXL 2.0 8.1.12.2 Memory Device PCIe Capabilities and Extended Capabilities. + What: /sys/bus/cxl/devices/memX/numa_node Date: January, 2022 KernelVersion: v5.18 @@ -52,6 +57,7 @@ Description: host PCI device for this memory device, emit the CPU node affinity for this device. + What: /sys/bus/cxl/devices/*/devtype Date: June, 2021 KernelVersion: v5.14 @@ -61,6 +67,7 @@ Description: mirrors the same value communicated in the DEVTYPE environment variable for uevents for devices on the "cxl" bus. + What: /sys/bus/cxl/devices/*/modalias Date: December, 2021 KernelVersion: v5.18 @@ -70,6 +77,7 @@ Description: mirrors the same value communicated in the MODALIAS environment variable for uevents for devices on the "cxl" bus. + What: /sys/bus/cxl/devices/portX/uport Date: June, 2021 KernelVersion: v5.14 @@ -81,6 +89,7 @@ Description: the CXL portX object to the device that published the CXL port capability. + What: /sys/bus/cxl/devices/portX/dportY Date: June, 2021 KernelVersion: v5.14 @@ -94,6 +103,7 @@ Description: integer reflects the hardware port unique-id used in the hardware decoder target list. + What: /sys/bus/cxl/devices/decoderX.Y Date: June, 2021 KernelVersion: v5.14 @@ -106,6 +116,7 @@ Description: cxl_port container of this decoder, and 'Y' represents the instance id of a given decoder resource. + What: /sys/bus/cxl/devices/decoderX.Y/{start,size} Date: June, 2021 KernelVersion: v5.14 @@ -120,6 +131,7 @@ Description: and dynamically updates based on the active memory regions in that address space. + What: /sys/bus/cxl/devices/decoderX.Y/locked Date: June, 2021 KernelVersion: v5.14 @@ -132,6 +144,7 @@ Description: secondary bus reset, of the PCIe bridge that provides the bus for this decoders uport, unlocks / resets the decoder. + What: /sys/bus/cxl/devices/decoderX.Y/target_list Date: June, 2021 KernelVersion: v5.14 @@ -142,6 +155,7 @@ Description: configured interleave order of the decoder's dport instances. Each entry in the list is a dport id. + What: /sys/bus/cxl/devices/decoderX.Y/cap_{pmem,ram,type2,type3} Date: June, 2021 KernelVersion: v5.14 @@ -154,6 +168,7 @@ Description: memory, volatile memory, accelerator memory, and / or expander memory may be mapped behind this decoder's memory window. + What: /sys/bus/cxl/devices/decoderX.Y/target_type Date: June, 2021 KernelVersion: v5.14 @@ -165,6 +180,7 @@ Description: the current setting which may dynamically change based on what memory regions are activated in this decode hierarchy. + What: /sys/bus/cxl/devices/endpointX/CDAT Date: July, 2022 KernelVersion: v5.20 -- cgit v1.2.3 From 2c8669033f16f5d791e10a5bdd42e39c7380da57 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 23 May 2022 12:15:35 -0700 Subject: cxl/hdm: Add 'mode' attribute to decoder objects Recall that the Device Physical Address (DPA) space of a CXL Memory Expander is potentially partitioned into a volatile and persistent portion. A decoder maps a Host Physical Address (HPA) range to a DPA range and that translation depends on the value of all previous (lower instance number) decoders before the current one. In preparation for allowing dynamic provisioning of regions, decoders need an ABI to indicate which DPA partition a decoder targets. This ABI needs to be prepared for the possibility that some other agent committed and locked a decoder that spans the partition boundary. Add 'decoderX.Y/mode' to endpoint decoders that indicates which partition 'ram' / 'pmem' the decoder targets, or 'mixed' if the decoder currently spans the partition boundary. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165603881967.551046.6007594190951596439.stgit@dwillia2-xfh Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 16 ++++++++++++++++ drivers/cxl/core/hdm.c | 10 ++++++++++ drivers/cxl/core/port.c | 20 ++++++++++++++++++++ drivers/cxl/cxl.h | 9 +++++++++ 4 files changed, 55 insertions(+) (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 5ada6d9543d3..0b672248d126 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -190,3 +190,19 @@ Description: found to support CDAT data. If it is present and the length of the data is 0 reading the CDAT data failed. Otherwise the CDAT data is reported. + + +What: /sys/bus/cxl/devices/decoderX.Y/mode +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) When a CXL decoder is of devtype "cxl_decoder_endpoint" it + translates from a host physical address range, to a device local + address range. Device-local address ranges are further split + into a 'ram' (volatile memory) range and 'pmem' (persistent + memory) range. The 'mode' attribute emits one of 'ram', 'pmem', + 'mixed', or 'none'. The 'mixed' indication is for error cases + when a decoder straddles the volatile/persistent partition + boundary, and 'none' indicates the decoder is not actively + decoding, or no DPA allocation policy has been set. diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 47e3af440b3d..c2cff5783fda 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -226,6 +226,16 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, cxled->dpa_res = res; cxled->skip = skipped; + if (resource_contains(&cxlds->pmem_res, res)) + cxled->mode = CXL_DECODER_PMEM; + else if (resource_contains(&cxlds->ram_res, res)) + cxled->mode = CXL_DECODER_RAM; + else { + dev_dbg(dev, "decoder%d.%d: %pr mixed\n", port->id, + cxled->cxld.id, cxled->dpa_res); + cxled->mode = CXL_DECODER_MIXED; + } + return 0; } diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 2cfa870428c2..9a312f2e2d35 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -172,6 +172,25 @@ static ssize_t target_list_show(struct device *dev, } static DEVICE_ATTR_RO(target_list); +static ssize_t mode_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + + switch (cxled->mode) { + case CXL_DECODER_RAM: + return sysfs_emit(buf, "ram\n"); + case CXL_DECODER_PMEM: + return sysfs_emit(buf, "pmem\n"); + case CXL_DECODER_NONE: + return sysfs_emit(buf, "none\n"); + case CXL_DECODER_MIXED: + default: + return sysfs_emit(buf, "mixed\n"); + } +} +static DEVICE_ATTR_RO(mode); + static struct attribute *cxl_decoder_base_attrs[] = { &dev_attr_start.attr, &dev_attr_size.attr, @@ -222,6 +241,7 @@ static const struct attribute_group *cxl_decoder_switch_attribute_groups[] = { static struct attribute *cxl_decoder_endpoint_attrs[] = { &dev_attr_target_type.attr, + &dev_attr_mode.attr, NULL, }; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 5fe8b59a0b18..afaa76f065f9 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -241,16 +241,25 @@ struct cxl_decoder { unsigned long flags; }; +enum cxl_decoder_mode { + CXL_DECODER_NONE, + CXL_DECODER_RAM, + CXL_DECODER_PMEM, + CXL_DECODER_MIXED, +}; + /** * struct cxl_endpoint_decoder - Endpoint / SPA to DPA decoder * @cxld: base cxl_decoder_object * @dpa_res: actively claimed DPA span of this decoder * @skip: offset into @dpa_res where @cxld.hpa_range maps + * @mode: which memory type / access-mode-partition this decoder targets */ struct cxl_endpoint_decoder { struct cxl_decoder cxld; struct resource *dpa_res; resource_size_t skip; + enum cxl_decoder_mode mode; }; /** -- cgit v1.2.3 From cf880423b6a0599499c1f83542cab0b75daa29ba Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 23 May 2022 18:02:30 -0700 Subject: cxl/hdm: Add support for allocating DPA to an endpoint decoder The region provisioning flow will roughly follow a sequence of: 1/ Allocate DPA to a set of decoders 2/ Allocate HPA to a region 3/ Associate decoders with a region and validate that the DPA allocations and topologies match the parameters of the region. For now, this change (step 1) arranges for DPA capacity to be allocated and deleted from non-committed decoders based on the decoder's mode / partition selection. Capacity is allocated from the lowest DPA in the partition and any 'pmem' allocation blocks out all remaining ram capacity in its 'skip' setting. DPA allocations are enforced in decoder instance order. I.e. decoder N + 1 always starts at a higher DPA than instance N, and deleting allocations must proceed from the highest-instance allocated decoder to the lowest. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165784329399.1758207.16732038126938632700.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 37 ++++++- drivers/cxl/core/core.h | 7 ++ drivers/cxl/core/hdm.c | 180 ++++++++++++++++++++++++++++++++ drivers/cxl/core/port.c | 73 ++++++++++++- 4 files changed, 295 insertions(+), 2 deletions(-) (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 0b672248d126..1ab69aca6152 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -197,7 +197,7 @@ Date: May, 2022 KernelVersion: v5.20 Contact: linux-cxl@vger.kernel.org Description: - (RO) When a CXL decoder is of devtype "cxl_decoder_endpoint" it + (RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it translates from a host physical address range, to a device local address range. Device-local address ranges are further split into a 'ram' (volatile memory) range and 'pmem' (persistent @@ -206,3 +206,38 @@ Description: when a decoder straddles the volatile/persistent partition boundary, and 'none' indicates the decoder is not actively decoding, or no DPA allocation policy has been set. + + 'mode' can be written, when the decoder is in the 'disabled' + state, with either 'ram' or 'pmem' to set the boundaries for the + next allocation. + + +What: /sys/bus/cxl/devices/decoderX.Y/dpa_resource +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) When a CXL decoder is of devtype "cxl_decoder_endpoint", + and its 'dpa_size' attribute is non-zero, this attribute + indicates the device physical address (DPA) base address of the + allocation. + + +What: /sys/bus/cxl/devices/decoderX.Y/dpa_size +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) When a CXL decoder is of devtype "cxl_decoder_endpoint" it + translates from a host physical address range, to a device local + address range. The range, base address plus length in bytes, of + DPA allocated to this decoder is conveyed in these 2 attributes. + Allocations can be mutated as long as the decoder is in the + disabled state. A write to 'dpa_size' releases the previous DPA + allocation and then attempts to allocate from the free capacity + in the device partition referred to by 'decoderX.Y/mode'. + Allocate and free requests can only be performed on the highest + instance number disabled decoder with non-zero size. I.e. + allocations are enforced to occur in increasing 'decoderX.Y/id' + order and frees are enforced to occur in decreasing + 'decoderX.Y/id' order. diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index a0808cdaffba..5551b82b2da0 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -18,6 +18,13 @@ void __iomem *devm_cxl_iomap_block(struct device *dev, resource_size_t addr, resource_size_t length); struct dentry *cxl_debugfs_create_dir(const char *dir); +int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, + enum cxl_decoder_mode mode); +int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size); +int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); +resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled); +resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled); + int cxl_memdev_init(void); void cxl_memdev_exit(void); void cxl_mbox_init(void); diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 14354f4cd92e..960a77864f33 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -184,6 +184,19 @@ static void cxl_dpa_release(void *cxled) up_write(&cxl_dpa_rwsem); } +/* + * Must be called from context that will not race port device + * unregistration, like decoder sysfs attribute methods + */ +static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = cxled_to_port(cxled); + + lockdep_assert_held_write(&cxl_dpa_rwsem); + devm_remove_action(&port->dev, cxl_dpa_release, cxled); + __cxl_dpa_release(cxled); +} + static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, resource_size_t base, resource_size_t len, resource_size_t skipped) @@ -271,6 +284,173 @@ static int devm_cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled, return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); } +resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled) +{ + resource_size_t size = 0; + + down_read(&cxl_dpa_rwsem); + if (cxled->dpa_res) + size = resource_size(cxled->dpa_res); + up_read(&cxl_dpa_rwsem); + + return size; +} + +resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled) +{ + resource_size_t base = -1; + + down_read(&cxl_dpa_rwsem); + if (cxled->dpa_res) + base = cxled->dpa_res->start; + up_read(&cxl_dpa_rwsem); + + return base; +} + +int cxl_dpa_free(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_port *port = cxled_to_port(cxled); + struct device *dev = &cxled->cxld.dev; + int rc; + + down_write(&cxl_dpa_rwsem); + if (!cxled->dpa_res) { + rc = 0; + goto out; + } + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { + dev_dbg(dev, "decoder enabled\n"); + rc = -EBUSY; + goto out; + } + if (cxled->cxld.id != port->hdm_end) { + dev_dbg(dev, "expected decoder%d.%d\n", port->id, + port->hdm_end); + rc = -EBUSY; + goto out; + } + devm_cxl_dpa_release(cxled); + rc = 0; +out: + up_write(&cxl_dpa_rwsem); + return rc; +} + +int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled, + enum cxl_decoder_mode mode) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &cxled->cxld.dev; + int rc; + + switch (mode) { + case CXL_DECODER_RAM: + case CXL_DECODER_PMEM: + break; + default: + dev_dbg(dev, "unsupported mode: %d\n", mode); + return -EINVAL; + } + + down_write(&cxl_dpa_rwsem); + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { + rc = -EBUSY; + goto out; + } + + /* + * Only allow modes that are supported by the current partition + * configuration + */ + if (mode == CXL_DECODER_PMEM && !resource_size(&cxlds->pmem_res)) { + dev_dbg(dev, "no available pmem capacity\n"); + rc = -ENXIO; + goto out; + } + if (mode == CXL_DECODER_RAM && !resource_size(&cxlds->ram_res)) { + dev_dbg(dev, "no available ram capacity\n"); + rc = -ENXIO; + goto out; + } + + cxled->mode = mode; + rc = 0; +out: + up_write(&cxl_dpa_rwsem); + + return rc; +} + +int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) +{ + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + resource_size_t free_ram_start, free_pmem_start; + struct cxl_port *port = cxled_to_port(cxled); + struct cxl_dev_state *cxlds = cxlmd->cxlds; + struct device *dev = &cxled->cxld.dev; + resource_size_t start, avail, skip; + struct resource *p, *last; + int rc; + + down_write(&cxl_dpa_rwsem); + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { + dev_dbg(dev, "decoder enabled\n"); + rc = -EBUSY; + goto out; + } + + for (p = cxlds->ram_res.child, last = NULL; p; p = p->sibling) + last = p; + if (last) + free_ram_start = last->end + 1; + else + free_ram_start = cxlds->ram_res.start; + + for (p = cxlds->pmem_res.child, last = NULL; p; p = p->sibling) + last = p; + if (last) + free_pmem_start = last->end + 1; + else + free_pmem_start = cxlds->pmem_res.start; + + if (cxled->mode == CXL_DECODER_RAM) { + start = free_ram_start; + avail = cxlds->ram_res.end - start + 1; + skip = 0; + } else if (cxled->mode == CXL_DECODER_PMEM) { + resource_size_t skip_start, skip_end; + + start = free_pmem_start; + avail = cxlds->pmem_res.end - start + 1; + skip_start = free_ram_start; + skip_end = start - 1; + skip = skip_end - skip_start + 1; + } else { + dev_dbg(dev, "mode not set\n"); + rc = -EINVAL; + goto out; + } + + if (size > avail) { + dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size, + cxled->mode == CXL_DECODER_RAM ? "ram" : "pmem", + &avail); + rc = -ENOSPC; + goto out; + } + + rc = __cxl_dpa_reserve(cxled, start, size, skip); +out: + up_write(&cxl_dpa_rwsem); + + if (rc) + return rc; + + return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); +} + static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, int *target_map, void __iomem *hdm, int which, u64 *dpa_base) diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 635d63e2f8a8..e301cf7e5141 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -189,7 +189,76 @@ static ssize_t mode_show(struct device *dev, struct device_attribute *attr, return sysfs_emit(buf, "mixed\n"); } } -static DEVICE_ATTR_RO(mode); + +static ssize_t mode_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + enum cxl_decoder_mode mode; + ssize_t rc; + + if (sysfs_streq(buf, "pmem")) + mode = CXL_DECODER_PMEM; + else if (sysfs_streq(buf, "ram")) + mode = CXL_DECODER_RAM; + else + return -EINVAL; + + rc = cxl_dpa_set_mode(cxled, mode); + if (rc) + return rc; + + return len; +} +static DEVICE_ATTR_RW(mode); + +static ssize_t dpa_resource_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + u64 base = cxl_dpa_resource_start(cxled); + + return sysfs_emit(buf, "%#llx\n", base); +} +static DEVICE_ATTR_RO(dpa_resource); + +static ssize_t dpa_size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + resource_size_t size = cxl_dpa_size(cxled); + + return sysfs_emit(buf, "%pa\n", &size); +} + +static ssize_t dpa_size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_endpoint_decoder *cxled = to_cxl_endpoint_decoder(dev); + unsigned long long size; + ssize_t rc; + + rc = kstrtoull(buf, 0, &size); + if (rc) + return rc; + + if (!IS_ALIGNED(size, SZ_256M)) + return -EINVAL; + + rc = cxl_dpa_free(cxled); + if (rc) + return rc; + + if (size == 0) + return len; + + rc = cxl_dpa_alloc(cxled, size); + if (rc) + return rc; + + return len; +} +static DEVICE_ATTR_RW(dpa_size); static struct attribute *cxl_decoder_base_attrs[] = { &dev_attr_start.attr, @@ -242,6 +311,8 @@ static const struct attribute_group *cxl_decoder_switch_attribute_groups[] = { static struct attribute *cxl_decoder_endpoint_attrs[] = { &dev_attr_target_type.attr, &dev_attr_mode.attr, + &dev_attr_dpa_size.attr, + &dev_attr_dpa_resource.attr, NULL, }; -- cgit v1.2.3 From 538831f1beb818c93e5879bf19de37d89ec88ed6 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Sun, 10 Apr 2022 15:26:13 -0700 Subject: cxl/hdm: Add sysfs attributes for interleave ways + granularity The region provisioning flow involves selecting interleave ways + granularity settings for a region, and then programming the decoder topology to meet those constraints, if possible. For example, root decoders set the minimum interleave ways + granularity for any hosted regions. Given decoder programming is not atomic and collisions can occur between multiple requesting regions userspace will be responsible for conflict resolution and it needs these attributes to make those decisions. Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165784332235.1758207.7185062713652694607.stgit@dwillia2-xfh.jf.intel.com [djbw: reword changelog, make read-only, add sysfs ABI documentaion] Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 27 +++++++++++++++++++++++++++ drivers/cxl/core/port.c | 23 +++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 1ab69aca6152..223b8762d037 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -241,3 +241,30 @@ Description: allocations are enforced to occur in increasing 'decoderX.Y/id' order and frees are enforced to occur in decreasing 'decoderX.Y/id' order. + + +What: /sys/bus/cxl/devices/decoderX.Y/interleave_ways +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) The number of targets across which this decoder's host + physical address (HPA) memory range is interleaved. The device + maps every Nth block of HPA (of size == + 'interleave_granularity') to consecutive DPA addresses. The + decoder's position in the interleave is determined by the + device's (endpoint or switch) switch ancestry. For root + decoders their interleave is specified by platform firmware and + they only specify a downstream target order for host bridges. + + +What: /sys/bus/cxl/devices/decoderX.Y/interleave_granularity +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) The number of consecutive bytes of host physical address + space this decoder claims at address N before the decode rotates + to the next target in the interleave at address N + + interleave_granularity (assuming N is aligned to + interleave_granularity). diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index c7f1844d58de..719563d85ce1 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -260,10 +260,33 @@ static ssize_t dpa_size_store(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR_RW(dpa_size); +static ssize_t interleave_granularity_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%d\n", cxld->interleave_granularity); +} + +static DEVICE_ATTR_RO(interleave_granularity); + +static ssize_t interleave_ways_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + + return sysfs_emit(buf, "%d\n", cxld->interleave_ways); +} + +static DEVICE_ATTR_RO(interleave_ways); + static struct attribute *cxl_decoder_base_attrs[] = { &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_locked.attr, + &dev_attr_interleave_granularity.attr, + &dev_attr_interleave_ways.attr, NULL, }; -- cgit v1.2.3 From 779dd20cfb56c510f89877cca45529fa9f8bc450 Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Tue, 8 Jun 2021 10:28:34 -0700 Subject: cxl/region: Add region creation support CXL 2.0 allows for dynamic provisioning of new memory regions (system physical address resources like "System RAM" and "Persistent Memory"). Whereas DDR and PMEM resources are conveyed statically at boot, CXL allows for assembling and instantiating new regions from the available capacity of CXL memory expanders in the system. Sysfs with an "echo $region_name > $create_region_attribute" interface is chosen as the mechanism to initiate the provisioning process. This was chosen over ioctl() and netlink() to keep the configuration interface entirely in a pseudo-fs interface, and it was chosen over configfs since, aside from this one creation event, the interface is read-mostly. I.e. configfs supports cases where an object is designed to be provisioned each boot, like an iSCSI storage target, and CXL region creation is mostly for PMEM regions which are created usually once per-lifetime of a server instance. This is an improvement over nvdimm that pre-created "seed" devices that tended to confuse users looking to determine which devices are active and which are idle. Recall that the major change that CXL brings over previous persistent memory architectures is the ability to dynamically define new regions. Compare that to drivers like 'nfit' where the region configuration is statically defined by platform firmware. Regions are created as a child of a root decoder that encompasses an address space with constraints. When created through sysfs, the root decoder is explicit. When created from an LSA's region structure a root decoder will possibly need to be inferred by the driver. Upon region creation through sysfs, a vacant region is created with a unique name. Regions have a number of attributes that must be configured before the region can be bound to the driver where HDM decoder program is completed. An example of creating a new region: - Allocate a new region name: region=$(cat /sys/bus/cxl/devices/decoder0.0/create_pmem_region) - Create a new region by name: while region=$(cat /sys/bus/cxl/devices/decoder0.0/create_pmem_region) ! echo $region > /sys/bus/cxl/devices/decoder0.0/create_pmem_region do true; done - Region now exists in sysfs: stat -t /sys/bus/cxl/devices/decoder0.0/$region - Delete the region, and name: echo $region > /sys/bus/cxl/devices/decoder0.0/delete_region Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165784333909.1758207.794374602146306032.stgit@dwillia2-xfh.jf.intel.com [djbw: simplify locking, reword changelog] Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 25 +++ Documentation/driver-api/cxl/memory-devices.rst | 11 ++ drivers/cxl/Kconfig | 5 + drivers/cxl/core/Makefile | 1 + drivers/cxl/core/core.h | 10 ++ drivers/cxl/core/port.c | 39 +++++ drivers/cxl/core/region.c | 201 ++++++++++++++++++++++++ drivers/cxl/cxl.h | 18 +++ tools/testing/cxl/Kbuild | 1 + 9 files changed, 311 insertions(+) create mode 100644 drivers/cxl/core/region.c (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 223b8762d037..6c2a501e0dd8 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -268,3 +268,28 @@ Description: to the next target in the interleave at address N + interleave_granularity (assuming N is aligned to interleave_granularity). + + +What: /sys/bus/cxl/devices/decoderX.Y/create_pmem_region +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Write a string in the form 'regionZ' to start the process + of defining a new persistent memory region (interleave-set) + within the decode range bounded by root decoder 'decoderX.Y'. + The value written must match the current value returned from + reading this attribute. An atomic compare exchange operation is + done on write to assign the requested id to a region and + allocate the region-id for the next creation attempt. EBUSY is + returned if the region name written does not match the current + cached value. + + +What: /sys/bus/cxl/devices/decoderX.Y/delete_region +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (WO) Write a string in the form 'regionZ' to delete that region, + provided it is currently idle / not bound to a driver. diff --git a/Documentation/driver-api/cxl/memory-devices.rst b/Documentation/driver-api/cxl/memory-devices.rst index db476bb170b6..66ddc58a21b1 100644 --- a/Documentation/driver-api/cxl/memory-devices.rst +++ b/Documentation/driver-api/cxl/memory-devices.rst @@ -362,6 +362,17 @@ CXL Core .. kernel-doc:: drivers/cxl/core/mbox.c :doc: cxl mbox +CXL Regions +----------- +.. kernel-doc:: drivers/cxl/region.h + :identifiers: + +.. kernel-doc:: drivers/cxl/core/region.c + :doc: cxl core region + +.. kernel-doc:: drivers/cxl/core/region.c + :identifiers: + External Interfaces =================== diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 7adaaf80b302..184fdd4b8871 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -103,4 +103,9 @@ config CXL_SUSPEND def_bool y depends on SUSPEND && CXL_MEM +config CXL_REGION + bool + default CXL_BUS + select MEMREGION + endif diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile index 9d35085d25af..79c7257f4107 100644 --- a/drivers/cxl/core/Makefile +++ b/drivers/cxl/core/Makefile @@ -10,3 +10,4 @@ cxl_core-y += memdev.o cxl_core-y += mbox.o cxl_core-y += pci.o cxl_core-y += hdm.o +cxl_core-$(CONFIG_CXL_REGION) += region.o diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 5551b82b2da0..29272df7e212 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -9,6 +9,16 @@ extern const struct device_type cxl_nvdimm_type; extern struct attribute_group cxl_base_attribute_group; +#ifdef CONFIG_CXL_REGION +extern struct device_attribute dev_attr_create_pmem_region; +extern struct device_attribute dev_attr_delete_region; +#define CXL_REGION_ATTR(x) (&dev_attr_##x.attr) +#define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr), +#else +#define CXL_REGION_ATTR(x) NULL +#define SET_CXL_REGION_ATTR(x) +#endif + struct cxl_send_command; struct cxl_mem_query_commands; int cxl_query_cmd(struct cxl_memdev *cxlmd, diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 60c637a826a2..9a768822261b 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright(c) 2020 Intel Corporation. All rights reserved. */ #include +#include #include #include #include @@ -300,11 +301,35 @@ static struct attribute *cxl_decoder_root_attrs[] = { &dev_attr_cap_type2.attr, &dev_attr_cap_type3.attr, &dev_attr_target_list.attr, + SET_CXL_REGION_ATTR(create_pmem_region) + SET_CXL_REGION_ATTR(delete_region) NULL, }; +static bool can_create_pmem(struct cxl_root_decoder *cxlrd) +{ + unsigned long flags = CXL_DECODER_F_TYPE3 | CXL_DECODER_F_PMEM; + + return (cxlrd->cxlsd.cxld.flags & flags) == flags; +} + +static umode_t cxl_root_decoder_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + + if (a == CXL_REGION_ATTR(create_pmem_region) && !can_create_pmem(cxlrd)) + return 0; + + if (a == CXL_REGION_ATTR(delete_region) && !can_create_pmem(cxlrd)) + return 0; + + return a->mode; +} + static struct attribute_group cxl_decoder_root_attribute_group = { .attrs = cxl_decoder_root_attrs, + .is_visible = cxl_root_decoder_visible, }; static const struct attribute_group *cxl_decoder_root_attribute_groups[] = { @@ -387,6 +412,8 @@ static void cxl_root_decoder_release(struct device *dev) { struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + if (atomic_read(&cxlrd->region_id) >= 0) + memregion_free(atomic_read(&cxlrd->region_id)); __cxl_decoder_release(&cxlrd->cxlsd.cxld); kfree(cxlrd); } @@ -1484,6 +1511,18 @@ struct cxl_root_decoder *cxl_root_decoder_alloc(struct cxl_port *port, cxld = &cxlsd->cxld; cxld->dev.type = &cxl_decoder_root_type; + /* + * cxl_root_decoder_release() special cases negative ids to + * detect memregion_alloc() failures. + */ + atomic_set(&cxlrd->region_id, -1); + rc = memregion_alloc(GFP_KERNEL); + if (rc < 0) { + put_device(&cxld->dev); + return ERR_PTR(rc); + } + + atomic_set(&cxlrd->region_id, rc); return cxlrd; } EXPORT_SYMBOL_NS_GPL(cxl_root_decoder_alloc, CXL); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c new file mode 100644 index 000000000000..4538756f295f --- /dev/null +++ b/drivers/cxl/core/region.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright(c) 2022 Intel Corporation. All rights reserved. */ +#include +#include +#include +#include +#include +#include +#include +#include "core.h" + +/** + * DOC: cxl core region + * + * CXL Regions represent mapped memory capacity in system physical address + * space. Whereas the CXL Root Decoders identify the bounds of potential CXL + * Memory ranges, Regions represent the active mapped capacity by the HDM + * Decoder Capability structures throughout the Host Bridges, Switches, and + * Endpoints in the topology. + */ + +static struct cxl_region *to_cxl_region(struct device *dev); + +static void cxl_region_release(struct device *dev) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + + memregion_free(cxlr->id); + kfree(cxlr); +} + +static const struct device_type cxl_region_type = { + .name = "cxl_region", + .release = cxl_region_release, +}; + +bool is_cxl_region(struct device *dev) +{ + return dev->type == &cxl_region_type; +} +EXPORT_SYMBOL_NS_GPL(is_cxl_region, CXL); + +static struct cxl_region *to_cxl_region(struct device *dev) +{ + if (dev_WARN_ONCE(dev, dev->type != &cxl_region_type, + "not a cxl_region device\n")) + return NULL; + + return container_of(dev, struct cxl_region, dev); +} + +static void unregister_region(void *dev) +{ + device_unregister(dev); +} + +static struct lock_class_key cxl_region_key; + +static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int id) +{ + struct cxl_region *cxlr; + struct device *dev; + + cxlr = kzalloc(sizeof(*cxlr), GFP_KERNEL); + if (!cxlr) { + memregion_free(id); + return ERR_PTR(-ENOMEM); + } + + dev = &cxlr->dev; + device_initialize(dev); + lockdep_set_class(&dev->mutex, &cxl_region_key); + dev->parent = &cxlrd->cxlsd.cxld.dev; + device_set_pm_not_required(dev); + dev->bus = &cxl_bus_type; + dev->type = &cxl_region_type; + cxlr->id = id; + + return cxlr; +} + +/** + * devm_cxl_add_region - Adds a region to a decoder + * @cxlrd: root decoder + * @id: memregion id to create, or memregion_free() on failure + * @mode: mode for the endpoint decoders of this region + * @type: select whether this is an expander or accelerator (type-2 or type-3) + * + * This is the second step of region initialization. Regions exist within an + * address space which is mapped by a @cxlrd. + * + * Return: 0 if the region was added to the @cxlrd, else returns negative error + * code. The region will be named "regionZ" where Z is the unique region number. + */ +static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, + int id, + enum cxl_decoder_mode mode, + enum cxl_decoder_type type) +{ + struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent); + struct cxl_region *cxlr; + struct device *dev; + int rc; + + cxlr = cxl_region_alloc(cxlrd, id); + if (IS_ERR(cxlr)) + return cxlr; + cxlr->mode = mode; + cxlr->type = type; + + dev = &cxlr->dev; + rc = dev_set_name(dev, "region%d", id); + if (rc) + goto err; + + rc = device_add(dev); + if (rc) + goto err; + + rc = devm_add_action_or_reset(port->uport, unregister_region, cxlr); + if (rc) + return ERR_PTR(rc); + + dev_dbg(port->uport, "%s: created %s\n", + dev_name(&cxlrd->cxlsd.cxld.dev), dev_name(dev)); + return cxlr; + +err: + put_device(dev); + return ERR_PTR(rc); +} + +static ssize_t create_pmem_region_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + + return sysfs_emit(buf, "region%u\n", atomic_read(&cxlrd->region_id)); +} + +static ssize_t create_pmem_region_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + struct cxl_region *cxlr; + int id, rc; + + rc = sscanf(buf, "region%d\n", &id); + if (rc != 1) + return -EINVAL; + + rc = memregion_alloc(GFP_KERNEL); + if (rc < 0) + return rc; + + if (atomic_cmpxchg(&cxlrd->region_id, id, rc) != id) { + memregion_free(rc); + return -EBUSY; + } + + cxlr = devm_cxl_add_region(cxlrd, id, CXL_DECODER_PMEM, + CXL_DECODER_EXPANDER); + if (IS_ERR(cxlr)) + return PTR_ERR(cxlr); + + return len; +} +DEVICE_ATTR_RW(create_pmem_region); + +static struct cxl_region * +cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name) +{ + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct device *region_dev; + + region_dev = device_find_child_by_name(&cxld->dev, name); + if (!region_dev) + return ERR_PTR(-ENODEV); + + return to_cxl_region(region_dev); +} + +static ssize_t delete_region_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev); + struct cxl_port *port = to_cxl_port(dev->parent); + struct cxl_region *cxlr; + + cxlr = cxl_find_region_by_name(cxlrd, buf); + if (IS_ERR(cxlr)) + return PTR_ERR(cxlr); + + devm_release_action(port->uport, unregister_region, cxlr); + put_device(&cxlr->dev); + + return len; +} +DEVICE_ATTR_WO(delete_region); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index c3c62568d287..0b38ee99bddf 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -286,13 +286,29 @@ struct cxl_switch_decoder { /** * struct cxl_root_decoder - Static platform CXL address decoder * @res: host / parent resource for region allocations + * @region_id: region id for next region provisioning event * @cxlsd: base cxl switch decoder */ struct cxl_root_decoder { struct resource *res; + atomic_t region_id; struct cxl_switch_decoder cxlsd; }; +/** + * struct cxl_region - CXL region + * @dev: This region's device + * @id: This region's id. Id is globally unique across all regions + * @mode: Endpoint decoder allocation / access mode + * @type: Endpoint decoder target type + */ +struct cxl_region { + struct device dev; + int id; + enum cxl_decoder_mode mode; + enum cxl_decoder_type type; +}; + /** * enum cxl_nvdimm_brige_state - state machine for managing bus rescans * @CXL_NVB_NEW: Set at bridge create and after cxl_pmem_wq is destroyed @@ -447,6 +463,8 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port); int devm_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm); int devm_cxl_add_passthrough_decoder(struct cxl_port *port); +bool is_cxl_region(struct device *dev); + extern struct bus_type cxl_bus_type; struct cxl_driver { diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 33543231d453..500be85729cc 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -47,6 +47,7 @@ cxl_core-y += $(CXL_CORE_SRC)/memdev.o cxl_core-y += $(CXL_CORE_SRC)/mbox.o cxl_core-y += $(CXL_CORE_SRC)/pci.o cxl_core-y += $(CXL_CORE_SRC)/hdm.o +cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o cxl_core-y += config_check.o obj-m += test/ -- cgit v1.2.3 From dd5ba0ebbdc414f4dda4dc4ec076f46fb6f26ffd Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Thu, 27 May 2021 13:30:41 -0700 Subject: cxl/region: Add a 'uuid' attribute The process of provisioning a region involves triggering the creation of a new region object, pouring in the configuration, and then binding that configured object to the region driver to start its operation. For persistent memory regions the CXL specification mandates that it identified by a uuid. Add an ABI for userspace to specify a region's uuid. Signed-off-by: Ben Widawsky [djbw: simplify locking] Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165784334465.1758207.8224025435884752570.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 10 +++ drivers/cxl/core/region.c | 118 ++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 25 +++++++ 3 files changed, 153 insertions(+) (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 6c2a501e0dd8..f4515a1c74fe 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -293,3 +293,13 @@ Contact: linux-cxl@vger.kernel.org Description: (WO) Write a string in the form 'regionZ' to delete that region, provided it is currently idle / not bound to a driver. + + +What: /sys/bus/cxl/devices/regionZ/uuid +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Write a unique identifier for the region. This field must + be set for persistent regions and it must not conflict with the + UUID of another region. diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 4538756f295f..98ce59a16143 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include "core.h" @@ -17,10 +18,126 @@ * Memory ranges, Regions represent the active mapped capacity by the HDM * Decoder Capability structures throughout the Host Bridges, Switches, and * Endpoints in the topology. + * + * Region configuration has ordering constraints. UUID may be set at any time + * but is only visible for persistent regions. + */ + +/* + * All changes to the interleave configuration occur with this lock held + * for write. */ +static DECLARE_RWSEM(cxl_region_rwsem); static struct cxl_region *to_cxl_region(struct device *dev); +static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%pUb\n", &p->uuid); + up_read(&cxl_region_rwsem); + + return rc; +} + +static int is_dup(struct device *match, void *data) +{ + struct cxl_region_params *p; + struct cxl_region *cxlr; + uuid_t *uuid = data; + + if (!is_cxl_region(match)) + return 0; + + lockdep_assert_held(&cxl_region_rwsem); + cxlr = to_cxl_region(match); + p = &cxlr->params; + + if (uuid_equal(&p->uuid, uuid)) { + dev_dbg(match, "already has uuid: %pUb\n", uuid); + return -EBUSY; + } + + return 0; +} + +static ssize_t uuid_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + uuid_t temp; + ssize_t rc; + + if (len != UUID_STRING_LEN + 1) + return -EINVAL; + + rc = uuid_parse(buf, &temp); + if (rc) + return rc; + + if (uuid_is_null(&temp)) + return -EINVAL; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + if (uuid_equal(&p->uuid, &temp)) + goto out; + + rc = -EBUSY; + if (p->state >= CXL_CONFIG_ACTIVE) + goto out; + + rc = bus_for_each_dev(&cxl_bus_type, NULL, &temp, is_dup); + if (rc < 0) + goto out; + + uuid_copy(&p->uuid, &temp); +out: + up_write(&cxl_region_rwsem); + + if (rc) + return rc; + return len; +} +static DEVICE_ATTR_RW(uuid); + +static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_region *cxlr = to_cxl_region(dev); + + if (a == &dev_attr_uuid.attr && cxlr->mode != CXL_DECODER_PMEM) + return 0; + return a->mode; +} + +static struct attribute *cxl_region_attrs[] = { + &dev_attr_uuid.attr, + NULL, +}; + +static const struct attribute_group cxl_region_group = { + .attrs = cxl_region_attrs, + .is_visible = cxl_region_visible, +}; + +static const struct attribute_group *region_groups[] = { + &cxl_base_attribute_group, + &cxl_region_group, + NULL, +}; + static void cxl_region_release(struct device *dev) { struct cxl_region *cxlr = to_cxl_region(dev); @@ -32,6 +149,7 @@ static void cxl_region_release(struct device *dev) static const struct device_type cxl_region_type = { .name = "cxl_region", .release = cxl_region_release, + .groups = region_groups }; bool is_cxl_region(struct device *dev) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 0b38ee99bddf..ea7255020808 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -295,18 +295,43 @@ struct cxl_root_decoder { struct cxl_switch_decoder cxlsd; }; +/* + * enum cxl_config_state - State machine for region configuration + * @CXL_CONFIG_IDLE: Any sysfs attribute can be written freely + * @CXL_CONFIG_ACTIVE: All targets have been added the region is now + * active + */ +enum cxl_config_state { + CXL_CONFIG_IDLE, + CXL_CONFIG_ACTIVE, +}; + +/** + * struct cxl_region_params - region settings + * @state: allow the driver to lockdown further parameter changes + * @uuid: unique id for persistent regions + * + * State transitions are protected by the cxl_region_rwsem + */ +struct cxl_region_params { + enum cxl_config_state state; + uuid_t uuid; +}; + /** * struct cxl_region - CXL region * @dev: This region's device * @id: This region's id. Id is globally unique across all regions * @mode: Endpoint decoder allocation / access mode * @type: Endpoint decoder target type + * @params: active + config params for the region */ struct cxl_region { struct device dev; int id; enum cxl_decoder_mode mode; enum cxl_decoder_type type; + struct cxl_region_params params; }; /** -- cgit v1.2.3 From 80d10a6cee05029cae9d9d6e8ddb799ea6d01e0c Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Mon, 25 Apr 2022 11:36:48 -0700 Subject: cxl/region: Add interleave geometry attributes Add ABI to allow the number of devices that comprise a region to be set as well as the interleave granularity for the region. Signed-off-by: Ben Widawsky [djbw: reword changelog] Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/20220624041950.559155-11-dan.j.williams@intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 21 +++++ drivers/cxl/core/region.c | 134 ++++++++++++++++++++++++++++++++ drivers/cxl/cxl.h | 33 ++++++++ 3 files changed, 188 insertions(+) (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index f4515a1c74fe..df347174fb37 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -303,3 +303,24 @@ Description: (RW) Write a unique identifier for the region. This field must be set for persistent regions and it must not conflict with the UUID of another region. + + +What: /sys/bus/cxl/devices/regionZ/interleave_granularity +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Set the number of consecutive bytes each device in the + interleave set will claim. The possible interleave granularity + values are determined by the CXL spec and the participating + devices. + + +What: /sys/bus/cxl/devices/regionZ/interleave_ways +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Configures the number of devices participating in the + region is set by writing this value. Each device will provide + 1/interleave_ways of storage for the region. diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 98ce59a16143..45bc2fa18837 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "core.h" @@ -21,6 +22,8 @@ * * Region configuration has ordering constraints. UUID may be set at any time * but is only visible for persistent regions. + * 1. Interleave granularity + * 2. Interleave size */ /* @@ -122,8 +125,135 @@ static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, return a->mode; } +static ssize_t interleave_ways_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%d\n", p->interleave_ways); + up_read(&cxl_region_rwsem); + + return rc; +} + +static ssize_t interleave_ways_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + int rc, val; + u8 iw; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + + rc = ways_to_cxl(val, &iw); + if (rc) + return rc; + + /* + * Even for x3, x9, and x12 interleaves the region interleave must be a + * power of 2 multiple of the host bridge interleave. + */ + if (!is_power_of_2(val / cxld->interleave_ways) || + (val % cxld->interleave_ways)) { + dev_dbg(&cxlr->dev, "invalid interleave: %d\n", val); + return -EINVAL; + } + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { + rc = -EBUSY; + goto out; + } + + p->interleave_ways = val; +out: + up_write(&cxl_region_rwsem); + if (rc) + return rc; + return len; +} +static DEVICE_ATTR_RW(interleave_ways); + +static ssize_t interleave_granularity_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%d\n", p->interleave_granularity); + up_read(&cxl_region_rwsem); + + return rc; +} + +static ssize_t interleave_granularity_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + int rc, val; + u16 ig; + + rc = kstrtoint(buf, 0, &val); + if (rc) + return rc; + + rc = granularity_to_cxl(val, &ig); + if (rc) + return rc; + + /* + * Disallow region granularity less than root granularity to + * simplify the implementation. Otherwise, region's with a + * granularity less than the root interleave result in needing + * multiple endpoints to support a single slot in the + * interleave. + */ + if (val < cxld->interleave_granularity) + return -EINVAL; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) { + rc = -EBUSY; + goto out; + } + + p->interleave_granularity = val; +out: + up_write(&cxl_region_rwsem); + if (rc) + return rc; + return len; +} +static DEVICE_ATTR_RW(interleave_granularity); + static struct attribute *cxl_region_attrs[] = { &dev_attr_uuid.attr, + &dev_attr_interleave_ways.attr, + &dev_attr_interleave_granularity.attr, NULL, }; @@ -216,6 +346,8 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, enum cxl_decoder_type type) { struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent); + struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; + struct cxl_region_params *p; struct cxl_region *cxlr; struct device *dev; int rc; @@ -223,8 +355,10 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd, cxlr = cxl_region_alloc(cxlrd, id); if (IS_ERR(cxlr)) return cxlr; + p = &cxlr->params; cxlr->mode = mode; cxlr->type = type; + p->interleave_granularity = cxld->interleave_granularity; dev = &cxlr->dev; rc = dev_set_name(dev, "region%d", id); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index ea7255020808..e36f7577978b 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -7,6 +7,7 @@ #include #include #include +#include #include /** @@ -92,6 +93,31 @@ static inline int cxl_to_ways(u8 eniw, unsigned int *val) return 0; } +static inline int granularity_to_cxl(int g, u16 *ig) +{ + if (g > SZ_16K || g < 256 || !is_power_of_2(g)) + return -EINVAL; + *ig = ilog2(g) - 8; + return 0; +} + +static inline int ways_to_cxl(int ways, u8 *iw) +{ + if (ways > 16) + return -EINVAL; + if (is_power_of_2(ways)) { + *iw = ilog2(ways); + return 0; + } + if (ways % 3) + return -EINVAL; + ways /= 3; + if (!is_power_of_2(ways)) + return -EINVAL; + *iw = ilog2(ways) + 8; + return 0; +} + /* CXL 2.0 8.2.8.1 Device Capabilities Array Register */ #define CXLDEV_CAP_ARRAY_OFFSET 0x0 #define CXLDEV_CAP_ARRAY_CAP_ID 0 @@ -298,11 +324,14 @@ struct cxl_root_decoder { /* * enum cxl_config_state - State machine for region configuration * @CXL_CONFIG_IDLE: Any sysfs attribute can be written freely + * @CXL_CONFIG_INTERLEAVE_ACTIVE: region size has been set, no more + * changes to interleave_ways or interleave_granularity * @CXL_CONFIG_ACTIVE: All targets have been added the region is now * active */ enum cxl_config_state { CXL_CONFIG_IDLE, + CXL_CONFIG_INTERLEAVE_ACTIVE, CXL_CONFIG_ACTIVE, }; @@ -310,12 +339,16 @@ enum cxl_config_state { * struct cxl_region_params - region settings * @state: allow the driver to lockdown further parameter changes * @uuid: unique id for persistent regions + * @interleave_ways: number of endpoints in the region + * @interleave_granularity: capacity each endpoint contributes to a stripe * * State transitions are protected by the cxl_region_rwsem */ struct cxl_region_params { enum cxl_config_state state; uuid_t uuid; + int interleave_ways; + int interleave_granularity; }; /** -- cgit v1.2.3 From 23a22cd1c98be518774fe7f7e8a5203af050525a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 25 Apr 2022 11:43:44 -0700 Subject: cxl/region: Allocate HPA capacity to regions After a region's interleave parameters (ways and granularity) are set, add a way for regions to allocate HPA (host physical address space) from the free capacity in their parent root-decoder. The allocator for this capacity reuses the 'struct resource' based allocator used for CONFIG_DEVICE_PRIVATE. Once the tuple of "ways, granularity, [uuid], and size" is set the region configuration transitions to the CXL_CONFIG_INTERLEAVE_ACTIVE state which is a precursor to allowing endpoint decoders to be added to a region. Co-developed-by: Ben Widawsky Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165784335630.1758207.420216490941955417.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 29 ++++++ drivers/cxl/Kconfig | 3 + drivers/cxl/core/region.c | 150 +++++++++++++++++++++++++++++++- drivers/cxl/cxl.h | 2 + 4 files changed, 183 insertions(+), 1 deletion(-) (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index df347174fb37..52764c172338 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -324,3 +324,32 @@ Description: (RW) Configures the number of devices participating in the region is set by writing this value. Each device will provide 1/interleave_ways of storage for the region. + + +What: /sys/bus/cxl/devices/regionZ/size +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) System physical address space to be consumed by the region. + When written trigger the driver to allocate space out of the + parent root decoder's address space. When read the size of the + address space is reported and should match the span of the + region's resource attribute. Size shall be set after the + interleave configuration parameters. Once set it cannot be + changed, only freed by writing 0. The kernel makes no guarantees + that data is maintained over an address space freeing event, and + there is no guarantee that a free followed by an allocate + results in the same address being allocated. + + +What: /sys/bus/cxl/devices/regionZ/resource +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RO) A region is a contiguous partition of a CXL root decoder + address space. Region capacity is allocated by writing to the + size attribute, the resulting physical address space determined + by the driver is reflected here. It is therefore not useful to + read this before writing a value to the size attribute. diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig index 184fdd4b8871..768ced3d6fe8 100644 --- a/drivers/cxl/Kconfig +++ b/drivers/cxl/Kconfig @@ -106,6 +106,9 @@ config CXL_SUSPEND config CXL_REGION bool default CXL_BUS + # For MAX_PHYSMEM_BITS + depends on SPARSEMEM select MEMREGION + select GET_FREE_REGION endif diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index 45bc2fa18837..a84b007df4be 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -250,10 +250,152 @@ out: } static DEVICE_ATTR_RW(interleave_granularity); +static ssize_t resource_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + u64 resource = -1ULL; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + if (p->res) + resource = p->res->start; + rc = sysfs_emit(buf, "%#llx\n", resource); + up_read(&cxl_region_rwsem); + + return rc; +} +static DEVICE_ATTR_RO(resource); + +static int alloc_hpa(struct cxl_region *cxlr, resource_size_t size) +{ + struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(cxlr->dev.parent); + struct cxl_region_params *p = &cxlr->params; + struct resource *res; + u32 remainder = 0; + + lockdep_assert_held_write(&cxl_region_rwsem); + + /* Nothing to do... */ + if (p->res && resource_size(res) == size) + return 0; + + /* To change size the old size must be freed first */ + if (p->res) + return -EBUSY; + + if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) + return -EBUSY; + + /* ways, granularity and uuid (if PMEM) need to be set before HPA */ + if (!p->interleave_ways || !p->interleave_granularity || + (cxlr->mode == CXL_DECODER_PMEM && uuid_is_null(&p->uuid))) + return -ENXIO; + + div_u64_rem(size, SZ_256M * p->interleave_ways, &remainder); + if (remainder) + return -EINVAL; + + res = alloc_free_mem_region(cxlrd->res, size, SZ_256M, + dev_name(&cxlr->dev)); + if (IS_ERR(res)) { + dev_dbg(&cxlr->dev, "failed to allocate HPA: %ld\n", + PTR_ERR(res)); + return PTR_ERR(res); + } + + p->res = res; + p->state = CXL_CONFIG_INTERLEAVE_ACTIVE; + + return 0; +} + +static void cxl_region_iomem_release(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + + if (device_is_registered(&cxlr->dev)) + lockdep_assert_held_write(&cxl_region_rwsem); + if (p->res) { + remove_resource(p->res); + kfree(p->res); + p->res = NULL; + } +} + +static int free_hpa(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + + lockdep_assert_held_write(&cxl_region_rwsem); + + if (!p->res) + return 0; + + if (p->state >= CXL_CONFIG_ACTIVE) + return -EBUSY; + + cxl_region_iomem_release(cxlr); + p->state = CXL_CONFIG_IDLE; + return 0; +} + +static ssize_t size_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + u64 val; + int rc; + + rc = kstrtou64(buf, 0, &val); + if (rc) + return rc; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + if (val) + rc = alloc_hpa(cxlr, val); + else + rc = free_hpa(cxlr); + up_write(&cxl_region_rwsem); + + if (rc) + return rc; + + return len; +} + +static ssize_t size_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + u64 size = 0; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + if (p->res) + size = resource_size(p->res); + rc = sysfs_emit(buf, "%#llx\n", size); + up_read(&cxl_region_rwsem); + + return rc; +} +static DEVICE_ATTR_RW(size); + static struct attribute *cxl_region_attrs[] = { &dev_attr_uuid.attr, &dev_attr_interleave_ways.attr, &dev_attr_interleave_granularity.attr, + &dev_attr_resource.attr, + &dev_attr_size.attr, NULL, }; @@ -299,7 +441,11 @@ static struct cxl_region *to_cxl_region(struct device *dev) static void unregister_region(void *dev) { - device_unregister(dev); + struct cxl_region *cxlr = to_cxl_region(dev); + + device_del(dev); + cxl_region_iomem_release(cxlr); + put_device(dev); } static struct lock_class_key cxl_region_key; @@ -451,3 +597,5 @@ static ssize_t delete_region_store(struct device *dev, return len; } DEVICE_ATTR_WO(delete_region); + +MODULE_IMPORT_NS(CXL); diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index e36f7577978b..9518b57c19b6 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -341,6 +341,7 @@ enum cxl_config_state { * @uuid: unique id for persistent regions * @interleave_ways: number of endpoints in the region * @interleave_granularity: capacity each endpoint contributes to a stripe + * @res: allocated iomem capacity for this region * * State transitions are protected by the cxl_region_rwsem */ @@ -349,6 +350,7 @@ struct cxl_region_params { uuid_t uuid; int interleave_ways; int interleave_granularity; + struct resource *res; }; /** -- cgit v1.2.3 From b9686e8c8e39d4072081ef078c04915ee51c8af4 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 4 Jun 2022 15:49:53 -0700 Subject: cxl/region: Enable the assignment of endpoint decoders to regions The region provisioning process involves allocating DPA to a set of endpoint decoders, and HPA plus the region geometry to a region device. Then the decoder is assigned to the region. At this point several validation steps can be performed to validate that the decoder is suitable to participate in the region. Co-developed-by: Ben Widawsky Signed-off-by: Ben Widawsky Reviewed-by: Jonathan Cameron Reported-by: kernel test robot Link: https://lore.kernel.org/r/165784336184.1758207.16403282029203949622.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 19 +++ drivers/cxl/core/core.h | 6 + drivers/cxl/core/hdm.c | 15 +- drivers/cxl/core/port.c | 9 + drivers/cxl/core/region.c | 282 +++++++++++++++++++++++++++++++- drivers/cxl/cxl.h | 11 ++ 6 files changed, 340 insertions(+), 2 deletions(-) (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 52764c172338..4ecf6cf40fd3 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -353,3 +353,22 @@ Description: size attribute, the resulting physical address space determined by the driver is reflected here. It is therefore not useful to read this before writing a value to the size attribute. + + +What: /sys/bus/cxl/devices/regionZ/target[0..N] +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Write an endpoint decoder object name to 'targetX' where X + is the intended position of the endpoint device in the region + interleave and N is the 'interleave_ways' setting for the + region. ENXIO is returned if the write results in an impossible + to map decode scenario, like the endpoint is unreachable at that + position relative to the root decoder interleave. EBUSY is + returned if the position in the region is already occupied, or + if the region is not in a state to accept interleave + configuration changes. EINVAL is returned if the object name is + not an endpoint decoder. Once all positions have been + successfully written a final validation for decode conflicts is + performed before activating the region. diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 29272df7e212..a60ad9f656fd 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -12,9 +12,14 @@ extern struct attribute_group cxl_base_attribute_group; #ifdef CONFIG_CXL_REGION extern struct device_attribute dev_attr_create_pmem_region; extern struct device_attribute dev_attr_delete_region; +extern struct device_attribute dev_attr_region; +void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled); #define CXL_REGION_ATTR(x) (&dev_attr_##x.attr) #define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr), #else +static inline void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled) +{ +} #define CXL_REGION_ATTR(x) NULL #define SET_CXL_REGION_ATTR(x) #endif @@ -34,6 +39,7 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size); int cxl_dpa_free(struct cxl_endpoint_decoder *cxled); resource_size_t cxl_dpa_size(struct cxl_endpoint_decoder *cxled); resource_size_t cxl_dpa_resource_start(struct cxl_endpoint_decoder *cxled); +extern struct rw_semaphore cxl_dpa_rwsem; int cxl_memdev_init(void); void cxl_memdev_exit(void); diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 96346c6be021..6b6d3be3a340 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -17,7 +17,7 @@ * for enumerating these registers and capabilities. */ -static DECLARE_RWSEM(cxl_dpa_rwsem); +DECLARE_RWSEM(cxl_dpa_rwsem); static int add_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, int *target_map) @@ -321,6 +321,12 @@ int cxl_dpa_free(struct cxl_endpoint_decoder *cxled) rc = 0; goto out; } + if (cxled->cxld.region) { + dev_dbg(dev, "decoder assigned to: %s\n", + dev_name(&cxled->cxld.region->dev)); + rc = -EBUSY; + goto out; + } if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { dev_dbg(dev, "decoder enabled\n"); rc = -EBUSY; @@ -397,6 +403,13 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size) int rc; down_write(&cxl_dpa_rwsem); + if (cxled->cxld.region) { + dev_dbg(dev, "decoder attached to %s\n", + dev_name(&cxled->cxld.region->dev)); + rc = -EBUSY; + goto out; + } + if (cxled->cxld.flags & CXL_DECODER_F_ENABLE) { dev_dbg(dev, "decoder enabled\n"); rc = -EBUSY; diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 9a768822261b..ff6ea869fcc9 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -288,6 +288,7 @@ static struct attribute *cxl_decoder_base_attrs[] = { &dev_attr_locked.attr, &dev_attr_interleave_granularity.attr, &dev_attr_interleave_ways.attr, + SET_CXL_REGION_ATTR(region) NULL, }; @@ -1583,6 +1584,7 @@ struct cxl_endpoint_decoder *cxl_endpoint_decoder_alloc(struct cxl_port *port) if (!cxled) return ERR_PTR(-ENOMEM); + cxled->pos = -1; cxld = &cxled->cxld; rc = cxl_decoder_init(port, cxld); if (rc) { @@ -1687,6 +1689,13 @@ EXPORT_SYMBOL_NS_GPL(cxl_decoder_add, CXL); static void cxld_unregister(void *dev) { + struct cxl_endpoint_decoder *cxled; + + if (is_endpoint_decoder(dev)) { + cxled = to_cxl_endpoint_decoder(dev); + cxl_decoder_kill_region(cxled); + } + device_unregister(dev); } diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index a84b007df4be..51447458a522 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -24,6 +24,7 @@ * but is only visible for persistent regions. * 1. Interleave granularity * 2. Interleave size + * 3. Decoder targets */ /* @@ -141,6 +142,8 @@ static ssize_t interleave_ways_show(struct device *dev, return rc; } +static const struct attribute_group *get_cxl_region_target_group(void); + static ssize_t interleave_ways_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -149,7 +152,7 @@ static ssize_t interleave_ways_store(struct device *dev, struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld; struct cxl_region *cxlr = to_cxl_region(dev); struct cxl_region_params *p = &cxlr->params; - int rc, val; + int rc, val, save; u8 iw; rc = kstrtoint(buf, 0, &val); @@ -178,7 +181,11 @@ static ssize_t interleave_ways_store(struct device *dev, goto out; } + save = p->interleave_ways; p->interleave_ways = val; + rc = sysfs_update_group(&cxlr->dev.kobj, get_cxl_region_target_group()); + if (rc) + p->interleave_ways = save; out: up_write(&cxl_region_rwsem); if (rc) @@ -404,9 +411,262 @@ static const struct attribute_group cxl_region_group = { .is_visible = cxl_region_visible, }; +static size_t show_targetN(struct cxl_region *cxlr, char *buf, int pos) +{ + struct cxl_region_params *p = &cxlr->params; + struct cxl_endpoint_decoder *cxled; + int rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + + if (pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos, + p->interleave_ways); + rc = -ENXIO; + goto out; + } + + cxled = p->targets[pos]; + if (!cxled) + rc = sysfs_emit(buf, "\n"); + else + rc = sysfs_emit(buf, "%s\n", dev_name(&cxled->cxld.dev)); +out: + up_read(&cxl_region_rwsem); + + return rc; +} + +/* + * - Check that the given endpoint is attached to a host-bridge identified + * in the root interleave. + */ +static int cxl_region_attach(struct cxl_region *cxlr, + struct cxl_endpoint_decoder *cxled, int pos) +{ + struct cxl_region_params *p = &cxlr->params; + + if (cxled->mode == CXL_DECODER_DEAD) { + dev_dbg(&cxlr->dev, "%s dead\n", dev_name(&cxled->cxld.dev)); + return -ENODEV; + } + + if (pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos, + p->interleave_ways); + return -ENXIO; + } + + if (p->targets[pos] == cxled) + return 0; + + if (p->targets[pos]) { + struct cxl_endpoint_decoder *cxled_target = p->targets[pos]; + struct cxl_memdev *cxlmd_target = cxled_to_memdev(cxled_target); + + dev_dbg(&cxlr->dev, "position %d already assigned to %s:%s\n", + pos, dev_name(&cxlmd_target->dev), + dev_name(&cxled_target->cxld.dev)); + return -EBUSY; + } + + p->targets[pos] = cxled; + cxled->pos = pos; + p->nr_targets++; + + return 0; +} + +static void cxl_region_detach(struct cxl_endpoint_decoder *cxled) +{ + struct cxl_region *cxlr = cxled->cxld.region; + struct cxl_region_params *p; + + lockdep_assert_held_write(&cxl_region_rwsem); + + if (!cxlr) + return; + + p = &cxlr->params; + get_device(&cxlr->dev); + + if (cxled->pos < 0 || cxled->pos >= p->interleave_ways || + p->targets[cxled->pos] != cxled) { + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + + dev_WARN_ONCE(&cxlr->dev, 1, "expected %s:%s at position %d\n", + dev_name(&cxlmd->dev), dev_name(&cxled->cxld.dev), + cxled->pos); + goto out; + } + + p->targets[cxled->pos] = NULL; + p->nr_targets--; + + /* notify the region driver that one of its targets has deparated */ + up_write(&cxl_region_rwsem); + device_release_driver(&cxlr->dev); + down_write(&cxl_region_rwsem); +out: + put_device(&cxlr->dev); +} + +void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled) +{ + down_write(&cxl_region_rwsem); + cxled->mode = CXL_DECODER_DEAD; + cxl_region_detach(cxled); + up_write(&cxl_region_rwsem); +} + +static int attach_target(struct cxl_region *cxlr, const char *decoder, int pos) +{ + struct device *dev; + int rc; + + dev = bus_find_device_by_name(&cxl_bus_type, NULL, decoder); + if (!dev) + return -ENODEV; + + if (!is_endpoint_decoder(dev)) { + put_device(dev); + return -EINVAL; + } + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + goto out; + down_read(&cxl_dpa_rwsem); + rc = cxl_region_attach(cxlr, to_cxl_endpoint_decoder(dev), pos); + up_read(&cxl_dpa_rwsem); + up_write(&cxl_region_rwsem); +out: + put_device(dev); + return rc; +} + +static int detach_target(struct cxl_region *cxlr, int pos) +{ + struct cxl_region_params *p = &cxlr->params; + int rc; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + if (pos >= p->interleave_ways) { + dev_dbg(&cxlr->dev, "position %d out of range %d\n", pos, + p->interleave_ways); + rc = -ENXIO; + goto out; + } + + if (!p->targets[pos]) { + rc = 0; + goto out; + } + + cxl_region_detach(p->targets[pos]); + rc = 0; +out: + up_write(&cxl_region_rwsem); + return rc; +} + +static size_t store_targetN(struct cxl_region *cxlr, const char *buf, int pos, + size_t len) +{ + int rc; + + if (sysfs_streq(buf, "\n")) + rc = detach_target(cxlr, pos); + else + rc = attach_target(cxlr, buf, pos); + + if (rc < 0) + return rc; + return len; +} + +#define TARGET_ATTR_RW(n) \ +static ssize_t target##n##_show( \ + struct device *dev, struct device_attribute *attr, char *buf) \ +{ \ + return show_targetN(to_cxl_region(dev), buf, (n)); \ +} \ +static ssize_t target##n##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + return store_targetN(to_cxl_region(dev), buf, (n), len); \ +} \ +static DEVICE_ATTR_RW(target##n) + +TARGET_ATTR_RW(0); +TARGET_ATTR_RW(1); +TARGET_ATTR_RW(2); +TARGET_ATTR_RW(3); +TARGET_ATTR_RW(4); +TARGET_ATTR_RW(5); +TARGET_ATTR_RW(6); +TARGET_ATTR_RW(7); +TARGET_ATTR_RW(8); +TARGET_ATTR_RW(9); +TARGET_ATTR_RW(10); +TARGET_ATTR_RW(11); +TARGET_ATTR_RW(12); +TARGET_ATTR_RW(13); +TARGET_ATTR_RW(14); +TARGET_ATTR_RW(15); + +static struct attribute *target_attrs[] = { + &dev_attr_target0.attr, + &dev_attr_target1.attr, + &dev_attr_target2.attr, + &dev_attr_target3.attr, + &dev_attr_target4.attr, + &dev_attr_target5.attr, + &dev_attr_target6.attr, + &dev_attr_target7.attr, + &dev_attr_target8.attr, + &dev_attr_target9.attr, + &dev_attr_target10.attr, + &dev_attr_target11.attr, + &dev_attr_target12.attr, + &dev_attr_target13.attr, + &dev_attr_target14.attr, + &dev_attr_target15.attr, + NULL, +}; + +static umode_t cxl_region_target_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + + if (n < p->interleave_ways) + return a->mode; + return 0; +} + +static const struct attribute_group cxl_region_target_group = { + .attrs = target_attrs, + .is_visible = cxl_region_target_visible, +}; + +static const struct attribute_group *get_cxl_region_target_group(void) +{ + return &cxl_region_target_group; +} + static const struct attribute_group *region_groups[] = { &cxl_base_attribute_group, &cxl_region_group, + &cxl_region_target_group, NULL, }; @@ -566,6 +826,26 @@ static ssize_t create_pmem_region_store(struct device *dev, } DEVICE_ATTR_RW(create_pmem_region); +static ssize_t region_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_decoder *cxld = to_cxl_decoder(dev); + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + + if (cxld->region) + rc = sysfs_emit(buf, "%s\n", dev_name(&cxld->region->dev)); + else + rc = sysfs_emit(buf, "\n"); + up_read(&cxl_region_rwsem); + + return rc; +} +DEVICE_ATTR_RO(region); + static struct cxl_region * cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name) { diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 9518b57c19b6..5e84aa2d09e2 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -255,6 +255,7 @@ enum cxl_decoder_type { * @interleave_ways: number of cxl_dports in this decode * @interleave_granularity: data stride per dport * @target_type: accelerator vs expander (type2 vs type3) selector + * @region: currently assigned region for this decoder * @flags: memory type capabilities and locking */ struct cxl_decoder { @@ -264,14 +265,20 @@ struct cxl_decoder { int interleave_ways; int interleave_granularity; enum cxl_decoder_type target_type; + struct cxl_region *region; unsigned long flags; }; +/* + * CXL_DECODER_DEAD prevents endpoints from being reattached to regions + * while cxld_unregister() is running + */ enum cxl_decoder_mode { CXL_DECODER_NONE, CXL_DECODER_RAM, CXL_DECODER_PMEM, CXL_DECODER_MIXED, + CXL_DECODER_DEAD, }; /** @@ -280,12 +287,14 @@ enum cxl_decoder_mode { * @dpa_res: actively claimed DPA span of this decoder * @skip: offset into @dpa_res where @cxld.hpa_range maps * @mode: which memory type / access-mode-partition this decoder targets + * @pos: interleave position in @cxld.region */ struct cxl_endpoint_decoder { struct cxl_decoder cxld; struct resource *dpa_res; resource_size_t skip; enum cxl_decoder_mode mode; + int pos; }; /** @@ -351,6 +360,8 @@ struct cxl_region_params { int interleave_ways; int interleave_granularity; struct resource *res; + struct cxl_endpoint_decoder *targets[CXL_DECODER_MAX_INTERLEAVE]; + int nr_targets; }; /** -- cgit v1.2.3 From 176baefb2eb5d7a3ddebe3ff803db1fce44574b5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 8 Jun 2022 22:56:37 -0700 Subject: cxl/hdm: Commit decoder state to hardware After all the soft validation of the region has completed, convey the region configuration to hardware while being careful to commit decoders in specification mandated order. In addition to programming the endpoint decoder base-address, interleave ways and granularity, the switch decoder target lists are also established. While the kernel can enforce spec-mandated commit order, it can not enforce spec-mandated reset order. For example, the kernel can't stop someone from removing an endpoint device that is occupying decoderN in a switch decoder where decoderN+1 is also committed. To reset decoderN, decoderN+1 must be torn down first. That "tear down the world" implementation is saved for a follow-on patch. Callback operations are provided for the 'commit' and 'reset' operations. While those callbacks may prove useful for CXL accelerators (Type-2 devices with memory) the primary motivation is to enable a simple way for cxl_test to intercept those operations. Reviewed-by: Jonathan Cameron Link: https://lore.kernel.org/r/165784338418.1758207.14659830845389904356.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- Documentation/ABI/testing/sysfs-bus-cxl | 16 +++ drivers/cxl/core/hdm.c | 227 ++++++++++++++++++++++++++++++++ drivers/cxl/core/port.c | 1 + drivers/cxl/core/region.c | 194 +++++++++++++++++++++++++-- drivers/cxl/cxl.h | 13 +- tools/testing/cxl/test/cxl.c | 46 +++++++ 6 files changed, 486 insertions(+), 11 deletions(-) (limited to 'Documentation/ABI') diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl index 4ecf6cf40fd3..8494ef27e8d2 100644 --- a/Documentation/ABI/testing/sysfs-bus-cxl +++ b/Documentation/ABI/testing/sysfs-bus-cxl @@ -372,3 +372,19 @@ Description: not an endpoint decoder. Once all positions have been successfully written a final validation for decode conflicts is performed before activating the region. + + +What: /sys/bus/cxl/devices/regionZ/commit +Date: May, 2022 +KernelVersion: v5.20 +Contact: linux-cxl@vger.kernel.org +Description: + (RW) Write a boolean 'true' string value to this attribute to + trigger the region to transition from the software programmed + state to the actively decoding in hardware state. The commit + operation in addition to validating that the region is in proper + configured state, validates that the decoders are being + committed in spec mandated order (last committed decoder id + + 1), and checks that the hardware accepts the commit request. + Reading this value indicates whether the region is committed or + not. diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c index 6b6d3be3a340..ee53e8ac5c96 100644 --- a/drivers/cxl/core/hdm.c +++ b/drivers/cxl/core/hdm.c @@ -129,6 +129,8 @@ struct cxl_hdm *devm_cxl_setup_hdm(struct cxl_port *port) return ERR_PTR(-ENXIO); } + dev_set_drvdata(dev, cxlhdm); + return cxlhdm; } EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_hdm, CXL); @@ -466,6 +468,222 @@ out: return devm_add_action_or_reset(&port->dev, cxl_dpa_release, cxled); } +static void cxld_set_interleave(struct cxl_decoder *cxld, u32 *ctrl) +{ + u16 eig; + u8 eiw; + + /* + * Input validation ensures these warns never fire, but otherwise + * suppress unititalized variable usage warnings. + */ + if (WARN_ONCE(ways_to_cxl(cxld->interleave_ways, &eiw), + "invalid interleave_ways: %d\n", cxld->interleave_ways)) + return; + if (WARN_ONCE(granularity_to_cxl(cxld->interleave_granularity, &eig), + "invalid interleave_granularity: %d\n", + cxld->interleave_granularity)) + return; + + u32p_replace_bits(ctrl, eig, CXL_HDM_DECODER0_CTRL_IG_MASK); + u32p_replace_bits(ctrl, eiw, CXL_HDM_DECODER0_CTRL_IW_MASK); + *ctrl |= CXL_HDM_DECODER0_CTRL_COMMIT; +} + +static void cxld_set_type(struct cxl_decoder *cxld, u32 *ctrl) +{ + u32p_replace_bits(ctrl, !!(cxld->target_type == 3), + CXL_HDM_DECODER0_CTRL_TYPE); +} + +static void cxld_set_hpa(struct cxl_decoder *cxld, u64 *base, u64 *size) +{ + struct cxl_region *cxlr = cxld->region; + struct cxl_region_params *p = &cxlr->params; + + cxld->hpa_range = (struct range) { + .start = p->res->start, + .end = p->res->end, + }; + + *base = p->res->start; + *size = resource_size(p->res); +} + +static void cxld_clear_hpa(struct cxl_decoder *cxld) +{ + cxld->hpa_range = (struct range) { + .start = 0, + .end = -1, + }; +} + +static int cxlsd_set_targets(struct cxl_switch_decoder *cxlsd, u64 *tgt) +{ + struct cxl_dport **t = &cxlsd->target[0]; + int ways = cxlsd->cxld.interleave_ways; + + if (dev_WARN_ONCE(&cxlsd->cxld.dev, + ways > 8 || ways > cxlsd->nr_targets, + "ways: %d overflows targets: %d\n", ways, + cxlsd->nr_targets)) + return -ENXIO; + + *tgt = FIELD_PREP(GENMASK(7, 0), t[0]->port_id); + if (ways > 1) + *tgt |= FIELD_PREP(GENMASK(15, 8), t[1]->port_id); + if (ways > 2) + *tgt |= FIELD_PREP(GENMASK(23, 16), t[2]->port_id); + if (ways > 3) + *tgt |= FIELD_PREP(GENMASK(31, 24), t[3]->port_id); + if (ways > 4) + *tgt |= FIELD_PREP(GENMASK_ULL(39, 32), t[4]->port_id); + if (ways > 5) + *tgt |= FIELD_PREP(GENMASK_ULL(47, 40), t[5]->port_id); + if (ways > 6) + *tgt |= FIELD_PREP(GENMASK_ULL(55, 48), t[6]->port_id); + if (ways > 7) + *tgt |= FIELD_PREP(GENMASK_ULL(63, 56), t[7]->port_id); + + return 0; +} + +/* + * Per CXL 2.0 8.2.5.12.20 Committing Decoder Programming, hardware must set + * committed or error within 10ms, but just be generous with 20ms to account for + * clock skew and other marginal behavior + */ +#define COMMIT_TIMEOUT_MS 20 +static int cxld_await_commit(void __iomem *hdm, int id) +{ + u32 ctrl; + int i; + + for (i = 0; i < COMMIT_TIMEOUT_MS; i++) { + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMIT_ERROR, ctrl)) { + ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + return -EIO; + } + if (FIELD_GET(CXL_HDM_DECODER0_CTRL_COMMITTED, ctrl)) + return 0; + fsleep(1000); + } + + return -ETIMEDOUT; +} + +static int cxl_decoder_commit(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + int id = cxld->id, rc; + u64 base, size; + u32 ctrl; + + if (cxld->flags & CXL_DECODER_F_ENABLE) + return 0; + + if (port->commit_end + 1 != id) { + dev_dbg(&port->dev, + "%s: out of order commit, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, port->commit_end + 1); + return -EBUSY; + } + + down_read(&cxl_dpa_rwsem); + /* common decoder settings */ + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(cxld->id)); + cxld_set_interleave(cxld, &ctrl); + cxld_set_type(cxld, &ctrl); + cxld_set_hpa(cxld, &base, &size); + + writel(upper_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id)); + writel(lower_32_bits(base), hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id)); + writel(upper_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id)); + writel(lower_32_bits(size), hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id)); + + if (is_switch_decoder(&cxld->dev)) { + struct cxl_switch_decoder *cxlsd = + to_cxl_switch_decoder(&cxld->dev); + void __iomem *tl_hi = hdm + CXL_HDM_DECODER0_TL_HIGH(id); + void __iomem *tl_lo = hdm + CXL_HDM_DECODER0_TL_LOW(id); + u64 targets; + + rc = cxlsd_set_targets(cxlsd, &targets); + if (rc) { + dev_dbg(&port->dev, "%s: target configuration error\n", + dev_name(&cxld->dev)); + goto err; + } + + writel(upper_32_bits(targets), tl_hi); + writel(lower_32_bits(targets), tl_lo); + } else { + struct cxl_endpoint_decoder *cxled = + to_cxl_endpoint_decoder(&cxld->dev); + void __iomem *sk_hi = hdm + CXL_HDM_DECODER0_SKIP_HIGH(id); + void __iomem *sk_lo = hdm + CXL_HDM_DECODER0_SKIP_LOW(id); + + writel(upper_32_bits(cxled->skip), sk_hi); + writel(lower_32_bits(cxled->skip), sk_lo); + } + + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + up_read(&cxl_dpa_rwsem); + + port->commit_end++; + rc = cxld_await_commit(hdm, cxld->id); +err: + if (rc) { + dev_dbg(&port->dev, "%s: error %d committing decoder\n", + dev_name(&cxld->dev), rc); + cxld->reset(cxld); + return rc; + } + cxld->flags |= CXL_DECODER_F_ENABLE; + + return 0; +} + +static int cxl_decoder_reset(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + struct cxl_hdm *cxlhdm = dev_get_drvdata(&port->dev); + void __iomem *hdm = cxlhdm->regs.hdm_decoder; + int id = cxld->id; + u32 ctrl; + + if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0) + return 0; + + if (port->commit_end != id) { + dev_dbg(&port->dev, + "%s: out of order reset, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, port->commit_end); + return -EBUSY; + } + + down_read(&cxl_dpa_rwsem); + ctrl = readl(hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + ctrl &= ~CXL_HDM_DECODER0_CTRL_COMMIT; + writel(ctrl, hdm + CXL_HDM_DECODER0_CTRL_OFFSET(id)); + + cxld_clear_hpa(cxld); + writel(0, hdm + CXL_HDM_DECODER0_SIZE_HIGH_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_BASE_HIGH_OFFSET(id)); + writel(0, hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(id)); + up_read(&cxl_dpa_rwsem); + + port->commit_end--; + cxld->flags &= ~CXL_DECODER_F_ENABLE; + + return 0; +} + static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, int *target_map, void __iomem *hdm, int which, u64 *dpa_base) @@ -488,6 +706,8 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, base = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_BASE_LOW_OFFSET(which)); size = ioread64_hi_lo(hdm + CXL_HDM_DECODER0_SIZE_LOW_OFFSET(which)); committed = !!(ctrl & CXL_HDM_DECODER0_CTRL_COMMITTED); + cxld->commit = cxl_decoder_commit; + cxld->reset = cxl_decoder_reset; if (!committed) size = 0; @@ -511,6 +731,13 @@ static int init_hdm_decoder(struct cxl_port *port, struct cxl_decoder *cxld, cxld->target_type = CXL_DECODER_EXPANDER; else cxld->target_type = CXL_DECODER_ACCELERATOR; + if (cxld->id != port->commit_end + 1) { + dev_warn(&port->dev, + "decoder%d.%d: Committed out of order\n", + port->id, cxld->id); + return -ENXIO; + } + port->commit_end = cxld->id; } else { /* unless / until type-2 drivers arrive, assume type-3 */ if (FIELD_GET(CXL_HDM_DECODER0_CTRL_TYPE, ctrl) == 0) { diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 9b9ac08ecbc7..40b4a72d6539 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -632,6 +632,7 @@ static struct cxl_port *cxl_port_alloc(struct device *uport, port->component_reg_phys = component_reg_phys; ida_init(&port->decoder_ida); port->hdm_end = -1; + port->commit_end = -1; xa_init(&port->dports); xa_init(&port->endpoints); xa_init(&port->regions); diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c index bd661d025a23..800085300779 100644 --- a/drivers/cxl/core/region.c +++ b/drivers/cxl/core/region.c @@ -115,6 +115,173 @@ out: } static DEVICE_ATTR_RW(uuid); +static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port, + struct cxl_region *cxlr) +{ + return xa_load(&port->regions, (unsigned long)cxlr); +} + +static int cxl_region_decode_reset(struct cxl_region *cxlr, int count) +{ + struct cxl_region_params *p = &cxlr->params; + int i; + + for (i = count - 1; i >= 0; i--) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_port *iter = cxled_to_port(cxled); + struct cxl_ep *ep; + int rc; + + while (!is_cxl_root(to_cxl_port(iter->dev.parent))) + iter = to_cxl_port(iter->dev.parent); + + for (ep = cxl_ep_load(iter, cxlmd); iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { + struct cxl_region_ref *cxl_rr; + struct cxl_decoder *cxld; + + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + rc = cxld->reset(cxld); + if (rc) + return rc; + } + + rc = cxled->cxld.reset(&cxled->cxld); + if (rc) + return rc; + } + + return 0; +} + +static int cxl_region_decode_commit(struct cxl_region *cxlr) +{ + struct cxl_region_params *p = &cxlr->params; + int i, rc; + + for (i = 0; i < p->nr_targets; i++) { + struct cxl_endpoint_decoder *cxled = p->targets[i]; + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); + struct cxl_region_ref *cxl_rr; + struct cxl_decoder *cxld; + struct cxl_port *iter; + struct cxl_ep *ep; + + /* commit bottom up */ + for (iter = cxled_to_port(cxled); !is_cxl_root(iter); + iter = to_cxl_port(iter->dev.parent)) { + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + rc = cxld->commit(cxld); + if (rc) + break; + } + + /* success, all decoders up to the root are programmed */ + if (is_cxl_root(iter)) + continue; + + /* programming @iter failed, teardown */ + for (ep = cxl_ep_load(iter, cxlmd); ep && iter; + iter = ep->next, ep = cxl_ep_load(iter, cxlmd)) { + cxl_rr = cxl_rr_load(iter, cxlr); + cxld = cxl_rr->decoder; + cxld->reset(cxld); + } + + cxled->cxld.reset(&cxled->cxld); + if (i == 0) + return rc; + break; + } + + if (i >= p->nr_targets) + return 0; + + /* undo the targets that were successfully committed */ + cxl_region_decode_reset(cxlr, i); + return rc; +} + +static ssize_t commit_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + bool commit; + ssize_t rc; + + rc = kstrtobool(buf, &commit); + if (rc) + return rc; + + rc = down_write_killable(&cxl_region_rwsem); + if (rc) + return rc; + + /* Already in the requested state? */ + if (commit && p->state >= CXL_CONFIG_COMMIT) + goto out; + if (!commit && p->state < CXL_CONFIG_COMMIT) + goto out; + + /* Not ready to commit? */ + if (commit && p->state < CXL_CONFIG_ACTIVE) { + rc = -ENXIO; + goto out; + } + + if (commit) + rc = cxl_region_decode_commit(cxlr); + else { + p->state = CXL_CONFIG_RESET_PENDING; + up_write(&cxl_region_rwsem); + device_release_driver(&cxlr->dev); + down_write(&cxl_region_rwsem); + + /* + * The lock was dropped, so need to revalidate that the reset is + * still pending. + */ + if (p->state == CXL_CONFIG_RESET_PENDING) + rc = cxl_region_decode_reset(cxlr, p->interleave_ways); + } + + if (rc) + goto out; + + if (commit) + p->state = CXL_CONFIG_COMMIT; + else if (p->state == CXL_CONFIG_RESET_PENDING) + p->state = CXL_CONFIG_ACTIVE; + +out: + up_write(&cxl_region_rwsem); + + if (rc) + return rc; + return len; +} + +static ssize_t commit_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cxl_region *cxlr = to_cxl_region(dev); + struct cxl_region_params *p = &cxlr->params; + ssize_t rc; + + rc = down_read_interruptible(&cxl_region_rwsem); + if (rc) + return rc; + rc = sysfs_emit(buf, "%d\n", p->state >= CXL_CONFIG_COMMIT); + up_read(&cxl_region_rwsem); + + return rc; +} +static DEVICE_ATTR_RW(commit); + static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a, int n) { @@ -399,6 +566,7 @@ static DEVICE_ATTR_RW(size); static struct attribute *cxl_region_attrs[] = { &dev_attr_uuid.attr, + &dev_attr_commit.attr, &dev_attr_interleave_ways.attr, &dev_attr_interleave_granularity.attr, &dev_attr_resource.attr, @@ -675,12 +843,6 @@ out_erase: return rc; } -static struct cxl_region_ref *cxl_rr_load(struct cxl_port *port, - struct cxl_region *cxlr) -{ - return xa_load(&port->regions, (unsigned long)cxlr); -} - static void cxl_port_detach_region(struct cxl_port *port, struct cxl_region *cxlr, struct cxl_endpoint_decoder *cxled) @@ -1068,20 +1230,32 @@ err: return rc; } -static void cxl_region_detach(struct cxl_endpoint_decoder *cxled) +static int cxl_region_detach(struct cxl_endpoint_decoder *cxled) { struct cxl_port *iter, *ep_port = cxled_to_port(cxled); struct cxl_region *cxlr = cxled->cxld.region; struct cxl_region_params *p; + int rc = 0; lockdep_assert_held_write(&cxl_region_rwsem); if (!cxlr) - return; + return 0; p = &cxlr->params; get_device(&cxlr->dev); + if (p->state > CXL_CONFIG_ACTIVE) { + /* + * TODO: tear down all impacted regions if a device is + * removed out of order + */ + rc = cxl_region_decode_reset(cxlr, p->interleave_ways); + if (rc) + goto out; + p->state = CXL_CONFIG_ACTIVE; + } + for (iter = ep_port; !is_cxl_root(iter); iter = to_cxl_port(iter->dev.parent)) cxl_port_detach_region(iter, cxlr, cxled); @@ -1109,6 +1283,7 @@ static void cxl_region_detach(struct cxl_endpoint_decoder *cxled) down_write(&cxl_region_rwsem); out: put_device(&cxlr->dev); + return rc; } void cxl_decoder_kill_region(struct cxl_endpoint_decoder *cxled) @@ -1166,8 +1341,7 @@ static int detach_target(struct cxl_region *cxlr, int pos) goto out; } - cxl_region_detach(p->targets[pos]); - rc = 0; + rc = cxl_region_detach(p->targets[pos]); out: up_write(&cxl_region_rwsem); return rc; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 2ed5369c0974..01095d78784d 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -54,6 +54,7 @@ #define CXL_HDM_DECODER0_CTRL_LOCK BIT(8) #define CXL_HDM_DECODER0_CTRL_COMMIT BIT(9) #define CXL_HDM_DECODER0_CTRL_COMMITTED BIT(10) +#define CXL_HDM_DECODER0_CTRL_COMMIT_ERROR BIT(11) #define CXL_HDM_DECODER0_CTRL_TYPE BIT(12) #define CXL_HDM_DECODER0_TL_LOW(i) (0x20 * (i) + 0x24) #define CXL_HDM_DECODER0_TL_HIGH(i) (0x20 * (i) + 0x28) @@ -257,7 +258,9 @@ enum cxl_decoder_type { * @target_type: accelerator vs expander (type2 vs type3) selector * @region: currently assigned region for this decoder * @flags: memory type capabilities and locking - */ + * @commit: device/decoder-type specific callback to commit settings to hw + * @reset: device/decoder-type specific callback to reset hw settings +*/ struct cxl_decoder { struct device dev; int id; @@ -267,6 +270,8 @@ struct cxl_decoder { enum cxl_decoder_type target_type; struct cxl_region *region; unsigned long flags; + int (*commit)(struct cxl_decoder *cxld); + int (*reset)(struct cxl_decoder *cxld); }; /* @@ -339,11 +344,15 @@ struct cxl_root_decoder { * changes to interleave_ways or interleave_granularity * @CXL_CONFIG_ACTIVE: All targets have been added the region is now * active + * @CXL_CONFIG_RESET_PENDING: see commit_store() + * @CXL_CONFIG_COMMIT: Soft-config has been committed to hardware */ enum cxl_config_state { CXL_CONFIG_IDLE, CXL_CONFIG_INTERLEAVE_ACTIVE, CXL_CONFIG_ACTIVE, + CXL_CONFIG_RESET_PENDING, + CXL_CONFIG_COMMIT, }; /** @@ -425,6 +434,7 @@ struct cxl_nvdimm { * @parent_dport: dport that points to this port in the parent * @decoder_ida: allocator for decoder ids * @hdm_end: track last allocated HDM decoder instance for allocation ordering + * @commit_end: cursor to track highest committed decoder for commit ordering * @component_reg_phys: component register capability base address (optional) * @dead: last ep has been removed, force port re-creation * @depth: How deep this port is relative to the root. depth 0 is the root. @@ -442,6 +452,7 @@ struct cxl_port { struct cxl_dport *parent_dport; struct ida decoder_ida; int hdm_end; + int commit_end; resource_size_t component_reg_phys; bool dead; unsigned int depth; diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 4dad0fa7ac4c..a072b2d3e726 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -429,6 +429,50 @@ static int map_targets(struct device *dev, void *data) return 0; } +static int mock_decoder_commit(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + int id = cxld->id; + + if (cxld->flags & CXL_DECODER_F_ENABLE) + return 0; + + dev_dbg(&port->dev, "%s commit\n", dev_name(&cxld->dev)); + if (port->commit_end + 1 != id) { + dev_dbg(&port->dev, + "%s: out of order commit, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, port->commit_end + 1); + return -EBUSY; + } + + port->commit_end++; + cxld->flags |= CXL_DECODER_F_ENABLE; + + return 0; +} + +static int mock_decoder_reset(struct cxl_decoder *cxld) +{ + struct cxl_port *port = to_cxl_port(cxld->dev.parent); + int id = cxld->id; + + if ((cxld->flags & CXL_DECODER_F_ENABLE) == 0) + return 0; + + dev_dbg(&port->dev, "%s reset\n", dev_name(&cxld->dev)); + if (port->commit_end != id) { + dev_dbg(&port->dev, + "%s: out of order reset, expected decoder%d.%d\n", + dev_name(&cxld->dev), port->id, port->commit_end); + return -EBUSY; + } + + port->commit_end--; + cxld->flags &= ~CXL_DECODER_F_ENABLE; + + return 0; +} + static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) { struct cxl_port *port = cxlhdm->port; @@ -482,6 +526,8 @@ static int mock_cxl_enumerate_decoders(struct cxl_hdm *cxlhdm) cxld->interleave_ways = min_not_zero(target_count, 1); cxld->interleave_granularity = SZ_4K; cxld->target_type = CXL_DECODER_EXPANDER; + cxld->commit = mock_decoder_commit; + cxld->reset = mock_decoder_reset; if (target_count) { rc = device_for_each_child(port->uport, &ctx, -- cgit v1.2.3