summaryrefslogtreecommitdiffstats
path: root/drivers/cxl/pci.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-03-24 18:07:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-03-24 18:07:03 -0700
commitb9132c32e01976686efa26252cc246944a0d2cab (patch)
treefe93979a569002f6f9a4188b1f21d475bbdec82b /drivers/cxl/pci.c
parentb14ffae378aa1db993e62b01392e70d1e585fb23 (diff)
parent05e815539f3f161585c13a9ab023341bade2c52f (diff)
downloadlinux-b9132c32e01976686efa26252cc246944a0d2cab.tar.bz2
Merge tag 'cxl-for-5.18' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl
Pull CXL (Compute Express Link) updates from Dan Williams: "This development cycle extends the subsystem to discover CXL resources throughout a CXL/PCIe switch topology and respond to hot add/remove events anywhere in that topology. This is more foundational infrastructure in preparation for dynamic memory region provisioning support. Recall that CXL memory regions, as the new "Theory of Operation" section of Documentation/driver-api/cxl/memory-devices.rst describes, bring storage volume striping semantics to memory. The hot add/remove behavior is validated with extensions to the cxl_test unit test environment and this test in the cxl-cli test suite: https://github.com/pmem/ndctl/blob/djbw/for-74/cxl/test/cxl-topology.sh Summary: - Add a driver for 'struct cxl_memdev' objects responsible for CXL.mem operation as distinct from 'cxl_pci' mailbox operations. Its primary responsibility is enumerating an endpoint 'struct cxl_port' and all the 'struct cxl_port' instances between an endpoint and the CXL platform root. - Add a driver for 'struct cxl_port' objects responsible for enumerating and operating all Host-managed Device Memory (HDM) decoder resources between the platform-level CXL memory description, all intervening host bridges / switches, and the HDM resources in endpoints. - Update the cxl_pci driver to validate CXL.mem operation precursors to HDM decoder operation like ready-polling, and legacy CXL 1.1 DVSEC based CXL.mem configuration. - Add basic lockdep coverage for usage of device_lock() on CXL subsystem objects similar to what exists for LIBNVDIMM. Include a compile-time switch for which subsystem to validate at run-time. - Update cxl_test to emulate a one level switch topology. - Document a "Theory of Operation" for the subsystem. - Add 'numa_node' and 'serial' attributes to cxl_memdev sysfs - Include miscellaneous fixes for spec / QEMU CXL emulation compatibility and static analysis reports" * tag 'cxl-for-5.18' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl: (48 commits) cxl/core/port: Fix NULL but dereferenced coccicheck error cxl/port: Hold port reference until decoder release cxl/port: Fix endpoint refcount leak cxl/core: Fix cxl_device_lock() class detection cxl/core/port: Fix unregister_port() lock assertion cxl/regs: Fix size of CXL Capability Header Register cxl/core/port: Handle invalid decoders cxl/core/port: Fix / relax decoder target enumeration tools/testing/cxl: Add a physical_node link tools/testing/cxl: Enumerate mock decoders tools/testing/cxl: Mock one level of switches tools/testing/cxl: Fix root port to host bridge assignment tools/testing/cxl: Mock dvsec_ranges() cxl/core/port: Add endpoint decoders cxl/core: Move target_list out of base decoder attributes cxl/mem: Add the cxl_mem driver cxl/core/port: Add switch port enumeration cxl/memdev: Add numa_node attribute cxl/pci: Emit device serial number cxl/pci: Implement wait for media active ...
Diffstat (limited to 'drivers/cxl/pci.c')
-rw-r--r--drivers/cxl/pci.c383
1 files changed, 241 insertions, 142 deletions
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 8dc91fd3396a..8a7267d116b7 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1,14 +1,16 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright(c) 2020 Intel Corporation. All rights reserved. */
#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/moduleparam.h>
#include <linux/module.h>
+#include <linux/delay.h>
#include <linux/sizes.h>
#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/pci.h>
#include <linux/io.h>
#include "cxlmem.h"
-#include "pci.h"
+#include "cxlpci.h"
#include "cxl.h"
/**
@@ -35,6 +37,20 @@
/* CXL 2.0 - 8.2.8.4 */
#define CXL_MAILBOX_TIMEOUT_MS (2 * HZ)
+/*
+ * CXL 2.0 ECN "Add Mailbox Ready Time" defines a capability field to
+ * dictate how long to wait for the mailbox to become ready. The new
+ * field allows the device to tell software the amount of time to wait
+ * before mailbox ready. This field per the spec theoretically allows
+ * for up to 255 seconds. 255 seconds is unreasonably long, its longer
+ * than the maximum SATA port link recovery wait. Default to 60 seconds
+ * until someone builds a CXL device that needs more time in practice.
+ */
+static unsigned short mbox_ready_timeout = 60;
+module_param(mbox_ready_timeout, ushort, 0644);
+MODULE_PARM_DESC(mbox_ready_timeout,
+ "seconds to wait for mailbox ready / memory active status");
+
static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
{
const unsigned long start = jiffies;
@@ -57,14 +73,16 @@ static int cxl_pci_mbox_wait_for_doorbell(struct cxl_dev_state *cxlds)
return 0;
}
-static void cxl_pci_mbox_timeout(struct cxl_dev_state *cxlds,
- struct cxl_mbox_cmd *mbox_cmd)
-{
- struct device *dev = cxlds->dev;
+#define cxl_err(dev, status, msg) \
+ dev_err_ratelimited(dev, msg ", device state %s%s\n", \
+ status & CXLMDEV_DEV_FATAL ? " fatal" : "", \
+ status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
- dev_dbg(dev, "Mailbox command (opcode: %#x size: %zub) timed out\n",
- mbox_cmd->opcode, mbox_cmd->size_in);
-}
+#define cxl_cmd_err(dev, cmd, status, msg) \
+ dev_err_ratelimited(dev, msg " (opcode: %#x), device state %s%s\n", \
+ (cmd)->opcode, \
+ status & CXLMDEV_DEV_FATAL ? " fatal" : "", \
+ status & CXLMDEV_FW_HALT ? " firmware-halt" : "")
/**
* __cxl_pci_mbox_send_cmd() - Execute a mailbox command
@@ -118,7 +136,11 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
/* #1 */
if (cxl_doorbell_busy(cxlds)) {
- dev_err_ratelimited(dev, "Mailbox re-busy after acquiring\n");
+ u64 md_status =
+ readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+
+ cxl_cmd_err(cxlds->dev, mbox_cmd, md_status,
+ "mailbox queue busy");
return -EBUSY;
}
@@ -144,7 +166,9 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
/* #5 */
rc = cxl_pci_mbox_wait_for_doorbell(cxlds);
if (rc == -ETIMEDOUT) {
- cxl_pci_mbox_timeout(cxlds, mbox_cmd);
+ u64 md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+
+ cxl_cmd_err(cxlds->dev, mbox_cmd, md_status, "mailbox timeout");
return rc;
}
@@ -182,98 +206,13 @@ static int __cxl_pci_mbox_send_cmd(struct cxl_dev_state *cxlds,
return 0;
}
-/**
- * cxl_pci_mbox_get() - Acquire exclusive access to the mailbox.
- * @cxlds: The device state to gain access to.
- *
- * Context: Any context. Takes the mbox_mutex.
- * Return: 0 if exclusive access was acquired.
- */
-static int cxl_pci_mbox_get(struct cxl_dev_state *cxlds)
-{
- struct device *dev = cxlds->dev;
- u64 md_status;
- int rc;
-
- mutex_lock_io(&cxlds->mbox_mutex);
-
- /*
- * XXX: There is some amount of ambiguity in the 2.0 version of the spec
- * around the mailbox interface ready (8.2.8.5.1.1). The purpose of the
- * bit is to allow firmware running on the device to notify the driver
- * that it's ready to receive commands. It is unclear if the bit needs
- * to be read for each transaction mailbox, ie. the firmware can switch
- * it on and off as needed. Second, there is no defined timeout for
- * mailbox ready, like there is for the doorbell interface.
- *
- * Assumptions:
- * 1. The firmware might toggle the Mailbox Interface Ready bit, check
- * it for every command.
- *
- * 2. If the doorbell is clear, the firmware should have first set the
- * Mailbox Interface Ready bit. Therefore, waiting for the doorbell
- * to be ready is sufficient.
- */
- rc = cxl_pci_mbox_wait_for_doorbell(cxlds);
- if (rc) {
- dev_warn(dev, "Mailbox interface not ready\n");
- goto out;
- }
-
- md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
- if (!(md_status & CXLMDEV_MBOX_IF_READY && CXLMDEV_READY(md_status))) {
- dev_err(dev, "mbox: reported doorbell ready, but not mbox ready\n");
- rc = -EBUSY;
- goto out;
- }
-
- /*
- * Hardware shouldn't allow a ready status but also have failure bits
- * set. Spit out an error, this should be a bug report
- */
- rc = -EFAULT;
- if (md_status & CXLMDEV_DEV_FATAL) {
- dev_err(dev, "mbox: reported ready, but fatal\n");
- goto out;
- }
- if (md_status & CXLMDEV_FW_HALT) {
- dev_err(dev, "mbox: reported ready, but halted\n");
- goto out;
- }
- if (CXLMDEV_RESET_NEEDED(md_status)) {
- dev_err(dev, "mbox: reported ready, but reset needed\n");
- goto out;
- }
-
- /* with lock held */
- return 0;
-
-out:
- mutex_unlock(&cxlds->mbox_mutex);
- return rc;
-}
-
-/**
- * cxl_pci_mbox_put() - Release exclusive access to the mailbox.
- * @cxlds: The device state to communicate with.
- *
- * Context: Any context. Expects mbox_mutex to be held.
- */
-static void cxl_pci_mbox_put(struct cxl_dev_state *cxlds)
-{
- mutex_unlock(&cxlds->mbox_mutex);
-}
-
static int cxl_pci_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *cmd)
{
int rc;
- rc = cxl_pci_mbox_get(cxlds);
- if (rc)
- return rc;
-
+ mutex_lock_io(&cxlds->mbox_mutex);
rc = __cxl_pci_mbox_send_cmd(cxlds, cmd);
- cxl_pci_mbox_put(cxlds);
+ mutex_unlock(&cxlds->mbox_mutex);
return rc;
}
@@ -281,6 +220,34 @@ static int cxl_pci_mbox_send(struct cxl_dev_state *cxlds, struct cxl_mbox_cmd *c
static int cxl_pci_setup_mailbox(struct cxl_dev_state *cxlds)
{
const int cap = readl(cxlds->regs.mbox + CXLDEV_MBOX_CAPS_OFFSET);
+ unsigned long timeout;
+ u64 md_status;
+
+ timeout = jiffies + mbox_ready_timeout * HZ;
+ do {
+ md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+ if (md_status & CXLMDEV_MBOX_IF_READY)
+ break;
+ if (msleep_interruptible(100))
+ break;
+ } while (!time_after(jiffies, timeout));
+
+ if (!(md_status & CXLMDEV_MBOX_IF_READY)) {
+ cxl_err(cxlds->dev, md_status,
+ "timeout awaiting mailbox ready");
+ return -ETIMEDOUT;
+ }
+
+ /*
+ * A command may be in flight from a previous driver instance,
+ * think kexec, do one doorbell wait so that
+ * __cxl_pci_mbox_send_cmd() can assume that it is the only
+ * source for future doorbell busy events.
+ */
+ if (cxl_pci_mbox_wait_for_doorbell(cxlds) != 0) {
+ cxl_err(cxlds->dev, md_status, "timeout awaiting mailbox idle");
+ return -ETIMEDOUT;
+ }
cxlds->mbox_send = cxl_pci_mbox_send;
cxlds->payload_size =
@@ -400,75 +367,182 @@ static int cxl_map_regs(struct cxl_dev_state *cxlds, struct cxl_register_map *ma
return 0;
}
-static void cxl_decode_regblock(u32 reg_lo, u32 reg_hi,
- struct cxl_register_map *map)
+static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
+ struct cxl_register_map *map)
{
- map->block_offset =
- ((u64)reg_hi << 32) | (reg_lo & CXL_REGLOC_ADDR_MASK);
- map->barno = FIELD_GET(CXL_REGLOC_BIR_MASK, reg_lo);
- map->reg_type = FIELD_GET(CXL_REGLOC_RBI_MASK, reg_lo);
+ int rc;
+
+ rc = cxl_find_regblock(pdev, type, map);
+ if (rc)
+ return rc;
+
+ rc = cxl_map_regblock(pdev, map);
+ if (rc)
+ return rc;
+
+ rc = cxl_probe_regs(pdev, map);
+ cxl_unmap_regblock(pdev, map);
+
+ return rc;
}
-/**
- * cxl_find_regblock() - Locate register blocks by type
- * @pdev: The CXL PCI device to enumerate.
- * @type: Register Block Indicator id
- * @map: Enumeration output, clobbered on error
- *
- * Return: 0 if register block enumerated, negative error code otherwise
- *
- * A CXL DVSEC may point to one or more register blocks, search for them
- * by @type.
- */
-static int cxl_find_regblock(struct pci_dev *pdev, enum cxl_regloc_type type,
- struct cxl_register_map *map)
+static int wait_for_valid(struct cxl_dev_state *cxlds)
{
- u32 regloc_size, regblocks;
- int regloc, i;
+ struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+ int d = cxlds->cxl_dvsec, rc;
+ u32 val;
- regloc = pci_find_dvsec_capability(pdev, PCI_DVSEC_VENDOR_ID_CXL,
- PCI_DVSEC_ID_CXL_REGLOC_DVSEC_ID);
- if (!regloc)
- return -ENXIO;
+ /*
+ * Memory_Info_Valid: When set, indicates that the CXL Range 1 Size high
+ * and Size Low registers are valid. Must be set within 1 second of
+ * deassertion of reset to CXL device. Likely it is already set by the
+ * time this runs, but otherwise give a 1.5 second timeout in case of
+ * clock skew.
+ */
+ rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val);
+ if (rc)
+ return rc;
+
+ if (val & CXL_DVSEC_MEM_INFO_VALID)
+ return 0;
+
+ msleep(1500);
+
+ rc = pci_read_config_dword(pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &val);
+ if (rc)
+ return rc;
- pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, &regloc_size);
- regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size);
+ if (val & CXL_DVSEC_MEM_INFO_VALID)
+ return 0;
- regloc += PCI_DVSEC_ID_CXL_REGLOC_BLOCK1_OFFSET;
- regblocks = (regloc_size - PCI_DVSEC_ID_CXL_REGLOC_BLOCK1_OFFSET) / 8;
+ return -ETIMEDOUT;
+}
+
+/*
+ * Wait up to @mbox_ready_timeout for the device to report memory
+ * active.
+ */
+static int wait_for_media_ready(struct cxl_dev_state *cxlds)
+{
+ struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+ int d = cxlds->cxl_dvsec;
+ bool active = false;
+ u64 md_status;
+ int rc, i;
+
+ rc = wait_for_valid(cxlds);
+ if (rc)
+ return rc;
- for (i = 0; i < regblocks; i++, regloc += 8) {
- u32 reg_lo, reg_hi;
+ for (i = mbox_ready_timeout; i; i--) {
+ u32 temp;
+ int rc;
- pci_read_config_dword(pdev, regloc, &reg_lo);
- pci_read_config_dword(pdev, regloc + 4, &reg_hi);
+ rc = pci_read_config_dword(
+ pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(0), &temp);
+ if (rc)
+ return rc;
- cxl_decode_regblock(reg_lo, reg_hi, map);
+ active = FIELD_GET(CXL_DVSEC_MEM_ACTIVE, temp);
+ if (active)
+ break;
+ msleep(1000);
+ }
- if (map->reg_type == type)
- return 0;
+ if (!active) {
+ dev_err(&pdev->dev,
+ "timeout awaiting memory active after %d seconds\n",
+ mbox_ready_timeout);
+ return -ETIMEDOUT;
}
- return -ENODEV;
+ md_status = readq(cxlds->regs.memdev + CXLMDEV_STATUS_OFFSET);
+ if (!CXLMDEV_READY(md_status))
+ return -EIO;
+
+ return 0;
}
-static int cxl_setup_regs(struct pci_dev *pdev, enum cxl_regloc_type type,
- struct cxl_register_map *map)
+static int cxl_dvsec_ranges(struct cxl_dev_state *cxlds)
{
- int rc;
+ struct cxl_endpoint_dvsec_info *info = &cxlds->info;
+ struct pci_dev *pdev = to_pci_dev(cxlds->dev);
+ int d = cxlds->cxl_dvsec;
+ int hdm_count, rc, i;
+ u16 cap, ctrl;
- rc = cxl_find_regblock(pdev, type, map);
+ if (!d)
+ return -ENXIO;
+
+ rc = pci_read_config_word(pdev, d + CXL_DVSEC_CAP_OFFSET, &cap);
if (rc)
return rc;
- rc = cxl_map_regblock(pdev, map);
+ rc = pci_read_config_word(pdev, d + CXL_DVSEC_CTRL_OFFSET, &ctrl);
if (rc)
return rc;
- rc = cxl_probe_regs(pdev, map);
- cxl_unmap_regblock(pdev, map);
+ if (!(cap & CXL_DVSEC_MEM_CAPABLE))
+ return -ENXIO;
- return rc;
+ /*
+ * It is not allowed by spec for MEM.capable to be set and have 0 legacy
+ * HDM decoders (values > 2 are also undefined as of CXL 2.0). As this
+ * driver is for a spec defined class code which must be CXL.mem
+ * capable, there is no point in continuing to enable CXL.mem.
+ */
+ hdm_count = FIELD_GET(CXL_DVSEC_HDM_COUNT_MASK, cap);
+ if (!hdm_count || hdm_count > 2)
+ return -EINVAL;
+
+ rc = wait_for_valid(cxlds);
+ if (rc)
+ return rc;
+
+ info->mem_enabled = FIELD_GET(CXL_DVSEC_MEM_ENABLE, ctrl);
+
+ for (i = 0; i < hdm_count; i++) {
+ u64 base, size;
+ u32 temp;
+
+ rc = pci_read_config_dword(
+ pdev, d + CXL_DVSEC_RANGE_SIZE_HIGH(i), &temp);
+ if (rc)
+ return rc;
+
+ size = (u64)temp << 32;
+
+ rc = pci_read_config_dword(
+ pdev, d + CXL_DVSEC_RANGE_SIZE_LOW(i), &temp);
+ if (rc)
+ return rc;
+
+ size |= temp & CXL_DVSEC_MEM_SIZE_LOW_MASK;
+
+ rc = pci_read_config_dword(
+ pdev, d + CXL_DVSEC_RANGE_BASE_HIGH(i), &temp);
+ if (rc)
+ return rc;
+
+ base = (u64)temp << 32;
+
+ rc = pci_read_config_dword(
+ pdev, d + CXL_DVSEC_RANGE_BASE_LOW(i), &temp);
+ if (rc)
+ return rc;
+
+ base |= temp & CXL_DVSEC_MEM_BASE_LOW_MASK;
+
+ info->dvsec_range[i] = (struct range) {
+ .start = base,
+ .end = base + size - 1
+ };
+
+ if (size)
+ info->ranges++;
+ }
+
+ return 0;
}
static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
@@ -493,6 +567,15 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (IS_ERR(cxlds))
return PTR_ERR(cxlds);
+ cxlds->serial = pci_get_dsn(pdev);
+ cxlds->cxl_dvsec = pci_find_dvsec_capability(
+ pdev, PCI_DVSEC_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);
+ if (!cxlds->cxl_dvsec)
+ dev_warn(&pdev->dev,
+ "Device DVSEC not present, skip CXL.mem init\n");
+
+ cxlds->wait_media_ready = wait_for_media_ready;
+
rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_MEMDEV, &map);
if (rc)
return rc;
@@ -501,6 +584,17 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (rc)
return rc;
+ /*
+ * If the component registers can't be found, the cxl_pci driver may
+ * still be useful for management functions so don't return an error.
+ */
+ cxlds->component_reg_phys = CXL_RESOURCE_NONE;
+ rc = cxl_setup_regs(pdev, CXL_REGLOC_RBI_COMPONENT, &map);
+ if (rc)
+ dev_warn(&pdev->dev, "No component registers (%d)\n", rc);
+
+ cxlds->component_reg_phys = cxl_regmap_to_base(pdev, &map);
+
rc = cxl_pci_setup_mailbox(cxlds);
if (rc)
return rc;
@@ -517,6 +611,11 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (rc)
return rc;
+ rc = cxl_dvsec_ranges(cxlds);
+ if (rc)
+ dev_warn(&pdev->dev,
+ "Failed to get DVSEC range information (%d)\n", rc);
+
cxlmd = devm_cxl_add_memdev(cxlds);
if (IS_ERR(cxlmd))
return PTR_ERR(cxlmd);