From 2f6e9c305127f8dea4e2d697b4bdd33e126ccbf7 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 29 Nov 2022 10:48:53 -0700 Subject: cxl/pci: add tracepoint events for CXL RAS Add tracepoint events for recording the CXL uncorrectable and correctable errors. For uncorrectable errors, there is additional data of 512B from the header log register (CXL spec rev3 8.2.4.16.7). The trace event will intake a dynamic array that will dump the entire Header Log data. If multiple errors are set in the status register, then the 'first error' field (CXL spec rev3 v8.2.4.16.6) is read from the Error Capabilities and Control Register in order to determine the error. This implementation does not include CXL IDE Error details. Cc: Steven Rostedt Reviewed-by: Jonathan Cameron Signed-off-by: Dave Jiang Reviewed-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/166974413388.1608150.5875712482260436188.stgit@djiang5-desk3.ch.intel.com Signed-off-by: Dan Williams --- include/trace/events/cxl.h | 112 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 include/trace/events/cxl.h (limited to 'include/trace') diff --git a/include/trace/events/cxl.h b/include/trace/events/cxl.h new file mode 100644 index 000000000000..72c3e2870a9e --- /dev/null +++ b/include/trace/events/cxl.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cxl + +#if !defined(_CXL_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _CXL_EVENTS_H + +#include + +#define CXL_HEADERLOG_SIZE SZ_512 +#define CXL_HEADERLOG_SIZE_U32 SZ_512 / sizeof(u32) + +#define CXL_RAS_UC_CACHE_DATA_PARITY BIT(0) +#define CXL_RAS_UC_CACHE_ADDR_PARITY BIT(1) +#define CXL_RAS_UC_CACHE_BE_PARITY BIT(2) +#define CXL_RAS_UC_CACHE_DATA_ECC BIT(3) +#define CXL_RAS_UC_MEM_DATA_PARITY BIT(4) +#define CXL_RAS_UC_MEM_ADDR_PARITY BIT(5) +#define CXL_RAS_UC_MEM_BE_PARITY BIT(6) +#define CXL_RAS_UC_MEM_DATA_ECC BIT(7) +#define CXL_RAS_UC_REINIT_THRESH BIT(8) +#define CXL_RAS_UC_RSVD_ENCODE BIT(9) +#define CXL_RAS_UC_POISON BIT(10) +#define CXL_RAS_UC_RECV_OVERFLOW BIT(11) +#define CXL_RAS_UC_INTERNAL_ERR BIT(14) +#define CXL_RAS_UC_IDE_TX_ERR BIT(15) +#define CXL_RAS_UC_IDE_RX_ERR BIT(16) + +#define show_uc_errs(status) __print_flags(status, " | ", \ + { CXL_RAS_UC_CACHE_DATA_PARITY, "Cache Data Parity Error" }, \ + { CXL_RAS_UC_CACHE_ADDR_PARITY, "Cache Address Parity Error" }, \ + { CXL_RAS_UC_CACHE_BE_PARITY, "Cache Byte Enable Parity Error" }, \ + { CXL_RAS_UC_CACHE_DATA_ECC, "Cache Data ECC Error" }, \ + { CXL_RAS_UC_MEM_DATA_PARITY, "Memory Data Parity Error" }, \ + { CXL_RAS_UC_MEM_ADDR_PARITY, "Memory Address Parity Error" }, \ + { CXL_RAS_UC_MEM_BE_PARITY, "Memory Byte Enable Parity Error" }, \ + { CXL_RAS_UC_MEM_DATA_ECC, "Memory Data ECC Error" }, \ + { CXL_RAS_UC_REINIT_THRESH, "REINIT Threshold Hit" }, \ + { CXL_RAS_UC_RSVD_ENCODE, "Received Unrecognized Encoding" }, \ + { CXL_RAS_UC_POISON, "Received Poison From Peer" }, \ + { CXL_RAS_UC_RECV_OVERFLOW, "Receiver Overflow" }, \ + { CXL_RAS_UC_INTERNAL_ERR, "Component Specific Error" }, \ + { CXL_RAS_UC_IDE_TX_ERR, "IDE Tx Error" }, \ + { CXL_RAS_UC_IDE_RX_ERR, "IDE Rx Error" } \ +) + +TRACE_EVENT(cxl_aer_uncorrectable_error, + TP_PROTO(const char *dev_name, u32 status, u32 fe, u32 *hl), + TP_ARGS(dev_name, status, fe, hl), + TP_STRUCT__entry( + __string(dev_name, dev_name) + __field(u32, status) + __field(u32, first_error) + __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) + ), + TP_fast_assign( + __assign_str(dev_name, dev_name); + __entry->status = status; + __entry->first_error = fe; + /* + * Embed the 512B headerlog data for user app retrieval and + * parsing, but no need to print this in the trace buffer. + */ + memcpy(__entry->header_log, hl, CXL_HEADERLOG_SIZE); + ), + TP_printk("%s: status: '%s' first_error: '%s'", + __get_str(dev_name), + show_uc_errs(__entry->status), + show_uc_errs(__entry->first_error) + ) +); + +#define CXL_RAS_CE_CACHE_DATA_ECC BIT(0) +#define CXL_RAS_CE_MEM_DATA_ECC BIT(1) +#define CXL_RAS_CE_CRC_THRESH BIT(2) +#define CLX_RAS_CE_RETRY_THRESH BIT(3) +#define CXL_RAS_CE_CACHE_POISON BIT(4) +#define CXL_RAS_CE_MEM_POISON BIT(5) +#define CXL_RAS_CE_PHYS_LAYER_ERR BIT(6) + +#define show_ce_errs(status) __print_flags(status, " | ", \ + { CXL_RAS_CE_CACHE_DATA_ECC, "Cache Data ECC Error" }, \ + { CXL_RAS_CE_MEM_DATA_ECC, "Memory Data ECC Error" }, \ + { CXL_RAS_CE_CRC_THRESH, "CRC Threshold Hit" }, \ + { CLX_RAS_CE_RETRY_THRESH, "Retry Threshold" }, \ + { CXL_RAS_CE_CACHE_POISON, "Received Cache Poison From Peer" }, \ + { CXL_RAS_CE_MEM_POISON, "Received Memory Poison From Peer" }, \ + { CXL_RAS_CE_PHYS_LAYER_ERR, "Received Error From Physical Layer" } \ +) + +TRACE_EVENT(cxl_aer_correctable_error, + TP_PROTO(const char *dev_name, u32 status), + TP_ARGS(dev_name, status), + TP_STRUCT__entry( + __string(dev_name, dev_name) + __field(u32, status) + ), + TP_fast_assign( + __assign_str(dev_name, dev_name); + __entry->status = status; + ), + TP_printk("%s: status: '%s'", + __get_str(dev_name), show_ce_errs(__entry->status) + ) +); + +#endif /* _CXL_EVENTS_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE cxl +#include -- cgit v1.2.3 From 372ab3bc3711db46ae1205401c2aac2ed16fc348 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 5 Dec 2022 20:28:34 -0800 Subject: cxl/pci: Add some type-safety to the AER trace points The first argument to the CXL AER trace points is the source device. Pass a 'const struct device *' rather than a 'const char *' for more type precision / safety. Cc: Jonathan Cameron Cc: Dave Jiang Cc: Steven Rostedt Reviewed-by: Dave Jiang Reviewed-by: Jonathan Cameron Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/167030091477.4045167.15174636482098463885.stgit@dwillia2-xfh.jf.intel.com Signed-off-by: Dan Williams --- drivers/cxl/pci.c | 4 ++-- include/trace/events/cxl.h | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/trace') diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 6cec9fa9326c..cced4a0df3d1 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -562,7 +562,7 @@ static bool cxl_report_and_clear(struct cxl_dev_state *cxlds) } header_log_copy(cxlds, hl); - trace_cxl_aer_uncorrectable_error(dev_name(dev), status, fe, hl); + trace_cxl_aer_uncorrectable_error(dev, status, fe, hl); writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr); return true; @@ -644,7 +644,7 @@ static void cxl_cor_error_detected(struct pci_dev *pdev) status = le32_to_cpu(readl(addr)); if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) { writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr); - trace_cxl_aer_correctable_error(dev_name(dev), status); + trace_cxl_aer_correctable_error(dev, status); } } diff --git a/include/trace/events/cxl.h b/include/trace/events/cxl.h index 72c3e2870a9e..ad085a2534ef 100644 --- a/include/trace/events/cxl.h +++ b/include/trace/events/cxl.h @@ -45,16 +45,16 @@ ) TRACE_EVENT(cxl_aer_uncorrectable_error, - TP_PROTO(const char *dev_name, u32 status, u32 fe, u32 *hl), - TP_ARGS(dev_name, status, fe, hl), + TP_PROTO(const struct device *dev, u32 status, u32 fe, u32 *hl), + TP_ARGS(dev, status, fe, hl), TP_STRUCT__entry( - __string(dev_name, dev_name) + __string(dev_name, dev_name(dev)) __field(u32, status) __field(u32, first_error) __array(u32, header_log, CXL_HEADERLOG_SIZE_U32) ), TP_fast_assign( - __assign_str(dev_name, dev_name); + __assign_str(dev_name, dev_name(dev)); __entry->status = status; __entry->first_error = fe; /* @@ -89,14 +89,14 @@ TRACE_EVENT(cxl_aer_uncorrectable_error, ) TRACE_EVENT(cxl_aer_correctable_error, - TP_PROTO(const char *dev_name, u32 status), - TP_ARGS(dev_name, status), + TP_PROTO(const struct device *dev, u32 status), + TP_ARGS(dev, status), TP_STRUCT__entry( - __string(dev_name, dev_name) + __string(dev_name, dev_name(dev)) __field(u32, status) ), TP_fast_assign( - __assign_str(dev_name, dev_name); + __assign_str(dev_name, dev_name(dev)); __entry->status = status; ), TP_printk("%s: status: '%s'", -- cgit v1.2.3