From 982a84455e94bf195f2c35f221a6b4fe239d74d2 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Mon, 31 Oct 2022 02:35:57 -0400 Subject: misc: genwqe: card_base: Fix some kernel-doc warnings Fixes the following W=1 kernel build warning(s): drivers/misc/genwqe/card_base.c:3: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst Signed-off-by: Bo Liu Link: https://lore.kernel.org/r/20221031063557.2710-1-liubo03@inspur.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/genwqe/card_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c index 693981891870..bae8114f2805 100644 --- a/drivers/misc/genwqe/card_base.c +++ b/drivers/misc/genwqe/card_base.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * IBM Accelerator Family 'GenWQE' * * (C) Copyright IBM Corp. 2013 -- cgit v1.2.3 From 4a4a4e9ebaa3ce903a3cdf8bb173eeaf87828cea Mon Sep 17 00:00:00 2001 From: Quan Nguyen Date: Mon, 31 Oct 2022 09:44:41 +0700 Subject: misc: smpro-errmon: Add Ampere's SMpro error monitor driver Add Ampere's SMpro error monitor driver for monitoring and reporting RAS-related errors as reported by SMpro co-processor found on Ampere's Altra processor family. Signed-off-by: Quan Nguyen Link: https://lore.kernel.org/r/20221031024442.2490881-3-quan@os.amperecomputing.com Signed-off-by: Greg Kroah-Hartman --- .../sysfs-bus-platform-devices-ampere-smpro | 264 ++++++++++ drivers/misc/Kconfig | 12 + drivers/misc/Makefile | 1 + drivers/misc/smpro-errmon.c | 529 +++++++++++++++++++++ 4 files changed, 806 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro create mode 100644 drivers/misc/smpro-errmon.c (limited to 'drivers/misc') diff --git a/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro b/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro new file mode 100644 index 000000000000..2b84dc8c3149 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro @@ -0,0 +1,264 @@ +What: /sys/bus/platform/devices/smpro-errmon.*/error_[core|mem|pcie|other]_[ce|ue] +KernelVersion: 6.1 +Contact: Quan Nguyen +Description: + (RO) Contains the 48-byte Ampere (Vendor-Specific) Error Record printed + in hex format according to the table below: + + +--------+---------------+-------------+------------------------------------------------------------+ + | Offset | Field | Size (byte) | Description | + +--------+---------------+-------------+------------------------------------------------------------+ + | 00 | Error Type | 1 | See :ref:`the table below ` for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 01 | Subtype | 1 | See :ref:`the table below ` for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 02 | Instance | 2 | See :ref:`the table below ` for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 04 | Error status | 4 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 08 | Error Address | 8 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 16 | Error Misc 0 | 8 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 24 | Error Misc 1 | 8 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 32 | Error Misc 2 | 8 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + | 40 | Error Misc 3 | 8 | See ARM RAS specification for details | + +--------+---------------+-------------+------------------------------------------------------------+ + + The table below defines the value of error types, their subtype, subcomponent and instance: + + .. _smpro-error-types: + + +-----------------+------------+----------+----------------+----------------------------------------+ + | Error Group | Error Type | Sub type | Sub component | Instance | + +-----------------+------------+----------+----------------+----------------------------------------+ + | CPM (core) | 0 | 0 | Snoop-Logic | CPM # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | CPM (core) | 0 | 2 | Armv8 Core 1 | CPM # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 1 | ERR1 | MCU # \| SLOT << 11 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 2 | ERR2 | MCU # \| SLOT << 11 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 3 | ERR3 | MCU # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 4 | ERR4 | MCU # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 5 | ERR5 | MCU # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 6 | ERR6 | MCU # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | MCU (mem) | 1 | 7 | Link Error | MCU # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | Mesh (other) | 2 | 0 | Cross Point | X \| (Y << 5) \| NS <<11 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | Mesh (other) | 2 | 1 | Home Node(IO) | X \| (Y << 5) \| NS <<11 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | Mesh (other) | 2 | 2 | Home Node(Mem) | X \| (Y << 5) \| NS <<11 \| device<<12 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | Mesh (other) | 2 | 4 | CCIX Node | X \| (Y << 5) \| NS <<11 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | 2P Link (other) | 3 | 0 | N/A | Altra 2P Link # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 0 | ERR0 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 1 | ERR1 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 2 | ERR2 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 3 | ERR3 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 4 | ERR4 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 5 | ERR5 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 6 | ERR6 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 7 | ERR7 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 8 | ERR8 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 9 | ERR9 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 10 | ERR10 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 11 | ERR11 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 12 | ERR12 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | GIC (other) | 5 | 13-21 | ERR13 | RC # + 1 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TCU | 100 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU0 | 0 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU1 | 1 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU2 | 2 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU3 | 3 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU4 | 4 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU5 | 5 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU6 | 6 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU7 | 7 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU8 | 8 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMMU (other) | 6 | TBU9 | 9 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PCIe AER (pcie) | 7 | Root | 0 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PCIe AER (pcie) | 7 | Device | 1 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PCIe RC (pcie) | 8 | RCA HB | 0 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PCIe RC (pcie) | 8 | RCB HB | 1 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PCIe RC (pcie) | 8 | RASDP | 8 | RC # | + +-----------------+------------+----------+----------------+----------------------------------------+ + | OCM (other) | 9 | ERR0 | 0 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | OCM (other) | 9 | ERR1 | 1 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | OCM (other) | 9 | ERR2 | 2 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMpro (other) | 10 | ERR0 | 0 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMpro (other) | 10 | ERR1 | 1 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | SMpro (other) | 10 | MPA_ERR | 2 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PMpro (other) | 11 | ERR0 | 0 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PMpro (other) | 11 | ERR1 | 1 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + | PMpro (other) | 11 | MPA_ERR | 2 | 0 | + +-----------------+------------+----------+----------------+----------------------------------------+ + + Example:: + + # cat error_other_ue + 880807001e004010401040101500000001004010401040100c0000000000000000000000000000000000000000000000 + + The detail of each sysfs entries is as below: + + +-------------+---------------------------------------------------------+----------------------------------+ + | Error | Sysfs entry | Description (when triggered) | + +-------------+---------------------------------------------------------+----------------------------------+ + | Core's CE | /sys/bus/platform/devices/smpro-errmon.*/error_core_ce | Core has CE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | Core's UE | /sys/bus/platform/devices/smpro-errmon.*/error_core_ue | Core has UE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | Memory's CE | /sys/bus/platform/devices/smpro-errmon.*/error_mem_ce | Memory has CE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | Memory's UE | /sys/bus/platform/devices/smpro-errmon.*/error_mem_ue | Memory has UE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | PCIe's CE | /sys/bus/platform/devices/smpro-errmon.*/error_pcie_ce | any PCIe controller has CE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | PCIe's UE | /sys/bus/platform/devices/smpro-errmon.*/error_pcie_ue | any PCIe controller has UE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | Other's CE | /sys/bus/platform/devices/smpro-errmon.*/error_other_ce | any other CE error | + +-------------+---------------------------------------------------------+----------------------------------+ + | Other's UE | /sys/bus/platform/devices/smpro-errmon.*/error_other_ue | any other UE error | + +-------------+---------------------------------------------------------+----------------------------------+ + + UE: Uncorrect-able Error + CE: Correct-able Error + + For details, see section `3.3 Ampere (Vendor-Specific) Error Record Formats, + Altra Family RAS Supplement`. + + +What: /sys/bus/platform/devices/smpro-errmon.*/overflow_[core|mem|pcie|other]_[ce|ue] +KernelVersion: 6.1 +Contact: Quan Nguyen +Description: + (RO) Return the overflow status of each type HW error reported: + + - 0 : No overflow + - 1 : There is an overflow and the oldest HW errors are dropped + + The detail of each sysfs entries is as below: + + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Overflow | Sysfs entry | Description | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Core's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_core_ce | Core CE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Core's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_core_ue | Core UE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Memory's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_mem_ce | Memory CE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Memory's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_mem_ue | Memory UE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | PCIe's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_pcie_ce | any PCIe controller CE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | PCIe's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_pcie_ue | any PCIe controller UE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Other's CE | /sys/bus/platform/devices/smpro-errmon.*/overflow_other_ce| any other CE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + | Other's UE | /sys/bus/platform/devices/smpro-errmon.*/overflow_other_ue| other UE error overflow | + +-------------+-----------------------------------------------------------+---------------------------------------+ + + where: + + - UE: Uncorrect-able Error + - CE: Correct-able Error + +What: /sys/bus/platform/devices/smpro-errmon.*/[error|warn]_[smpro|pmpro] +KernelVersion: 6.1 +Contact: Quan Nguyen +Description: + (RO) Contains the internal firmware error/warning printed as hex format. + + The detail of each sysfs entries is as below: + + +---------------+------------------------------------------------------+--------------------------+ + | Error | Sysfs entry | Description | + +---------------+------------------------------------------------------+--------------------------+ + | SMpro error | /sys/bus/platform/devices/smpro-errmon.*/error_smpro | system has SMpro error | + +---------------+------------------------------------------------------+--------------------------+ + | SMpro warning | /sys/bus/platform/devices/smpro-errmon.*/warn_smpro | system has SMpro warning | + +---------------+------------------------------------------------------+--------------------------+ + | PMpro error | /sys/bus/platform/devices/smpro-errmon.*/error_pmpro | system has PMpro error | + +---------------+------------------------------------------------------+--------------------------+ + | PMpro warning | /sys/bus/platform/devices/smpro-errmon.*/warn_pmpro | system has PMpro warning | + +---------------+------------------------------------------------------+--------------------------+ + + For details, see section `5.10 RAS Internal Error Register Definitions, + Altra Family Soc BMC Interface Specification`. + +What: /sys/bus/platform/devices/smpro-errmon.*/event_[vrd_warn_fault|vrd_hot|dimm_hot] +KernelVersion: 6.1 +Contact: Quan Nguyen +Description: + (RO) Contains the detail information in case of VRD/DIMM warning/hot events + in hex format as below:: + + AAAA + + where: + + - ``AAAA``: The event detail information data + + The detail of each sysfs entries is as below: + + +---------------+---------------------------------------------------------------+---------------------+ + | Event | Sysfs entry | Description | + +---------------+---------------------------------------------------------------+---------------------+ + | VRD HOT | /sys/bus/platform/devices/smpro-errmon.*/event_vrd_hot | VRD Hot | + +---------------+---------------------------------------------------------------+---------------------+ + | VR Warn/Fault | /sys/bus/platform/devices/smpro-errmon.*/event_vrd_warn_fault | VR Warning or Fault | + +---------------+---------------------------------------------------------------+---------------------+ + | DIMM HOT | /sys/bus/platform/devices/smpro-errmon.*/event_dimm_hot | DIMM Hot | + +---------------+---------------------------------------------------------------+---------------------+ + + For more details, see section `5.7 GPI Status Registers, + Altra Family Soc BMC Interface Specification`. + diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 358ad56f6524..b9ceee949dab 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -176,6 +176,18 @@ config SGI_XP this feature will allow for direct communication between SSIs based on a network adapter and DMA messaging. +config SMPRO_ERRMON + tristate "Ampere Computing SMPro error monitor driver" + depends on MFD_SMPRO || COMPILE_TEST + help + Say Y here to get support for the SMpro error monitor function + provided by Ampere Computing's Altra and Altra Max SoCs. Upon + loading, the driver creates sysfs files which can be use to gather + multiple HW error data reported via read and write system calls. + + To compile this driver as a module, say M here. The driver will be + called smpro-errmon. + config CS5535_MFGPT tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer (MFGPT) support" depends on MFD_CS5535 diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index ac9b3e757ba1..bbe24d4511a3 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o obj-$(CONFIG_KGDB_TESTS) += kgdbts.o obj-$(CONFIG_SGI_XP) += sgi-xp/ obj-$(CONFIG_SGI_GRU) += sgi-gru/ +obj-$(CONFIG_SMPRO_ERRMON) += smpro-errmon.o obj-$(CONFIG_CS5535_MFGPT) += cs5535-mfgpt.o obj-$(CONFIG_GEHC_ACHC) += gehc-achc.o obj-$(CONFIG_HP_ILO) += hpilo.o diff --git a/drivers/misc/smpro-errmon.c b/drivers/misc/smpro-errmon.c new file mode 100644 index 000000000000..d1431d419aa4 --- /dev/null +++ b/drivers/misc/smpro-errmon.c @@ -0,0 +1,529 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ampere Computing SoC's SMpro Error Monitoring Driver + * + * Copyright (c) 2022, Ampere Computing LLC + * + */ + +#include +#include +#include +#include +#include + +/* GPI RAS Error Registers */ +#define GPI_RAS_ERR 0x7E + +/* Core and L2C Error Registers */ +#define CORE_CE_ERR_CNT 0x80 +#define CORE_CE_ERR_LEN 0x81 +#define CORE_CE_ERR_DATA 0x82 +#define CORE_UE_ERR_CNT 0x83 +#define CORE_UE_ERR_LEN 0x84 +#define CORE_UE_ERR_DATA 0x85 + +/* Memory Error Registers */ +#define MEM_CE_ERR_CNT 0x90 +#define MEM_CE_ERR_LEN 0x91 +#define MEM_CE_ERR_DATA 0x92 +#define MEM_UE_ERR_CNT 0x93 +#define MEM_UE_ERR_LEN 0x94 +#define MEM_UE_ERR_DATA 0x95 + +/* RAS Error/Warning Registers */ +#define ERR_SMPRO_TYPE 0xA0 +#define ERR_PMPRO_TYPE 0xA1 +#define ERR_SMPRO_INFO_LO 0xA2 +#define ERR_SMPRO_INFO_HI 0xA3 +#define ERR_SMPRO_DATA_LO 0xA4 +#define ERR_SMPRO_DATA_HI 0xA5 +#define WARN_SMPRO_INFO_LO 0xAA +#define WARN_SMPRO_INFO_HI 0xAB +#define ERR_PMPRO_INFO_LO 0xA6 +#define ERR_PMPRO_INFO_HI 0xA7 +#define ERR_PMPRO_DATA_LO 0xA8 +#define ERR_PMPRO_DATA_HI 0xA9 +#define WARN_PMPRO_INFO_LO 0xAC +#define WARN_PMPRO_INFO_HI 0xAD + +/* PCIE Error Registers */ +#define PCIE_CE_ERR_CNT 0xC0 +#define PCIE_CE_ERR_LEN 0xC1 +#define PCIE_CE_ERR_DATA 0xC2 +#define PCIE_UE_ERR_CNT 0xC3 +#define PCIE_UE_ERR_LEN 0xC4 +#define PCIE_UE_ERR_DATA 0xC5 + +/* Other Error Registers */ +#define OTHER_CE_ERR_CNT 0xD0 +#define OTHER_CE_ERR_LEN 0xD1 +#define OTHER_CE_ERR_DATA 0xD2 +#define OTHER_UE_ERR_CNT 0xD8 +#define OTHER_UE_ERR_LEN 0xD9 +#define OTHER_UE_ERR_DATA 0xDA + +/* Event Data Registers */ +#define VRD_WARN_FAULT_EVENT_DATA 0x78 +#define VRD_HOT_EVENT_DATA 0x79 +#define DIMM_HOT_EVENT_DATA 0x7A + +#define MAX_READ_BLOCK_LENGTH 48 + +#define RAS_SMPRO_ERR 0 +#define RAS_PMPRO_ERR 1 + +enum RAS_48BYTES_ERR_TYPES { + CORE_CE_ERR, + CORE_UE_ERR, + MEM_CE_ERR, + MEM_UE_ERR, + PCIE_CE_ERR, + PCIE_UE_ERR, + OTHER_CE_ERR, + OTHER_UE_ERR, + NUM_48BYTES_ERR_TYPE, +}; + +struct smpro_error_hdr { + u8 count; /* Number of the RAS errors */ + u8 len; /* Number of data bytes */ + u8 data; /* Start of 48-byte data */ + u8 max_cnt; /* Max num of errors */ +}; + +/* + * Included Address of registers to get Count, Length of data and Data + * of the 48 bytes error data + */ +static struct smpro_error_hdr smpro_error_table[] = { + [CORE_CE_ERR] = { + .count = CORE_CE_ERR_CNT, + .len = CORE_CE_ERR_LEN, + .data = CORE_CE_ERR_DATA, + .max_cnt = 32 + }, + [CORE_UE_ERR] = { + .count = CORE_UE_ERR_CNT, + .len = CORE_UE_ERR_LEN, + .data = CORE_UE_ERR_DATA, + .max_cnt = 32 + }, + [MEM_CE_ERR] = { + .count = MEM_CE_ERR_CNT, + .len = MEM_CE_ERR_LEN, + .data = MEM_CE_ERR_DATA, + .max_cnt = 16 + }, + [MEM_UE_ERR] = { + .count = MEM_UE_ERR_CNT, + .len = MEM_UE_ERR_LEN, + .data = MEM_UE_ERR_DATA, + .max_cnt = 16 + }, + [PCIE_CE_ERR] = { + .count = PCIE_CE_ERR_CNT, + .len = PCIE_CE_ERR_LEN, + .data = PCIE_CE_ERR_DATA, + .max_cnt = 96 + }, + [PCIE_UE_ERR] = { + .count = PCIE_UE_ERR_CNT, + .len = PCIE_UE_ERR_LEN, + .data = PCIE_UE_ERR_DATA, + .max_cnt = 96 + }, + [OTHER_CE_ERR] = { + .count = OTHER_CE_ERR_CNT, + .len = OTHER_CE_ERR_LEN, + .data = OTHER_CE_ERR_DATA, + .max_cnt = 8 + }, + [OTHER_UE_ERR] = { + .count = OTHER_UE_ERR_CNT, + .len = OTHER_UE_ERR_LEN, + .data = OTHER_UE_ERR_DATA, + .max_cnt = 8 + }, +}; + +/* + * List of SCP registers which are used to get + * one type of RAS Internal errors. + */ +struct smpro_int_error_hdr { + u8 type; + u8 info_l; + u8 info_h; + u8 data_l; + u8 data_h; + u8 warn_l; + u8 warn_h; +}; + +static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = { + [RAS_SMPRO_ERR] = { + .type = ERR_SMPRO_TYPE, + .info_l = ERR_SMPRO_INFO_LO, + .info_h = ERR_SMPRO_INFO_HI, + .data_l = ERR_SMPRO_DATA_LO, + .data_h = ERR_SMPRO_DATA_HI, + .warn_l = WARN_SMPRO_INFO_LO, + .warn_h = WARN_SMPRO_INFO_HI, + }, + [RAS_PMPRO_ERR] = { + .type = ERR_PMPRO_TYPE, + .info_l = ERR_PMPRO_INFO_LO, + .info_h = ERR_PMPRO_INFO_HI, + .data_l = ERR_PMPRO_DATA_LO, + .data_h = ERR_PMPRO_DATA_HI, + .warn_l = WARN_PMPRO_INFO_LO, + .warn_h = WARN_PMPRO_INFO_HI, + }, +}; + +struct smpro_errmon { + struct regmap *regmap; +}; + +enum EVENT_TYPES { + VRD_WARN_FAULT_EVENT, + VRD_HOT_EVENT, + DIMM_HOT_EVENT, + NUM_EVENTS_TYPE, +}; + +/* Included Address of event source and data registers */ +static u8 smpro_event_table[NUM_EVENTS_TYPE] = { + VRD_WARN_FAULT_EVENT_DATA, + VRD_HOT_EVENT_DATA, + DIMM_HOT_EVENT_DATA, +}; + +static ssize_t smpro_event_data_read(struct device *dev, + struct device_attribute *da, char *buf, + int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + s32 event_data; + int ret; + + ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data); + if (ret) + return ret; + /* Clear event after read */ + if (event_data != 0) + regmap_write(errmon->regmap, smpro_event_table[channel], event_data); + + return sysfs_emit(buf, "%04x\n", event_data); +} + +static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + struct smpro_error_hdr *err_info; + s32 err_count; + int ret; + + err_info = &smpro_error_table[channel]; + + ret = regmap_read(errmon->regmap, err_info->count, &err_count); + if (ret) + return ret; + + /* Bit 8 indicates the overflow status */ + return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0); +} + +static ssize_t smpro_error_data_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + unsigned char err_data[MAX_READ_BLOCK_LENGTH]; + struct smpro_error_hdr *err_info; + s32 err_count, err_length; + int ret; + + err_info = &smpro_error_table[channel]; + + ret = regmap_read(errmon->regmap, err_info->count, &err_count); + /* Error count is the low byte */ + err_count &= 0xff; + if (ret || !err_count || err_count > err_info->max_cnt) + return ret; + + ret = regmap_read(errmon->regmap, err_info->len, &err_length); + if (ret || err_length <= 0) + return ret; + + if (err_length > MAX_READ_BLOCK_LENGTH) + err_length = MAX_READ_BLOCK_LENGTH; + + memset(err_data, 0x00, MAX_READ_BLOCK_LENGTH); + ret = regmap_noinc_read(errmon->regmap, err_info->data, err_data, err_length); + if (ret < 0) + return ret; + + /* clear the error */ + ret = regmap_write(errmon->regmap, err_info->count, 0x100); + if (ret) + return ret; + /* + * The output of Core/Memory/PCIe/Others UE/CE errors follows the format + * specified in section 5.8.1 CE/UE Error Data record in + * Altra SOC BMC Interface specification. + */ + return sysfs_emit(buf, "%*phN\n", MAX_READ_BLOCK_LENGTH, err_data); +} + +/* + * Output format: + * <4-byte hex value of error info><4-byte hex value of error extensive data> + * Where: + * + error info : The error information + * + error data : Extensive data (32 bits) + * Reference to section 5.10 RAS Internal Error Register Definition in + * Altra SOC BMC Interface specification + */ +static ssize_t smpro_internal_err_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + struct smpro_int_error_hdr *err_info; + unsigned int err[4] = { 0 }; + unsigned int err_type; + unsigned int val; + int ret; + + /* read error status */ + ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val); + if (ret) + return ret; + + if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) || + (channel == RAS_PMPRO_ERR && !(val & BIT(1)))) + return 0; + + err_info = &list_smpro_int_error_hdr[channel]; + ret = regmap_read(errmon->regmap, err_info->type, &val); + if (ret) + return ret; + + err_type = (val & BIT(1)) ? BIT(1) : + (val & BIT(2)) ? BIT(2) : 0; + + if (!err_type) + return 0; + + ret = regmap_read(errmon->regmap, err_info->info_l, err + 1); + if (ret) + return ret; + + ret = regmap_read(errmon->regmap, err_info->info_h, err); + if (ret) + return ret; + + if (err_type & BIT(2)) { + /* Error with data type */ + ret = regmap_read(errmon->regmap, err_info->data_l, err + 3); + if (ret) + return ret; + + ret = regmap_read(errmon->regmap, err_info->data_h, err + 2); + if (ret) + return ret; + } + + /* clear the read errors */ + ret = regmap_write(errmon->regmap, err_info->type, err_type); + if (ret) + return ret; + + return sysfs_emit(buf, "%*phN\n", (int)sizeof(err), err); +} + +/* + * Output format: + * <4-byte hex value of warining info> + * Reference to section 5.10 RAS Internal Error Register Definition in + * Altra SOC BMC Interface specification + */ +static ssize_t smpro_internal_warn_read(struct device *dev, struct device_attribute *da, + char *buf, int channel) +{ + struct smpro_errmon *errmon = dev_get_drvdata(dev); + struct smpro_int_error_hdr *err_info; + unsigned int warn[2] = { 0 }; + unsigned int val; + int ret; + + /* read error status */ + ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val); + if (ret) + return ret; + + if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) || + (channel == RAS_PMPRO_ERR && !(val & BIT(1)))) + return 0; + + err_info = &list_smpro_int_error_hdr[channel]; + ret = regmap_read(errmon->regmap, err_info->type, &val); + if (ret) + return ret; + + if (!(val & BIT(0))) + return 0; + + ret = regmap_read(errmon->regmap, err_info->warn_l, warn + 1); + if (ret) + return ret; + + ret = regmap_read(errmon->regmap, err_info->warn_h, warn); + if (ret) + return ret; + + /* clear the warning */ + ret = regmap_write(errmon->regmap, err_info->type, BIT(0)); + if (ret) + return ret; + + return sysfs_emit(buf, "%*phN\n", (int)sizeof(warn), warn); +} + +#define ERROR_OVERFLOW_RO(_error, _index) \ + static ssize_t overflow_##_error##_show(struct device *dev, \ + struct device_attribute *da, \ + char *buf) \ + { \ + return smpro_overflow_data_read(dev, da, buf, _index); \ + } \ + static DEVICE_ATTR_RO(overflow_##_error) + +ERROR_OVERFLOW_RO(core_ce, CORE_CE_ERR); +ERROR_OVERFLOW_RO(core_ue, CORE_UE_ERR); +ERROR_OVERFLOW_RO(mem_ce, MEM_CE_ERR); +ERROR_OVERFLOW_RO(mem_ue, MEM_UE_ERR); +ERROR_OVERFLOW_RO(pcie_ce, PCIE_CE_ERR); +ERROR_OVERFLOW_RO(pcie_ue, PCIE_UE_ERR); +ERROR_OVERFLOW_RO(other_ce, OTHER_CE_ERR); +ERROR_OVERFLOW_RO(other_ue, OTHER_UE_ERR); + +#define ERROR_RO(_error, _index) \ + static ssize_t error_##_error##_show(struct device *dev, \ + struct device_attribute *da, \ + char *buf) \ + { \ + return smpro_error_data_read(dev, da, buf, _index); \ + } \ + static DEVICE_ATTR_RO(error_##_error) + +ERROR_RO(core_ce, CORE_CE_ERR); +ERROR_RO(core_ue, CORE_UE_ERR); +ERROR_RO(mem_ce, MEM_CE_ERR); +ERROR_RO(mem_ue, MEM_UE_ERR); +ERROR_RO(pcie_ce, PCIE_CE_ERR); +ERROR_RO(pcie_ue, PCIE_UE_ERR); +ERROR_RO(other_ce, OTHER_CE_ERR); +ERROR_RO(other_ue, OTHER_UE_ERR); + +static ssize_t error_smpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_err_read(dev, da, buf, RAS_SMPRO_ERR); +} +static DEVICE_ATTR_RO(error_smpro); + +static ssize_t error_pmpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_err_read(dev, da, buf, RAS_PMPRO_ERR); +} +static DEVICE_ATTR_RO(error_pmpro); + +static ssize_t warn_smpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_warn_read(dev, da, buf, RAS_SMPRO_ERR); +} +static DEVICE_ATTR_RO(warn_smpro); + +static ssize_t warn_pmpro_show(struct device *dev, struct device_attribute *da, char *buf) +{ + return smpro_internal_warn_read(dev, da, buf, RAS_PMPRO_ERR); +} +static DEVICE_ATTR_RO(warn_pmpro); + +#define EVENT_RO(_event, _index) \ + static ssize_t event_##_event##_show(struct device *dev, \ + struct device_attribute *da, \ + char *buf) \ + { \ + return smpro_event_data_read(dev, da, buf, _index); \ + } \ + static DEVICE_ATTR_RO(event_##_event) + +EVENT_RO(vrd_warn_fault, VRD_WARN_FAULT_EVENT); +EVENT_RO(vrd_hot, VRD_HOT_EVENT); +EVENT_RO(dimm_hot, DIMM_HOT_EVENT); + +static struct attribute *smpro_errmon_attrs[] = { + &dev_attr_overflow_core_ce.attr, + &dev_attr_overflow_core_ue.attr, + &dev_attr_overflow_mem_ce.attr, + &dev_attr_overflow_mem_ue.attr, + &dev_attr_overflow_pcie_ce.attr, + &dev_attr_overflow_pcie_ue.attr, + &dev_attr_overflow_other_ce.attr, + &dev_attr_overflow_other_ue.attr, + &dev_attr_error_core_ce.attr, + &dev_attr_error_core_ue.attr, + &dev_attr_error_mem_ce.attr, + &dev_attr_error_mem_ue.attr, + &dev_attr_error_pcie_ce.attr, + &dev_attr_error_pcie_ue.attr, + &dev_attr_error_other_ce.attr, + &dev_attr_error_other_ue.attr, + &dev_attr_error_smpro.attr, + &dev_attr_error_pmpro.attr, + &dev_attr_warn_smpro.attr, + &dev_attr_warn_pmpro.attr, + &dev_attr_event_vrd_warn_fault.attr, + &dev_attr_event_vrd_hot.attr, + &dev_attr_event_dimm_hot.attr, + NULL +}; + +ATTRIBUTE_GROUPS(smpro_errmon); + +static int smpro_errmon_probe(struct platform_device *pdev) +{ + struct smpro_errmon *errmon; + + errmon = devm_kzalloc(&pdev->dev, sizeof(struct smpro_errmon), GFP_KERNEL); + if (!errmon) + return -ENOMEM; + + platform_set_drvdata(pdev, errmon); + + errmon->regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!errmon->regmap) + return -ENODEV; + + return 0; +} + +static struct platform_driver smpro_errmon_driver = { + .probe = smpro_errmon_probe, + .driver = { + .name = "smpro-errmon", + .dev_groups = smpro_errmon_groups, + }, +}; + +module_platform_driver(smpro_errmon_driver); + +MODULE_AUTHOR("Tung Nguyen "); +MODULE_AUTHOR("Thinh Pham "); +MODULE_AUTHOR("Hoang Nguyen "); +MODULE_AUTHOR("Thu Nguyen "); +MODULE_AUTHOR("Quan Nguyen "); +MODULE_DESCRIPTION("Ampere Altra SMpro driver"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 763dc90e9a4332f82ad43c866c6878742b15d4ab Mon Sep 17 00:00:00 2001 From: Quan Nguyen Date: Mon, 31 Oct 2022 09:44:42 +0700 Subject: misc: smpro-misc: Add Ampere's Altra SMpro misc driver Add driver support for accessing various information reported by Ampere's SMpro co-processor such as Boot Progress and other miscellaneous data. Signed-off-by: Quan Nguyen Link: https://lore.kernel.org/r/20221031024442.2490881-4-quan@os.amperecomputing.com Signed-off-by: Greg Kroah-Hartman --- .../sysfs-bus-platform-devices-ampere-smpro | 48 +++++++ drivers/misc/Kconfig | 10 ++ drivers/misc/Makefile | 1 + drivers/misc/smpro-misc.c | 145 +++++++++++++++++++++ 4 files changed, 204 insertions(+) create mode 100644 drivers/misc/smpro-misc.c (limited to 'drivers/misc') diff --git a/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro b/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro index 2b84dc8c3149..ca93c215ef99 100644 --- a/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro +++ b/Documentation/ABI/testing/sysfs-bus-platform-devices-ampere-smpro @@ -262,3 +262,51 @@ Description: For more details, see section `5.7 GPI Status Registers, Altra Family Soc BMC Interface Specification`. +What: /sys/bus/platform/devices/smpro-misc.*/boot_progress +KernelVersion: 6.1 +Contact: Quan Nguyen +Description: + (RO) Contains the boot stages information in hex as format below:: + + AABBCCCCCCCC + + where: + + - ``AA`` : The boot stages + + - 00: SMpro firmware booting + - 01: PMpro firmware booting + - 02: ATF BL1 firmware booting + - 03: DDR initialization + - 04: DDR training report status + - 05: ATF BL2 firmware booting + - 06: ATF BL31 firmware booting + - 07: ATF BL32 firmware booting + - 08: UEFI firmware booting + - 09: OS booting + + - ``BB`` : Boot status + + - 00: Not started + - 01: Started + - 02: Completed without error + - 03: Failed. + + - ``CCCCCCCC``: Boot status information defined for each boot stages + + For details, see section `5.11 Boot Stage Register Definitions` + and section `6. Processor Boot Progress Codes, Altra Family Soc BMC + Interface Specification`. + + +What: /sys/bus/platform/devices/smpro-misc*/soc_power_limit +KernelVersion: 6.1 +Contact: Quan Nguyen +Description: + (RW) Contains the desired SoC power limit in Watt. + Writes to this sysfs set the desired SoC power limit (W). + Reads from this register return the current SoC power limit (W). + The value ranges: + + - Minimum: 120 W + - Maximum: Socket TDP power diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index b9ceee949dab..9947b7892bd5 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -188,6 +188,16 @@ config SMPRO_ERRMON To compile this driver as a module, say M here. The driver will be called smpro-errmon. +config SMPRO_MISC + tristate "Ampere Computing SMPro miscellaneous driver" + depends on MFD_SMPRO || COMPILE_TEST + help + Say Y here to get support for the SMpro error miscellalenous function + provided by Ampere Computing's Altra and Altra Max SoCs. + + To compile this driver as a module, say M here. The driver will be + called smpro-misc. + config CS5535_MFGPT tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer (MFGPT) support" depends on MFD_CS5535 diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index bbe24d4511a3..87b54a4a4422 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_KGDB_TESTS) += kgdbts.o obj-$(CONFIG_SGI_XP) += sgi-xp/ obj-$(CONFIG_SGI_GRU) += sgi-gru/ obj-$(CONFIG_SMPRO_ERRMON) += smpro-errmon.o +obj-$(CONFIG_SMPRO_MISC) += smpro-misc.o obj-$(CONFIG_CS5535_MFGPT) += cs5535-mfgpt.o obj-$(CONFIG_GEHC_ACHC) += gehc-achc.o obj-$(CONFIG_HP_ILO) += hpilo.o diff --git a/drivers/misc/smpro-misc.c b/drivers/misc/smpro-misc.c new file mode 100644 index 000000000000..6c427141e51b --- /dev/null +++ b/drivers/misc/smpro-misc.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Ampere Computing SoC's SMpro Misc Driver + * + * Copyright (c) 2022, Ampere Computing LLC + */ +#include +#include +#include +#include + +/* Boot Stage/Progress Registers */ +#define BOOTSTAGE 0xB0 +#define BOOTSTAGE_LO 0xB1 +#define CUR_BOOTSTAGE 0xB2 +#define BOOTSTAGE_HI 0xB3 + +/* SOC State Registers */ +#define SOC_POWER_LIMIT 0xE5 + +struct smpro_misc { + struct regmap *regmap; +}; + +static ssize_t boot_progress_show(struct device *dev, struct device_attribute *da, char *buf) +{ + struct smpro_misc *misc = dev_get_drvdata(dev); + u16 boot_progress[3] = { 0 }; + u32 bootstage; + u8 boot_stage; + u8 cur_stage; + u32 reg_lo; + u32 reg; + int ret; + + /* Read current boot stage */ + ret = regmap_read(misc->regmap, CUR_BOOTSTAGE, ®); + if (ret) + return ret; + + cur_stage = reg & 0xff; + + ret = regmap_read(misc->regmap, BOOTSTAGE, &bootstage); + if (ret) + return ret; + + boot_stage = (bootstage >> 8) & 0xff; + + if (boot_stage > cur_stage) + return -EINVAL; + + ret = regmap_read(misc->regmap, BOOTSTAGE_LO, ®_lo); + if (!ret) + ret = regmap_read(misc->regmap, BOOTSTAGE_HI, ®); + if (ret) + return ret; + + /* Firmware to report new boot stage next time */ + if (boot_stage < cur_stage) { + ret = regmap_write(misc->regmap, BOOTSTAGE, ((bootstage & 0xff00) | 0x1)); + if (ret) + return ret; + } + + boot_progress[0] = bootstage; + boot_progress[1] = swab16(reg); + boot_progress[2] = swab16(reg_lo); + + return sysfs_emit(buf, "%*phN\n", (int)sizeof(boot_progress), boot_progress); +} + +static DEVICE_ATTR_RO(boot_progress); + +static ssize_t soc_power_limit_show(struct device *dev, struct device_attribute *da, char *buf) +{ + struct smpro_misc *misc = dev_get_drvdata(dev); + unsigned int value; + int ret; + + ret = regmap_read(misc->regmap, SOC_POWER_LIMIT, &value); + if (ret) + return ret; + + return sysfs_emit(buf, "%d\n", value); +} + +static ssize_t soc_power_limit_store(struct device *dev, struct device_attribute *da, + const char *buf, size_t count) +{ + struct smpro_misc *misc = dev_get_drvdata(dev); + unsigned long val; + s32 ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + ret = regmap_write(misc->regmap, SOC_POWER_LIMIT, (unsigned int)val); + if (ret) + return -EPROTO; + + return count; +} + +static DEVICE_ATTR_RW(soc_power_limit); + +static struct attribute *smpro_misc_attrs[] = { + &dev_attr_boot_progress.attr, + &dev_attr_soc_power_limit.attr, + NULL +}; + +ATTRIBUTE_GROUPS(smpro_misc); + +static int smpro_misc_probe(struct platform_device *pdev) +{ + struct smpro_misc *misc; + + misc = devm_kzalloc(&pdev->dev, sizeof(struct smpro_misc), GFP_KERNEL); + if (!misc) + return -ENOMEM; + + platform_set_drvdata(pdev, misc); + + misc->regmap = dev_get_regmap(pdev->dev.parent, NULL); + if (!misc->regmap) + return -ENODEV; + + return 0; +} + +static struct platform_driver smpro_misc_driver = { + .probe = smpro_misc_probe, + .driver = { + .name = "smpro-misc", + .dev_groups = smpro_misc_groups, + }, +}; + +module_platform_driver(smpro_misc_driver); + +MODULE_AUTHOR("Tung Nguyen "); +MODULE_AUTHOR("Quan Nguyen "); +MODULE_DESCRIPTION("Ampere Altra SMpro Misc driver"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 8749c27895a369a99e4a21709b3e3bec4785778f Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 23 Sep 2022 22:39:13 +0800 Subject: habanalabs: fix return value check in hl_fw_get_sec_attest_data() If hl_cpu_accessible_dma_pool_alloc() fails, we should check 'req_cpu_addr', fix it. Fixes: 0c88760f8f5e ("habanalabs/gaudi2: add secured attestation info uapi") Signed-off-by: Yang Yingliang Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 2de6a9bd564d..f18e53bbba6b 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -2983,7 +2983,7 @@ static int hl_fw_get_sec_attest_data(struct hl_device *hdev, u32 packet_id, void int rc; req_cpu_addr = hl_cpu_accessible_dma_pool_alloc(hdev, size, &req_dma_addr); - if (!data) { + if (!req_cpu_addr) { dev_err(hdev->dev, "Failed to allocate DMA memory for CPU-CP packet %u\n", packet_id); return -ENOMEM; -- cgit v1.2.3 From a925d90b365aa38565191857bddc3c12d80fda96 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 23 Aug 2022 15:14:14 +0300 Subject: habanalabs: allow control device open during reset Monitoring apps would like to query device state at any time so we should allow it also during reset because it doesn't involve accessing the h/w. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 22 ++++++++++++++++++++++ drivers/misc/habanalabs/common/habanalabs.h | 2 ++ drivers/misc/habanalabs/common/habanalabs_drv.c | 4 ++-- 3 files changed, 26 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 233d8b46c831..1aaaa2004e34 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -355,6 +355,28 @@ bool hl_device_operational(struct hl_device *hdev, } } +bool hl_ctrl_device_operational(struct hl_device *hdev, + enum hl_device_status *status) +{ + enum hl_device_status current_status; + + current_status = hl_device_status(hdev); + if (status) + *status = current_status; + + switch (current_status) { + case HL_DEVICE_STATUS_MALFUNCTION: + return false; + case HL_DEVICE_STATUS_IN_RESET: + case HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE: + case HL_DEVICE_STATUS_NEEDS_RESET: + case HL_DEVICE_STATUS_OPERATIONAL: + case HL_DEVICE_STATUS_IN_DEVICE_CREATION: + default: + return true; + } +} + static void hpriv_release(struct kref *ref) { u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 58c95b13be69..2ffb8378f565 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -3496,6 +3496,8 @@ int hl_device_open(struct inode *inode, struct file *filp); int hl_device_open_ctrl(struct inode *inode, struct file *filp); bool hl_device_operational(struct hl_device *hdev, enum hl_device_status *status); +bool hl_ctrl_device_operational(struct hl_device *hdev, + enum hl_device_status *status); enum hl_device_status hl_device_status(struct hl_device *hdev); int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable); int hl_hw_queues_create(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 112632afe7d5..3ee44ea58d5c 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -270,9 +270,9 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) mutex_lock(&hdev->fpriv_ctrl_list_lock); - if (!hl_device_operational(hdev, NULL)) { + if (!hl_ctrl_device_operational(hdev, NULL)) { dev_dbg_ratelimited(hdev->dev_ctrl, - "Can't open %s because it is disabled or in reset\n", + "Can't open %s because it is disabled\n", dev_name(hdev->dev_ctrl)); rc = -EPERM; goto out_err; -- cgit v1.2.3 From ea73ef14ddf93b8b1ae6ce1963846f43a81bb510 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 22 Sep 2022 12:30:32 +0300 Subject: habanalabs: Use simplified API for p2p dist calc Use the simplified API that calculates distance between two devices. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index ef28f3b37b93..99b1d6ce26ae 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -1689,7 +1689,7 @@ static int hl_dmabuf_attach(struct dma_buf *dmabuf, hl_dmabuf = dmabuf->priv; hdev = hl_dmabuf->ctx->hdev; - rc = pci_p2pdma_distance_many(hdev->pdev, &attachment->dev, 1, true); + rc = pci_p2pdma_distance(hdev->pdev, attachment->dev, true); if (rc < 0) attachment->peer2peer = false; -- cgit v1.2.3 From 52d5e5469526216bcc418f26a2796d5af6226023 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 19 Sep 2022 18:51:59 +0300 Subject: habanalabs: refactor razwi event notification This event notification was compatible only with gaudi, where razwi and page fault happens together. To make it compatible with all ASICs, this refactor contains: 1. Razwi notification will only notify about razwi info. New notification will be added in future patch, to retrieve data about page fault error. 2. Changed razwi info structure to support all ASICs. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 22 ++++++++ drivers/misc/habanalabs/common/habanalabs.h | 31 +++-------- drivers/misc/habanalabs/common/habanalabs_drv.c | 2 +- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 12 ++--- drivers/misc/habanalabs/gaudi/gaudi.c | 64 ++++++++++------------- include/uapi/misc/habanalabs.h | 45 ++++++++++------ 6 files changed, 90 insertions(+), 86 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 1aaaa2004e34..30ddaaae67e5 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -2253,3 +2253,25 @@ inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val) { writel(val, hdev->rmmio + reg); } + +void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags) +{ + if (num_of_engines > HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR) { + dev_err(hdev->dev, + "Number of possible razwi initiators (%u) exceeded limit (%u)\n", + num_of_engines, HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR); + return; + } + + /* In case it's the first razwi since the device was opened, capture its parameters */ + if (atomic_cmpxchg(&hdev->captured_err_info.razwi_info_recorded, 0, 1)) + return; + + hdev->captured_err_info.razwi.timestamp = ktime_to_ns(ktime_get()); + hdev->captured_err_info.razwi.addr = addr; + hdev->captured_err_info.razwi.num_of_possible_engines = num_of_engines; + memcpy(&hdev->captured_err_info.razwi.engine_id[0], &engine_id[0], + num_of_engines * sizeof(u16)); + hdev->captured_err_info.razwi.flags = flags; +} diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 2ffb8378f565..cdc50c2c4de8 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2925,30 +2925,6 @@ struct cs_timeout_info { u64 seq; }; -/** - * struct razwi_info - info about last razwi error occurred. - * @timestamp: razwi timestamp. - * @write_enable: if set writing to razwi parameters in the structure is enabled. - * otherwise - disabled, so the first (root cause) razwi will not be overwritten. - * @addr: address that caused razwi. - * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does - * not have engine id it will be set to U16_MAX. - * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible - * engines which one them caused the razwi. In that case, it will contain the - * second possible engine id, otherwise it will be set to U16_MAX. - * @non_engine_initiator: in case the initiator of the razwi does not have engine id. - * @type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX. - */ -struct razwi_info { - ktime_t timestamp; - atomic_t write_enable; - u64 addr; - u16 engine_id_1; - u16 engine_id_2; - u8 non_engine_initiator; - u8 type; -}; - #define MAX_QMAN_STREAMS_INFO 4 #define OPCODE_INFO_MAX_ADDR_SIZE 8 /** @@ -2985,11 +2961,14 @@ struct undefined_opcode_info { * struct hl_error_info - holds information collected during an error. * @cs_timeout: CS timeout error information. * @razwi: razwi information. + * @razwi_info_recorded: if set writing to razwi information is enabled. + * otherwise - disabled, so the first (root cause) razwi will not be overwritten. * @undef_opcode: undefined opcode information */ struct hl_error_info { struct cs_timeout_info cs_timeout; - struct razwi_info razwi; + struct hl_info_razwi_event razwi; + atomic_t razwi_info_recorded; struct undefined_opcode_info undef_opcode; }; @@ -3800,6 +3779,8 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg, struct hl_mmap_mem_buf_behavior *behavior, gfp_t gfp, void *args); __printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...); +void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 3ee44ea58d5c..d87434b9bc16 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -212,7 +212,7 @@ int hl_device_open(struct inode *inode, struct file *filp) hl_debugfs_add_file(hpriv); atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); - atomic_set(&hdev->captured_err_info.razwi.write_enable, 1); + atomic_set(&hdev->captured_err_info.razwi_info_recorded, 0); hdev->captured_err_info.undef_opcode.write_enable = true; hdev->open_counter++; diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 43afe40966e5..6aef4e24d122 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -603,20 +603,14 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args) { struct hl_device *hdev = hpriv->hdev; u32 max_size = args->return_size; - struct hl_info_razwi_event info = {0}; + struct hl_info_razwi_event *info = &hdev->captured_err_info.razwi; void __user *out = (void __user *) (uintptr_t) args->return_pointer; if ((!max_size) || (!out)) return -EINVAL; - info.timestamp = ktime_to_ns(hdev->captured_err_info.razwi.timestamp); - info.addr = hdev->captured_err_info.razwi.addr; - info.engine_id_1 = hdev->captured_err_info.razwi.engine_id_1; - info.engine_id_2 = hdev->captured_err_info.razwi.engine_id_2; - info.no_engine_id = hdev->captured_err_info.razwi.non_engine_initiator; - info.error_type = hdev->captured_err_info.razwi.type; - - return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; + return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_info_razwi_event))) + ? -EFAULT : 0; } static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 92560414e843..f856ac51fde1 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -6505,8 +6505,8 @@ event_not_supported: } static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u32 x_y, - bool is_write, s32 *engine_id_1, - s32 *engine_id_2) + bool is_write, u16 *engine_id_1, + u16 *engine_id_2) { u32 dma_id[2], dma_offset, err_cause[2], mask, i; @@ -6603,7 +6603,7 @@ unknown_initiator: } static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool is_write, - u32 *engine_id_1, u32 *engine_id_2) + u16 *engine_id_1, u16 *engine_id_2) { u32 val, x_y, axi_id; @@ -6719,8 +6719,8 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool i return "unknown initiator"; } -static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_id_1, - u32 *engine_id_2) +static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u16 *engine_id_1, + u16 *engine_id_2, bool *is_read, bool *is_write) { if (RREG32(mmMMU_UP_RAZWI_WRITE_VLD)) { @@ -6728,6 +6728,7 @@ static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_i "RAZWI event caused by illegal write of %s\n", gaudi_get_razwi_initiator_name(hdev, true, engine_id_1, engine_id_2)); WREG32(mmMMU_UP_RAZWI_WRITE_VLD, 0); + *is_write = true; } if (RREG32(mmMMU_UP_RAZWI_READ_VLD)) { @@ -6735,10 +6736,11 @@ static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_i "RAZWI event caused by illegal read of %s\n", gaudi_get_razwi_initiator_name(hdev, false, engine_id_1, engine_id_2)); WREG32(mmMMU_UP_RAZWI_READ_VLD, 0); + *is_read = true; } } -static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u8 *type) +static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr) { struct gaudi_device *gaudi = hdev->asic_specific; u32 val; @@ -6753,8 +6755,6 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA); dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr); - *type = HL_RAZWI_PAGE_FAULT; - WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0); } @@ -6765,7 +6765,6 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr *addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA); dev_err_ratelimited(hdev->dev, "MMU access error on va 0x%llx\n", *addr); - *type = HL_RAZWI_MMU_ACCESS_ERROR; WREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE, 0); } @@ -7302,46 +7301,41 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, bool razwi) { - u32 engine_id_1, engine_id_2; + bool is_read = false, is_write = false; + u16 engine_id[2], num_of_razwi_eng = 0; char desc[64] = ""; u64 razwi_addr = 0; - u8 razwi_type; - int rc; + u8 razwi_flags = 0; /* * Init engine id by default as not valid and only if razwi initiated from engine with * engine id it will get valid value. - * Init razwi type to default, will be changed only if razwi caused by page fault of - * MMU access error */ - engine_id_1 = U16_MAX; - engine_id_2 = U16_MAX; - razwi_type = U8_MAX; + engine_id[0] = HL_RAZWI_NA_ENG_ID; + engine_id[1] = HL_RAZWI_NA_ENG_ID; gaudi_get_event_desc(event_type, desc, sizeof(desc)); dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n", event_type, desc); if (razwi) { - gaudi_print_and_get_razwi_info(hdev, &engine_id_1, &engine_id_2); - gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type); - - /* In case it's the first razwi, save its parameters*/ - rc = atomic_cmpxchg(&hdev->captured_err_info.razwi.write_enable, 1, 0); - if (rc) { - hdev->captured_err_info.razwi.timestamp = ktime_get(); - hdev->captured_err_info.razwi.addr = razwi_addr; - hdev->captured_err_info.razwi.engine_id_1 = engine_id_1; - hdev->captured_err_info.razwi.engine_id_2 = engine_id_2; - /* - * If first engine id holds non valid value the razwi initiator - * does not have engine id - */ - hdev->captured_err_info.razwi.non_engine_initiator = - (engine_id_1 == U16_MAX); - hdev->captured_err_info.razwi.type = razwi_type; - + gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read, + &is_write); + gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr); + + if (is_read) + razwi_flags |= HL_RAZWI_READ; + if (is_write) + razwi_flags |= HL_RAZWI_WRITE; + + if (engine_id[0] != HL_RAZWI_NA_ENG_ID) { + if (engine_id[1] != HL_RAZWI_NA_ENG_ID) + num_of_razwi_eng = 2; + else + num_of_razwi_eng = 1; } + + hl_capture_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags); } } diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index e00ebe05097d..d6f84cb35e3d 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -1071,31 +1071,44 @@ struct hl_info_cs_timeout_event { __u64 seq; }; -#define HL_RAZWI_PAGE_FAULT 0 -#define HL_RAZWI_MMU_ACCESS_ERROR 1 +#define HL_RAZWI_NA_ENG_ID U16_MAX +#define HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR 128 +#define HL_RAZWI_READ BIT(0) +#define HL_RAZWI_WRITE BIT(1) +#define HL_RAZWI_LBW BIT(2) +#define HL_RAZWI_HBW BIT(3) +#define HL_RAZWI_RR BIT(4) +#define HL_RAZWI_ADDR_DEC BIT(5) /** * struct hl_info_razwi_event - razwi information. * @timestamp: timestamp of razwi. * @addr: address which accessing it caused razwi. - * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not - * have engine id it will be set to U16_MAX. - * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible - * engines which one them caused the razwi. In that case, it will contain the - * second possible engine id, otherwise it will be set to U16_MAX. - * @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1, - * otherwise 0. - * @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX. - * @pad: padding to 64 bit. + * @engine_id: engine id of the razwi initiator, if it was initiated by engine that does not + * have engine id it will be set to HL_RAZWI_NA_ENG_ID. If there are several possible + * engines which caused the razwi, it will hold all of them. + * @num_of_possible_engines: contains number of possible engine ids. In some asics, razwi indication + * might be common for several engines and there is no way to get the + * exact engine. In this way, engine_id array will be filled with all + * possible engines caused this razwi. Also, there might be possibility + * in gaudi, where we don't indication on specific engine, in that case + * the value of this parameter will be zero. + * @flags: bitmask for additional data: HL_RAZWI_READ - razwi caused by read operation + * HL_RAZWI_WRITE - razwi caused by write operation + * HL_RAZWI_LBW - razwi caused by lbw fabric transaction + * HL_RAZWI_HBW - razwi caused by hbw fabric transaction + * HL_RAZWI_RR - razwi caused by range register + * HL_RAZWI_ADDR_DEC - razwi caused by address decode error + * Note: this data is not supported by all asics, in that case the relevant bits will not + * be set. */ struct hl_info_razwi_event { __s64 timestamp; __u64 addr; - __u16 engine_id_1; - __u16 engine_id_2; - __u8 no_engine_id; - __u8 error_type; - __u8 pad[2]; + __u16 engine_id[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR]; + __u16 num_of_possible_engines; + __u8 flags; + __u8 pad[5]; }; #define MAX_QMAN_STREAMS_INFO 4 -- cgit v1.2.3 From 0502df9bbea0bd0e77c4a283e4cc34801038899a Mon Sep 17 00:00:00 2001 From: Bharat Jauhari Date: Tue, 27 Sep 2022 14:38:38 +0300 Subject: habanalabs: use lower_32_bits() This fixes sparse warning on doing cast to 32-bits Signed-off-by: Bharat Jauhari Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index cdc50c2c4de8..f4b3fa4b0976 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2528,7 +2528,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); break; \ (val) = __elbi_read; \ } else {\ - (val) = RREG32((u32)(addr)); \ + (val) = RREG32(lower_32_bits(addr)); \ } \ if (cond) \ break; \ @@ -2539,7 +2539,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); break; \ (val) = __elbi_read; \ } else {\ - (val) = RREG32((u32)(addr)); \ + (val) = RREG32(lower_32_bits(addr)); \ } \ break; \ } \ @@ -2594,7 +2594,7 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); if (__rc) \ break; \ } else { \ - __read_val = RREG32((u32)(addr_arr)[__arr_idx]); \ + __read_val = RREG32(lower_32_bits(addr_arr[__arr_idx])); \ } \ if (__read_val == (expected_val)) \ __elem_bitmask &= ~BIT_ULL(__arr_idx); \ -- cgit v1.2.3 From 6d1c567f2ac66391edf5423247f27c82f6b82d86 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 22 Sep 2022 15:25:46 +0300 Subject: habanalabs/gaudi2: fix module ID for RAZWI handling RAZWI is optionally handled as part of the generic QM SEI error handling, but it always uses PDMA as the module ID. Fix it to use the suitable module ID according to the specific event. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 65e6cae6100a..b3685978f6ae 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -7602,6 +7602,7 @@ static void _gaudi2_handle_qm_sei_err(struct hl_device *hdev, u64 qman_base) static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type, struct hl_eq_razwi_info *razwi_info) { + enum razwi_event_sources module; u64 qman_base; u8 index; @@ -7611,9 +7612,11 @@ static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type, qman_base = mmDCORE0_TPC0_QM_BASE + (index / NUM_OF_TPC_PER_DCORE) * DCORE_OFFSET + (index % NUM_OF_TPC_PER_DCORE) * DCORE_TPC_OFFSET; + module = RAZWI_TPC; break; case GAUDI2_EVENT_TPC24_AXI_ERR_RSP: qman_base = mmDCORE0_TPC6_QM_BASE; + module = RAZWI_TPC; break; case GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE: case GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE: @@ -7623,16 +7626,19 @@ static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type, (GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE - GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE); qman_base = mmDCORE0_MME_QM_BASE + index * DCORE_OFFSET; + module = RAZWI_MME; break; case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP: case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP: index = event_type - GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP; qman_base = mmPDMA0_QM_BASE + index * PDMA_OFFSET; + module = RAZWI_PDMA; break; case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE: case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE: index = event_type - GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE; qman_base = mmROT0_QM_BASE + index * ROT_OFFSET; + module = RAZWI_ROT; break; default: return; @@ -7647,7 +7653,7 @@ static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type, /* check if RAZWI happened */ if (razwi_info) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_PDMA, 0, 0, razwi_info); + gaudi2_ack_module_razwi_event_handler(hdev, module, 0, 0, razwi_info); } static void gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type) -- cgit v1.2.3 From dd600db47ba60c3c69d4d24c73c43133c6040118 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Sun, 18 Sep 2022 21:37:31 +0300 Subject: habanalabs: add page fault info uapi Only the first page fault will be saved. Besides the address which caused the page fault, the driver captures all of the mmu user mappings. User can retrieve this data via the new uapi (new opcode in INFO ioctl). Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 58 +++++++++++++++++++++++ drivers/misc/habanalabs/common/habanalabs.h | 22 ++++++++- drivers/misc/habanalabs/common/habanalabs_drv.c | 1 + drivers/misc/habanalabs/common/habanalabs_ioctl.c | 42 ++++++++++++++++ drivers/misc/habanalabs/gaudi/gaudi.c | 2 + include/uapi/misc/habanalabs.h | 31 ++++++++++++ 6 files changed, 155 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 30ddaaae67e5..5dc6c77b4721 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -12,6 +12,7 @@ #include #include +#include #include @@ -2199,6 +2200,8 @@ void hl_device_fini(struct hl_device *hdev) hl_mmu_fini(hdev); + vfree(hdev->captured_err_info.pgf_info.user_mappings); + hl_eq_fini(hdev, &hdev->event_queue); kfree(hdev->shadow_cs_queue); @@ -2275,3 +2278,58 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_ num_of_engines * sizeof(u16)); hdev->captured_err_info.razwi.flags = flags; } +static void hl_capture_user_mappings(struct hl_device *hdev) +{ + struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info; + struct hl_vm_hash_node *hnode; + struct hl_userptr *userptr; + struct hl_ctx *ctx; + u32 map_idx = 0; + int i; + + ctx = hl_get_compute_ctx(hdev); + if (!ctx) { + dev_err(hdev->dev, "Can't get user context for user mappings\n"); + return; + } + + mutex_lock(&ctx->mem_hash_lock); + hash_for_each(ctx->mem_hash, i, hnode, node) + pgf_info->num_of_user_mappings++; + + if (!pgf_info->num_of_user_mappings) + goto finish; + + /* In case we already allocated in previous session, need to release it before + * allocating new buffer. + */ + vfree(pgf_info->user_mappings); + pgf_info->user_mappings = + vmalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping)); + if (!pgf_info->user_mappings) { + pgf_info->num_of_user_mappings = 0; + goto finish; + } + + hash_for_each(ctx->mem_hash, i, hnode, node) { + userptr = hnode->ptr; + pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr; + pgf_info->user_mappings[map_idx].size = userptr->size; + map_idx++; + } +finish: + mutex_unlock(&ctx->mem_hash_lock); + hl_ctx_put(ctx); +} + +void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu) +{ + /* Capture only the first page fault */ + if (atomic_cmpxchg(&hdev->captured_err_info.pgf_info_recorded, 0, 1)) + return; + + hdev->captured_err_info.pgf_info.pgf.timestamp = ktime_to_ns(ktime_get()); + hdev->captured_err_info.pgf_info.pgf.addr = addr; + hdev->captured_err_info.pgf_info.pgf.engine_id = eng_id; + hl_capture_user_mappings(hdev); +} diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index f4b3fa4b0976..1489240d5a3a 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2957,19 +2957,38 @@ struct undefined_opcode_info { bool write_enable; }; +/** + * struct page_fault_info - info about page fault + * @pgf_info: page fault information. + * @user_mappings: buffer containing user mappings. + * @num_of_user_mappings: number of user mappings. + */ +struct page_fault_info { + struct hl_page_fault_info pgf; + struct hl_user_mapping *user_mappings; + u64 num_of_user_mappings; +}; + /** * struct hl_error_info - holds information collected during an error. * @cs_timeout: CS timeout error information. * @razwi: razwi information. * @razwi_info_recorded: if set writing to razwi information is enabled. - * otherwise - disabled, so the first (root cause) razwi will not be overwritten. + * otherwise - disabled, so the first (root cause) razwi will not be + * overwritten. * @undef_opcode: undefined opcode information + * @pgf_info: page fault information. + * @pgf_info_recorded: if set writing to page fault information is enabled. + * otherwise - disabled, so the first (root cause) page fault will not be + * overwritten. */ struct hl_error_info { struct cs_timeout_info cs_timeout; struct hl_info_razwi_event razwi; atomic_t razwi_info_recorded; struct undefined_opcode_info undef_opcode; + struct page_fault_info pgf_info; + atomic_t pgf_info_recorded; }; /** @@ -3781,6 +3800,7 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg, __printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...); void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, u8 flags); +void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index d87434b9bc16..714994725224 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -213,6 +213,7 @@ int hl_device_open(struct inode *inode, struct file *filp) atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1); atomic_set(&hdev->captured_err_info.razwi_info_recorded, 0); + atomic_set(&hdev->captured_err_info.pgf_info_recorded, 0); hdev->captured_err_info.undef_opcode.write_enable = true; hdev->open_counter++; diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 6aef4e24d122..cac2c7fb14f1 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -778,6 +778,42 @@ static int engine_status_info(struct hl_fpriv *hpriv, struct hl_info_args *args) return rc; } +static int page_fault_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + struct hl_page_fault_info *info = &hdev->captured_err_info.pgf_info.pgf; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((!max_size) || (!out)) + return -EINVAL; + + return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_page_fault_info))) + ? -EFAULT : 0; +} + +static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + u32 user_buf_size = args->return_size; + struct hl_device *hdev = hpriv->hdev; + struct page_fault_info *pgf_info; + u64 actual_size; + + pgf_info = &hdev->captured_err_info.pgf_info; + args->array_size = pgf_info->num_of_user_mappings; + + if (!out) + return -EINVAL; + + actual_size = pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping); + if (user_buf_size < actual_size) + return -ENOMEM; + + return copy_to_user(out, pgf_info->user_mappings, min_t(size_t, user_buf_size, actual_size)) + ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -837,6 +873,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_GET_EVENTS: return events_info(hpriv, args); + case HL_INFO_PAGE_FAULT_EVENT: + return page_fault_info(hpriv, args); + + case HL_INFO_USER_MAPPINGS: + return user_mappings_info(hpriv, args); + default: break; } diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index f856ac51fde1..1a99f7be8b60 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -6755,6 +6755,8 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA); dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr); + hl_capture_page_fault(hdev, *addr, 0, true); + WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0); } diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index d6f84cb35e3d..2b794f54e2ed 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -778,6 +778,9 @@ enum hl_server_type { * HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd * HL_INFO_GET_EVENTS - Retrieve the last occurred events * HL_INFO_UNDEFINED_OPCODE_EVENT - Retrieve last undefined opcode error information. + * HL_INFO_ENGINE_STATUS - Retrieve the status of all the h/w engines in the asic. + * HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault. + * HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event. */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -809,6 +812,8 @@ enum hl_server_type { #define HL_INFO_GET_EVENTS 30 #define HL_INFO_UNDEFINED_OPCODE_EVENT 31 #define HL_INFO_ENGINE_STATUS 32 +#define HL_INFO_PAGE_FAULT_EVENT 33 +#define HL_INFO_USER_MAPPINGS 34 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -1187,6 +1192,29 @@ struct hl_info_sec_attest { __u8 pad0[2]; }; +/** + * struct hl_page_fault_info - page fault information. + * @timestamp: timestamp of page fault. + * @addr: address which accessing it caused page fault. + * @engine_id: engine id which caused the page fault, supported only in gaudi3. + */ +struct hl_page_fault_info { + __s64 timestamp; + __u64 addr; + __u16 engine_id; + __u8 pad[6]; +}; + +/** + * struct hl_user_mapping - user mapping information. + * @dev_va: device virtual address. + * @size: virtual address mapping size. + */ +struct hl_user_mapping { + __u64 dev_va; + __u64 size; +}; + enum gaudi_dcores { HL_GAUDI_WS_DCORE, HL_GAUDI_WN_DCORE, @@ -1213,6 +1241,8 @@ enum gaudi_dcores { * needed, hence updating this variable so user will know the exact amount * of bytes copied by the kernel to the buffer. * @sec_attest_nonce: Nonce number used for attestation report. + * @array_size: Number of array members copied to user buffer. + * Relevant for HL_INFO_USER_MAPPINGS info ioctl. * @pad: Padding to 64 bit. */ struct hl_info_args { @@ -1228,6 +1258,7 @@ struct hl_info_args { __u32 eventfd; __u32 user_buffer_actual_size; __u32 sec_attest_nonce; + __u32 array_size; }; __u32 pad; -- cgit v1.2.3 From 189b203ebbea181d678b4d8bf3547eb78c8ae44a Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Wed, 28 Sep 2022 11:38:00 +0300 Subject: habanalabs: replace 'pf' to 'prefetch' pf was an abbreviation for prefetch but because pf already stands for 'physical function', we decided to change it to 'prefetch'. Signed-off-by: Dafna Hirschfeld Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 14 +++++++------- drivers/misc/habanalabs/common/habanalabs.h | 8 ++++---- drivers/misc/habanalabs/common/mmu/mmu.c | 22 +++++++++++----------- 3 files changed, 22 insertions(+), 22 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 5dc6c77b4721..bf675cf39f71 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -783,8 +783,8 @@ static int device_early_init(struct hl_device *hdev) goto free_cs_cmplt_wq; } - hdev->pf_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0); - if (!hdev->pf_wq) { + hdev->prefetch_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0); + if (!hdev->prefetch_wq) { dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n"); rc = -ENOMEM; goto free_ts_free_wq; @@ -794,7 +794,7 @@ static int device_early_init(struct hl_device *hdev) GFP_KERNEL); if (!hdev->hl_chip_info) { rc = -ENOMEM; - goto free_pf_wq; + goto free_prefetch_wq; } rc = hl_mmu_if_set_funcs(hdev); @@ -833,8 +833,8 @@ free_cb_mgr: hl_mem_mgr_fini(&hdev->kernel_mem_mgr); free_chip_info: kfree(hdev->hl_chip_info); -free_pf_wq: - destroy_workqueue(hdev->pf_wq); +free_prefetch_wq: + destroy_workqueue(hdev->prefetch_wq); free_ts_free_wq: destroy_workqueue(hdev->ts_free_obj_wq); free_cs_cmplt_wq: @@ -877,7 +877,7 @@ static void device_early_fini(struct hl_device *hdev) kfree(hdev->hl_chip_info); - destroy_workqueue(hdev->pf_wq); + destroy_workqueue(hdev->prefetch_wq); destroy_workqueue(hdev->ts_free_obj_wq); destroy_workqueue(hdev->cs_cmplt_wq); destroy_workqueue(hdev->eq_wq); @@ -1076,7 +1076,7 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_r hl_cs_rollback_all(hdev, skip_wq_flush); /* flush the MMU prefetch workqueue */ - flush_workqueue(hdev->pf_wq); + flush_workqueue(hdev->prefetch_wq); /* Release all pending user interrupts, each pending user interrupt * holds a reference to user context diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 1489240d5a3a..6d8ce4a1dbb1 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2811,7 +2811,7 @@ struct hl_mmu_funcs { /** * struct hl_prefetch_work - prefetch work structure handler - * @pf_work: actual work struct. + * @prefetch_work: actual work struct. * @ctx: compute context. * @va: virtual address to pre-fetch. * @size: pre-fetch size. @@ -2819,7 +2819,7 @@ struct hl_mmu_funcs { * @asid: ASID for maintenance operation. */ struct hl_prefetch_work { - struct work_struct pf_work; + struct work_struct prefetch_work; struct hl_ctx *ctx; u64 va; u64 size; @@ -3060,7 +3060,7 @@ struct hl_reset_info { * @cs_cmplt_wq: work queue of CS completions for executing work in process * context. * @ts_free_obj_wq: work queue for timestamp registration objects release. - * @pf_wq: work queue for MMU pre-fetch operations. + * @prefetch_wq: work queue for MMU pre-fetch operations. * @kernel_ctx: Kernel driver context structure. * @kernel_queues: array of hl_hw_queue. * @cs_mirror_list: CS mirror list for TDR. @@ -3231,7 +3231,7 @@ struct hl_device { struct workqueue_struct *eq_wq; struct workqueue_struct *cs_cmplt_wq; struct workqueue_struct *ts_free_obj_wq; - struct workqueue_struct *pf_wq; + struct workqueue_struct *prefetch_wq; struct hl_ctx *kernel_ctx; struct hl_hw_queue *kernel_queues; struct list_head cs_mirror_list; diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c index cf8946266615..589179f8cd41 100644 --- a/drivers/misc/habanalabs/common/mmu/mmu.c +++ b/drivers/misc/habanalabs/common/mmu/mmu.c @@ -699,7 +699,7 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard, static void hl_mmu_prefetch_work_function(struct work_struct *work) { - struct hl_prefetch_work *pfw = container_of(work, struct hl_prefetch_work, pf_work); + struct hl_prefetch_work *pfw = container_of(work, struct hl_prefetch_work, prefetch_work); struct hl_ctx *ctx = pfw->ctx; struct hl_device *hdev = ctx->hdev; @@ -723,25 +723,25 @@ put_ctx: int hl_mmu_prefetch_cache_range(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size) { - struct hl_prefetch_work *handle_pf_work; + struct hl_prefetch_work *handle_prefetch_work; - handle_pf_work = kmalloc(sizeof(*handle_pf_work), GFP_KERNEL); - if (!handle_pf_work) + handle_prefetch_work = kmalloc(sizeof(*handle_prefetch_work), GFP_KERNEL); + if (!handle_prefetch_work) return -ENOMEM; - INIT_WORK(&handle_pf_work->pf_work, hl_mmu_prefetch_work_function); - handle_pf_work->ctx = ctx; - handle_pf_work->va = va; - handle_pf_work->size = size; - handle_pf_work->flags = flags; - handle_pf_work->asid = asid; + INIT_WORK(&handle_prefetch_work->prefetch_work, hl_mmu_prefetch_work_function); + handle_prefetch_work->ctx = ctx; + handle_prefetch_work->va = va; + handle_prefetch_work->size = size; + handle_prefetch_work->flags = flags; + handle_prefetch_work->asid = asid; /* * as actual prefetch is done in a WQ we must get the context (and put it * at the end of the work function) */ hl_ctx_get(ctx); - queue_work(ctx->hdev->pf_wq, &handle_pf_work->pf_work); + queue_work(ctx->hdev->prefetch_wq, &handle_prefetch_work->prefetch_work); return 0; } -- cgit v1.2.3 From 16448d644404351e685466ab14e7e043ad67673c Mon Sep 17 00:00:00 2001 From: Koby Elbaz Date: Wed, 28 Sep 2022 15:56:13 +0300 Subject: habanalabs/gaudi2: remove privileged MME clock configuration Privileged MME clock configuration is removed as it is done by the f/w. Signed-off-by: Koby Elbaz Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index b3685978f6ae..cb048920ffc8 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -4535,7 +4535,7 @@ static void gaudi2_init_mme_acc(struct hl_device *hdev, u32 reg_base) static void gaudi2_init_dcore_mme(struct hl_device *hdev, int dcore_id, bool config_qman_only) { - u32 queue_id_base, reg_base, clk_en_addr = 0; + u32 queue_id_base, reg_base; switch (dcore_id) { case 0: @@ -4543,23 +4543,18 @@ static void gaudi2_init_dcore_mme(struct hl_device *hdev, int dcore_id, break; case 1: queue_id_base = GAUDI2_QUEUE_ID_DCORE1_MME_0_0; - clk_en_addr = mmDCORE1_MME_CTRL_LO_QM_SLV_CLK_EN; break; case 2: queue_id_base = GAUDI2_QUEUE_ID_DCORE2_MME_0_0; break; case 3: queue_id_base = GAUDI2_QUEUE_ID_DCORE3_MME_0_0; - clk_en_addr = mmDCORE3_MME_CTRL_LO_QM_SLV_CLK_EN; break; default: dev_err(hdev->dev, "Invalid dcore id %u\n", dcore_id); return; } - if (clk_en_addr && !(hdev->fw_components & FW_TYPE_BOOT_CPU)) - WREG32(clk_en_addr, 0x1); - if (!config_qman_only) { reg_base = gaudi2_mme_acc_blocks_bases[dcore_id]; gaudi2_init_mme_acc(hdev, reg_base); -- cgit v1.2.3 From 5731b6e6f08a4a3adf944fd0436f77f3e9ce1725 Mon Sep 17 00:00:00 2001 From: Tal Cohen Date: Wed, 28 Sep 2022 18:33:19 +0300 Subject: habanalabs/gaudi2: add device unavailable notification Device unavailable notifies the user that there isn't an option to retrieve debug information from the device. When a critical device error occurs and the f/w performs the device reset, a device unavailable notification shall be sent to the user process. Signed-off-by: Tal Cohen Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index cb048920ffc8..e9c4ec429bae 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -8576,7 +8576,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent { u32 ctl, reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY; struct gaudi2_device *gaudi2 = hdev->asic_specific; - bool reset_required = false, skip_reset = false; + bool reset_required = false, skip_reset = false, is_critical = false; int index, sbte_index; u64 event_mask = 0; u16 event_type; @@ -8602,6 +8602,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; reset_required = gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); + is_critical = eq_entry->ecc_data.is_critical; break; case GAUDI2_EVENT_TPC0_QM ... GAUDI2_EVENT_PDMA1_QM: @@ -8976,9 +8977,16 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent return; reset_device: - if (hdev->hard_reset_on_fw_events) { + if (hdev->asic_prop.fw_security_enabled && is_critical) { + reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW; + + /* notify on device unavailable while the reset triggered by fw */ + event_mask |= (HL_NOTIFIER_EVENT_DEVICE_RESET | + HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE); hl_device_reset(hdev, reset_flags); + } else if (hdev->hard_reset_on_fw_events) { event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; + hl_device_reset(hdev, reset_flags); } else { if (!gaudi2_irq_map_table[event_type].msg) hl_fw_unmask_irq(hdev, event_type); -- cgit v1.2.3 From 3a83ebc521b2e57af070b5667c60ac2d50347658 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Fri, 30 Sep 2022 14:09:32 +0300 Subject: habanalabs: skip idle status check if reset on device release If reset upon device release is enabled, there is no need to check the device idle status in hpriv_release(), because device is going to be reset in any case. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index bf675cf39f71..e60ed0c8a9db 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -398,16 +398,14 @@ static void hpriv_release(struct kref *ref) mutex_destroy(&hpriv->ctx_lock); mutex_destroy(&hpriv->restore_phase_mutex); - if ((!hdev->pldm) && (hdev->pdev) && - (!hdev->asic_funcs->is_device_idle(hdev, - idle_mask, - HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) { - dev_err(hdev->dev, - "device not idle after user context is closed (0x%llx_%llx)\n", - idle_mask[1], idle_mask[0]); + /* No need for idle status check if device is going to be reset in any case */ + if (!hdev->reset_upon_device_release && hdev->pdev && !hdev->pldm) + device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask, + HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL); - device_is_idle = false; - } + if (!device_is_idle) + dev_err(hdev->dev, "device not idle after user context is closed (0x%llx_%llx)\n", + idle_mask[1], idle_mask[0]); /* We need to remove the user from the list to make sure the reset process won't * try to kill the user process. Because, if we got here, it means there are no -- cgit v1.2.3 From 51236cd95e7bcea41e57fb2cf238312be21dcf58 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Fri, 30 Sep 2022 14:19:21 +0300 Subject: habanalabs: allow unregistering eventfd when device non-operational Unregistering eventfd is for releasing host resources and doesn't involve an access to the device. As such, there is no reason to disallow it when device isn't operational. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index cac2c7fb14f1..5ce5c42e2731 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -879,6 +879,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_USER_MAPPINGS: return user_mappings_info(hpriv, args); + case HL_INFO_UNREGISTER_EVENTFD: + return eventfd_unregister(hpriv, args); + default: break; } @@ -935,9 +938,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_REGISTER_EVENTFD: return eventfd_register(hpriv, args); - case HL_INFO_UNREGISTER_EVENTFD: - return eventfd_unregister(hpriv, args); - case HL_INFO_ENGINE_STATUS: return engine_status_info(hpriv, args); -- cgit v1.2.3 From 1eebb259290b1be5398fec953bdd7923a5cbf33e Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Fri, 30 Sep 2022 14:36:27 +0300 Subject: habanalabs: move reset workqueue to be under hl_device 'struct hl_device_reset_work' is used as a wrapper for the reset work and its parameters, including the reset workqueue on which it runs. In a future commit, another reset related work with similar parameters is going to be added, but it won't use the reset workqueue. As in any case there is a single reset workqueue, and to allow the resue of this structure, move the reset workqueue to 'struct hl_device'. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 15 ++++++--------- drivers/misc/habanalabs/common/habanalabs.h | 12 ++++++------ 2 files changed, 12 insertions(+), 15 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index e60ed0c8a9db..e9b373a8cdad 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -684,9 +684,8 @@ static void device_hard_reset_pending(struct work_struct *work) "Could not reset device. will try again in %u seconds", HL_PENDING_RESET_PER_SEC); - queue_delayed_work(device_reset_work->wq, - &device_reset_work->reset_work, - msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000)); + queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work, + msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000)); } } @@ -801,9 +800,8 @@ static int device_early_init(struct hl_device *hdev) hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr); - hdev->device_reset_work.wq = - create_singlethread_workqueue("hl_device_reset"); - if (!hdev->device_reset_work.wq) { + hdev->reset_wq = create_singlethread_workqueue("hl_device_reset"); + if (!hdev->reset_wq) { rc = -ENOMEM; dev_err(hdev->dev, "Failed to create device reset WQ\n"); goto free_cb_mgr; @@ -879,7 +877,7 @@ static void device_early_fini(struct hl_device *hdev) destroy_workqueue(hdev->ts_free_obj_wq); destroy_workqueue(hdev->cs_cmplt_wq); destroy_workqueue(hdev->eq_wq); - destroy_workqueue(hdev->device_reset_work.wq); + destroy_workqueue(hdev->reset_wq); for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) destroy_workqueue(hdev->cq_wq[i]); @@ -1460,8 +1458,7 @@ again: * Because the reset function can't run from heartbeat work, * we need to call the reset function from a dedicated work. */ - queue_delayed_work(hdev->device_reset_work.wq, - &hdev->device_reset_work.reset_work, 0); + queue_delayed_work(hdev->reset_wq, &hdev->device_reset_work.reset_work, 0); return 0; } diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 6d8ce4a1dbb1..4913197c433e 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2682,17 +2682,15 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); struct hwmon_chip_info; /** - * struct hl_device_reset_work - reset workqueue task wrapper. - * @wq: work queue for device reset procedure. + * struct hl_device_reset_work - reset work wrapper. * @reset_work: reset work to be done. * @hdev: habanalabs device structure. * @flags: reset flags. */ struct hl_device_reset_work { - struct workqueue_struct *wq; - struct delayed_work reset_work; - struct hl_device *hdev; - u32 flags; + struct delayed_work reset_work; + struct hl_device *hdev; + u32 flags; }; /** @@ -3061,6 +3059,7 @@ struct hl_reset_info { * context. * @ts_free_obj_wq: work queue for timestamp registration objects release. * @prefetch_wq: work queue for MMU pre-fetch operations. + * @reset_wq: work queue for device reset procedure. * @kernel_ctx: Kernel driver context structure. * @kernel_queues: array of hl_hw_queue. * @cs_mirror_list: CS mirror list for TDR. @@ -3232,6 +3231,7 @@ struct hl_device { struct workqueue_struct *cs_cmplt_wq; struct workqueue_struct *ts_free_obj_wq; struct workqueue_struct *prefetch_wq; + struct workqueue_struct *reset_wq; struct hl_ctx *kernel_ctx; struct hl_hw_queue *kernel_queues; struct list_head cs_mirror_list; -- cgit v1.2.3 From 17f3f42af2bcddc38ff08b355e007f3b6d5ce70c Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Thu, 29 Sep 2022 10:21:28 +0300 Subject: habanalabs: handle HBM MMU when capturing page fault data In case of HBM MMU page fault, capture its relevant mappings. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index e9b373a8cdad..b8b32285720d 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -2273,15 +2273,20 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_ num_of_engines * sizeof(u16)); hdev->captured_err_info.razwi.flags = flags; } -static void hl_capture_user_mappings(struct hl_device *hdev) +static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu) { struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info; + struct hl_vm_phys_pg_pack *phys_pg_pack = NULL; struct hl_vm_hash_node *hnode; struct hl_userptr *userptr; + enum vm_type *vm_type; struct hl_ctx *ctx; u32 map_idx = 0; int i; + /* Reset previous session count*/ + pgf_info->num_of_user_mappings = 0; + ctx = hl_get_compute_ctx(hdev); if (!ctx) { dev_err(hdev->dev, "Can't get user context for user mappings\n"); @@ -2290,7 +2295,7 @@ static void hl_capture_user_mappings(struct hl_device *hdev) mutex_lock(&ctx->mem_hash_lock); hash_for_each(ctx->mem_hash, i, hnode, node) - pgf_info->num_of_user_mappings++; + pgf_info->num_of_user_mappings++; if (!pgf_info->num_of_user_mappings) goto finish; @@ -2300,17 +2305,25 @@ static void hl_capture_user_mappings(struct hl_device *hdev) */ vfree(pgf_info->user_mappings); pgf_info->user_mappings = - vmalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping)); + vzalloc(pgf_info->num_of_user_mappings * sizeof(struct hl_user_mapping)); if (!pgf_info->user_mappings) { pgf_info->num_of_user_mappings = 0; goto finish; } hash_for_each(ctx->mem_hash, i, hnode, node) { - userptr = hnode->ptr; - pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr; - pgf_info->user_mappings[map_idx].size = userptr->size; - map_idx++; + vm_type = hnode->ptr; + if ((*vm_type == VM_TYPE_USERPTR) && (is_pmmu)) { + userptr = hnode->ptr; + pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr; + pgf_info->user_mappings[map_idx].size = userptr->size; + map_idx++; + } else if ((*vm_type == VM_TYPE_PHYS_PACK) && (!is_pmmu)) { + phys_pg_pack = hnode->ptr; + pgf_info->user_mappings[map_idx].dev_va = hnode->vaddr; + pgf_info->user_mappings[map_idx].size = phys_pg_pack->total_size; + map_idx++; + } } finish: mutex_unlock(&ctx->mem_hash_lock); @@ -2326,5 +2339,5 @@ void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is hdev->captured_err_info.pgf_info.pgf.timestamp = ktime_to_ns(ktime_get()); hdev->captured_err_info.pgf_info.pgf.addr = addr; hdev->captured_err_info.pgf_info.pgf.engine_id = eng_id; - hl_capture_user_mappings(hdev); + hl_capture_user_mappings(hdev, is_pmmu); } -- cgit v1.2.3 From 15ac503cdc0d9a1275d82a926c673359cf69ebef Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Wed, 28 Sep 2022 22:14:55 +0300 Subject: habanalabs/gaudi2: capture RAZWI information Added function to calculate possible engines which caused RAZWI (read-only zero, write ignored), from a given router id or module index. When getting RAZWI via PSOC IP, first the router id is calculated and then the possible engines that caused the RAZWI are calculated. There is a possibility that the RAZWI initiator is not an engine. In that case, it will not be included in possible engines as it doesn't have an engine id. RAZWI information is captured when receiving event from engine or via PSOC IP. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 255 +++++++++++++++++++++++++++++--- include/uapi/misc/habanalabs.h | 4 + 2 files changed, 242 insertions(+), 17 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index e9c4ec429bae..1058c8a0e644 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -128,6 +128,8 @@ #define GAUDI2_VDEC_MSIX_ENTRIES (GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM - \ GAUDI2_IRQ_NUM_DCORE0_DEC0_NRM + 1) +#define ENGINE_ID_DCORE_OFFSET (GAUDI2_DCORE1_ENGINE_ID_EDMA_0 - GAUDI2_DCORE0_ENGINE_ID_EDMA_0) + enum hl_pmmu_fatal_cause { LATENCY_RD_OUT_FIFO_OVERRUN, LATENCY_WR_OUT_FIFO_OVERRUN, @@ -7092,9 +7094,12 @@ static void gaudi2_handle_qman_err_generic(struct hl_device *hdev, const char *q static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev, u64 rtr_mstr_if_base_addr, bool is_write, char *name, - bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info) + bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info, + enum gaudi2_engine_id id) { u32 razwi_hi, razwi_lo, razwi_xy; + u16 eng_id = id; + u8 rd_wr_flag; if (is_write) { if (read_razwi_regs) { @@ -7106,6 +7111,7 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev, razwi_lo = le32_to_cpu(razwi_info->hbw.rr_aw_razwi_lo_reg); razwi_xy = le32_to_cpu(razwi_info->hbw.rr_aw_razwi_id_reg); } + rd_wr_flag = HL_RAZWI_WRITE; } else { if (read_razwi_regs) { razwi_hi = RREG32(rtr_mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HI); @@ -7116,8 +7122,12 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev, razwi_lo = le32_to_cpu(razwi_info->hbw.rr_ar_razwi_lo_reg); razwi_xy = le32_to_cpu(razwi_info->hbw.rr_ar_razwi_id_reg); } + rd_wr_flag = HL_RAZWI_READ; } + hl_capture_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &eng_id, 1, + rd_wr_flag | HL_RAZWI_HBW); + dev_err_ratelimited(hdev->dev, "%s-RAZWI SHARED RR HBW %s error, address %#llx, Initiator coordinates 0x%x\n", name, is_write ? "WR" : "RD", (u64)razwi_hi << 32 | razwi_lo, razwi_xy); @@ -7125,9 +7135,12 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev, static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev, u64 rtr_mstr_if_base_addr, bool is_write, char *name, - bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info) + bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info, + enum gaudi2_engine_id id) { u32 razwi_addr, razwi_xy; + u16 eng_id = id; + u8 rd_wr_flag; if (is_write) { if (read_razwi_regs) { @@ -7138,9 +7151,7 @@ static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev, razwi_xy = le32_to_cpu(razwi_info->lbw.rr_aw_razwi_id_reg); } - dev_err_ratelimited(hdev->dev, - "%s-RAZWI SHARED RR LBW WR error, mstr_if 0x%llx, captured address 0x%x, Initiator coordinates 0x%x\n", - name, rtr_mstr_if_base_addr, razwi_addr, razwi_xy); + rd_wr_flag = HL_RAZWI_WRITE; } else { if (read_razwi_regs) { razwi_addr = RREG32(rtr_mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI); @@ -7150,9 +7161,57 @@ static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev, razwi_xy = le32_to_cpu(razwi_info->lbw.rr_ar_razwi_id_reg); } - dev_err_ratelimited(hdev->dev, - "%s-RAZWI SHARED RR LBW AR error, mstr_if 0x%llx, captured address 0x%x Initiator coordinates 0x%x\n", - name, rtr_mstr_if_base_addr, razwi_addr, razwi_xy); + rd_wr_flag = HL_RAZWI_READ; + } + + hl_capture_razwi(hdev, razwi_addr, &eng_id, 1, rd_wr_flag | HL_RAZWI_LBW); + dev_err_ratelimited(hdev->dev, + "%s-RAZWI SHARED RR LBW %s error, mstr_if 0x%llx, captured address 0x%x Initiator coordinates 0x%x\n", + name, is_write ? "WR" : "RD", rtr_mstr_if_base_addr, razwi_addr, + razwi_xy); +} + +static enum gaudi2_engine_id gaudi2_razwi_calc_engine_id(struct hl_device *hdev, + enum razwi_event_sources module, u8 module_idx) +{ + switch (module) { + case RAZWI_TPC: + if (module_idx == (NUM_OF_TPC_PER_DCORE * NUM_OF_DCORES)) + return GAUDI2_DCORE0_ENGINE_ID_TPC_6; + return (((module_idx / NUM_OF_TPC_PER_DCORE) * ENGINE_ID_DCORE_OFFSET) + + (module_idx % NUM_OF_TPC_PER_DCORE) + + (GAUDI2_DCORE0_ENGINE_ID_TPC_0 - GAUDI2_DCORE0_ENGINE_ID_EDMA_0)); + + case RAZWI_MME: + return ((GAUDI2_DCORE0_ENGINE_ID_MME - GAUDI2_DCORE0_ENGINE_ID_EDMA_0) + + (module_idx * ENGINE_ID_DCORE_OFFSET)); + + case RAZWI_EDMA: + return (((module_idx / NUM_OF_EDMA_PER_DCORE) * ENGINE_ID_DCORE_OFFSET) + + (module_idx % NUM_OF_EDMA_PER_DCORE)); + + case RAZWI_PDMA: + return (GAUDI2_ENGINE_ID_PDMA_0 + module_idx); + + case RAZWI_NIC: + return (GAUDI2_ENGINE_ID_NIC0_0 + (NIC_NUMBER_OF_QM_PER_MACRO * module_idx)); + + case RAZWI_DEC: + if (module_idx == 8) + return GAUDI2_PCIE_ENGINE_ID_DEC_0; + + if (module_idx == 9) + return GAUDI2_PCIE_ENGINE_ID_DEC_1; + ; + return (((module_idx / NUM_OF_DEC_PER_DCORE) * ENGINE_ID_DCORE_OFFSET) + + (module_idx % NUM_OF_DEC_PER_DCORE) + + (GAUDI2_DCORE0_ENGINE_ID_DEC_0 - GAUDI2_DCORE0_ENGINE_ID_EDMA_0)); + + case RAZWI_ROT: + return GAUDI2_ENGINE_ID_ROT_0 + module_idx; + + default: + return GAUDI2_ENGINE_ID_SIZE; } } @@ -7165,7 +7224,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev, u8 module_sub_idx, struct hl_eq_razwi_info *razwi_info) { bool via_sft = false, read_razwi_regs = false; - u32 rtr_id, dcore_id, dcore_rtr_id, sft_id; + u32 rtr_id, dcore_id, dcore_rtr_id, sft_id, eng_id; u64 rtr_mstr_if_base_addr; u32 hbw_shrd_aw = 0, hbw_shrd_ar = 0; u32 lbw_shrd_aw = 0, lbw_shrd_ar = 0; @@ -7299,9 +7358,11 @@ dump_info: if (!hbw_shrd_aw && !hbw_shrd_ar && !lbw_shrd_aw && !lbw_shrd_ar) return; + eng_id = gaudi2_razwi_calc_engine_id(hdev, module, module_idx); if (hbw_shrd_aw) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true, - initiator_name, read_razwi_regs, razwi_info); + initiator_name, read_razwi_regs, razwi_info, + eng_id); /* Clear event indication */ if (read_razwi_regs) @@ -7310,7 +7371,8 @@ dump_info: if (hbw_shrd_ar) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false, - initiator_name, read_razwi_regs, razwi_info); + initiator_name, read_razwi_regs, razwi_info, + eng_id); /* Clear event indication */ if (read_razwi_regs) @@ -7319,7 +7381,8 @@ dump_info: if (lbw_shrd_aw) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true, - initiator_name, read_razwi_regs, razwi_info); + initiator_name, read_razwi_regs, razwi_info, + eng_id); /* Clear event indication */ if (read_razwi_regs) @@ -7328,7 +7391,8 @@ dump_info: if (lbw_shrd_ar) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false, - initiator_name, read_razwi_regs, razwi_info); + initiator_name, read_razwi_regs, razwi_info, + eng_id); /* Clear event indication */ if (read_razwi_regs) @@ -7450,25 +7514,175 @@ static const char *gaudi2_get_initiators_name(u32 rtr_id) } } +static u16 gaudi2_get_razwi_initiators(u32 rtr_id, u16 *engines) +{ + switch (rtr_id) { + case DCORE0_RTR0: + engines[0] = GAUDI2_DCORE0_ENGINE_ID_DEC_0; + engines[1] = GAUDI2_DCORE0_ENGINE_ID_DEC_1; + engines[2] = GAUDI2_PCIE_ENGINE_ID_DEC_0; + engines[3] = GAUDI2_PCIE_ENGINE_ID_DEC_1; + engines[4] = GAUDI2_DCORE0_ENGINE_ID_TPC_6; + engines[5] = GAUDI2_ENGINE_ID_PDMA_0; + engines[6] = GAUDI2_ENGINE_ID_PDMA_1; + engines[7] = GAUDI2_ENGINE_ID_PCIE; + engines[8] = GAUDI2_DCORE0_ENGINE_ID_EDMA_0; + engines[9] = GAUDI2_DCORE1_ENGINE_ID_EDMA_0; + engines[10] = GAUDI2_ENGINE_ID_PSOC; + return 11; + + case DCORE0_RTR1: + engines[0] = GAUDI2_DCORE0_ENGINE_ID_TPC_0; + engines[1] = GAUDI2_DCORE0_ENGINE_ID_TPC_1; + return 2; + + case DCORE0_RTR2: + engines[0] = GAUDI2_DCORE0_ENGINE_ID_TPC_2; + engines[1] = GAUDI2_DCORE0_ENGINE_ID_TPC_3; + return 2; + + case DCORE0_RTR3: + engines[0] = GAUDI2_DCORE0_ENGINE_ID_TPC_4; + engines[1] = GAUDI2_DCORE0_ENGINE_ID_TPC_5; + return 2; + + case DCORE0_RTR4: + case DCORE0_RTR5: + case DCORE0_RTR6: + case DCORE0_RTR7: + engines[0] = GAUDI2_DCORE0_ENGINE_ID_MME; + return 1; + + case DCORE1_RTR0: + case DCORE1_RTR1: + case DCORE1_RTR2: + case DCORE1_RTR3: + engines[0] = GAUDI2_DCORE1_ENGINE_ID_MME; + return 1; + + case DCORE1_RTR4: + engines[0] = GAUDI2_DCORE1_ENGINE_ID_TPC_4; + engines[1] = GAUDI2_DCORE1_ENGINE_ID_TPC_5; + return 2; + + case DCORE1_RTR5: + engines[0] = GAUDI2_DCORE1_ENGINE_ID_TPC_2; + engines[1] = GAUDI2_DCORE1_ENGINE_ID_TPC_3; + return 2; + + case DCORE1_RTR6: + engines[0] = GAUDI2_DCORE1_ENGINE_ID_TPC_0; + engines[1] = GAUDI2_DCORE1_ENGINE_ID_TPC_1; + return 2; + + case DCORE1_RTR7: + engines[0] = GAUDI2_DCORE1_ENGINE_ID_DEC_0; + engines[1] = GAUDI2_DCORE1_ENGINE_ID_DEC_1; + engines[2] = GAUDI2_ENGINE_ID_NIC0_0; + engines[3] = GAUDI2_ENGINE_ID_NIC1_0; + engines[4] = GAUDI2_ENGINE_ID_NIC2_0; + engines[5] = GAUDI2_ENGINE_ID_NIC3_0; + engines[6] = GAUDI2_ENGINE_ID_NIC4_0; + engines[7] = GAUDI2_ENGINE_ID_ARC_FARM; + engines[8] = GAUDI2_ENGINE_ID_KDMA; + engines[9] = GAUDI2_DCORE0_ENGINE_ID_EDMA_1; + engines[10] = GAUDI2_DCORE1_ENGINE_ID_EDMA_1; + return 11; + + case DCORE2_RTR0: + engines[0] = GAUDI2_DCORE2_ENGINE_ID_DEC_0; + engines[1] = GAUDI2_DCORE2_ENGINE_ID_DEC_1; + engines[2] = GAUDI2_ENGINE_ID_NIC5_0; + engines[3] = GAUDI2_ENGINE_ID_NIC6_0; + engines[4] = GAUDI2_ENGINE_ID_NIC7_0; + engines[5] = GAUDI2_ENGINE_ID_NIC8_0; + engines[6] = GAUDI2_DCORE2_ENGINE_ID_EDMA_0; + engines[7] = GAUDI2_DCORE3_ENGINE_ID_EDMA_0; + engines[8] = GAUDI2_ENGINE_ID_ROT_0; + return 9; + + case DCORE2_RTR1: + engines[0] = GAUDI2_DCORE2_ENGINE_ID_TPC_4; + engines[1] = GAUDI2_DCORE2_ENGINE_ID_TPC_5; + return 2; + + case DCORE2_RTR2: + engines[0] = GAUDI2_DCORE2_ENGINE_ID_TPC_2; + engines[1] = GAUDI2_DCORE2_ENGINE_ID_TPC_3; + return 2; + + case DCORE2_RTR3: + engines[0] = GAUDI2_DCORE2_ENGINE_ID_TPC_0; + engines[1] = GAUDI2_DCORE2_ENGINE_ID_TPC_1; + return 2; + + case DCORE2_RTR4: + case DCORE2_RTR5: + case DCORE2_RTR6: + case DCORE2_RTR7: + engines[0] = GAUDI2_DCORE2_ENGINE_ID_MME; + return 1; + case DCORE3_RTR0: + case DCORE3_RTR1: + case DCORE3_RTR2: + case DCORE3_RTR3: + engines[0] = GAUDI2_DCORE3_ENGINE_ID_MME; + return 1; + case DCORE3_RTR4: + engines[0] = GAUDI2_DCORE3_ENGINE_ID_TPC_0; + engines[1] = GAUDI2_DCORE3_ENGINE_ID_TPC_1; + return 2; + case DCORE3_RTR5: + engines[0] = GAUDI2_DCORE3_ENGINE_ID_TPC_2; + engines[1] = GAUDI2_DCORE3_ENGINE_ID_TPC_3; + return 2; + case DCORE3_RTR6: + engines[0] = GAUDI2_DCORE3_ENGINE_ID_TPC_4; + engines[1] = GAUDI2_DCORE3_ENGINE_ID_TPC_5; + return 2; + case DCORE3_RTR7: + engines[0] = GAUDI2_DCORE3_ENGINE_ID_DEC_0; + engines[1] = GAUDI2_DCORE3_ENGINE_ID_DEC_1; + engines[2] = GAUDI2_ENGINE_ID_NIC9_0; + engines[3] = GAUDI2_ENGINE_ID_NIC10_0; + engines[4] = GAUDI2_ENGINE_ID_NIC11_0; + engines[5] = GAUDI2_DCORE2_ENGINE_ID_EDMA_1; + engines[6] = GAUDI2_DCORE3_ENGINE_ID_EDMA_1; + engines[7] = GAUDI2_ENGINE_ID_ROT_1; + engines[8] = GAUDI2_ENGINE_ID_ROT_0; + return 9; + default: + return 0; + } +} + static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u32 rtr_id, u64 rtr_ctrl_base_addr, bool is_write) { + u16 engines[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR], num_of_eng; u32 razwi_hi, razwi_lo; + u8 rd_wr_flag; + + num_of_eng = gaudi2_get_razwi_initiators(rtr_id, &engines[0]); if (is_write) { razwi_hi = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AW_ADDR_HI); razwi_lo = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AW_ADDR_LO); + rd_wr_flag = HL_RAZWI_WRITE; /* Clear set indication */ WREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AW_SET, 0x1); } else { razwi_hi = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AR_ADDR_HI); razwi_lo = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AR_ADDR_LO); + rd_wr_flag = HL_RAZWI_READ; /* Clear set indication */ WREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AR_SET, 0x1); } + hl_capture_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &engines[0], num_of_eng, + rd_wr_flag | HL_RAZWI_HBW); dev_err_ratelimited(hdev->dev, "RAZWI PSOC unmapped HBW %s error, rtr id %u, address %#llx\n", is_write ? "WR" : "RD", rtr_id, (u64)razwi_hi << 32 | razwi_lo); @@ -7480,20 +7694,27 @@ static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u static void gaudi2_razwi_unmapped_addr_lbw_printf_info(struct hl_device *hdev, u32 rtr_id, u64 rtr_ctrl_base_addr, bool is_write) { + u16 engines[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR], num_of_eng; u32 razwi_addr; + u8 rd_wr_flag; + + num_of_eng = gaudi2_get_razwi_initiators(rtr_id, &engines[0]); if (is_write) { razwi_addr = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AW_ADDR); + rd_wr_flag = HL_RAZWI_WRITE; /* Clear set indication */ WREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AW_SET, 0x1); } else { razwi_addr = RREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AR_ADDR); + rd_wr_flag = HL_RAZWI_READ; /* Clear set indication */ WREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AR_SET, 0x1); } + hl_capture_razwi(hdev, razwi_addr, &engines[0], num_of_eng, rd_wr_flag | HL_RAZWI_LBW); dev_err_ratelimited(hdev->dev, "RAZWI PSOC unmapped LBW %s error, rtr id %u, address %#x\n", is_write ? "WR" : "RD", rtr_id, razwi_addr); @@ -7974,28 +8195,28 @@ static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev) razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true, - NULL); + NULL, GAUDI2_ENGINE_ID_PCIE); WREG32(razwi_happened_addr, 0x1); } razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true, - NULL); + NULL, GAUDI2_ENGINE_ID_PCIE); WREG32(razwi_happened_addr, 0x1); } razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true, - NULL); + NULL, GAUDI2_ENGINE_ID_PCIE); WREG32(razwi_happened_addr, 0x1); } razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true, - NULL); + NULL, GAUDI2_ENGINE_ID_PCIE); WREG32(razwi_happened_addr, 0x1); } } diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 2b794f54e2ed..a4ceee681898 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -597,6 +597,10 @@ enum gaudi2_engine_id { GAUDI2_ENGINE_ID_NIC10_1, GAUDI2_ENGINE_ID_NIC11_0, GAUDI2_ENGINE_ID_NIC11_1, + GAUDI2_ENGINE_ID_PCIE, + GAUDI2_ENGINE_ID_PSOC, + GAUDI2_ENGINE_ID_ARC_FARM, + GAUDI2_ENGINE_ID_KDMA, GAUDI2_ENGINE_ID_SIZE }; -- cgit v1.2.3 From 4f11694f27582fa0875c4be7d133e0ae88ad36f8 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Thu, 29 Sep 2022 10:28:36 +0300 Subject: habanalabs/gaudi2: capture page fault data Capture page fault data when it happens. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 1058c8a0e644..a4e3586f1a12 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -8286,6 +8286,7 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n", is_pmmu ? "PMMU" : "HMMU", addr); + hl_capture_page_fault(hdev, addr, 0, is_pmmu); WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE), 0); } -- cgit v1.2.3 From 27cd39afde454ca8f9a438cfc84d676e96b36bd7 Mon Sep 17 00:00:00 2001 From: Tal Cohen Date: Mon, 3 Oct 2022 13:55:50 +0300 Subject: habanalabs: verify no zero event is sent The event notifier mechanism should not raise an empty event (event equals zero). Signed-off-by: Tal Cohen Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 5 +++++ drivers/misc/habanalabs/gaudi/gaudi.c | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index b8b32285720d..9b54d1df5302 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1746,6 +1746,11 @@ void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask) { struct hl_fpriv *hpriv; + if (!event_mask) { + dev_warn(hdev->dev, "Skip sending zero event"); + return; + } + mutex_lock(&hdev->fpriv_list_lock); list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 1a99f7be8b60..337123f73501 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7945,7 +7945,9 @@ reset_device: /* despite reset doesn't execute. a notification on * occurred event needs to be sent here */ - hl_notifier_event_send_all(hdev, event_mask); + if (event_mask) + hl_notifier_event_send_all(hdev, event_mask); + if (reset_required) hl_device_reset(hdev, flags); else -- cgit v1.2.3 From dc8d243caea8056bd2580b0f1703fe019d3b4419 Mon Sep 17 00:00:00 2001 From: Dilip Puri Date: Wed, 12 Oct 2022 11:06:48 +0300 Subject: habanalabs/gaudi2: unsecure CBU_EARLY_BRESP registers NIC ARCs need to have access to CBU_EARLY_BRESP, hence we unsecure those registers. Signed-off-by: Dilip Puri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2_security.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2_security.c b/drivers/misc/habanalabs/gaudi2/gaudi2_security.c index c6906fb14229..768c2f3dc900 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2_security.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2_security.c @@ -1764,6 +1764,7 @@ static const struct range gaudi2_pb_nic0_qm_arc_aux0_unsecured_regs[] = { {mmNIC0_QM_ARC_AUX0_CLUSTER_NUM, mmNIC0_QM_ARC_AUX0_WAKE_UP_EVENT}, {mmNIC0_QM_ARC_AUX0_ARC_RST_REQ, mmNIC0_QM_ARC_AUX0_CID_OFFSET_7}, {mmNIC0_QM_ARC_AUX0_SCRATCHPAD_0, mmNIC0_QM_ARC_AUX0_INFLIGHT_LBU_RD_CNT}, + {mmNIC0_QM_ARC_AUX0_CBU_EARLY_BRESP_EN, mmNIC0_QM_ARC_AUX0_CBU_EARLY_BRESP_EN}, {mmNIC0_QM_ARC_AUX0_LBU_EARLY_BRESP_EN, mmNIC0_QM_ARC_AUX0_LBU_EARLY_BRESP_EN}, {mmNIC0_QM_ARC_AUX0_DCCM_QUEUE_BASE_ADDR_0, mmNIC0_QM_ARC_AUX0_DCCM_QUEUE_ALERT_MSG}, {mmNIC0_QM_ARC_AUX0_DCCM_Q_PUSH_FIFO_CNT, mmNIC0_QM_ARC_AUX0_QMAN_ARC_CQ_SHADOW_CI}, -- cgit v1.2.3 From 24fdfb359cadd222de8ba9d2d6a3f4dfc514878a Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Tue, 18 Oct 2022 08:51:33 +0300 Subject: habanalabs: fix using freed pointer The code uses the pointer for trace purpose (without actually dereference it) but still get static analysis warning. This patch eliminate the warning. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 9b54d1df5302..dd01be5c4ba3 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -135,6 +135,9 @@ static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *c dma_addr_t dma_handle, enum dma_alloc_type alloc_type, const char *caller) { + /* this is needed to avoid warning on using freed pointer */ + u64 store_cpu_addr = (u64) (uintptr_t) cpu_addr; + switch (alloc_type) { case DMA_ALLOC_COHERENT: hdev->asic_funcs->asic_dma_free_coherent(hdev, size, cpu_addr, dma_handle); @@ -147,7 +150,7 @@ static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *c break; } - trace_habanalabs_dma_free(hdev->dev, (u64) (uintptr_t) cpu_addr, dma_handle, size, caller); + trace_habanalabs_dma_free(hdev->dev, store_cpu_addr, dma_handle, size, caller); } void *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle, -- cgit v1.2.3 From e325d5dbf34500fd42d5847d5b8c4e097f8030af Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Wed, 14 Sep 2022 08:53:29 +0300 Subject: habanalabs: allow setting HBM BAR to other regions Up until now the use-case in the driver was that the HBM is accessed using the HBM BAR, yet the BAR sometimes cannot cover the whole HBM and so we needed to set the BAR to other HBM offset. Now we are facing the need to access other PCI memory regions that can be covered by the HBM BAR. To answer that we are allowing the caller to determine if the HBM BAR need to be set or not regardless of the PCI memory region. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 29 ++++++++++++++++------------- drivers/misc/habanalabs/common/habanalabs.h | 2 ++ 2 files changed, 18 insertions(+), 13 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index dd01be5c4ba3..0026fe42b3d2 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -32,6 +32,7 @@ enum dma_alloc_type { * @hdev: pointer to habanalabs device structure. * @addr: the address the caller wants to access. * @region: the PCI region. + * @new_bar_region_base: the new BAR region base address. * * @return: the old BAR base address on success, U64_MAX for failure. * The caller should set it back to the old address after use. @@ -41,7 +42,8 @@ enum dma_alloc_type { * This function can be called also if the bar doesn't need to be set, * in that case it just won't change the base. */ -static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_region *region) +static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_region *region, + u64 *new_bar_region_base) { struct asic_fixed_properties *prop = &hdev->asic_prop; u64 bar_base_addr, old_base; @@ -55,27 +57,28 @@ static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_regi old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr); /* in case of success we need to update the new BAR base */ - if (old_base != U64_MAX) - region->region_base = bar_base_addr; + if ((old_base != U64_MAX) && new_bar_region_base) + *new_bar_region_base = bar_base_addr; return old_base; } -static int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val, - enum debugfs_access_type acc_type, enum pci_region region_type) +int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val, + enum debugfs_access_type acc_type, enum pci_region region_type, bool set_dram_bar) { struct pci_mem_region *region = &hdev->pci_mem_region[region_type]; + u64 old_base = 0, rc, new_bar_region_base = 0; void __iomem *acc_addr; - u64 old_base = 0, rc; - if (region_type == PCI_REGION_DRAM) { - old_base = hl_set_dram_bar(hdev, addr, region); + if (set_dram_bar) { + old_base = hl_set_dram_bar(hdev, addr, region, &new_bar_region_base); if (old_base == U64_MAX) return -EIO; } - acc_addr = hdev->pcie_bar[region->bar_id] + addr - region->region_base + - region->offset_in_bar; + acc_addr = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + + (addr - new_bar_region_base); + switch (acc_type) { case DEBUGFS_READ8: *val = readb(acc_addr); @@ -97,8 +100,8 @@ static int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val break; } - if (region_type == PCI_REGION_DRAM) { - rc = hl_set_dram_bar(hdev, old_base, region); + if (set_dram_bar) { + rc = hl_set_dram_bar(hdev, old_base, region, NULL); if (rc == U64_MAX) return -EIO; } @@ -283,7 +286,7 @@ int hl_access_dev_mem(struct hl_device *hdev, enum pci_region region_type, case PCI_REGION_SRAM: case PCI_REGION_DRAM: return hl_access_sram_dram_region(hdev, addr, val, acc_type, - region_type); + region_type, (region_type == PCI_REGION_DRAM)); default: return -EFAULT; } diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 4913197c433e..c8347eac09ed 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -3486,6 +3486,8 @@ void hl_asic_dma_pool_free_caller(struct hl_device *hdev, void *vaddr, dma_addr_ int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir); void hl_dma_unmap_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir); +int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val, + enum debugfs_access_type acc_type, enum pci_region region_type, bool set_dram_bar); int hl_access_cfg_region(struct hl_device *hdev, u64 addr, u64 *val, enum debugfs_access_type acc_type); int hl_access_dev_mem(struct hl_device *hdev, enum pci_region region_type, -- cgit v1.2.3 From 5ad06bb1d2c073c8b071016226fb9ebe2163e660 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 20 Oct 2022 11:29:03 +0300 Subject: habanalabs/gaudi2: remove configurations to access the MSI-X doorbell The virtual MSI-X doorbell is supported now in F/W, so all configurations to access the PCIE_DBI MSI-X doorbell can be removed. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 34 +++------------------------------ 1 file changed, 3 insertions(+), 31 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index a4e3586f1a12..9208f69dd7f8 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -4473,23 +4473,9 @@ static void gaudi2_init_sm(struct hl_device *hdev) reg_val = FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_MON_CONFIG_CQ_EN_MASK, 1); WREG32(mmDCORE0_SYNC_MNGR_OBJS_MON_CONFIG_0 + (4 * i), reg_val); - /* Init CQ0 DB */ - /* Configure the monitor to trigger MSI-X interrupt */ - /* TODO: - * Remove the if statement when virtual MSI-X doorbell is supported in simulator (SW-93022) - * and in F/W (SW-93024). - */ - if (!hdev->pdev || hdev->asic_prop.fw_security_enabled) { - u64 msix_db_reg = CFG_BASE + mmPCIE_DBI_MSIX_DOORBELL_OFF; - - WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_L_0, lower_32_bits(msix_db_reg)); - WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_H_0, upper_32_bits(msix_db_reg)); - } else { - WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_L_0, - lower_32_bits(gaudi2->virt_msix_db_dma_addr)); - WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_H_0, - upper_32_bits(gaudi2->virt_msix_db_dma_addr)); - } + /* Init CQ0 DB - configure the monitor to trigger MSI-X interrupt */ + WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_L_0, lower_32_bits(gaudi2->virt_msix_db_dma_addr)); + WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_H_0, upper_32_bits(gaudi2->virt_msix_db_dma_addr)); WREG32(mmDCORE0_SYNC_MNGR_GLBL_LBW_DATA_0, GAUDI2_IRQ_NUM_COMPLETION); for (i = 0 ; i < GAUDI2_RESERVED_CQ_NUMBER ; i++) { @@ -4657,20 +4643,6 @@ static void gaudi2_init_vdec_brdg_ctrl(struct hl_device *hdev, u64 base_addr, u3 { u32 sob_id; - /* TODO: - * Remove when virtual MSI-X doorbell is supported in simulator (SW-93022) and in F/W - * (SW-93024). - */ - if (!hdev->pdev || hdev->asic_prop.fw_security_enabled) { - u32 interrupt_id = GAUDI2_IRQ_NUM_DCORE0_DEC0_NRM + 2 * decoder_id; - - WREG32(base_addr + BRDG_CTRL_NRM_MSIX_LBW_AWADDR, mmPCIE_DBI_MSIX_DOORBELL_OFF); - WREG32(base_addr + BRDG_CTRL_NRM_MSIX_LBW_WDATA, interrupt_id); - WREG32(base_addr + BRDG_CTRL_ABNRM_MSIX_LBW_AWADDR, mmPCIE_DBI_MSIX_DOORBELL_OFF); - WREG32(base_addr + BRDG_CTRL_ABNRM_MSIX_LBW_WDATA, interrupt_id + 1); - return; - } - /* VCMD normal interrupt */ sob_id = GAUDI2_RESERVED_SOB_DEC_NRM_FIRST + decoder_id; WREG32(base_addr + BRDG_CTRL_NRM_MSIX_LBW_AWADDR, -- cgit v1.2.3 From 6bcb2d05a59b3534821a194f8642808ae56f2d10 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Wed, 19 Oct 2022 20:24:55 +0300 Subject: habanalabs: fix user mappings calculation in case of page fault As there are 2 types of user mappings, pmmu and hmmu, calculate only the relevant mappings for the requested type. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 0026fe42b3d2..0e88396744a1 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -2305,8 +2305,13 @@ static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu) } mutex_lock(&ctx->mem_hash_lock); - hash_for_each(ctx->mem_hash, i, hnode, node) - pgf_info->num_of_user_mappings++; + hash_for_each(ctx->mem_hash, i, hnode, node) { + vm_type = hnode->ptr; + if (((*vm_type == VM_TYPE_USERPTR) && is_pmmu) || + ((*vm_type == VM_TYPE_PHYS_PACK) && !is_pmmu)) + pgf_info->num_of_user_mappings++; + + } if (!pgf_info->num_of_user_mappings) goto finish; -- cgit v1.2.3 From d1e0ac37ed41e581c030a8fffe4ad1d0bb987872 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Sun, 23 Oct 2022 14:46:08 +0300 Subject: habanalabs: avoid divide by zero in device utilization Currently there is no verification whether the divisor is legal. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 0e88396744a1..b71303ba11d0 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -987,11 +987,16 @@ static void device_late_fini(struct hl_device *hdev) int hl_device_utilization(struct hl_device *hdev, u32 *utilization) { - u64 max_power, curr_power, dc_power, dividend; + u64 max_power, curr_power, dc_power, dividend, divisor; int rc; max_power = hdev->max_power; dc_power = hdev->asic_prop.dc_power_default; + divisor = max_power - dc_power; + if (!divisor) { + dev_warn(hdev->dev, "device utilization is not supported\n"); + return -EOPNOTSUPP; + } rc = hl_fw_cpucp_power_get(hdev, &curr_power); if (rc) @@ -1000,7 +1005,7 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization) curr_power = clamp(curr_power, dc_power, max_power); dividend = (curr_power - dc_power) * 100; - *utilization = (u32) div_u64(dividend, (max_power - dc_power)); + *utilization = (u32) div_u64(dividend, divisor); return 0; } -- cgit v1.2.3 From a88a6f5f5cdfce21aaf988370287e0e78970c8ad Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Fri, 30 Sep 2022 15:08:13 +0300 Subject: habanalabs: add support for graceful hard reset Calling hl_device_reset() for a hard reset will lead to a quite immediate device reset and to killing user process. For resets that follow errors, it disables the option to debug the errors on both the device side and the user application side. This patch adds a 'graceful hard reset' option and a new hl_device_cond_reset() function. Under some conditions, mainly if there is no user process or if he is not registered to driver notifications, this function will execute hard reset as usual. Otherwise, the reset will be postponed and a notification will be sent to user, to let him perform post-error actions and then to release the device, after which reset will take place. If device is not released by user in some defined time, a watchdog work will execute the reset in any case. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 141 +++++++++++++++++++++++++--- drivers/misc/habanalabs/common/habanalabs.h | 14 ++- 2 files changed, 140 insertions(+), 15 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index b71303ba11d0..bcd959924971 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -16,7 +16,9 @@ #include -#define HL_RESET_DELAY_USEC 10000 /* 10ms */ +#define HL_RESET_DELAY_USEC 10000 /* 10ms */ + +#define HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC 5 enum dma_alloc_type { DMA_ALLOC_COHERENT, @@ -387,7 +389,7 @@ bool hl_ctrl_device_operational(struct hl_device *hdev, static void hpriv_release(struct kref *ref) { u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; - bool device_is_idle = true; + bool reset_device, device_is_idle = true; struct hl_fpriv *hpriv; struct hl_device *hdev; @@ -404,14 +406,20 @@ static void hpriv_release(struct kref *ref) mutex_destroy(&hpriv->ctx_lock); mutex_destroy(&hpriv->restore_phase_mutex); - /* No need for idle status check if device is going to be reset in any case */ - if (!hdev->reset_upon_device_release && hdev->pdev && !hdev->pldm) + /* Device should be reset if reset-upon-device-release is enabled, or if there is a pending + * reset that waits for device release. + */ + reset_device = hdev->reset_upon_device_release || hdev->reset_info.watchdog_active; + + /* Unless device is reset in any case, check idle status and reset if device is not idle */ + if (!reset_device && hdev->pdev && !hdev->pldm) device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask, HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL); - - if (!device_is_idle) + if (!device_is_idle) { dev_err(hdev->dev, "device not idle after user context is closed (0x%llx_%llx)\n", idle_mask[1], idle_mask[0]); + reset_device = true; + } /* We need to remove the user from the list to make sure the reset process won't * try to kill the user process. Because, if we got here, it means there are no @@ -426,9 +434,10 @@ static void hpriv_release(struct kref *ref) list_del(&hpriv->dev_node); mutex_unlock(&hdev->fpriv_list_lock); - if (!device_is_idle || hdev->reset_upon_device_release) { + if (reset_device) { hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE); } else { + /* Scrubbing is handled within hl_device_reset(), so here need to do it directly */ int rc = hdev->asic_funcs->scrub_device_mem(hdev); if (rc) @@ -695,6 +704,20 @@ static void device_hard_reset_pending(struct work_struct *work) } } +static void device_release_watchdog_func(struct work_struct *work) +{ + struct hl_device_reset_work *device_release_watchdog_work = + container_of(work, struct hl_device_reset_work, reset_work.work); + struct hl_device *hdev = device_release_watchdog_work->hdev; + u32 flags; + + dev_dbg(hdev->dev, "Device wasn't released in time. Initiate device reset.\n"); + + flags = device_release_watchdog_work->flags | HL_DRV_RESET_FROM_WD_THR; + + hl_device_reset(hdev, flags); +} + /* * device_early_init - do some early initialization for the habanalabs device * @@ -813,11 +836,14 @@ static int device_early_init(struct hl_device *hdev) goto free_cb_mgr; } - INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, - device_hard_reset_pending); + INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, device_hard_reset_pending); hdev->device_reset_work.hdev = hdev; hdev->device_fini_pending = 0; + INIT_DELAYED_WORK(&hdev->device_release_watchdog_work.reset_work, + device_release_watchdog_func); + hdev->device_release_watchdog_work.hdev = hdev; + mutex_init(&hdev->send_cpu_message_lock); mutex_init(&hdev->debug_lock); INIT_LIST_HEAD(&hdev->cs_mirror_list); @@ -1367,8 +1393,8 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) int hl_device_reset(struct hl_device *hdev, u32 flags) { bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false, - reset_upon_device_release = false, schedule_hard_reset = false, - skip_wq_flush, delay_reset; + reset_upon_device_release = false, schedule_hard_reset = false, delay_reset, + from_dev_release, from_watchdog_thread; u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; struct hl_ctx *ctx; int i, rc; @@ -1381,8 +1407,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) hard_reset = !!(flags & HL_DRV_RESET_HARD); from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR); fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW); - skip_wq_flush = !!(flags & HL_DRV_RESET_DEV_RELEASE); + from_dev_release = !!(flags & HL_DRV_RESET_DEV_RELEASE); delay_reset = !!(flags & HL_DRV_RESET_DELAY); + from_watchdog_thread = !!(flags & HL_DRV_RESET_FROM_WD_THR); if (!hard_reset && !hdev->asic_prop.supports_compute_reset) { hard_instead_soft = true; @@ -1439,6 +1466,23 @@ do_reset: spin_unlock(&hdev->reset_info.lock); + /* Cancel the device release watchdog work if required. + * In case of reset-upon-device-release while the release watchdog work is + * scheduled, do hard-reset instead of compute-reset. + */ + if ((hard_reset || from_dev_release) && hdev->reset_info.watchdog_active) { + hdev->reset_info.watchdog_active = 0; + if (!from_watchdog_thread) + cancel_delayed_work_sync( + &hdev->device_release_watchdog_work.reset_work); + + if (from_dev_release) { + flags |= HL_DRV_RESET_HARD; + flags &= ~HL_DRV_RESET_DEV_RELEASE; + hard_reset = true; + } + } + if (delay_reset) usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1); @@ -1474,7 +1518,7 @@ again: return 0; } - cleanup_resources(hdev, hard_reset, fw_reset, skip_wq_flush); + cleanup_resources(hdev, hard_reset, fw_reset, from_dev_release); kill_processes: if (hard_reset) { @@ -1735,6 +1779,73 @@ out_err: return rc; } +/* + * hl_device_cond_reset() - conditionally reset the device. + * @hdev: pointer to habanalabs device structure. + * @reset_flags: reset flags. + * @event_mask: events to notify user about. + * + * Conditionally reset the device, or alternatively schedule a watchdog work to reset the device + * unless another reset precedes it. + */ +int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask) +{ + struct hl_ctx *ctx = NULL; + + /* Device release watchdog is only for hard reset */ + if (!(flags & HL_DRV_RESET_HARD) && hdev->asic_prop.allow_inference_soft_reset) + goto device_reset; + + /* F/W reset cannot be postponed */ + if (flags & HL_DRV_RESET_BYPASS_REQ_TO_FW) + goto device_reset; + + /* Device release watchdog is relevant only if user exists and gets a reset notification */ + if (!(event_mask & HL_NOTIFIER_EVENT_DEVICE_RESET)) { + dev_err(hdev->dev, "Resetting device without a reset indication to user\n"); + goto device_reset; + } + + ctx = hl_get_compute_ctx(hdev); + if (!ctx || !ctx->hpriv->notifier_event.eventfd) + goto device_reset; + + /* Schedule the device release watchdog work unless reset is already in progress or if the + * work is already scheduled. + */ + spin_lock(&hdev->reset_info.lock); + if (hdev->reset_info.in_reset) { + spin_unlock(&hdev->reset_info.lock); + goto device_reset; + } + + if (hdev->reset_info.watchdog_active) + goto out; + + hdev->device_release_watchdog_work.flags = flags; + dev_dbg(hdev->dev, "Device is going to be reset in %u sec unless being released\n", + hdev->device_release_watchdog_timeout_sec); + schedule_delayed_work(&hdev->device_release_watchdog_work.reset_work, + msecs_to_jiffies(hdev->device_release_watchdog_timeout_sec * 1000)); + hdev->reset_info.watchdog_active = 1; +out: + spin_unlock(&hdev->reset_info.lock); + + hl_notifier_event_send_all(hdev, event_mask); + + hl_ctx_put(ctx); + + return 0; + +device_reset: + if (event_mask) + hl_notifier_event_send_all(hdev, event_mask); + if (ctx) + hl_ctx_put(ctx); + + return hl_device_reset(hdev, flags); +} + static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask) { mutex_lock(¬ifier_event->lock); @@ -1932,6 +2043,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) hdev->asic_funcs->state_dump_init(hdev); + hdev->device_release_watchdog_timeout_sec = HL_DEVICE_RELEASE_WATCHDOG_TIMEOUT_SEC; + hdev->memory_scrub_val = MEM_SCRUB_DEFAULT_VAL; hl_debugfs_add_device(hdev); @@ -2152,6 +2265,8 @@ void hl_device_fini(struct hl_device *hdev) } } + cancel_delayed_work_sync(&hdev->device_release_watchdog_work.reset_work); + /* Disable PCI access from device F/W so it won't send us additional * interrupts. We disable MSI/MSI-X at the halt_engines function and we * can't have the F/W sending us interrupts after that. We need to diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index c8347eac09ed..bfaaa9daa750 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -191,6 +191,9 @@ enum hl_mmu_enablement { * * - HL_DRV_RESET_DELAY * Set if a delay should be added before the reset + * + * - HL_DRV_RESET_FROM_WD_THR + * Set if the caller is the device release watchdog thread */ #define HL_DRV_RESET_HARD (1 << 0) @@ -201,6 +204,7 @@ enum hl_mmu_enablement { #define HL_DRV_RESET_BYPASS_REQ_TO_FW (1 << 5) #define HL_DRV_RESET_FW_FATAL_ERR (1 << 6) #define HL_DRV_RESET_DELAY (1 << 7) +#define HL_DRV_RESET_FROM_WD_THR (1 << 8) /* * Security @@ -3009,6 +3013,7 @@ struct hl_error_info { * same cause. * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to * complete instead. + * @watchdog_active: true if a device release watchdog work is scheduled. */ struct hl_reset_info { spinlock_t lock; @@ -3019,12 +3024,11 @@ struct hl_reset_info { u8 in_compute_reset; u8 needs_reset; u8 hard_reset_pending; - u8 curr_reset_cause; u8 prev_reset_trigger; u8 reset_trigger_repeated; - u8 skip_reset_on_timeout; + u8 watchdog_active; }; /** @@ -3040,6 +3044,8 @@ struct hl_reset_info { * @dev_ctrl: related kernel device structure for the control device * @work_heartbeat: delayed work for CPU-CP is-alive check. * @device_reset_work: delayed work which performs hard reset + * @device_release_watchdog_work: watchdog work that performs hard reset if user doesn't release + * device upon certain error cases. * @asic_name: ASIC specific name. * @asic_type: ASIC specific type. * @completion_queue: array of hl_cq. @@ -3149,6 +3155,7 @@ struct hl_reset_info { * indicates which decoder engines are binned-out * @edma_binning: contains mask of edma engines that is received from the f/w which * indicates which edma engines are binned-out + * @device_release_watchdog_timeout_sec: device release watchdog timeout value in seconds. * @id: device minor. * @id_control: minor of the control device. * @cdev_idx: char device index. Used for setting its name. @@ -3218,6 +3225,7 @@ struct hl_device { struct device *dev_ctrl; struct delayed_work work_heartbeat; struct hl_device_reset_work device_reset_work; + struct hl_device_reset_work device_release_watchdog_work; char asic_name[HL_STR_MAX]; char status[HL_DEV_STS_MAX][HL_STR_MAX]; enum hl_asic_type asic_type; @@ -3312,6 +3320,7 @@ struct hl_device { u32 high_pll; u32 decoder_binning; u32 edma_binning; + u32 device_release_watchdog_timeout_sec; u16 id; u16 id_control; u16 cdev_idx; @@ -3551,6 +3560,7 @@ void hl_device_fini(struct hl_device *hdev); int hl_device_suspend(struct hl_device *hdev); int hl_device_resume(struct hl_device *hdev); int hl_device_reset(struct hl_device *hdev, u32 flags); +int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask); void hl_hpriv_get(struct hl_fpriv *hpriv); int hl_hpriv_put(struct hl_fpriv *hpriv); int hl_device_utilization(struct hl_device *hdev, u32 *utilization); -- cgit v1.2.3 From 11669b58fa1cee8442ae31ad4ba71398729727b5 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Fri, 30 Sep 2022 16:37:41 +0300 Subject: habanalabs: add an option to control watchdog timeout via debugfs Add an option to control the timeout value for the driver's watchdog of the reset process. The timeout represents the amount of the user has to close his process once he gets a device reset notification from the driver. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- Documentation/ABI/testing/debugfs-driver-habanalabs | 7 +++++++ drivers/misc/habanalabs/common/debugfs.c | 5 +++++ 2 files changed, 12 insertions(+) (limited to 'drivers/misc') diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs index c915bf17b293..85f6d04f528b 100644 --- a/Documentation/ABI/testing/debugfs-driver-habanalabs +++ b/Documentation/ABI/testing/debugfs-driver-habanalabs @@ -91,6 +91,13 @@ Description: Enables the root user to set the device to specific state. Valid values are "disable", "enable", "suspend", "resume". User can read this property to see the valid values +What: /sys/kernel/debug/habanalabs/hl/device_release_watchdog_timeout +Date: Oct 2022 +KernelVersion: 6.2 +Contact: ttayar@habana.ai +Description: The watchdog timeout value in seconds for a device relese upon + certain error cases, after which the device is reset. + What: /sys/kernel/debug/habanalabs/hl/dma_size Date: Apr 2021 KernelVersion: 5.13 diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c index 48d3ec8b5c82..945c0e6758ca 100644 --- a/drivers/misc/habanalabs/common/debugfs.c +++ b/drivers/misc/habanalabs/common/debugfs.c @@ -1769,6 +1769,11 @@ void hl_debugfs_add_device(struct hl_device *hdev) dev_entry, &hl_timeout_locked_fops); + debugfs_create_u32("device_release_watchdog_timeout", + 0644, + dev_entry->root, + &hdev->device_release_watchdog_timeout_sec); + for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) { debugfs_create_file(hl_debugfs_list[i].name, 0444, -- cgit v1.2.3 From 5b8873b39c5d4ee93e382389b199d553b38b19f3 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Fri, 30 Sep 2022 16:43:47 +0300 Subject: habanalabs/gaudi: use graceful hard reset for F/W events Use graceful hard reset for F/W events on Gaudi device that require a device reset. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 337123f73501..3dfb9ecf7db3 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7942,16 +7942,14 @@ reset_device: reset_required = false; } - /* despite reset doesn't execute. a notification on - * occurred event needs to be sent here - */ - if (event_mask) - hl_notifier_event_send_all(hdev, event_mask); - - if (reset_required) - hl_device_reset(hdev, flags); - else + if (reset_required) { + hl_device_cond_reset(hdev, flags, event_mask); + } else { hl_fw_unmask_irq(hdev, event_type); + /* Notification on occurred event needs to be sent although reset is not executed */ + if (event_mask) + hl_notifier_event_send_all(hdev, event_mask); + } } static void *gaudi_get_events_stat(struct hl_device *hdev, bool aggregate, u32 *size) -- cgit v1.2.3 From d1ce7e5ea140bb01d8c6faded09b9264bb83f722 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Fri, 30 Sep 2022 16:57:54 +0300 Subject: habanalabs/gaudi2: use graceful hard reset for F/W events Use graceful hard reset for F/W events on Gaudi2 device that require a device reset. While at it, do a small refactor of the checks and function calls, to simplify it and to avoid code duplication. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 9208f69dd7f8..22f5445fe71c 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -8768,9 +8768,9 @@ static void hl_arc_event_handle(struct hl_device *hdev, static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) { - u32 ctl, reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY; - struct gaudi2_device *gaudi2 = hdev->asic_specific; bool reset_required = false, skip_reset = false, is_critical = false; + struct gaudi2_device *gaudi2 = hdev->asic_specific; + u32 ctl, reset_flags = HL_DRV_RESET_HARD; int index, sbte_index; u64 event_mask = 0; u16 event_type; @@ -9158,7 +9158,9 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent event_type); } - if ((gaudi2_irq_map_table[event_type].reset || reset_required) && !skip_reset) + if ((gaudi2_irq_map_table[event_type].reset || reset_required) && !skip_reset && + (hdev->hard_reset_on_fw_events || + (hdev->asic_prop.fw_security_enabled && is_critical))) goto reset_device; /* Send unmask irq only for interrupts not classified as MSG */ @@ -9172,22 +9174,13 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent reset_device: if (hdev->asic_prop.fw_security_enabled && is_critical) { - reset_flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW; - - /* notify on device unavailable while the reset triggered by fw */ - event_mask |= (HL_NOTIFIER_EVENT_DEVICE_RESET | - HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE); - hl_device_reset(hdev, reset_flags); - } else if (hdev->hard_reset_on_fw_events) { - event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; - hl_device_reset(hdev, reset_flags); + reset_flags |= HL_DRV_RESET_BYPASS_REQ_TO_FW; + event_mask |= HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE; } else { - if (!gaudi2_irq_map_table[event_type].msg) - hl_fw_unmask_irq(hdev, event_type); + reset_flags |= HL_DRV_RESET_DELAY; } - - if (event_mask) - hl_notifier_event_send_all(hdev, event_mask); + event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; + hl_device_cond_reset(hdev, reset_flags, event_mask); } static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size, u64 val) -- cgit v1.2.3 From 1b363adc7fbe37c4b6c18864c1f7043d85b4af6e Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Fri, 30 Sep 2022 17:02:19 +0300 Subject: habanalabs: use graceful hard reset for CS timeouts Use graceful hard reset when detecting a CS timeout that requires a device reset. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/command_submission.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index fa05770865c6..f1c69c8ed74a 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -798,7 +798,7 @@ out: static void cs_timedout(struct work_struct *work) { struct hl_device *hdev; - u64 event_mask; + u64 event_mask = 0x0; int rc; struct hl_cs *cs = container_of(work, struct hl_cs, work_tdr.work); @@ -830,11 +830,7 @@ static void cs_timedout(struct work_struct *work) if (rc) { hdev->captured_err_info.cs_timeout.timestamp = ktime_get(); hdev->captured_err_info.cs_timeout.seq = cs->sequence; - - event_mask = device_reset ? (HL_NOTIFIER_EVENT_CS_TIMEOUT | - HL_NOTIFIER_EVENT_DEVICE_RESET) : HL_NOTIFIER_EVENT_CS_TIMEOUT; - - hl_notifier_event_send_all(hdev, event_mask); + event_mask |= HL_NOTIFIER_EVENT_CS_TIMEOUT; } switch (cs->type) { @@ -869,8 +865,12 @@ static void cs_timedout(struct work_struct *work) cs_put(cs); - if (device_reset) - hl_device_reset(hdev, HL_DRV_RESET_TDR); + if (device_reset) { + event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET; + hl_device_cond_reset(hdev, HL_DRV_RESET_TDR, event_mask); + } else if (event_mask) { + hl_notifier_event_send_all(hdev, event_mask); + } } static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, -- cgit v1.2.3 From 4a9c6e2cdf2b4128f5204b9cf14e3a788a8511df Mon Sep 17 00:00:00 2001 From: Tal Cohen Date: Tue, 18 Oct 2022 17:35:06 +0300 Subject: habanalabs: no consecutive err when user context is enabled Consecutive error protects a device reset loop from being triggered due to h/w issues and enters the device into an unavailable state. When user may cause the error, an unavailable state will prevent the user from running its workloads. The commit prevents entering consecutive state when a user context is enabled. Signed-off-by: Tal Cohen Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index bcd959924971..61ddcb1ce508 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1320,6 +1320,10 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags) { u32 cur_reset_trigger = HL_RESET_TRIGGER_DEFAULT; + /* No consecutive mechanism when user context exists */ + if (hdev->is_compute_ctx_active) + return; + /* * 'reset cause' is being updated here, because getting here * means that it's the 1st time and the last time we're here -- cgit v1.2.3 From 679e968908a4997d02c2a7df294e97b066f9149f Mon Sep 17 00:00:00 2001 From: farah kassabri Date: Tue, 20 Sep 2022 11:48:40 +0300 Subject: habanalabs: zero ts registration buff when allocated To avoid memory corruption in kernel memory while using timestamp registration nodes, zero the kernel buff memory when its allocated. Signed-off-by: farah kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 99b1d6ce26ae..541e1b6a2176 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -2109,7 +2109,7 @@ static int hl_ts_alloc_buf(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args) /* Allocate the internal kernel buffer */ size = num_elements * sizeof(struct hl_user_pending_interrupt); - p = vmalloc(size); + p = vzalloc(size); if (!p) goto free_user_buff; -- cgit v1.2.3 From fc69aa8640f8baf9c1246c17ca858bab9aea98b0 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 24 Oct 2022 01:14:18 +0300 Subject: habanalabs: fix PCIe access to SRAM via debugfs hl_access_sram_dram_region() uses a region base which is set within the hl_set_dram_bar() function. However, for SRAM access this function is not called, and we end up with a wrong value of region base and with a bad calculated address. Fix it by initializing the region base value independently of whether hl_set_dram_bar() is called or not. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 61ddcb1ce508..cb8ecc17bba1 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -69,17 +69,17 @@ int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val, enum debugfs_access_type acc_type, enum pci_region region_type, bool set_dram_bar) { struct pci_mem_region *region = &hdev->pci_mem_region[region_type]; - u64 old_base = 0, rc, new_bar_region_base = 0; + u64 old_base = 0, rc, bar_region_base = region->region_base; void __iomem *acc_addr; if (set_dram_bar) { - old_base = hl_set_dram_bar(hdev, addr, region, &new_bar_region_base); + old_base = hl_set_dram_bar(hdev, addr, region, &bar_region_base); if (old_base == U64_MAX) return -EIO; } acc_addr = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + - (addr - new_bar_region_base); + (addr - bar_region_base); switch (acc_type) { case DEBUGFS_READ8: -- cgit v1.2.3 From bdfef91e7c9c2bae083ce1965f53115d88329773 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 19 Oct 2022 14:05:18 +0300 Subject: habanalabs: add warning print upon a PCI error In order to know if driver catches PCI errors correctly, we need to print a warning per each error. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs_drv.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 714994725224..e82af8989700 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -595,15 +595,16 @@ hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) switch (state) { case pci_channel_io_normal: + dev_warn(hdev->dev, "PCI normal state error detected\n"); return PCI_ERS_RESULT_CAN_RECOVER; case pci_channel_io_frozen: - dev_warn(hdev->dev, "frozen state error detected\n"); + dev_warn(hdev->dev, "PCI frozen state error detected\n"); result = PCI_ERS_RESULT_NEED_RESET; break; case pci_channel_io_perm_failure: - dev_warn(hdev->dev, "failure state error detected\n"); + dev_warn(hdev->dev, "PCI failure state error detected\n"); result = PCI_ERS_RESULT_DISCONNECT; break; @@ -639,6 +640,10 @@ static void hl_pci_err_resume(struct pci_dev *pdev) */ static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev) { + struct hl_device *hdev = pci_get_drvdata(pdev); + + dev_warn(hdev->dev, "PCI slot reset detected\n"); + return PCI_ERS_RESULT_RECOVERED; } -- cgit v1.2.3 From 306206985a4bcfc12b45596d56c7bd8ba6f0f6b1 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 26 Oct 2022 18:20:49 +0300 Subject: habanalabs: remove redundant gaudi2_sec asic type As Gaudi2 has a single PCI id, the secured asic type is redundant. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 3 --- drivers/misc/habanalabs/common/habanalabs.h | 2 -- drivers/misc/habanalabs/common/mmu/mmu.c | 1 - drivers/misc/habanalabs/common/sysfs.c | 2 -- drivers/misc/habanalabs/gaudi2/gaudi2.c | 2 +- 5 files changed, 1 insertion(+), 9 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index cb8ecc17bba1..3ea1ee1ec8ef 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -748,9 +748,6 @@ static int device_early_init(struct hl_device *hdev) gaudi2_set_asic_funcs(hdev); strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name)); break; - case ASIC_GAUDI2_SEC: - gaudi2_set_asic_funcs(hdev); - strscpy(hdev->asic_name, "GAUDI2 SEC", sizeof(hdev->asic_name)); break; default: dev_err(hdev->dev, "Unrecognized ASIC type %d\n", diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index bfaaa9daa750..7d191f388953 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1192,7 +1192,6 @@ struct hl_dec { * @ASIC_GAUDI: Gaudi device (HL-2000). * @ASIC_GAUDI_SEC: Gaudi secured device (HL-2000). * @ASIC_GAUDI2: Gaudi2 device. - * @ASIC_GAUDI2_SEC: Gaudi2 secured device. */ enum hl_asic_type { ASIC_INVALID, @@ -1200,7 +1199,6 @@ enum hl_asic_type { ASIC_GAUDI, ASIC_GAUDI_SEC, ASIC_GAUDI2, - ASIC_GAUDI2_SEC, }; struct hl_cs_parser; diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c index 589179f8cd41..67d3e70cf571 100644 --- a/drivers/misc/habanalabs/common/mmu/mmu.c +++ b/drivers/misc/habanalabs/common/mmu/mmu.c @@ -635,7 +635,6 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev) hl_mmu_v1_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]); break; case ASIC_GAUDI2: - case ASIC_GAUDI2_SEC: /* MMUs in Gaudi2 are always host resident */ hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]); break; diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index 36e9814139d1..c924fc994bd9 100644 --- a/drivers/misc/habanalabs/common/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -248,8 +248,6 @@ static ssize_t device_type_show(struct device *dev, case ASIC_GAUDI2: str = "GAUDI2"; break; - case ASIC_GAUDI2_SEC: - str = "GAUDI2 SEC"; break; default: dev_err(hdev->dev, "Unrecognized ASIC type %d\n", diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 22f5445fe71c..03f8cf9bb136 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -3969,7 +3969,7 @@ static void gaudi2_init_firmware_loader(struct hl_device *hdev) fw_loader->sram_bar_id = SRAM_CFG_BAR_ID; fw_loader->dram_bar_id = DRAM_BAR_ID; - if (hdev->asic_type == ASIC_GAUDI2 || hdev->asic_type == ASIC_GAUDI2_SEC) + if (hdev->asic_type == ASIC_GAUDI2) fw_loader->cpu_timeout = GAUDI2_CPU_TIMEOUT_USEC; else /* ASIC_GAUDI2_FPGA */ fw_loader->cpu_timeout = GAUDI2_FPGA_CPU_TIMEOUT; -- cgit v1.2.3 From 841cd2d7658d92e09354640c1887797f0da3d444 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Wed, 26 Oct 2022 16:20:45 +0300 Subject: habanalabs/gaudi2: add PCI revision 2 support Add support for Gaudi2 Device with PCI revision 2. Functionality is exactly the same as revision 1, the only difference is device name exposed to user. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 4 ++++ drivers/misc/habanalabs/common/habanalabs.h | 2 ++ drivers/misc/habanalabs/common/habanalabs_drv.c | 26 +++++++++++++++------- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 6 +++-- drivers/misc/habanalabs/common/mmu/mmu.c | 1 + drivers/misc/habanalabs/common/sysfs.c | 2 ++ drivers/misc/habanalabs/gaudi2/gaudi2.c | 6 +---- drivers/misc/habanalabs/gaudi2/gaudi2P.h | 2 -- .../habanalabs/include/hw_ip/pci/pci_general.h | 7 ++++++ include/uapi/misc/habanalabs.h | 7 ++++++ 10 files changed, 46 insertions(+), 17 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 3ea1ee1ec8ef..35ed494fcfdf 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -748,6 +748,10 @@ static int device_early_init(struct hl_device *hdev) gaudi2_set_asic_funcs(hdev); strscpy(hdev->asic_name, "GAUDI2", sizeof(hdev->asic_name)); break; + case ASIC_GAUDI2B: + gaudi2_set_asic_funcs(hdev); + strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name)); + break; break; default: dev_err(hdev->dev, "Unrecognized ASIC type %d\n", diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 7d191f388953..e391e7951fb7 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1192,6 +1192,7 @@ struct hl_dec { * @ASIC_GAUDI: Gaudi device (HL-2000). * @ASIC_GAUDI_SEC: Gaudi secured device (HL-2000). * @ASIC_GAUDI2: Gaudi2 device. + * @ASIC_GAUDI2B: Gaudi2B device. */ enum hl_asic_type { ASIC_INVALID, @@ -1199,6 +1200,7 @@ enum hl_asic_type { ASIC_GAUDI, ASIC_GAUDI_SEC, ASIC_GAUDI2, + ASIC_GAUDI2B, }; struct hl_cs_parser; diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index e82af8989700..7815c60df54e 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -9,6 +9,7 @@ #define pr_fmt(fmt) "habanalabs: " fmt #include "habanalabs.h" +#include "../include/hw_ip/pci/pci_general.h" #include #include @@ -74,16 +75,17 @@ MODULE_DEVICE_TABLE(pci, ids); /* * get_asic_type - translate device id to asic type * - * @device: id of the PCI device + * @hdev: pointer to habanalabs device structure. * - * Translate device id to asic type. + * Translate device id and revision id to asic type. * In case of unidentified device, return -1 */ -static enum hl_asic_type get_asic_type(u16 device) +static enum hl_asic_type get_asic_type(struct hl_device *hdev) { - enum hl_asic_type asic_type; + struct pci_dev *pdev = hdev->pdev; + enum hl_asic_type asic_type = ASIC_INVALID; - switch (device) { + switch (pdev->device) { case PCI_IDS_GOYA: asic_type = ASIC_GOYA; break; @@ -94,10 +96,18 @@ static enum hl_asic_type get_asic_type(u16 device) asic_type = ASIC_GAUDI_SEC; break; case PCI_IDS_GAUDI2: - asic_type = ASIC_GAUDI2; + switch (pdev->revision) { + case REV_ID_A: + asic_type = ASIC_GAUDI2; + break; + case REV_ID_B: + asic_type = ASIC_GAUDI2B; + break; + default: + break; + } break; default: - asic_type = ASIC_INVALID; break; } @@ -416,7 +426,7 @@ static int create_hdev(struct hl_device **dev, struct pci_dev *pdev) /* First, we must find out which ASIC are we handling. This is needed * to configure the behavior of the driver (kernel parameters) */ - hdev->asic_type = get_asic_type(pdev->device); + hdev->asic_type = get_asic_type(hdev); if (hdev->asic_type == ASIC_INVALID) { dev_err(&pdev->dev, "Unsupported ASIC\n"); rc = -ENODEV; diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 5ce5c42e2731..ee43017eb563 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -10,10 +10,11 @@ #include #include "habanalabs.h" -#include #include -#include +#include +#include #include +#include #include static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = { @@ -105,6 +106,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args) hw_ip.edma_enabled_mask = prop->edma_enabled_mask; hw_ip.server_type = prop->server_type; hw_ip.security_enabled = prop->fw_security_enabled; + hw_ip.revision_id = hdev->pdev->revision; return copy_to_user(out, &hw_ip, min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0; diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c index 67d3e70cf571..2c1005f74cf4 100644 --- a/drivers/misc/habanalabs/common/mmu/mmu.c +++ b/drivers/misc/habanalabs/common/mmu/mmu.c @@ -635,6 +635,7 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev) hl_mmu_v1_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]); break; case ASIC_GAUDI2: + case ASIC_GAUDI2B: /* MMUs in Gaudi2 are always host resident */ hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]); break; diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c index c924fc994bd9..735d8bed0066 100644 --- a/drivers/misc/habanalabs/common/sysfs.c +++ b/drivers/misc/habanalabs/common/sysfs.c @@ -248,6 +248,8 @@ static ssize_t device_type_show(struct device *dev, case ASIC_GAUDI2: str = "GAUDI2"; break; + case ASIC_GAUDI2B: + str = "GAUDI2B"; break; default: dev_err(hdev->dev, "Unrecognized ASIC type %d\n", diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 03f8cf9bb136..f21b68be6d20 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -3968,11 +3968,7 @@ static void gaudi2_init_firmware_loader(struct hl_device *hdev) fw_loader->skip_bmc = false; fw_loader->sram_bar_id = SRAM_CFG_BAR_ID; fw_loader->dram_bar_id = DRAM_BAR_ID; - - if (hdev->asic_type == ASIC_GAUDI2) - fw_loader->cpu_timeout = GAUDI2_CPU_TIMEOUT_USEC; - else /* ASIC_GAUDI2_FPGA */ - fw_loader->cpu_timeout = GAUDI2_FPGA_CPU_TIMEOUT; + fw_loader->cpu_timeout = GAUDI2_CPU_TIMEOUT_USEC; /* here we update initial values for few specific dynamic regs (as * before reading the first descriptor from FW those value has to be diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2P.h b/drivers/misc/habanalabs/gaudi2/gaudi2P.h index a99c348bbf39..b4383c199bbb 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2P.h +++ b/drivers/misc/habanalabs/gaudi2/gaudi2P.h @@ -23,8 +23,6 @@ #define GAUDI2_CPU_TIMEOUT_USEC 30000000 /* 30s */ -#define GAUDI2_FPGA_CPU_TIMEOUT 100000000 /* 100s */ - #define NUMBER_OF_PDMA_QUEUES 2 #define NUMBER_OF_EDMA_QUEUES 8 #define NUMBER_OF_MME_QUEUES 4 diff --git a/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h b/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h index d232081d4e0f..f5d497dc9bdc 100644 --- a/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h +++ b/drivers/misc/habanalabs/include/hw_ip/pci/pci_general.h @@ -20,4 +20,11 @@ #define PCI_CONFIG_ELBI_STS_MASK (PCI_CONFIG_ELBI_STS_ERR | \ PCI_CONFIG_ELBI_STS_DONE) +enum hl_revision_id { + /* PCI revision ID 0 is not legal */ + REV_ID_INVALID = 0x00, + REV_ID_A = 0x01, + REV_ID_B = 0x02, +}; + #endif /* INCLUDE_PCI_GENERAL_H_ */ diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index a4ceee681898..58343998bd63 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -868,6 +868,7 @@ enum hl_server_type { * @number_of_user_interrupts: The number of interrupts that are available to the userspace * application to use. Relevant for Gaudi2 and later. * @device_mem_alloc_default_page_size: default page size used in device memory allocation. + * @revision_id: PCI revision ID of the ASIC. */ struct hl_info_hw_ip_info { __u64 sram_base_address; @@ -898,6 +899,12 @@ struct hl_info_hw_ip_info { __u16 pad2; __u64 reserved4; __u64 device_mem_alloc_default_page_size; + __u64 reserved5; + __u64 reserved6; + __u32 reserved7; + __u8 reserved8; + __u8 revision_id; + __u8 pad[2]; }; struct hl_info_dram_usage { -- cgit v1.2.3 From cb5fb665f30388cf8cb9becae86dcb84ace0ca88 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Sun, 30 Oct 2022 13:08:37 +0200 Subject: habanalabs/gaudi: add razwi notify event Each time razwi (read-only zero, write ignore) happens, besides capturing its data, also notify the user about it. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 8 +++++++ drivers/misc/habanalabs/common/habanalabs.h | 2 ++ drivers/misc/habanalabs/gaudi/gaudi.c | 37 +++++++++++++++-------------- include/uapi/misc/habanalabs.h | 2 ++ 4 files changed, 31 insertions(+), 18 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 35ed494fcfdf..d1a609589558 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -2409,6 +2409,14 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_ num_of_engines * sizeof(u16)); hdev->captured_err_info.razwi.flags = flags; } + +void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags, u64 *event_mask) +{ + hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags); + *event_mask |= HL_NOTIFIER_EVENT_RAZWI; +} + static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu) { struct page_fault_info *pgf_info = &hdev->captured_err_info.pgf_info; diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index e391e7951fb7..d9335f3769b8 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -3812,6 +3812,8 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg, __printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...); void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, u8 flags); +void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, + u8 flags, u64 *event_mask); void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 3dfb9ecf7db3..035865cb097c 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7301,7 +7301,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e } static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, - bool razwi) + bool razwi, u64 *event_mask) { bool is_read = false, is_write = false; u16 engine_id[2], num_of_razwi_eng = 0; @@ -7337,7 +7337,8 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, num_of_razwi_eng = 1; } - hl_capture_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags); + hl_handle_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags, + event_mask); } } @@ -7675,7 +7676,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM_0_DERR ... GAUDI_EVENT_HBM_3_DERR: case GAUDI_EVENT_MMU_DERR: case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; @@ -7685,7 +7686,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_AXI_ECC: case GAUDI_EVENT_L2_RAM_ECC: case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; @@ -7694,7 +7695,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM1_SPI_0: case GAUDI_EVENT_HBM2_SPI_0: case GAUDI_EVENT_HBM3_SPI_0: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_hbm_read_interrupts(hdev, gaudi_hbm_event_to_dev(event_type), &eq_entry->hbm_ecc_data); @@ -7706,7 +7707,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM1_SPI_1: case GAUDI_EVENT_HBM2_SPI_1: case GAUDI_EVENT_HBM3_SPI_1: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_hbm_read_interrupts(hdev, gaudi_hbm_event_to_dev(event_type), &eq_entry->hbm_ecc_data); @@ -7728,7 +7729,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr * if the event is a TPC Assertion or a "real" TPC DEC. */ event_mask |= HL_NOTIFIER_EVENT_TPC_ASSERT; - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); reset_required = gaudi_tpc_read_interrupts(hdev, tpc_dec_event_to_tpc_id(event_type), "AXI_SLV_DEC_Error"); @@ -7753,7 +7754,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_TPC5_KRN_ERR: case GAUDI_EVENT_TPC6_KRN_ERR: case GAUDI_EVENT_TPC7_KRN_ERR: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); reset_required = gaudi_tpc_read_interrupts(hdev, tpc_krn_event_to_tpc_id(event_type), "KRN_ERR"); @@ -7792,7 +7793,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_HBM_0_SERR ... GAUDI_EVENT_HBM_3_SERR: fallthrough; case GAUDI_EVENT_MMU_SERR: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; @@ -7802,14 +7803,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_CPU_AXI_SPLITTER: case GAUDI_EVENT_PSOC_AXI_DEC: case GAUDI_EVENT_PSOC_PRSTN_FALL: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; case GAUDI_EVENT_MMU_PAGE_FAULT: case GAUDI_EVENT_MMU_WR_PERM: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -7838,14 +7839,14 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_NIC4_QM1: case GAUDI_EVENT_DMA0_CORE ... GAUDI_EVENT_DMA7_CORE: case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); gaudi_handle_qman_err(hdev, event_type, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= (HL_NOTIFIER_EVENT_USER_ENGINE_ERR | HL_NOTIFIER_EVENT_DEVICE_RESET); break; case GAUDI_EVENT_RAZWI_OR_ADC_SW: - gaudi_print_irq_info(hdev, event_type, true); + gaudi_print_irq_info(hdev, event_type, true, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; goto reset_device; @@ -7858,7 +7859,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr case GAUDI_EVENT_TPC6_BMON_SPMU: case GAUDI_EVENT_TPC7_BMON_SPMU: case GAUDI_EVENT_DMA_BM_CH0 ... GAUDI_EVENT_DMA_BM_CH7: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); hl_fw_unmask_irq(hdev, event_type); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -7870,7 +7871,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr break; case GAUDI_EVENT_DMA_IF_SEI_0 ... GAUDI_EVENT_DMA_IF_SEI_3: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_print_sm_sei_info(hdev, event_type, &eq_entry->sm_sei_data); rc = hl_state_dump(hdev); @@ -7899,18 +7900,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr break; case GAUDI_EVENT_DEV_RESET_REQ: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; case GAUDI_EVENT_PKT_QUEUE_OUT_SYNC: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; case GAUDI_EVENT_FW_ALIVE_S: - gaudi_print_irq_info(hdev, event_type, false); + gaudi_print_irq_info(hdev, event_type, false, &event_mask); gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive); event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; goto reset_device; diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 58343998bd63..7747e19e81fe 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -721,6 +721,7 @@ enum hl_server_type { * HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE - Indicates device is unavailable * HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state * HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error + * HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened */ #define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0) #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1) @@ -729,6 +730,7 @@ enum hl_server_type { #define HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE (1ULL << 4) #define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5) #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6) +#define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7) /* Opcode for management ioctl * -- cgit v1.2.3 From cd21701cde33123fc53c6401192219ba14832da3 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Thu, 27 Oct 2022 20:38:26 +0300 Subject: habanalabs: use single threaded WQ for event handling Creating event queue workqueue using alloc_workqueue made it run in multi threaded mode, which caused parallel dumping of events as well as parallel events notifying to user, causing logs with multiple events to be out of order. Fixed by creating event queue workqueue as single threaded work queue. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index d1a609589558..65bb40f81901 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -787,7 +787,7 @@ static int device_early_init(struct hl_device *hdev) } } - hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0); + hdev->eq_wq = create_singlethread_workqueue("hl-events"); if (hdev->eq_wq == NULL) { dev_err(hdev->dev, "Failed to allocate EQ workqueue\n"); rc = -ENOMEM; -- cgit v1.2.3 From aff6354afd1f9eae1e10658c157c26e316806f56 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 31 Oct 2022 11:44:45 +0200 Subject: habanalabs/gaudi: add page fault notify event Each time page fault happens, besides capturing its data, also notify the user about it. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 9 +++++++++ drivers/misc/habanalabs/common/habanalabs.h | 2 ++ drivers/misc/habanalabs/gaudi/gaudi.c | 6 +++--- include/uapi/misc/habanalabs.h | 2 ++ 4 files changed, 16 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 65bb40f81901..31818121ef4d 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -2490,3 +2490,12 @@ void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is hdev->captured_err_info.pgf_info.pgf.engine_id = eng_id; hl_capture_user_mappings(hdev, is_pmmu); } + +void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu, + u64 *event_mask) +{ + hl_capture_page_fault(hdev, addr, eng_id, is_pmmu); + + if (event_mask) + *event_mask |= HL_NOTIFIER_EVENT_PAGE_FAULT; +} diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index d9335f3769b8..0781b8698f74 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -3815,6 +3815,8 @@ void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_ void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines, u8 flags, u64 *event_mask); void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu); +void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu, + u64 *event_mask); #ifdef CONFIG_DEBUG_FS diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 035865cb097c..cbe1daf5a793 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -6740,7 +6740,7 @@ static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u16 *engine_i } } -static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr) +static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u64 *event_mask) { struct gaudi_device *gaudi = hdev->asic_specific; u32 val; @@ -6755,7 +6755,7 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA); dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr); - hl_capture_page_fault(hdev, *addr, 0, true); + hl_handle_page_fault(hdev, *addr, 0, true, event_mask); WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0); } @@ -7323,7 +7323,7 @@ static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, if (razwi) { gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read, &is_write); - gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr); + gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, event_mask); if (is_read) razwi_flags |= HL_RAZWI_READ; diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 7747e19e81fe..e50cb71df081 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -722,6 +722,7 @@ enum hl_server_type { * HL_NOTIFIER_EVENT_USER_ENGINE_ERR - Indicates device engine in error state * HL_NOTIFIER_EVENT_GENERAL_HW_ERR - Indicates device HW error * HL_NOTIFIER_EVENT_RAZWI - Indicates razwi happened + * HL_NOTIFIER_EVENT_PAGE_FAULT - Indicates page fault happened */ #define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0) #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1) @@ -731,6 +732,7 @@ enum hl_server_type { #define HL_NOTIFIER_EVENT_USER_ENGINE_ERR (1ULL << 5) #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR (1ULL << 6) #define HL_NOTIFIER_EVENT_RAZWI (1ULL << 7) +#define HL_NOTIFIER_EVENT_PAGE_FAULT (1ULL << 8) /* Opcode for management ioctl * -- cgit v1.2.3 From 91bd822448e57a55d12dc0461909b5c585485a6c Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 30 Oct 2022 15:10:13 +0200 Subject: habanalabs/gaudi2: implement fp32 not supported event Due to binning, Gaudi2 does not always support fp32. We add support for such an event in case fp32 is used by the user in such a device. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 5 +++++ drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h | 1 + .../misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h | 4 +++- 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index f21b68be6d20..77bdbab41e6c 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -9148,6 +9148,11 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; + case GAUDI2_EVENT_CPU_FP32_NOT_SUPPORTED: + event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; + is_critical = true; + break; + default: if (gaudi2_irq_map_table[event_type].valid) dev_err_ratelimited(hdev->dev, "Cannot find handler for event %d\n", diff --git a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h index 34406770a76a..305b576222e6 100644 --- a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h +++ b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_events.h @@ -957,6 +957,7 @@ enum gaudi2_async_event_id { GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG0 = 1317, GAUDI2_EVENT_CPU11_STATUS_NIC11_ENG1 = 1318, GAUDI2_EVENT_ARC_DCCM_FULL = 1319, + GAUDI2_EVENT_CPU_FP32_NOT_SUPPORTED = 1320, GAUDI2_EVENT_SIZE, }; diff --git a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h index 5bd4383c9f2c..d510cb10c883 100644 --- a/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h +++ b/drivers/misc/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2018-2021 HabanaLabs, Ltd. + * Copyright 2018-2022 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -2663,6 +2663,8 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = { .msg = 1, .reset = 0, .name = "STATUS_NIC11_ENG1" }, { .fc_id = 1319, .cpu_id = 625, .valid = 1, .msg = 1, .reset = 0, .name = "ARC_DCCM_FULL" }, + { .fc_id = 1320, .cpu_id = 626, .valid = 1, + .msg = 1, .reset = 1, .name = "FP32_NOT_SUPPORTED" }, }; #endif /* __GAUDI2_ASYNC_IDS_MAP_EVENTS_EXT_H_ */ -- cgit v1.2.3 From 413bdb176eaa7d02c979a3c738738aea91fe6ed7 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Sun, 30 Oct 2022 14:46:19 +0200 Subject: habanalabs/gaudi2: add razwi notify event Each time razwi (read-only zero, write ignored) event happens, besides capturing its data, also notify the user about it. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 4 +- drivers/misc/habanalabs/gaudi2/gaudi2.c | 140 ++++++++++++++++++-------------- 2 files changed, 82 insertions(+), 62 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 31818121ef4d..708db0f48ee0 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -2414,7 +2414,9 @@ void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_o u8 flags, u64 *event_mask) { hl_capture_razwi(hdev, addr, engine_id, num_of_engines, flags); - *event_mask |= HL_NOTIFIER_EVENT_RAZWI; + + if (event_mask) + *event_mask |= HL_NOTIFIER_EVENT_RAZWI; } static void hl_capture_user_mappings(struct hl_device *hdev, bool is_pmmu) diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 77bdbab41e6c..59940c8df2d2 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -7063,7 +7063,7 @@ static void gaudi2_handle_qman_err_generic(struct hl_device *hdev, const char *q static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev, u64 rtr_mstr_if_base_addr, bool is_write, char *name, bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info, - enum gaudi2_engine_id id) + enum gaudi2_engine_id id, u64 *event_mask) { u32 razwi_hi, razwi_lo, razwi_xy; u16 eng_id = id; @@ -7093,8 +7093,8 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev, rd_wr_flag = HL_RAZWI_READ; } - hl_capture_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &eng_id, 1, - rd_wr_flag | HL_RAZWI_HBW); + hl_handle_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &eng_id, 1, + rd_wr_flag | HL_RAZWI_HBW, event_mask); dev_err_ratelimited(hdev->dev, "%s-RAZWI SHARED RR HBW %s error, address %#llx, Initiator coordinates 0x%x\n", @@ -7104,7 +7104,7 @@ static void gaudi2_razwi_rr_hbw_shared_printf_info(struct hl_device *hdev, static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev, u64 rtr_mstr_if_base_addr, bool is_write, char *name, bool read_razwi_regs, struct hl_eq_razwi_info *razwi_info, - enum gaudi2_engine_id id) + enum gaudi2_engine_id id, u64 *event_mask) { u32 razwi_addr, razwi_xy; u16 eng_id = id; @@ -7132,7 +7132,7 @@ static void gaudi2_razwi_rr_lbw_shared_printf_info(struct hl_device *hdev, rd_wr_flag = HL_RAZWI_READ; } - hl_capture_razwi(hdev, razwi_addr, &eng_id, 1, rd_wr_flag | HL_RAZWI_LBW); + hl_handle_razwi(hdev, razwi_addr, &eng_id, 1, rd_wr_flag | HL_RAZWI_LBW, event_mask); dev_err_ratelimited(hdev->dev, "%s-RAZWI SHARED RR LBW %s error, mstr_if 0x%llx, captured address 0x%x Initiator coordinates 0x%x\n", name, is_write ? "WR" : "RD", rtr_mstr_if_base_addr, razwi_addr, @@ -7189,7 +7189,8 @@ static enum gaudi2_engine_id gaudi2_razwi_calc_engine_id(struct hl_device *hdev, */ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev, enum razwi_event_sources module, u8 module_idx, - u8 module_sub_idx, struct hl_eq_razwi_info *razwi_info) + u8 module_sub_idx, struct hl_eq_razwi_info *razwi_info, + u64 *event_mask) { bool via_sft = false, read_razwi_regs = false; u32 rtr_id, dcore_id, dcore_rtr_id, sft_id, eng_id; @@ -7330,7 +7331,7 @@ dump_info: if (hbw_shrd_aw) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true, initiator_name, read_razwi_regs, razwi_info, - eng_id); + eng_id, event_mask); /* Clear event indication */ if (read_razwi_regs) @@ -7340,7 +7341,7 @@ dump_info: if (hbw_shrd_ar) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false, initiator_name, read_razwi_regs, razwi_info, - eng_id); + eng_id, event_mask); /* Clear event indication */ if (read_razwi_regs) @@ -7350,7 +7351,7 @@ dump_info: if (lbw_shrd_aw) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, true, initiator_name, read_razwi_regs, razwi_info, - eng_id); + eng_id, event_mask); /* Clear event indication */ if (read_razwi_regs) @@ -7360,7 +7361,7 @@ dump_info: if (lbw_shrd_ar) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, rtr_mstr_if_base_addr, false, initiator_name, read_razwi_regs, razwi_info, - eng_id); + eng_id, event_mask); /* Clear event indication */ if (read_razwi_regs) @@ -7376,38 +7377,42 @@ static void gaudi2_check_if_razwi_happened(struct hl_device *hdev) /* check all TPCs */ for (mod_idx = 0 ; mod_idx < (NUM_OF_TPC_PER_DCORE * NUM_OF_DCORES + 1) ; mod_idx++) { if (prop->tpc_enabled_mask & BIT(mod_idx)) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, mod_idx, 0, NULL); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, mod_idx, 0, NULL, + NULL); } /* check all MMEs */ for (mod_idx = 0 ; mod_idx < (NUM_OF_MME_PER_DCORE * NUM_OF_DCORES) ; mod_idx++) for (sub_mod = MME_WAP0 ; sub_mod < MME_INITIATORS_MAX ; sub_mod++) gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mod_idx, - sub_mod, NULL); + sub_mod, NULL, NULL); /* check all EDMAs */ for (mod_idx = 0 ; mod_idx < (NUM_OF_EDMA_PER_DCORE * NUM_OF_DCORES) ; mod_idx++) if (prop->edma_enabled_mask & BIT(mod_idx)) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_EDMA, mod_idx, 0, NULL); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_EDMA, mod_idx, 0, NULL, + NULL); /* check all PDMAs */ for (mod_idx = 0 ; mod_idx < NUM_OF_PDMA ; mod_idx++) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_PDMA, mod_idx, 0, NULL); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_PDMA, mod_idx, 0, NULL, + NULL); /* check all NICs */ for (mod_idx = 0 ; mod_idx < NIC_NUMBER_OF_PORTS ; mod_idx++) if (hdev->nic_ports_mask & BIT(mod_idx)) gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_NIC, mod_idx >> 1, 0, - NULL); + NULL, NULL); /* check all DECs */ for (mod_idx = 0 ; mod_idx < NUMBER_OF_DEC ; mod_idx++) if (prop->decoder_enabled_mask & BIT(mod_idx)) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, mod_idx, 0, NULL); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, mod_idx, 0, NULL, + NULL); /* check all ROTs */ for (mod_idx = 0 ; mod_idx < NUM_OF_ROT ; mod_idx++) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, mod_idx, 0, NULL); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, mod_idx, 0, NULL, NULL); } static const char *gaudi2_get_initiators_name(u32 rtr_id) @@ -7625,7 +7630,8 @@ static u16 gaudi2_get_razwi_initiators(u32 rtr_id, u16 *engines) } static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u32 rtr_id, - u64 rtr_ctrl_base_addr, bool is_write) + u64 rtr_ctrl_base_addr, bool is_write, + u64 *event_mask) { u16 engines[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR], num_of_eng; u32 razwi_hi, razwi_lo; @@ -7649,8 +7655,8 @@ static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u WREG32(rtr_ctrl_base_addr + DEC_RAZWI_HBW_AR_SET, 0x1); } - hl_capture_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &engines[0], num_of_eng, - rd_wr_flag | HL_RAZWI_HBW); + hl_handle_razwi(hdev, (u64)razwi_hi << 32 | razwi_lo, &engines[0], num_of_eng, + rd_wr_flag | HL_RAZWI_HBW, event_mask); dev_err_ratelimited(hdev->dev, "RAZWI PSOC unmapped HBW %s error, rtr id %u, address %#llx\n", is_write ? "WR" : "RD", rtr_id, (u64)razwi_hi << 32 | razwi_lo); @@ -7660,7 +7666,8 @@ static void gaudi2_razwi_unmapped_addr_hbw_printf_info(struct hl_device *hdev, u } static void gaudi2_razwi_unmapped_addr_lbw_printf_info(struct hl_device *hdev, u32 rtr_id, - u64 rtr_ctrl_base_addr, bool is_write) + u64 rtr_ctrl_base_addr, bool is_write, + u64 *event_mask) { u16 engines[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR], num_of_eng; u32 razwi_addr; @@ -7682,7 +7689,8 @@ static void gaudi2_razwi_unmapped_addr_lbw_printf_info(struct hl_device *hdev, u WREG32(rtr_ctrl_base_addr + DEC_RAZWI_LBW_AR_SET, 0x1); } - hl_capture_razwi(hdev, razwi_addr, &engines[0], num_of_eng, rd_wr_flag | HL_RAZWI_LBW); + hl_handle_razwi(hdev, razwi_addr, &engines[0], num_of_eng, rd_wr_flag | HL_RAZWI_LBW, + event_mask); dev_err_ratelimited(hdev->dev, "RAZWI PSOC unmapped LBW %s error, rtr id %u, address %#x\n", is_write ? "WR" : "RD", rtr_id, razwi_addr); @@ -7692,7 +7700,7 @@ static void gaudi2_razwi_unmapped_addr_lbw_printf_info(struct hl_device *hdev, u } /* PSOC RAZWI interrupt occurs only when trying to access a bad address */ -static void gaudi2_ack_psoc_razwi_event_handler(struct hl_device *hdev) +static void gaudi2_ack_psoc_razwi_event_handler(struct hl_device *hdev, u64 *event_mask) { u32 hbw_aw_set, hbw_ar_set, lbw_aw_set, lbw_ar_set, rtr_id, dcore_id, dcore_rtr_id, xy, razwi_mask_info, razwi_intr = 0; @@ -7746,19 +7754,19 @@ static void gaudi2_ack_psoc_razwi_event_handler(struct hl_device *hdev) if (hbw_aw_set) gaudi2_razwi_unmapped_addr_hbw_printf_info(hdev, rtr_id, - rtr_ctrl_base_addr, true); + rtr_ctrl_base_addr, true, event_mask); if (hbw_ar_set) gaudi2_razwi_unmapped_addr_hbw_printf_info(hdev, rtr_id, - rtr_ctrl_base_addr, false); + rtr_ctrl_base_addr, false, event_mask); if (lbw_aw_set) gaudi2_razwi_unmapped_addr_lbw_printf_info(hdev, rtr_id, - rtr_ctrl_base_addr, true); + rtr_ctrl_base_addr, true, event_mask); if (lbw_ar_set) gaudi2_razwi_unmapped_addr_lbw_printf_info(hdev, rtr_id, - rtr_ctrl_base_addr, false); + rtr_ctrl_base_addr, false, event_mask); clear: /* Clear Interrupts only on pldm or if f/w doesn't handle interrupts */ @@ -7784,7 +7792,7 @@ static void _gaudi2_handle_qm_sei_err(struct hl_device *hdev, u64 qman_base) } static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type, - struct hl_eq_razwi_info *razwi_info) + struct hl_eq_razwi_info *razwi_info, u64 *event_mask) { enum razwi_event_sources module; u64 qman_base; @@ -7837,7 +7845,7 @@ static void gaudi2_handle_qm_sei_err(struct hl_device *hdev, u16 event_type, /* check if RAZWI happened */ if (razwi_info) - gaudi2_ack_module_razwi_event_handler(hdev, module, 0, 0, razwi_info); + gaudi2_ack_module_razwi_event_handler(hdev, module, 0, 0, razwi_info, event_mask); } static void gaudi2_handle_qman_err(struct hl_device *hdev, u16 event_type) @@ -8003,7 +8011,8 @@ static void gaudi2_handle_cpu_sei_err(struct hl_device *hdev) } static void gaudi2_handle_rot_err(struct hl_device *hdev, u8 rot_index, - struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause) + struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause, + u64 *event_mask) { u64 intr_cause_data = le64_to_cpu(razwi_with_intr_cause->intr_cause.intr_cause_data); int i; @@ -8015,11 +8024,12 @@ static void gaudi2_handle_rot_err(struct hl_device *hdev, u8 rot_index, /* check if RAZWI happened */ gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_ROT, rot_index, 0, - &razwi_with_intr_cause->razwi_info); + &razwi_with_intr_cause->razwi_info, event_mask); } static void gaudi2_tpc_ack_interrupts(struct hl_device *hdev, u8 tpc_index, char *interrupt_name, - struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause) + struct hl_eq_razwi_with_intr_cause *razwi_with_intr_cause, + u64 *event_mask) { u64 intr_cause_data = le64_to_cpu(razwi_with_intr_cause->intr_cause.intr_cause_data); int i; @@ -8031,11 +8041,11 @@ static void gaudi2_tpc_ack_interrupts(struct hl_device *hdev, u8 tpc_index, char /* check if RAZWI happened */ gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_TPC, tpc_index, 0, - &razwi_with_intr_cause->razwi_info); + &razwi_with_intr_cause->razwi_info, event_mask); } static void gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, const char *interrupt_name, - struct hl_eq_razwi_info *razwi_info) + struct hl_eq_razwi_info *razwi_info, u64 *event_mask) { u32 sts_addr, sts_val, sts_clr_val = 0; int i; @@ -8061,14 +8071,15 @@ static void gaudi2_handle_dec_err(struct hl_device *hdev, u8 dec_index, const ch } /* check if RAZWI happened */ - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, dec_index, 0, razwi_info); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_DEC, dec_index, 0, razwi_info, + event_mask); /* Write 1 clear errors */ WREG32(sts_addr, sts_clr_val); } static void gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, const char *interrupt_name, - struct hl_eq_razwi_info *razwi_info) + struct hl_eq_razwi_info *razwi_info, u64 *event_mask) { u32 sts_addr, sts_val, sts_clr_addr, sts_clr_val = 0; int i; @@ -8088,7 +8099,8 @@ static void gaudi2_handle_mme_err(struct hl_device *hdev, u8 mme_index, const ch /* check if RAZWI happened */ for (i = MME_WRITE ; i < MME_INITIATORS_MAX ; i++) - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, i, razwi_info); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, i, razwi_info, + event_mask); WREG32(sts_clr_addr, sts_clr_val); } @@ -8105,7 +8117,7 @@ static void gaudi2_handle_mme_sbte_err(struct hl_device *hdev, u8 mme_index, u8 } static void gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index, - struct hl_eq_razwi_info *razwi_info) + struct hl_eq_razwi_info *razwi_info, u64 *event_mask) { u32 sts_addr, sts_val, sts_clr_addr, sts_clr_val = 0; int i; @@ -8125,8 +8137,10 @@ static void gaudi2_handle_mme_wap_err(struct hl_device *hdev, u8 mme_index, } /* check if RAZWI happened on WAP0/1 */ - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP0, razwi_info); - gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP1, razwi_info); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP0, razwi_info, + event_mask); + gaudi2_ack_module_razwi_event_handler(hdev, RAZWI_MME, mme_index, MME_WAP1, razwi_info, + event_mask); WREG32(sts_clr_addr, sts_clr_val); } @@ -8156,40 +8170,41 @@ static void gaudi2_handle_dma_core_event(struct hl_device *hdev, u64 intr_cause_ gaudi2_dma_core_interrupts_cause[i]); } -static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev) +static void gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(struct hl_device *hdev, u64 *event_mask) { u32 mstr_if_base_addr = mmPCIE_MSTR_RR_MSTR_IF_RR_SHRD_HBW_BASE, razwi_happened_addr; razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AW_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true, - NULL, GAUDI2_ENGINE_ID_PCIE); + NULL, GAUDI2_ENGINE_ID_PCIE, event_mask); WREG32(razwi_happened_addr, 0x1); } razwi_happened_addr = mstr_if_base_addr + RR_SHRD_HBW_AR_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_hbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true, - NULL, GAUDI2_ENGINE_ID_PCIE); + NULL, GAUDI2_ENGINE_ID_PCIE, event_mask); WREG32(razwi_happened_addr, 0x1); } razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AW_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, true, "PCIE", true, - NULL, GAUDI2_ENGINE_ID_PCIE); + NULL, GAUDI2_ENGINE_ID_PCIE, event_mask); WREG32(razwi_happened_addr, 0x1); } razwi_happened_addr = mstr_if_base_addr + RR_SHRD_LBW_AR_RAZWI_HAPPENED; if (RREG32(razwi_happened_addr)) { gaudi2_razwi_rr_lbw_shared_printf_info(hdev, mstr_if_base_addr, false, "PCIE", true, - NULL, GAUDI2_ENGINE_ID_PCIE); + NULL, GAUDI2_ENGINE_ID_PCIE, event_mask); WREG32(razwi_happened_addr, 0x1); } } -static void gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_cause_data) +static void gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_cause_data, + u64 *event_mask) { int i; @@ -8204,7 +8219,7 @@ static void gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u64 intr_cau case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK: break; case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK: - gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev); + gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask); break; } } @@ -8818,29 +8833,30 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP: case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP: reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; - gaudi2_handle_qm_sei_err(hdev, event_type, &eq_entry->razwi_info); + gaudi2_handle_qm_sei_err(hdev, event_type, &eq_entry->razwi_info, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE: case GAUDI2_EVENT_ROTATOR1_AXI_ERROR_RESPONSE: index = event_type - GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE; - gaudi2_handle_rot_err(hdev, index, &eq_entry->razwi_with_intr_cause); - gaudi2_handle_qm_sei_err(hdev, event_type, NULL); + gaudi2_handle_rot_err(hdev, index, &eq_entry->razwi_with_intr_cause, &event_mask); + gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_TPC0_AXI_ERR_RSP ... GAUDI2_EVENT_TPC24_AXI_ERR_RSP: index = event_type - GAUDI2_EVENT_TPC0_AXI_ERR_RSP; gaudi2_tpc_ack_interrupts(hdev, index, "AXI_ERR_RSP", - &eq_entry->razwi_with_intr_cause); - gaudi2_handle_qm_sei_err(hdev, event_type, NULL); + &eq_entry->razwi_with_intr_cause, &event_mask); + gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE: index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE; - gaudi2_handle_dec_err(hdev, index, "AXI_ERR_RESPONSE", &eq_entry->razwi_info); + gaudi2_handle_dec_err(hdev, index, "AXI_ERR_RESPONSE", &eq_entry->razwi_info, + &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8871,7 +8887,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_TPC24_KERNEL_ERR: index = (event_type - GAUDI2_EVENT_TPC0_KERNEL_ERR) / (GAUDI2_EVENT_TPC1_KERNEL_ERR - GAUDI2_EVENT_TPC0_KERNEL_ERR); - gaudi2_tpc_ack_interrupts(hdev, index, "KRN_ERR", &eq_entry->razwi_with_intr_cause); + gaudi2_tpc_ack_interrupts(hdev, index, "KRN_ERR", &eq_entry->razwi_with_intr_cause, + &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8887,7 +8904,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_DEC9_SPI: index = (event_type - GAUDI2_EVENT_DEC0_SPI) / (GAUDI2_EVENT_DEC1_SPI - GAUDI2_EVENT_DEC0_SPI); - gaudi2_handle_dec_err(hdev, index, "SPI", &eq_entry->razwi_info); + gaudi2_handle_dec_err(hdev, index, "SPI", &eq_entry->razwi_info, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8899,8 +8916,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent (GAUDI2_EVENT_MME1_CTRL_AXI_ERROR_RESPONSE - GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE); gaudi2_handle_mme_err(hdev, index, - "CTRL_AXI_ERROR_RESPONSE", &eq_entry->razwi_info); - gaudi2_handle_qm_sei_err(hdev, event_type, NULL); + "CTRL_AXI_ERROR_RESPONSE", &eq_entry->razwi_info, &event_mask); + gaudi2_handle_qm_sei_err(hdev, event_type, NULL, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8911,7 +8928,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent index = (event_type - GAUDI2_EVENT_MME0_QMAN_SW_ERROR) / (GAUDI2_EVENT_MME1_QMAN_SW_ERROR - GAUDI2_EVENT_MME0_QMAN_SW_ERROR); - gaudi2_handle_mme_err(hdev, index, "QMAN_SW_ERROR", &eq_entry->razwi_info); + gaudi2_handle_mme_err(hdev, index, "QMAN_SW_ERROR", &eq_entry->razwi_info, + &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8922,7 +8940,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent index = (event_type - GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID) / (GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID - GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID); - gaudi2_handle_mme_wap_err(hdev, index, &eq_entry->razwi_info); + gaudi2_handle_mme_wap_err(hdev, index, &eq_entry->razwi_info, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -8941,7 +8959,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_PCIE_ADDR_DEC_ERR: gaudi2_print_pcie_addr_dec_info(hdev, - le64_to_cpu(eq_entry->intr_cause.intr_cause_data)); + le64_to_cpu(eq_entry->intr_cause.intr_cause_data), &event_mask); reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR; break; @@ -8970,7 +8988,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent break; case GAUDI2_EVENT_PSOC63_RAZWI_OR_PID_MIN_MAX_INTERRUPT: - gaudi2_ack_psoc_razwi_event_handler(hdev); + gaudi2_ack_psoc_razwi_event_handler(hdev, &event_mask); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; -- cgit v1.2.3 From 3daa64eea1fb219c8cfb3bb6948dc2993652e201 Mon Sep 17 00:00:00 2001 From: farah kassabri Date: Thu, 22 Sep 2022 14:24:35 +0300 Subject: habanalabs: fix firmware descriptor copy operation This is needed to allow adding more data to the lkd_fw_comms_desc structure. Signed-off-by: farah kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index f18e53bbba6b..01c4ffba6e97 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -12,6 +12,7 @@ #include #include #include +#include #define FW_FILE_MAX_SIZE 0x1400000 /* maximum size of 20MB */ @@ -1988,10 +1989,11 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev, struct fw_load_mgr *fw_loader) { struct lkd_fw_comms_desc *fw_desc; + void __iomem *src, *temp_fw_desc; struct pci_mem_region *region; struct fw_response *response; + u16 fw_data_size; enum pci_region region_id; - void __iomem *src; int rc; fw_desc = &fw_loader->dynamic_loader.comm_desc; @@ -2018,9 +2020,29 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev, fw_loader->dynamic_loader.fw_desc_valid = false; src = hdev->pcie_bar[region->bar_id] + region->offset_in_bar + response->ram_offset; + + /* + * We do the copy of the fw descriptor in 2 phases: + * 1. copy the header + data info according to our lkd_fw_comms_desc definition. + * then we're able to read the actual data size provided by fw. + * this is needed for cases where data in descriptor was changed(add/remove) + * in embedded specs header file before updating lkd copy of the header file + * 2. copy descriptor to temporary buffer with aligned size and send it to validation + */ memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc)); + fw_data_size = le16_to_cpu(fw_desc->header.size); + + temp_fw_desc = vzalloc(sizeof(struct comms_desc_header) + fw_data_size); + if (!temp_fw_desc) + return -ENOMEM; + + memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_desc_header) + fw_data_size); - return hl_fw_dynamic_validate_descriptor(hdev, fw_loader, fw_desc); + rc = hl_fw_dynamic_validate_descriptor(hdev, fw_loader, + (struct lkd_fw_comms_desc *) temp_fw_desc); + vfree(temp_fw_desc); + + return rc; } /** -- cgit v1.2.3 From b829e01025f8936bb85bdc39cbd1faddcca290d0 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Sun, 6 Nov 2022 09:26:01 +0200 Subject: habanalabs: skip events info ioctl if not supported Some ASICs haven't yet implemented this functionality and so the ioctl call should fail and the user should be notified of the reason. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index ee43017eb563..b6abfa7761a7 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -123,6 +123,10 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate, return -EINVAL; arr = hdev->asic_funcs->get_events_stat(hdev, aggregate, &size); + if (!arr) { + dev_err(hdev->dev, "Events info not supported\n"); + return -EOPNOTSUPP; + } return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0; } -- cgit v1.2.3 From a63de89bee7ff01dc184fbe289eade5b5ab5f49a Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Sun, 6 Nov 2022 12:07:03 +0200 Subject: habanalabs/gaudi2: classify power/thermal events as info As power and thermal envelope events are pure informative and not indicating an error, we reduce the print level to info only. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 59940c8df2d2..61960fa059e0 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -6828,6 +6828,7 @@ static inline bool is_info_event(u32 event) { switch (event) { case GAUDI2_EVENT_CPU_CPLD_SHUTDOWN_CAUSE: + case GAUDI2_EVENT_CPU_FIX_POWER_ENV_S ... GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_E: return true; default: return false; -- cgit v1.2.3 From d3027f4a625063c18becd6953b4a2a273033b071 Mon Sep 17 00:00:00 2001 From: Dani Liberman Date: Mon, 31 Oct 2022 23:04:14 +0200 Subject: habanalabs/gaudi2: add page fault notify event Each time page fault happens, besides capturing its data, also notify the user about it. Signed-off-by: Dani Liberman Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 61960fa059e0..65c9b535aa69 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -8253,7 +8253,8 @@ static void gaudi2_handle_hif_fatal(struct hl_device *hdev, u16 event_type, u64 } } -static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool is_pmmu) +static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool is_pmmu, + u64 *event_mask) { u32 valid, val; u64 addr; @@ -8270,7 +8271,7 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool dev_err_ratelimited(hdev->dev, "%s page fault on va 0x%llx\n", is_pmmu ? "PMMU" : "HMMU", addr); - hl_capture_page_fault(hdev, addr, 0, is_pmmu); + hl_handle_page_fault(hdev, addr, 0, is_pmmu, event_mask); WREG32(mmu_base + MMU_OFFSET(mmDCORE0_HMMU0_MMU_PAGE_ERROR_CAPTURE), 0); } @@ -8296,7 +8297,7 @@ static void gaudi2_handle_access_error(struct hl_device *hdev, u64 mmu_base, boo } static void gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char *mmu_name, - u64 mmu_base, bool is_pmmu) + u64 mmu_base, bool is_pmmu, u64 *event_mask) { u32 spi_sei_cause, interrupt_clr = 0x0; int i; @@ -8309,7 +8310,7 @@ static void gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char mmu_name, gaudi2_mmu_spi_sei[i].cause); if (i == 0) - gaudi2_handle_page_error(hdev, mmu_base, is_pmmu); + gaudi2_handle_page_error(hdev, mmu_base, is_pmmu, event_mask); else if (i == 1) gaudi2_handle_access_error(hdev, mmu_base, is_pmmu); @@ -8381,7 +8382,7 @@ static bool gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) return reset; } -static void gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type) +static void gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, u64 *event_mask) { bool is_pmmu = false; char desc[32]; @@ -8439,7 +8440,7 @@ static void gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type return; } - gaudi2_handle_mmu_spi_sei_generic(hdev, desc, mmu_base, is_pmmu); + gaudi2_handle_mmu_spi_sei_generic(hdev, desc, mmu_base, is_pmmu, event_mask); } @@ -8969,7 +8970,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP: case GAUDI2_EVENT_PMMU0_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_PMMU0_SECURITY_ERROR: case GAUDI2_EVENT_PMMU_AXI_ERR_RSP_0: - gaudi2_handle_mmu_spi_sei_err(hdev, event_type); + gaudi2_handle_mmu_spi_sei_err(hdev, event_type, &event_mask); reset_flags |= HL_DRV_RESET_FW_FATAL_ERR; event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -10206,7 +10207,7 @@ static void gaudi2_ack_mmu_error(struct hl_device *hdev, u64 mmu_id) if (gaudi2_get_mmu_base(hdev, mmu_id, &mmu_base)) return; - gaudi2_handle_page_error(hdev, mmu_base, is_pmmu); + gaudi2_handle_page_error(hdev, mmu_base, is_pmmu, NULL); gaudi2_handle_access_error(hdev, mmu_base, is_pmmu); } -- cgit v1.2.3 From 5f8981d699ed33017ff2212ec17f6cde89212756 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 7 Nov 2022 16:20:03 +0200 Subject: habanalabs: fix print for out-of-sync and pkt-failure events Add missing le32_to_cpu() conversions, and use %d for the value returned from atomic_read(). Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 4 ++-- drivers/misc/habanalabs/gaudi2/gaudi2.c | 8 ++++---- drivers/misc/habanalabs/goya/goya.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index cbe1daf5a793..7b93f0d26dd0 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7347,8 +7347,8 @@ static void gaudi_print_out_of_sync_info(struct hl_device *hdev, { struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI_QUEUE_ID_CPU_PQ]; - dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", - sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); + dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n", + le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci)); } static void gaudi_print_fw_alive_info(struct hl_device *hdev, diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 65c9b535aa69..bdb5782afb7e 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -8684,8 +8684,8 @@ static void gaudi2_print_out_of_sync_info(struct hl_device *hdev, { struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI2_QUEUE_ID_CPU_PQ]; - dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", - sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); + dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n", + le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci)); } static void gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev) @@ -8751,8 +8751,8 @@ static void gaudi2_print_cpu_pkt_failure_info(struct hl_device *hdev, struct hl_hw_queue *q = &hdev->kernel_queues[GAUDI2_QUEUE_ID_CPU_PQ]; dev_warn(hdev->dev, - "FW reported sanity check failure, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", - sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); + "FW reported sanity check failure, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n", + le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci)); } static void hl_arc_event_handle(struct hl_device *hdev, diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 5ef9e3ca97a6..0f083fcf81a6 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -4475,8 +4475,8 @@ static void goya_print_out_of_sync_info(struct hl_device *hdev, { struct hl_hw_queue *q = &hdev->kernel_queues[GOYA_QUEUE_ID_CPU_PQ]; - dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%u\n", - sync_err->pi, sync_err->ci, q->pi, atomic_read(&q->ci)); + dev_err(hdev->dev, "Out of sync with FW, FW: pi=%u, ci=%u, LKD: pi=%u, ci=%d\n", + le32_to_cpu(sync_err->pi), le32_to_cpu(sync_err->ci), q->pi, atomic_read(&q->ci)); } static void goya_print_irq_info(struct hl_device *hdev, u16 event_type, -- cgit v1.2.3 From fe3e88c9470ceb2ea67651aa397f29e80453eed1 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 7 Nov 2022 16:34:32 +0200 Subject: habanalabs/gaudi: fix print for firmware-alive event Add missing le{32,64}_to_cpu conversions. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 7b93f0d26dd0..9f5e208701ba 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7356,9 +7356,10 @@ static void gaudi_print_fw_alive_info(struct hl_device *hdev, { dev_err(hdev->dev, "FW alive report: severity=%s, process_id=%u, thread_id=%u, uptime=%llu seconds\n", - (fw_alive->severity == FW_ALIVE_SEVERITY_MINOR) ? - "Minor" : "Critical", fw_alive->process_id, - fw_alive->thread_id, fw_alive->uptime_seconds); + (fw_alive->severity == FW_ALIVE_SEVERITY_MINOR) ? "Minor" : "Critical", + le32_to_cpu(fw_alive->process_id), + le32_to_cpu(fw_alive->thread_id), + le64_to_cpu(fw_alive->uptime_seconds)); } static void gaudi_print_nic_axi_irq_info(struct hl_device *hdev, u16 event_type, -- cgit v1.2.3 From 24c983c88f5e7865de972e3b395baf2c237485ca Mon Sep 17 00:00:00 2001 From: farah kassabri Date: Tue, 8 Nov 2022 13:23:17 +0200 Subject: habanalabs/gaudi2: remove redundant firmware version check Firmware 1.7 is the first official firmware, so no need to check if we are running a version below it. Signed-off-by: farah kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index bdb5782afb7e..36f0ea1100bb 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -10358,10 +10358,9 @@ int gaudi2_send_device_activity(struct hl_device *hdev, bool open) { struct gaudi2_device *gaudi2 = hdev->asic_specific; - if (!(gaudi2->hw_cap_initialized & HW_CAP_CPU_Q) || hdev->fw_major_version < 37) + if (!(gaudi2->hw_cap_initialized & HW_CAP_CPU_Q)) return 0; - /* TODO: add check for FW version using minor ver once it's known */ return hl_fw_send_device_activity(hdev, open); } -- cgit v1.2.3 From 2c77ec14c2db228f76a74e9123aecbb5b8c994f5 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 20 Oct 2022 12:05:09 +0300 Subject: habanalabs/gaudi2: don't enable entries in the MSIX_GW table User should use the virtual MSI-X doorbell to generate interrupts from the device, so there is no need to enable entries in the MSIX_GW table. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index 36f0ea1100bb..d5efec347bc1 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -4695,30 +4695,6 @@ static void gaudi2_init_dec(struct hl_device *hdev) } } -static void gaudi2_init_msix_gw_table(struct hl_device *hdev) -{ - u32 first_reg_offset, last_reg_offset, msix_gw_table_base; - u8 first_bit, last_bit; - int i; - - msix_gw_table_base = mmPCIE_WRAP_MSIX_GW_TABLE_0; - first_reg_offset = (GAUDI2_IRQ_NUM_USER_FIRST >> 5) << 2; - first_bit = GAUDI2_IRQ_NUM_USER_FIRST % 32; - last_reg_offset = (GAUDI2_IRQ_NUM_USER_LAST >> 5) << 2; - last_bit = GAUDI2_IRQ_NUM_USER_LAST % 32; - - if (first_reg_offset == last_reg_offset) { - WREG32(msix_gw_table_base + first_reg_offset, GENMASK(last_bit, first_bit)); - return; - } - - WREG32(msix_gw_table_base + first_reg_offset, GENMASK(31, first_bit)); - WREG32(msix_gw_table_base + last_reg_offset, GENMASK(last_bit, 0)); - - for (i = first_reg_offset + 4; i < last_reg_offset ; i += 4) - WREG32(msix_gw_table_base + i, 0xFFFFFFFF); -} - static int gaudi2_mmu_update_asid_hop0_addr(struct hl_device *hdev, u32 stlb_base, u32 asid, u64 phys_addr) { @@ -5232,8 +5208,6 @@ static int gaudi2_hw_init(struct hl_device *hdev) return rc; } - gaudi2_init_msix_gw_table(hdev); - gaudi2_init_scrambler_hbm(hdev); gaudi2_init_kdma(hdev); -- cgit v1.2.3 From 9c604af0c9d4efe4f308761229186768b3f3a6a9 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 20 Oct 2022 14:40:16 +0300 Subject: habanalabs/gaudi2: return to reset upon SM SEI BRESP error Due to a H/W issue in the LBW path to the PCIE_DBI MSI-X doorbell, there were false sporadic error responses in SM when it was configured to write to there, and hence no reset was done as part of handling the relevant event. Now that the virtual MSI-X doorbell is used, such errors in SM are not expected and reset shouldn't be skipped. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index d5efec347bc1..f0f2f77f56de 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -8300,11 +8300,10 @@ static void gaudi2_handle_mmu_spi_sei_generic(struct hl_device *hdev, const char WREG32(mmu_base + MMU_INTERRUPT_CLR_OFFSET, interrupt_clr); } -static bool gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) +static void gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) { u32 sei_cause_addr, sei_cause_val, sei_cause_cause, sei_cause_log; u32 cq_intr_addr, cq_intr_val, cq_intr_queue_index; - bool reset = true; int i; sei_cause_addr = mmDCORE0_SYNC_MNGR_GLBL_SM_SEI_CAUSE + DCORE_OFFSET * sm_index; @@ -8329,10 +8328,6 @@ static bool gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) gaudi2_sm_sei_cause[i].cause_name, gaudi2_sm_sei_cause[i].log_name, sei_cause_log & gaudi2_sm_sei_cause[i].log_mask); - - /* Due to a potential H/W issue, do not reset upon BRESP errors */ - if (i == 2) - reset = false; break; } @@ -8352,8 +8347,6 @@ static bool gaudi2_handle_sm_err(struct hl_device *hdev, u8 sm_index) /* Clear CQ_INTR */ WREG32(cq_intr_addr, 0); } - - return reset; } static void gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type, u64 *event_mask) @@ -8755,8 +8748,8 @@ static void hl_arc_event_handle(struct hl_device *hdev, static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry) { - bool reset_required = false, skip_reset = false, is_critical = false; struct gaudi2_device *gaudi2 = hdev->asic_specific; + bool reset_required = false, is_critical = false; u32 ctl, reset_flags = HL_DRV_RESET_HARD; int index, sbte_index; u64 event_mask = 0; @@ -9113,7 +9106,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent case GAUDI2_EVENT_SM0_AXI_ERROR_RESPONSE ... GAUDI2_EVENT_SM3_AXI_ERROR_RESPONSE: index = event_type - GAUDI2_EVENT_SM0_AXI_ERROR_RESPONSE; - skip_reset = !gaudi2_handle_sm_err(hdev, index); + gaudi2_handle_sm_err(hdev, index); event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR; break; @@ -9153,9 +9146,9 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent event_type); } - if ((gaudi2_irq_map_table[event_type].reset || reset_required) && !skip_reset && - (hdev->hard_reset_on_fw_events || - (hdev->asic_prop.fw_security_enabled && is_critical))) + if ((gaudi2_irq_map_table[event_type].reset || reset_required) && + (hdev->hard_reset_on_fw_events || + (hdev->asic_prop.fw_security_enabled && is_critical))) goto reset_device; /* Send unmask irq only for interrupts not classified as MSG */ -- cgit v1.2.3 From bc8e4bae70237f4671e07f83bcfb726eb14d86ed Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 9 Nov 2022 18:08:38 +0200 Subject: habanalabs: reset device if still in use when released If the device file is released while a context is still held, it won't be possible to reopen it until the context is eventually released. If that doesn't happen, only a device reset will revert it back to an operational state, i.e. need to wait for a CS timeout or an error, or to wait for an external intervention of injecting a reset via sysfs. At this stage, after the device was released by user, context is held either because of CS which were left running on the device and are not relevant anymore, or due to missing cleanup steps from user side. All of this is in any case handled in the device reset flow, so initiate the reset at this point instead of waiting for it. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 708db0f48ee0..49640c8ca910 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -504,9 +504,10 @@ static int hl_device_release(struct inode *inode, struct file *filp) hdev->compute_ctx_in_release = 1; - if (!hl_hpriv_put(hpriv)) - dev_notice(hdev->dev, - "User process closed FD but device still in use\n"); + if (!hl_hpriv_put(hpriv)) { + dev_notice(hdev->dev, "User process closed FD but device still in use\n"); + hl_device_reset(hdev, HL_DRV_RESET_HARD); + } hdev->last_open_session_duration_jif = jiffies - hdev->last_successful_open_jif; -- cgit v1.2.3 From f69c3e460a614cba8939f7c623f7b77f0bcb3584 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 10 Nov 2022 17:05:24 +0200 Subject: habanalabs: check schedule_hard_reset correctly schedule_hard_reset can be true only if we didn't do hard-reset. Therefore, no point of checking it in case hard_reset is true. Signed-off-by: Oded Gabbay Reviewed-by: Tomer Tayar --- drivers/misc/habanalabs/common/device.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 49640c8ca910..0650e511a0f5 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1737,18 +1737,19 @@ kill_processes: * the device will be operational although it shouldn't be */ hdev->asic_funcs->enable_events_from_fw(hdev); - } else if (!reset_upon_device_release) { - hdev->reset_info.compute_reset_cnt++; - } - - if (schedule_hard_reset) { - dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); - flags = hdev->reset_info.hard_reset_schedule_flags; - hdev->reset_info.hard_reset_schedule_flags = 0; - hdev->disabled = true; - hard_reset = true; - handle_reset_trigger(hdev, flags); - goto again; + } else { + if (!reset_upon_device_release) + hdev->reset_info.compute_reset_cnt++; + + if (schedule_hard_reset) { + dev_info(hdev->dev, "Performing hard reset scheduled during compute reset\n"); + flags = hdev->reset_info.hard_reset_schedule_flags; + hdev->reset_info.hard_reset_schedule_flags = 0; + hdev->disabled = true; + hard_reset = true; + handle_reset_trigger(hdev, flags); + goto again; + } } return 0; -- cgit v1.2.3 From b585daa89d572210a94c7f11a746bd5489017003 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Thu, 10 Nov 2022 17:24:02 +0200 Subject: habanalabs: extend process wait timeout in device fine Processes that use our device are likely to use at the same time other devices such as remote storage. In case our device is removed and a user process is still using the device, we need to kill the user process. However, if that process has a thread waiting for i/o to complete on remote storage, for example, the process won't terminate. Let's give it enough time to terminate before giving up. Signed-off-by: Oded Gabbay Reviewed-by: Tomer Tayar --- drivers/misc/habanalabs/common/device.c | 6 ++++-- drivers/misc/habanalabs/common/habanalabs.h | 11 ++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 0650e511a0f5..63d0cb7087e8 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -2300,14 +2300,16 @@ void hl_device_fini(struct hl_device *hdev) */ dev_info(hdev->dev, "Waiting for all processes to exit (timeout of %u seconds)", - HL_PENDING_RESET_LONG_SEC); + HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI); - rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC, false); + hdev->process_kill_trial_cnt = 0; + rc = device_kill_open_processes(hdev, HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI, false); if (rc) { dev_crit(hdev->dev, "Failed to kill all open processes\n"); device_disable_open_processes(hdev, false); } + hdev->process_kill_trial_cnt = 0; rc = device_kill_open_processes(hdev, 0, true); if (rc) { dev_crit(hdev->dev, "Failed to kill all control device open processes\n"); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 0781b8698f74..e7f89868428d 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -50,9 +50,14 @@ struct hl_fpriv; #define HL_MMAP_OFFSET_VALUE_MASK (0x1FFFFFFFFFFFull >> PAGE_SHIFT) #define HL_MMAP_OFFSET_VALUE_GET(off) (off & HL_MMAP_OFFSET_VALUE_MASK) -#define HL_PENDING_RESET_PER_SEC 10 -#define HL_PENDING_RESET_MAX_TRIALS 60 /* 10 minutes */ -#define HL_PENDING_RESET_LONG_SEC 60 +#define HL_PENDING_RESET_PER_SEC 10 +#define HL_PENDING_RESET_MAX_TRIALS 60 /* 10 minutes */ +#define HL_PENDING_RESET_LONG_SEC 60 +/* + * In device fini, wait 10 minutes for user processes to be terminated after we kill them. + * This is needed to prevent situation of clearing resources while user processes are still alive. + */ +#define HL_WAIT_PROCESS_KILL_ON_DEVICE_FINI 600 #define HL_HARD_RESET_MAX_TIMEOUT 120 #define HL_PLDM_HARD_RESET_MAX_TIMEOUT (HL_HARD_RESET_MAX_TIMEOUT * 3) -- cgit v1.2.3 From 18cd948204fffa61660d3f8454fc9d275c1f6c94 Mon Sep 17 00:00:00 2001 From: farah kassabri Date: Tue, 8 Nov 2022 15:24:33 +0200 Subject: habanalabs/gaudi2: change memory scrub mechanism Currently the scrubbing mechanism used the EDMA engines by directly setting the engine core registers to scrub a chunk of memory. Due to a sporadic failure with this mechanism, it was decided to initiate the engines via its QMAN using LIN-DMA packets. Signed-off-by: farah kassabri Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 129 ++++++++++++++++++++------------ 1 file changed, 83 insertions(+), 46 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index f0f2f77f56de..c14e63164a84 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -9171,34 +9171,74 @@ reset_device: hl_device_cond_reset(hdev, reset_flags, event_mask); } +static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev, + struct packet_lin_dma *lin_dma_pkt, dma_addr_t pkt_dma_addr, + u32 hw_queue_id, u32 size, u64 addr, u32 val) +{ + u32 ctl, pkt_size; + int rc = 0; + + ctl = FIELD_PREP(GAUDI2_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA); + ctl |= FIELD_PREP(GAUDI2_PKT_LIN_DMA_CTL_MEMSET_MASK, 1); + ctl |= FIELD_PREP(GAUDI2_PKT_LIN_DMA_CTL_WRCOMP_MASK, 1); + ctl |= FIELD_PREP(GAUDI2_PKT_CTL_EB_MASK, 1); + + lin_dma_pkt->ctl = cpu_to_le32(ctl); + lin_dma_pkt->src_addr = cpu_to_le64(val); + lin_dma_pkt->dst_addr = cpu_to_le64(addr); + lin_dma_pkt->tsize = cpu_to_le32(size); + + pkt_size = sizeof(struct packet_lin_dma); + + rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, pkt_dma_addr); + if (rc) + dev_err(hdev->dev, "Failed to send lin dma packet to H/W queue %d\n", + hw_queue_id); + + return rc; +} + static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size, u64 val) { - struct asic_fixed_properties *prop = &hdev->asic_prop; + u32 edma_queues_id[] = {GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0, + GAUDI2_QUEUE_ID_DCORE1_EDMA_0_0, + GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0, + GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0}; + u32 chunk_size, dcore, edma_idx, sob_offset, sob_addr, comp_val, + old_mmubp, mmubp, num_of_pkts, busy, pkt_size; u64 comp_addr, cur_addr = addr, end_addr = addr + size; - u32 chunk_size, busy, dcore, edma_idx, sob_offset, sob_addr, comp_val, edma_commit; - u32 old_mmubp, mmubp; - int rc = 0; + struct asic_fixed_properties *prop = &hdev->asic_prop; + void *lin_dma_pkts_arr; + dma_addr_t pkt_dma_addr; + int rc = 0, dma_num = 0; + + if (prop->edma_enabled_mask == 0) { + dev_info(hdev->dev, "non of the EDMA engines is enabled - skip dram scrubbing\n"); + return -EIO; + } sob_offset = hdev->asic_prop.first_available_user_sob[0] * 4; sob_addr = mmDCORE0_SYNC_MNGR_OBJS_SOB_OBJ_0 + sob_offset; comp_addr = CFG_BASE + sob_addr; comp_val = FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1) | FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1); - - edma_commit = FIELD_PREP(ARC_FARM_KDMA_CTX_COMMIT_LIN_MASK, 1) | - FIELD_PREP(ARC_FARM_KDMA_CTX_COMMIT_MEM_SET_MASK, 1) | - FIELD_PREP(ARC_FARM_KDMA_CTX_COMMIT_WR_COMP_EN_MASK, 1); mmubp = FIELD_PREP(ARC_FARM_KDMA_CTX_AXUSER_HB_MMU_BP_WR_MASK, 1) | FIELD_PREP(ARC_FARM_KDMA_CTX_AXUSER_HB_MMU_BP_RD_MASK, 1); - if (prop->edma_enabled_mask == 0) { - dev_info(hdev->dev, "non of the EDMA engines is enabled - skip dram scrubbing\n"); - return -EIO; - } + /* Calculate how many lin dma pkts we'll need */ + num_of_pkts = div64_u64(round_up(size, SZ_2G), SZ_2G); + pkt_size = sizeof(struct packet_lin_dma); + + lin_dma_pkts_arr = hl_asic_dma_alloc_coherent(hdev, pkt_size * num_of_pkts, + &pkt_dma_addr, GFP_KERNEL); + if (!lin_dma_pkts_arr) + return -ENOMEM; /* * set mmu bypass for the scrubbing - all ddmas are configured the same so save * only the first one to restore later + * also set the sob addr for all edma cores for completion. + * set QM as trusted to allow it to access physical address with MMU bp. */ old_mmubp = RREG32(mmDCORE0_EDMA0_CORE_CTX_AXUSER_HB_MMU_BP); for (dcore = 0 ; dcore < NUM_OF_DCORES ; dcore++) { @@ -9211,17 +9251,22 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz WREG32(mmDCORE0_EDMA0_CORE_CTX_AXUSER_HB_MMU_BP + edma_offset, mmubp); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_LO + edma_offset, + lower_32_bits(comp_addr)); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_HI + edma_offset, + upper_32_bits(comp_addr)); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_WDATA + edma_offset, + comp_val); + gaudi2_qman_set_test_mode(hdev, + edma_queues_id[dcore] + 4 * edma_idx, true); } } - while (cur_addr < end_addr) { - int dma_num = 0; + WREG32(sob_addr, 0); - WREG32(sob_addr, 0); + while (cur_addr < end_addr) { for (dcore = 0 ; dcore < NUM_OF_DCORES ; dcore++) { for (edma_idx = 0 ; edma_idx < NUM_OF_EDMA_PER_DCORE ; edma_idx++) { - u32 edma_offset = dcore * DCORE_OFFSET + - edma_idx * DCORE_EDMA_OFFSET; u32 edma_bit = dcore * NUM_OF_EDMA_PER_DCORE + edma_idx; if (!(prop->edma_enabled_mask & BIT(edma_bit))) @@ -9229,41 +9274,26 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz chunk_size = min_t(u64, SZ_2G, end_addr - cur_addr); - WREG32(mmDCORE0_EDMA0_CORE_CTX_SRC_BASE_LO + edma_offset, - lower_32_bits(val)); - WREG32(mmDCORE0_EDMA0_CORE_CTX_SRC_BASE_HI + edma_offset, - upper_32_bits(val)); - - WREG32(mmDCORE0_EDMA0_CORE_CTX_DST_BASE_LO + edma_offset, - lower_32_bits(cur_addr)); - WREG32(mmDCORE0_EDMA0_CORE_CTX_DST_BASE_HI + edma_offset, - upper_32_bits(cur_addr)); - - WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_LO + edma_offset, - lower_32_bits(comp_addr)); - WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_HI + edma_offset, - upper_32_bits(comp_addr)); - WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_WDATA + edma_offset, - comp_val); - - WREG32(mmDCORE0_EDMA0_CORE_CTX_DST_TSIZE_0 + edma_offset, - chunk_size); - WREG32(mmDCORE0_EDMA0_CORE_CTX_COMMIT + edma_offset, edma_commit); + rc = gaudi2_memset_memory_chunk_using_edma_qm(hdev, + (struct packet_lin_dma *)lin_dma_pkts_arr + dma_num, + pkt_dma_addr + dma_num * pkt_size, + edma_queues_id[dcore] + edma_idx * 4, + chunk_size, cur_addr, val); + if (rc) + goto end; dma_num++; - cur_addr += chunk_size; - if (cur_addr == end_addr) - goto poll; + break; } } -poll: - rc = hl_poll_timeout(hdev, sob_addr, busy, (busy == dma_num), 1000, 1000000); - if (rc) { - dev_err(hdev->dev, "DMA Timeout during HBM scrubbing\n"); - goto end; - } + } + + rc = hl_poll_timeout(hdev, sob_addr, busy, (busy == dma_num), 1000, 1000000); + if (rc) { + dev_err(hdev->dev, "DMA Timeout during HBM scrubbing\n"); + goto end; } end: for (dcore = 0 ; dcore < NUM_OF_DCORES ; dcore++) { @@ -9275,10 +9305,17 @@ end: continue; WREG32(mmDCORE0_EDMA0_CORE_CTX_AXUSER_HB_MMU_BP + edma_offset, old_mmubp); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_LO + edma_offset, 0); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_ADDR_HI + edma_offset, 0); + WREG32(mmDCORE0_EDMA0_CORE_CTX_WR_COMP_WDATA + edma_offset, 0); + gaudi2_qman_set_test_mode(hdev, + edma_queues_id[dcore] + 4 * edma_idx, false); } } WREG32(sob_addr, 0); + hl_asic_dma_free_coherent(hdev, pkt_size * num_of_pkts, lin_dma_pkts_arr, pkt_dma_addr); + return rc; } -- cgit v1.2.3 From 01907ba5252164ca6bf0de670660cd94d77c378c Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Sun, 23 Oct 2022 12:55:21 +0300 Subject: habanalabs: increase the size of busy engines mask Increase the size of the busy engines mask in 'struct hl_info_hw_idle', for future ASICs with more than 128 engines. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 9 +++++---- include/uapi/misc/habanalabs.h | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 63d0cb7087e8..f5864893237c 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -416,8 +416,9 @@ static void hpriv_release(struct kref *ref) device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask, HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL); if (!device_is_idle) { - dev_err(hdev->dev, "device not idle after user context is closed (0x%llx_%llx)\n", - idle_mask[1], idle_mask[0]); + dev_err(hdev->dev, + "device not idle after user context is closed (0x%llx_%llx_%llx_%llx)\n", + idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]); reset_device = true; } @@ -1661,8 +1662,8 @@ kill_processes: /* If device is not idle fail the reset process */ if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask, HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) { - dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n", - idle_mask[1], idle_mask[0]); + dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx_%llx_%llx) after reset\n", + idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]); rc = -EIO; goto out_err; } diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index e50cb71df081..3b995e841eb8 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -916,7 +916,7 @@ struct hl_info_dram_usage { __u64 ctx_dram_mem; }; -#define HL_BUSY_ENGINES_MASK_EXT_SIZE 2 +#define HL_BUSY_ENGINES_MASK_EXT_SIZE 4 struct hl_info_hw_idle { __u32 is_idle; -- cgit v1.2.3 From 5908560a7f14171d1100f4a357deda659dc26868 Mon Sep 17 00:00:00 2001 From: Marco Pagani Date: Wed, 16 Nov 2022 14:41:25 +0100 Subject: habanalabs: added return value check for hl_fw_dynamic_send_clear_cmd() The clang-analyzer reported a warning: "Value stored to 'rc' is never read". The return value check for the first hl_fw_dynamic_send_clear_cmd() call in hl_fw_dynamic_send_protocol_cmd() appears to be missing. Signed-off-by: Marco Pagani Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index 01c4ffba6e97..c0909d76d6eb 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -1783,6 +1783,8 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev, /* first send clear command to clean former commands */ rc = hl_fw_dynamic_send_clear_cmd(hdev, fw_loader); + if (rc) + return rc; /* send the actual command */ hl_fw_dynamic_send_cmd(hdev, fw_loader, cmd, size); -- cgit v1.2.3 From 6825b5f81f273fcc1ec61e7e203b0ea40d9987fc Mon Sep 17 00:00:00 2001 From: Marco Pagani Date: Wed, 23 Nov 2022 09:56:39 +0100 Subject: habanalabs/gaudi2: added memset for the cq_size register The clang-analyzer reported a warning: "Value stored to 'cq_size_addr' is never read". The cq_size register of dcore0 is not being zeroed using gaudi2_memset_device_lbw(), along with the other cq_* registers, even though the corresponding cq_size_addr variable is set. Signed-off-by: Marco Pagani Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi2/gaudi2.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index c14e63164a84..a33a9072fca4 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -9386,6 +9386,7 @@ static void gaudi2_restore_user_sm_registers(struct hl_device *hdev) gaudi2_memset_device_lbw(hdev, cq_lbw_data_addr, size, 0); gaudi2_memset_device_lbw(hdev, cq_base_l_addr, size, 0); gaudi2_memset_device_lbw(hdev, cq_base_h_addr, size, 0); + gaudi2_memset_device_lbw(hdev, cq_size_addr, size, 0); cq_lbw_l_addr = mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_L_0 + DCORE_OFFSET; cq_lbw_h_addr = mmDCORE0_SYNC_MNGR_GLBL_LBW_ADDR_H_0 + DCORE_OFFSET; -- cgit v1.2.3 From 56fb517775f4d71dbca2b1fb3562276138361072 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Fri, 18 Nov 2022 15:08:33 +0200 Subject: habanalabs: fix rc when new CPUCP opcodes are not supported When the new CPUCP opcodes are not supported and a CPUCP packet fails, the return value is the F/W error resposone which is a positive value. If this packet is sent from IOCTL and the positive value is used, the ICOTL will not be considered as unsuccessful. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index c0909d76d6eb..cf8147e43833 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -324,6 +324,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, if (!prop->supports_advanced_cpucp_rc) { dev_dbg(hdev->dev, "F/W ERROR %d for CPU packet %d\n", rc, opcode); + rc = -EIO; goto scrub_descriptor; } -- cgit v1.2.3 From 0abcae8b48850e0f488d0eb7232323d93bdc4b13 Mon Sep 17 00:00:00 2001 From: Dafna Hirschfeld Date: Sun, 30 Oct 2022 16:49:42 +0200 Subject: habanalabs: add RMWREG32_SHIFTED to set a val within a mask This is similar to RMWREG32, but the given 'val' is already shifted according to the mask. This allows several 'ORed' vals and masks to be set at once The patch also fixes wrong usage of RMWREG32 by replacing it with RMWREG32_SHIFTED Signed-off-by: Dafna Hirschfeld Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/habanalabs.h | 10 +++------- drivers/misc/habanalabs/gaudi2/gaudi2.c | 6 +++--- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index e7f89868428d..0329a0980bb7 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2498,13 +2498,9 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val); #define WREG32_AND(reg, and) WREG32_P(reg, 0, and) #define WREG32_OR(reg, or) WREG32_P(reg, or, ~(or)) -#define RMWREG32(reg, val, mask) \ - do { \ - u32 tmp_ = RREG32(reg); \ - tmp_ &= ~(mask); \ - tmp_ |= ((val) << __ffs(mask)); \ - WREG32(reg, tmp_); \ - } while (0) +#define RMWREG32_SHIFTED(reg, val, mask) WREG32_P(reg, val, ~(mask)) + +#define RMWREG32(reg, val, mask) RMWREG32_SHIFTED(reg, (val) << __ffs(mask), mask) #define RREG32_MASK(reg, mask) ((RREG32(reg) & mask) >> __ffs(mask)) diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c index a33a9072fca4..e793fb2bdcbe 100644 --- a/drivers/misc/habanalabs/gaudi2/gaudi2.c +++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c @@ -5052,7 +5052,7 @@ static int gaudi2_pci_mmu_init(struct hl_device *hdev) mmu_base = mmPMMU_HBW_MMU_BASE; stlb_base = mmPMMU_HBW_STLB_BASE; - RMWREG32(stlb_base + STLB_HOP_CONFIGURATION_OFFSET, + RMWREG32_SHIFTED(stlb_base + STLB_HOP_CONFIGURATION_OFFSET, (0 << PMMU_HBW_STLB_HOP_CONFIGURATION_FIRST_HOP_SHIFT) | (5 << PMMU_HBW_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_SMALL_P_SHIFT) | (4 << PMMU_HBW_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_LARGE_P_SHIFT) | @@ -5068,7 +5068,7 @@ static int gaudi2_pci_mmu_init(struct hl_device *hdev) if (PAGE_SIZE == SZ_64K) { /* Set page sizes to 64K on hop5 and 16M on hop4 + enable 8 bit hops */ - RMWREG32(mmu_base + MMU_STATIC_MULTI_PAGE_SIZE_OFFSET, + RMWREG32_SHIFTED(mmu_base + MMU_STATIC_MULTI_PAGE_SIZE_OFFSET, FIELD_PREP(DCORE0_HMMU0_MMU_STATIC_MULTI_PAGE_SIZE_HOP5_PAGE_SIZE_MASK, 4) | FIELD_PREP(DCORE0_HMMU0_MMU_STATIC_MULTI_PAGE_SIZE_HOP4_PAGE_SIZE_MASK, 3) | FIELD_PREP( @@ -5116,7 +5116,7 @@ static int gaudi2_dcore_hmmu_init(struct hl_device *hdev, int dcore_id, RMWREG32(mmu_base + MMU_STATIC_MULTI_PAGE_SIZE_OFFSET, 5 /* 64MB */, MMU_STATIC_MULTI_PAGE_SIZE_HOP4_PAGE_SIZE_MASK); - RMWREG32(stlb_base + STLB_HOP_CONFIGURATION_OFFSET, + RMWREG32_SHIFTED(stlb_base + STLB_HOP_CONFIGURATION_OFFSET, FIELD_PREP(DCORE0_HMMU0_STLB_HOP_CONFIGURATION_FIRST_HOP_MASK, 0) | FIELD_PREP(DCORE0_HMMU0_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_SMALL_P_MASK, 3) | FIELD_PREP(DCORE0_HMMU0_STLB_HOP_CONFIGURATION_FIRST_LOOKUP_HOP_LARGE_P_MASK, 3) | -- cgit v1.2.3 From 408c46bd6eb7a4e2fb9fd686218e4a13b9de844c Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 14 Nov 2022 13:26:21 +0200 Subject: habanalabs: print context refcount value if hard reset fails Failing to kill a user process during a hard reset can be due to a reference to the user context which isn't released. To make it easier to understand if this the reason for the failure and not something else, add a print of the context refcount value. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index f5864893237c..926f230def56 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -696,10 +696,22 @@ static void device_hard_reset_pending(struct work_struct *work) flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR; rc = hl_device_reset(hdev, flags); + if ((rc == -EBUSY) && !hdev->device_fini_pending) { - dev_info(hdev->dev, - "Could not reset device. will try again in %u seconds", - HL_PENDING_RESET_PER_SEC); + struct hl_ctx *ctx = hl_get_compute_ctx(hdev); + + if (ctx) { + /* The read refcount value should subtracted by one, because the read is + * protected with hl_get_compute_ctx(). + */ + dev_info(hdev->dev, + "Could not reset device (compute_ctx refcount %u). will try again in %u seconds", + kref_read(&ctx->refcount) - 1, HL_PENDING_RESET_PER_SEC); + hl_ctx_put(ctx); + } else { + dev_info(hdev->dev, "Could not reset device. will try again in %u seconds", + HL_PENDING_RESET_PER_SEC); + } queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work, msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000)); -- cgit v1.2.3 From 1f615120fc9d24a8df7f14b0d1e79f3402330855 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 16 Nov 2022 13:14:02 +0200 Subject: habanalabs: don't put context in hl_encaps_handle_do_release_sob() hl_encaps_handle_do_release_sob() can be called only when the last reference to the context object is released and hl_ctx_do_release() is initiated, and therefore it shouldn't call hl_ctx_put(). Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/context.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index 2f4620b7990c..ba6675960203 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -39,7 +39,6 @@ static void hl_encaps_handle_do_release_sob(struct kref *ref) idr_remove(&mgr->handles, handle->id); spin_unlock(&mgr->lock); - hl_ctx_put(handle->ctx); kfree(handle); } -- cgit v1.2.3 From 893afb248c7a1f24d17719a5e5f4fe4174ecb60c Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 17 Nov 2022 15:22:31 +0200 Subject: habanalabs: clear non-released encapsulated signals Reserved encapsulated signals which were not released hold the context refcount, leading to a failure when killing the user process on device reset or device fini. Add the release of these left signals in the CS roll-back process. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- .../misc/habanalabs/common/command_submission.c | 46 +++++++++++++++---- drivers/misc/habanalabs/common/context.c | 53 +++++++++++++--------- drivers/misc/habanalabs/common/habanalabs.h | 3 +- 3 files changed, 71 insertions(+), 31 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index f1c69c8ed74a..ea0e5101c10e 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -742,13 +742,11 @@ static void cs_do_release(struct kref *ref) */ if (hl_cs_cmpl->encaps_signals) kref_put(&hl_cs_cmpl->encaps_sig_hdl->refcount, - hl_encaps_handle_do_release); + hl_encaps_release_handle_and_put_ctx); } - if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) - && cs->encaps_signals) - kref_put(&cs->encaps_sig_hdl->refcount, - hl_encaps_handle_do_release); + if ((cs->type == CS_TYPE_WAIT || cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals) + kref_put(&cs->encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx); out: /* Must be called before hl_ctx_put because inside we use ctx to get @@ -1011,6 +1009,34 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs) hl_complete_job(hdev, job); } +/* + * release_reserved_encaps_signals() - release reserved encapsulated signals. + * @hdev: pointer to habanalabs device structure + * + * Release reserved encapsulated signals which weren't un-reserved, or for which a CS with + * encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back. + * For these signals need also to put the refcount of the H/W SOB which was taken at the + * reservation. + */ +static void release_reserved_encaps_signals(struct hl_device *hdev) +{ + struct hl_ctx *ctx = hl_get_compute_ctx(hdev); + struct hl_cs_encaps_sig_handle *handle; + struct hl_encaps_signals_mgr *mgr; + u32 id; + + if (!ctx) + return; + + mgr = &ctx->sig_mgr; + + idr_for_each_entry(&mgr->handles, handle, id) + if (handle->cs_seq == ULLONG_MAX) + kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob_ctx); + + hl_ctx_put(ctx); +} + void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush) { int i; @@ -1039,6 +1065,8 @@ void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush) } force_complete_multi_cs(hdev); + + release_reserved_encaps_signals(hdev); } static void @@ -2001,6 +2029,8 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv, */ handle->pre_sob_val = prop->next_sob_val - handle->count; + handle->cs_seq = ULLONG_MAX; + *signals_count = prop->next_sob_val; hdev->asic_funcs->hw_queues_unlock(hdev); @@ -2350,10 +2380,8 @@ put_cs: /* We finished with the CS in this function, so put the ref */ cs_put(cs); free_cs_chunk_array: - if (!wait_cs_submitted && cs_encaps_signals && handle_found && - is_wait_cs) - kref_put(&encaps_sig_hdl->refcount, - hl_encaps_handle_do_release); + if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs) + kref_put(&encaps_sig_hdl->refcount, hl_encaps_release_handle_and_put_ctx); kfree(cs_chunk_array); out: return rc; diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index ba6675960203..9c8b1b37b510 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -9,37 +9,46 @@ #include -void hl_encaps_handle_do_release(struct kref *ref) +static void encaps_handle_do_release(struct hl_cs_encaps_sig_handle *handle, bool put_hw_sob, + bool put_ctx) { - struct hl_cs_encaps_sig_handle *handle = - container_of(ref, struct hl_cs_encaps_sig_handle, refcount); struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr; + if (put_hw_sob) + hw_sob_put(handle->hw_sob); + spin_lock(&mgr->lock); idr_remove(&mgr->handles, handle->id); spin_unlock(&mgr->lock); - hl_ctx_put(handle->ctx); + if (put_ctx) + hl_ctx_put(handle->ctx); + kfree(handle); } -static void hl_encaps_handle_do_release_sob(struct kref *ref) +void hl_encaps_release_handle_and_put_ctx(struct kref *ref) { struct hl_cs_encaps_sig_handle *handle = - container_of(ref, struct hl_cs_encaps_sig_handle, refcount); - struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr; + container_of(ref, struct hl_cs_encaps_sig_handle, refcount); - /* if we're here, then there was a signals reservation but cs with - * encaps signals wasn't submitted, so need to put refcount - * to hw_sob taken at the reservation. - */ - hw_sob_put(handle->hw_sob); + encaps_handle_do_release(handle, false, true); +} - spin_lock(&mgr->lock); - idr_remove(&mgr->handles, handle->id); - spin_unlock(&mgr->lock); +static void hl_encaps_release_handle_and_put_sob(struct kref *ref) +{ + struct hl_cs_encaps_sig_handle *handle = + container_of(ref, struct hl_cs_encaps_sig_handle, refcount); - kfree(handle); + encaps_handle_do_release(handle, true, false); +} + +void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref) +{ + struct hl_cs_encaps_sig_handle *handle = + container_of(ref, struct hl_cs_encaps_sig_handle, refcount); + + encaps_handle_do_release(handle, true, true); } static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr) @@ -48,8 +57,7 @@ static void hl_encaps_sig_mgr_init(struct hl_encaps_signals_mgr *mgr) idr_init(&mgr->handles); } -static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, - struct hl_encaps_signals_mgr *mgr) +static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, struct hl_encaps_signals_mgr *mgr) { struct hl_cs_encaps_sig_handle *handle; struct idr *idp; @@ -57,11 +65,14 @@ static void hl_encaps_sig_mgr_fini(struct hl_device *hdev, idp = &mgr->handles; + /* The IDR is expected to be empty at this stage, because any left signal should have been + * released as part of CS roll-back. + */ if (!idr_is_empty(idp)) { - dev_warn(hdev->dev, "device released while some encaps signals handles are still allocated\n"); + dev_warn(hdev->dev, + "device released while some encaps signals handles are still allocated\n"); idr_for_each_entry(idp, handle, id) - kref_put(&handle->refcount, - hl_encaps_handle_do_release_sob); + kref_put(&handle->refcount, hl_encaps_release_handle_and_put_sob); } idr_destroy(&mgr->handles); diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 0329a0980bb7..e2527d976ee0 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -3775,7 +3775,8 @@ void hl_sysfs_add_dev_vrm_attr(struct hl_device *hdev, struct attribute_group *d void hw_sob_get(struct hl_hw_sob *hw_sob); void hw_sob_put(struct hl_hw_sob *hw_sob); -void hl_encaps_handle_do_release(struct kref *ref); +void hl_encaps_release_handle_and_put_ctx(struct kref *ref); +void hl_encaps_release_handle_and_put_sob_ctx(struct kref *ref); void hl_hw_queue_encaps_sig_set_sob_info(struct hl_device *hdev, struct hl_cs *cs, struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl); -- cgit v1.2.3 From 1b18cf33d6ce63a9f5fe3764d7b20c4738dd1245 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Thu, 17 Nov 2022 18:57:49 +0200 Subject: habanalabs: make print of engines idle mask more readable The engines idle mask was increased to be an array of 4 u64 entries. To make the print of this mask more readable, remove the "0x" prefix, and zero-pad each u64 to 16 bytes if either it isn't zero or if any of the higher-order u64's is not zero. Signed-off-by: Tomer Tayar Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 926f230def56..87ab329e65d4 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -386,6 +386,23 @@ bool hl_ctrl_device_operational(struct hl_device *hdev, } } +static void print_idle_status_mask(struct hl_device *hdev, const char *message, + u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE]) +{ + u32 pad_width[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {}; + + BUILD_BUG_ON(HL_BUSY_ENGINES_MASK_EXT_SIZE != 4); + + pad_width[3] = idle_mask[3] ? 16 : 0; + pad_width[2] = idle_mask[2] || pad_width[3] ? 16 : 0; + pad_width[1] = idle_mask[1] || pad_width[2] ? 16 : 0; + pad_width[0] = idle_mask[0] || pad_width[1] ? 16 : 0; + + dev_err(hdev->dev, "%s (mask %0*llx_%0*llx_%0*llx_%0*llx)\n", + message, pad_width[3], idle_mask[3], pad_width[2], idle_mask[2], + pad_width[1], idle_mask[1], pad_width[0], idle_mask[0]); +} + static void hpriv_release(struct kref *ref) { u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; @@ -416,9 +433,8 @@ static void hpriv_release(struct kref *ref) device_is_idle = hdev->asic_funcs->is_device_idle(hdev, idle_mask, HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL); if (!device_is_idle) { - dev_err(hdev->dev, - "device not idle after user context is closed (0x%llx_%llx_%llx_%llx)\n", - idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]); + print_idle_status_mask(hdev, "device is not idle after user context is closed", + idle_mask); reset_device = true; } @@ -1673,9 +1689,8 @@ kill_processes: /* If device is not idle fail the reset process */ if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask, - HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) { - dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx_%llx_%llx) after reset\n", - idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]); + HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) { + print_idle_status_mask(hdev, "device is not idle after reset", idle_mask); rc = -EIO; goto out_err; } -- cgit v1.2.3 From 5354a2a0018345774ab2517fc2fe107a6cd894fa Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Tue, 22 Nov 2022 09:59:27 +0200 Subject: habanalabs: fail driver load if EEPROM errors detected In case EEPROM is not burned, firmware sets default EEPROM values. As this is not valid in production, driver should fail load upon any EEPROM error reported by firmware. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/firmware_if.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c index cf8147e43833..228b92278e48 100644 --- a/drivers/misc/habanalabs/common/firmware_if.c +++ b/drivers/misc/habanalabs/common/firmware_if.c @@ -617,16 +617,12 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, if (sts_val & CPU_BOOT_DEV_STS0_ENABLED) dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val); - /* All warnings should go here in order not to reach the unknown error validation */ if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) { - dev_warn(hdev->dev, - "Device boot warning - EEPROM failure detected, default settings applied\n"); - /* This is a warning so we don't want it to disable the - * device - */ - err_val &= ~CPU_BOOT_ERR0_EEPROM_FAIL; + dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n"); + err_exists = true; } + /* All warnings should go here in order not to reach the unknown error validation */ if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) { dev_warn(hdev->dev, "Device boot warning - Skipped DRAM initialization\n"); @@ -2532,7 +2528,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, struct fw_load_mgr *fw_loader) { struct cpu_dyn_regs *dyn_regs; - int rc; + int rc, fw_error_rc; dev_info(hdev->dev, "Loading %sfirmware to device, may take some time...\n", @@ -2632,14 +2628,17 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev, hl_fw_dynamic_update_linux_interrupt_if(hdev); - return 0; - protocol_err: - if (fw_loader->dynamic_loader.fw_desc_valid) - fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0), + if (fw_loader->dynamic_loader.fw_desc_valid) { + fw_error_rc = fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0), le32_to_cpu(dyn_regs->cpu_boot_err1), le32_to_cpu(dyn_regs->cpu_boot_dev_sts0), le32_to_cpu(dyn_regs->cpu_boot_dev_sts1)); + + if (fw_error_rc) + return fw_error_rc; + } + return rc; } -- cgit v1.2.3 From 19a17a9fb486b2961dbd7f3fff0d79a144c9a3b6 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Sun, 20 Nov 2022 15:12:26 +0200 Subject: habanalabs: fix VA range calculation Current implementation is fixing the page size to PAGE_SIZE whereas the input page size may be different. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/memory.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 541e1b6a2176..7c5c18be294a 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -2508,24 +2508,20 @@ static int va_range_init(struct hl_device *hdev, struct hl_va_range **va_ranges, /* * PAGE_SIZE alignment - * it is the callers responsibility to align the addresses if the + * it is the caller's responsibility to align the addresses if the * page size is not a power of 2 */ if (is_power_of_2(page_size)) { - if (start & (PAGE_SIZE - 1)) { - start &= PAGE_MASK; - start += PAGE_SIZE; - } + start = round_up(start, page_size); /* * The end of the range is inclusive, hence we need to align it * to the end of the last full page in the range. For example if * end = 0x3ff5 with page size 0x1000, we need to align it to - * 0x2fff. The remainig 0xff5 bytes do not form a full page. + * 0x2fff. The remaining 0xff5 bytes do not form a full page. */ - if ((end + 1) & (PAGE_SIZE - 1)) - end = ((end + 1) & PAGE_MASK) - 1; + end = round_down(end + 1, page_size) - 1; } if (start >= end) { -- cgit v1.2.3 From 83f47eea742c1152c237398fc040ceba04fc5d76 Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Wed, 16 Nov 2022 14:47:34 +0200 Subject: mei: add timeout to send When driver wakes up the firmware from the low power state, it is sending a memory ready message. The send is done via synchronous/blocking function to ensure that firmware is in ready state. However, in case of firmware undergoing reset send might be block forever. To address this issue a timeout is added to blocking write command on the internal bus. Introduce the __mei_cl_send_timeout function to use instead of __mei_cl_send in cases where timeout is required. The mei_cl_write has only two callers and there is no need to split it into two functions. Signed-off-by: Alexander Usyskin Link: https://lore.kernel.org/r/20221116124735.2493847-2-alexander.usyskin@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/bus-fixup.c | 7 +++++-- drivers/misc/mei/bus.c | 22 +++++++++++++++++++++- drivers/misc/mei/client.c | 20 ++++++++++++++++---- drivers/misc/mei/client.h | 2 +- drivers/misc/mei/main.c | 2 +- drivers/misc/mei/mei_dev.h | 2 ++ 6 files changed, 46 insertions(+), 9 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/mei/bus-fixup.c b/drivers/misc/mei/bus-fixup.c index 71fbf0bc8453..90023c34666e 100644 --- a/drivers/misc/mei/bus-fixup.c +++ b/drivers/misc/mei/bus-fixup.c @@ -188,17 +188,20 @@ static int mei_fwver(struct mei_cl_device *cldev) return ret; } +#define GFX_MEMORY_READY_TIMEOUT 200 /* timeout in milliseconds */ + static int mei_gfx_memory_ready(struct mei_cl_device *cldev) { struct mkhi_gfx_mem_ready req = {0}; - unsigned int mode = MEI_CL_IO_TX_INTERNAL; + unsigned int mode = MEI_CL_IO_TX_INTERNAL | MEI_CL_IO_TX_BLOCKING; req.hdr.group_id = MKHI_GROUP_ID_GFX; req.hdr.command = MKHI_GFX_MEMORY_READY_CMD_REQ; req.flags = MKHI_GFX_MEM_READY_PXP_ALLOWED; dev_dbg(&cldev->dev, "Sending memory ready command\n"); - return __mei_cl_send(cldev->cl, (u8 *)&req, sizeof(req), 0, mode); + return __mei_cl_send_timeout(cldev->cl, (u8 *)&req, sizeof(req), 0, + mode, GFX_MEMORY_READY_TIMEOUT); } static void mei_mkhi_fix(struct mei_cl_device *cldev) diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c index 46aa3554e97b..fdb5f7331695 100644 --- a/drivers/misc/mei/bus.c +++ b/drivers/misc/mei/bus.c @@ -33,6 +33,26 @@ */ ssize_t __mei_cl_send(struct mei_cl *cl, const u8 *buf, size_t length, u8 vtag, unsigned int mode) +{ + return __mei_cl_send_timeout(cl, buf, length, vtag, mode, MAX_SCHEDULE_TIMEOUT); +} + +/** + * __mei_cl_send_timeout - internal client send (write) + * + * @cl: host client + * @buf: buffer to send + * @length: buffer length + * @vtag: virtual tag + * @mode: sending mode + * @timeout: send timeout in milliseconds. + * effective only for blocking writes: the MEI_CL_IO_TX_BLOCKING mode bit is set. + * set timeout to the MAX_SCHEDULE_TIMEOUT to maixum allowed wait. + * + * Return: written size bytes or < 0 on error + */ +ssize_t __mei_cl_send_timeout(struct mei_cl *cl, const u8 *buf, size_t length, u8 vtag, + unsigned int mode, unsigned long timeout) { struct mei_device *bus; struct mei_cl_cb *cb; @@ -101,7 +121,7 @@ ssize_t __mei_cl_send(struct mei_cl *cl, const u8 *buf, size_t length, u8 vtag, cb->blocking = !!(mode & MEI_CL_IO_TX_BLOCKING); memcpy(cb->buf.data, buf, length); - rets = mei_cl_write(cl, cb); + rets = mei_cl_write(cl, cb, timeout); out: mutex_unlock(&bus->device_lock); diff --git a/drivers/misc/mei/client.c b/drivers/misc/mei/client.c index 0b2fbe1335a7..b4c104907ce2 100644 --- a/drivers/misc/mei/client.c +++ b/drivers/misc/mei/client.c @@ -1926,10 +1926,13 @@ err: * * @cl: host client * @cb: write callback with filled data + * @timeout: send timeout in milliseconds. + * effective only for blocking writes: the cb->blocking is set. + * set timeout to the MAX_SCHEDULE_TIMEOUT to maixum allowed wait. * * Return: number of bytes sent on success, <0 on failure. */ -ssize_t mei_cl_write(struct mei_cl *cl, struct mei_cl_cb *cb) +ssize_t mei_cl_write(struct mei_cl *cl, struct mei_cl_cb *cb, unsigned long timeout) { struct mei_device *dev; struct mei_msg_data *buf; @@ -2056,11 +2059,20 @@ out: if (blocking && cl->writing_state != MEI_WRITE_COMPLETE) { mutex_unlock(&dev->device_lock); - rets = wait_event_interruptible(cl->tx_wait, - cl->writing_state == MEI_WRITE_COMPLETE || - (!mei_cl_is_connected(cl))); + rets = wait_event_interruptible_timeout(cl->tx_wait, + cl->writing_state == MEI_WRITE_COMPLETE || + (!mei_cl_is_connected(cl)), + msecs_to_jiffies(timeout)); mutex_lock(&dev->device_lock); + /* clean all queue on timeout as something fatal happened */ + if (rets == 0) { + rets = -ETIME; + mei_io_tx_list_free_cl(&dev->write_list, cl, NULL); + mei_io_tx_list_free_cl(&dev->write_waiting_list, cl, NULL); + } /* wait_event_interruptible returns -ERESTARTSYS */ + if (rets > 0) + rets = 0; if (rets) { if (signal_pending(current)) rets = -EINTR; diff --git a/drivers/misc/mei/client.h b/drivers/misc/mei/client.h index 418056fb1489..9052860bcfe0 100644 --- a/drivers/misc/mei/client.h +++ b/drivers/misc/mei/client.h @@ -246,7 +246,7 @@ int mei_cl_connect(struct mei_cl *cl, struct mei_me_client *me_cl, int mei_cl_irq_connect(struct mei_cl *cl, struct mei_cl_cb *cb, struct list_head *cmpl_list); int mei_cl_read_start(struct mei_cl *cl, size_t length, const struct file *fp); -ssize_t mei_cl_write(struct mei_cl *cl, struct mei_cl_cb *cb); +ssize_t mei_cl_write(struct mei_cl *cl, struct mei_cl_cb *cb, unsigned long timeout); int mei_cl_irq_write(struct mei_cl *cl, struct mei_cl_cb *cb, struct list_head *cmpl_list); diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c index 930887e7e38d..632d4ae21e46 100644 --- a/drivers/misc/mei/main.c +++ b/drivers/misc/mei/main.c @@ -383,7 +383,7 @@ static ssize_t mei_write(struct file *file, const char __user *ubuf, goto out; } - rets = mei_cl_write(cl, cb); + rets = mei_cl_write(cl, cb, MAX_SCHEDULE_TIMEOUT); out: mutex_unlock(&dev->device_lock); return rets; diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h index 6bb3e1ba9ded..c1618a44c5a4 100644 --- a/drivers/misc/mei/mei_dev.h +++ b/drivers/misc/mei/mei_dev.h @@ -373,6 +373,8 @@ void mei_cl_bus_rescan_work(struct work_struct *work); void mei_cl_bus_dev_fixup(struct mei_cl_device *dev); ssize_t __mei_cl_send(struct mei_cl *cl, const u8 *buf, size_t length, u8 vtag, unsigned int mode); +ssize_t __mei_cl_send_timeout(struct mei_cl *cl, const u8 *buf, size_t length, u8 vtag, + unsigned int mode, unsigned long timeout); ssize_t __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length, u8 *vtag, unsigned int mode, unsigned long timeout); bool mei_cl_bus_rx_event(struct mei_cl *cl); -- cgit v1.2.3 From 0ef77698b85603d21453daf32ae70f76ae62ccae Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Wed, 16 Nov 2022 14:47:35 +0200 Subject: mei: bus-fixup: change pxp mode only if message was sent Move PXP mode state machine to SETUP mode only if memory ready message sent successfully to the firmware. Leave it in INIT mode otherwise to allow try to send message later. Signed-off-by: Alexander Usyskin Link: https://lore.kernel.org/r/20221116124735.2493847-3-alexander.usyskin@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/bus-fixup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/mei/bus-fixup.c b/drivers/misc/mei/bus-fixup.c index 90023c34666e..6df7679d9739 100644 --- a/drivers/misc/mei/bus-fixup.c +++ b/drivers/misc/mei/bus-fixup.c @@ -266,12 +266,13 @@ static void mei_gsc_mkhi_fix_ver(struct mei_cl_device *cldev) if (cldev->bus->pxp_mode == MEI_DEV_PXP_INIT) { ret = mei_gfx_memory_ready(cldev); - if (ret < 0) + if (ret < 0) { dev_err(&cldev->dev, "memory ready command failed %d\n", ret); - else + } else { dev_dbg(&cldev->dev, "memory ready command sent\n"); + cldev->bus->pxp_mode = MEI_DEV_PXP_SETUP; + } /* we go to reset after that */ - cldev->bus->pxp_mode = MEI_DEV_PXP_SETUP; goto out; } -- cgit v1.2.3 From a4cb1004aeed2ab893a058fad00a5b41a12c4691 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 11 Nov 2022 22:59:29 +0800 Subject: misc: ocxl: fix possible name leak in ocxl_file_register_afu() If device_register() returns error in ocxl_file_register_afu(), the name allocated by dev_set_name() need be freed. As comment of device_register() says, it should use put_device() to give up the reference in the error path. So fix this by calling put_device(), then the name can be freed in kobject_cleanup(), and info is freed in info_release(). Fixes: 75ca758adbaf ("ocxl: Create a clear delineation between ocxl backend & frontend") Signed-off-by: Yang Yingliang Acked-by: Andrew Donnellan Acked-by: Frederic Barrat Link: https://lore.kernel.org/r/20221111145929.2429271-1-yangyingliang@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/ocxl/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/ocxl/file.c b/drivers/misc/ocxl/file.c index d46dba2df5a1..452d5777a0e4 100644 --- a/drivers/misc/ocxl/file.c +++ b/drivers/misc/ocxl/file.c @@ -541,8 +541,11 @@ int ocxl_file_register_afu(struct ocxl_afu *afu) goto err_put; rc = device_register(&info->dev); - if (rc) - goto err_put; + if (rc) { + free_minor(info); + put_device(&info->dev); + return rc; + } rc = ocxl_sysfs_register_afu(info); if (rc) -- cgit v1.2.3 From 27158c72678b39ee01cc01de1aba6b51c71abe2f Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Mon, 21 Nov 2022 23:43:39 +0800 Subject: ocxl: fix pci device refcount leak when calling get_function_0() get_function_0() calls pci_get_domain_bus_and_slot(), as comment says, it returns a pci device with refcount increment, so after using it, pci_dev_put() needs be called. Get the device reference when get_function_0() is not called, so pci_dev_put() can be called in the error path and callers unconditionally. And add comment above get_dvsec_vendor0() to tell callers to call pci_dev_put(). Fixes: 87db7579ebd5 ("ocxl: control via sysfs whether the FPGA is reloaded on a link reset") Suggested-by: Andrew Donnellan Signed-off-by: Yang Yingliang Acked-by: Andrew Donnellan Link: https://lore.kernel.org/r/20221121154339.4088935-1-yangyingliang@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/ocxl/config.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c index e401a51596b9..92ab49705f64 100644 --- a/drivers/misc/ocxl/config.c +++ b/drivers/misc/ocxl/config.c @@ -193,6 +193,18 @@ static int read_dvsec_vendor(struct pci_dev *dev) return 0; } +/** + * get_dvsec_vendor0() - Find a related PCI device (function 0) + * @dev: PCI device to match + * @dev0: The PCI device (function 0) found + * @out_pos: The position of PCI device (function 0) + * + * Returns 0 on success, negative on failure. + * + * NOTE: If it's successful, the reference of dev0 is increased, + * so after using it, the callers must call pci_dev_put() to give + * up the reference. + */ static int get_dvsec_vendor0(struct pci_dev *dev, struct pci_dev **dev0, int *out_pos) { @@ -202,10 +214,14 @@ static int get_dvsec_vendor0(struct pci_dev *dev, struct pci_dev **dev0, dev = get_function_0(dev); if (!dev) return -1; + } else { + dev = pci_dev_get(dev); } pos = find_dvsec(dev, OCXL_DVSEC_VENDOR_ID); - if (!pos) + if (!pos) { + pci_dev_put(dev); return -1; + } *dev0 = dev; *out_pos = pos; return 0; @@ -222,6 +238,7 @@ int ocxl_config_get_reset_reload(struct pci_dev *dev, int *val) pci_read_config_dword(dev0, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, &reset_reload); + pci_dev_put(dev0); *val = !!(reset_reload & BIT(0)); return 0; } @@ -243,6 +260,7 @@ int ocxl_config_set_reset_reload(struct pci_dev *dev, int val) reset_reload &= ~BIT(0); pci_write_config_dword(dev0, pos + OCXL_DVSEC_VENDOR_RESET_RELOAD, reset_reload); + pci_dev_put(dev0); return 0; } -- cgit v1.2.3 From fd2c930cf6a5b9176382c15f9acb1996e76e25ad Mon Sep 17 00:00:00 2001 From: ruanjinjie Date: Thu, 17 Nov 2022 14:47:25 +0800 Subject: misc: tifm: fix possible memory leak in tifm_7xx1_switch_media() If device_register() returns error in tifm_7xx1_switch_media(), name of kobject which is allocated in dev_set_name() called in device_add() is leaked. Never directly free @dev after calling device_register(), even if it returned an error! Always use put_device() to give up the reference initialized. Fixes: 2428a8fe2261 ("tifm: move common device management tasks from tifm_7xx1 to tifm_core") Signed-off-by: ruanjinjie Link: https://lore.kernel.org/r/20221117064725.3478402-1-ruanjinjie@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/tifm_7xx1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/tifm_7xx1.c b/drivers/misc/tifm_7xx1.c index 017c2f7d6287..7dd86a9858ab 100644 --- a/drivers/misc/tifm_7xx1.c +++ b/drivers/misc/tifm_7xx1.c @@ -190,7 +190,7 @@ static void tifm_7xx1_switch_media(struct work_struct *work) spin_unlock_irqrestore(&fm->lock, flags); } if (sock) - tifm_free_device(&sock->dev); + put_device(&sock->dev); } spin_lock_irqsave(&fm->lock, flags); } -- cgit v1.2.3 From 643a16a0eb1d6ac23744bb6e90a00fc21148a9dc Mon Sep 17 00:00:00 2001 From: Zheng Wang Date: Thu, 10 Nov 2022 11:50:33 +0800 Subject: misc: sgi-gru: fix use-after-free error in gru_set_context_option, gru_fault and gru_handle_user_call_os In some bad situation, the gts may be freed gru_check_chiplet_assignment. The call chain can be gru_unload_context->gru_free_gru_context->gts_drop and kfree finally. However, the caller didn't know if the gts is freed or not and use it afterwards. This will trigger a Use after Free bug. Fix it by introducing a return value to see if it's in error path or not. Free the gts in caller if gru_check_chiplet_assignment check failed. Fixes: 55484c45dbec ("gru: allow users to specify gru chiplet 2") Signed-off-by: Zheng Wang Acked-by: Dimitri Sivanich Link: https://lore.kernel.org/r/20221110035033.19498-1-zyytlz.wz@163.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/sgi-gru/grufault.c | 13 +++++++++++-- drivers/misc/sgi-gru/grumain.c | 22 ++++++++++++++++++---- drivers/misc/sgi-gru/grutables.h | 2 +- 3 files changed, 30 insertions(+), 7 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c index d7ef61e602ed..b836936e9747 100644 --- a/drivers/misc/sgi-gru/grufault.c +++ b/drivers/misc/sgi-gru/grufault.c @@ -648,6 +648,7 @@ int gru_handle_user_call_os(unsigned long cb) if ((cb & (GRU_HANDLE_STRIDE - 1)) || ucbnum >= GRU_NUM_CB) return -EINVAL; +again: gts = gru_find_lock_gts(cb); if (!gts) return -EINVAL; @@ -656,7 +657,11 @@ int gru_handle_user_call_os(unsigned long cb) if (ucbnum >= gts->ts_cbr_au_count * GRU_CBR_AU_SIZE) goto exit; - gru_check_context_placement(gts); + if (gru_check_context_placement(gts)) { + gru_unlock_gts(gts); + gru_unload_context(gts, 1); + goto again; + } /* * CCH may contain stale data if ts_force_cch_reload is set. @@ -874,7 +879,11 @@ int gru_set_context_option(unsigned long arg) } else { gts->ts_user_blade_id = req.val1; gts->ts_user_chiplet_id = req.val0; - gru_check_context_placement(gts); + if (gru_check_context_placement(gts)) { + gru_unlock_gts(gts); + gru_unload_context(gts, 1); + return ret; + } } break; case sco_gseg_owner: diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index 6706ef3c5977..4eb4b9455139 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c @@ -716,9 +716,10 @@ static int gru_check_chiplet_assignment(struct gru_state *gru, * chiplet. Misassignment can occur if the process migrates to a different * blade or if the user changes the selected blade/chiplet. */ -void gru_check_context_placement(struct gru_thread_state *gts) +int gru_check_context_placement(struct gru_thread_state *gts) { struct gru_state *gru; + int ret = 0; /* * If the current task is the context owner, verify that the @@ -726,15 +727,23 @@ void gru_check_context_placement(struct gru_thread_state *gts) * references. Pthread apps use non-owner references to the CBRs. */ gru = gts->ts_gru; + /* + * If gru or gts->ts_tgid_owner isn't initialized properly, return + * success to indicate that the caller does not need to unload the + * gru context.The caller is responsible for their inspection and + * reinitialization if needed. + */ if (!gru || gts->ts_tgid_owner != current->tgid) - return; + return ret; if (!gru_check_chiplet_assignment(gru, gts)) { STAT(check_context_unload); - gru_unload_context(gts, 1); + ret = -EINVAL; } else if (gru_retarget_intr(gts)) { STAT(check_context_retarget_intr); } + + return ret; } @@ -934,7 +943,12 @@ again: mutex_lock(>s->ts_ctxlock); preempt_disable(); - gru_check_context_placement(gts); + if (gru_check_context_placement(gts)) { + preempt_enable(); + mutex_unlock(>s->ts_ctxlock); + gru_unload_context(gts, 1); + return VM_FAULT_NOPAGE; + } if (!gts->ts_gru) { STAT(load_user_context); diff --git a/drivers/misc/sgi-gru/grutables.h b/drivers/misc/sgi-gru/grutables.h index 8c52776db234..640daf1994df 100644 --- a/drivers/misc/sgi-gru/grutables.h +++ b/drivers/misc/sgi-gru/grutables.h @@ -632,7 +632,7 @@ extern int gru_user_flush_tlb(unsigned long arg); extern int gru_user_unload_context(unsigned long arg); extern int gru_get_exception_detail(unsigned long arg); extern int gru_set_context_option(unsigned long address); -extern void gru_check_context_placement(struct gru_thread_state *gts); +extern int gru_check_context_placement(struct gru_thread_state *gts); extern int gru_cpu_fault_map_id(void); extern struct vm_area_struct *gru_find_vma(unsigned long vaddr); extern void gru_flush_all_tlb(struct gru_state *gru); -- cgit v1.2.3 From 7198cf0f1ca90581f27452664216a662ad72aed5 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:48 +0100 Subject: misc: lis3lv02d/lis3lv02d_i2c: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-495-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/lis3lv02d/lis3lv02d_i2c.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/lis3lv02d/lis3lv02d_i2c.c b/drivers/misc/lis3lv02d/lis3lv02d_i2c.c index d7daa01fe7ca..7071412d6bf6 100644 --- a/drivers/misc/lis3lv02d/lis3lv02d_i2c.c +++ b/drivers/misc/lis3lv02d/lis3lv02d_i2c.c @@ -100,8 +100,7 @@ static const struct of_device_id lis3lv02d_i2c_dt_ids[] = { MODULE_DEVICE_TABLE(of, lis3lv02d_i2c_dt_ids); #endif -static int lis3lv02d_i2c_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int lis3lv02d_i2c_probe(struct i2c_client *client) { int ret = 0; struct lis3lv02d_platform_data *pdata = client->dev.platform_data; @@ -263,7 +262,7 @@ static struct i2c_driver lis3lv02d_i2c_driver = { .pm = &lis3_pm_ops, .of_match_table = of_match_ptr(lis3lv02d_i2c_dt_ids), }, - .probe = lis3lv02d_i2c_probe, + .probe_new = lis3lv02d_i2c_probe, .remove = lis3lv02d_i2c_remove, .id_table = lis3lv02d_id, }; -- cgit v1.2.3 From 59ee8ca4eeda35d850e4c81ec3065dba10023842 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:41 +0100 Subject: misc: eeprom/eeprom: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-488-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/eeprom.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/eeprom.c b/drivers/misc/eeprom/eeprom.c index 8a841a75d893..32611100d5cd 100644 --- a/drivers/misc/eeprom/eeprom.c +++ b/drivers/misc/eeprom/eeprom.c @@ -141,8 +141,7 @@ static int eeprom_detect(struct i2c_client *client, struct i2c_board_info *info) return 0; } -static int eeprom_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int eeprom_probe(struct i2c_client *client) { struct i2c_adapter *adapter = client->adapter; struct eeprom_data *data; @@ -197,7 +196,7 @@ static struct i2c_driver eeprom_driver = { .driver = { .name = "eeprom", }, - .probe = eeprom_probe, + .probe_new = eeprom_probe, .remove = eeprom_remove, .id_table = eeprom_id, -- cgit v1.2.3 From 8427bd8bdee8f35797cade56fe173fbea990e38c Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:49 +0100 Subject: misc: tsl2550: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-496-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/tsl2550.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/tsl2550.c b/drivers/misc/tsl2550.c index 1652fb9b3856..6c62b94e0acd 100644 --- a/drivers/misc/tsl2550.c +++ b/drivers/misc/tsl2550.c @@ -331,8 +331,7 @@ static int tsl2550_init_client(struct i2c_client *client) */ static struct i2c_driver tsl2550_driver; -static int tsl2550_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int tsl2550_probe(struct i2c_client *client) { struct i2c_adapter *adapter = client->adapter; struct tsl2550_data *data; @@ -438,7 +437,7 @@ static struct i2c_driver tsl2550_driver = { .of_match_table = tsl2550_of_match, .pm = TSL2550_PM_OPS, }, - .probe = tsl2550_probe, + .probe_new = tsl2550_probe, .remove = tsl2550_remove, .id_table = tsl2550_id, }; -- cgit v1.2.3 From 327e1ad186d91b12b6ece0b21178c07edef01806 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:47 +0100 Subject: misc: isl29020: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-494-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/isl29020.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/isl29020.c b/drivers/misc/isl29020.c index c6f2a94f501a..3be02093368c 100644 --- a/drivers/misc/isl29020.c +++ b/drivers/misc/isl29020.c @@ -151,8 +151,7 @@ static int als_set_default_config(struct i2c_client *client) return 0; } -static int isl29020_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int isl29020_probe(struct i2c_client *client) { int res; @@ -215,7 +214,7 @@ static struct i2c_driver isl29020_driver = { .name = "isl29020", .pm = ISL29020_PM_OPS, }, - .probe = isl29020_probe, + .probe_new = isl29020_probe, .remove = isl29020_remove, .id_table = isl29020_id, }; -- cgit v1.2.3 From 99b0cb3f5f8d67f4552c24d9b0aa6cda38f558aa Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:43 +0100 Subject: misc: eeprom/max6875: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-490-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/max6875.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/max6875.c b/drivers/misc/eeprom/max6875.c index 6bd4f4339af4..79cf8afcef2e 100644 --- a/drivers/misc/eeprom/max6875.c +++ b/drivers/misc/eeprom/max6875.c @@ -130,8 +130,7 @@ static const struct bin_attribute user_eeprom_attr = { .read = max6875_read, }; -static int max6875_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int max6875_probe(struct i2c_client *client) { struct i2c_adapter *adapter = client->adapter; struct max6875_data *data; @@ -193,7 +192,7 @@ static struct i2c_driver max6875_driver = { .driver = { .name = "max6875", }, - .probe = max6875_probe, + .probe_new = max6875_probe, .remove = max6875_remove, .id_table = max6875_id, }; -- cgit v1.2.3 From 654700c9fc2860d33d57b42fd39cae2310dbc2ba Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:44 +0100 Subject: misc: hmc6352: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-491-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/hmc6352.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/hmc6352.c b/drivers/misc/hmc6352.c index 42b9adef28a3..8967940ecd1e 100644 --- a/drivers/misc/hmc6352.c +++ b/drivers/misc/hmc6352.c @@ -101,8 +101,7 @@ static const struct attribute_group m_compass_gr = { .attrs = mid_att_compass }; -static int hmc6352_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int hmc6352_probe(struct i2c_client *client) { int res; @@ -132,7 +131,7 @@ static struct i2c_driver hmc6352_driver = { .driver = { .name = "hmc6352", }, - .probe = hmc6352_probe, + .probe_new = hmc6352_probe, .remove = hmc6352_remove, .id_table = hmc6352_id, }; -- cgit v1.2.3 From 9c18dad44dc1de202a69c8ccef983e6070740acd Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:45 +0100 Subject: misc: ics932s401: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-492-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/ics932s401.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/ics932s401.c b/drivers/misc/ics932s401.c index 1cb71df966a4..12108a7b9b40 100644 --- a/drivers/misc/ics932s401.c +++ b/drivers/misc/ics932s401.c @@ -89,8 +89,7 @@ struct ics932s401_data { u8 regs[NUM_REGS]; }; -static int ics932s401_probe(struct i2c_client *client, - const struct i2c_device_id *id); +static int ics932s401_probe(struct i2c_client *client); static int ics932s401_detect(struct i2c_client *client, struct i2c_board_info *info); static void ics932s401_remove(struct i2c_client *client); @@ -106,7 +105,7 @@ static struct i2c_driver ics932s401_driver = { .driver = { .name = "ics932s401", }, - .probe = ics932s401_probe, + .probe_new = ics932s401_probe, .remove = ics932s401_remove, .id_table = ics932s401_id, .detect = ics932s401_detect, @@ -429,8 +428,7 @@ static int ics932s401_detect(struct i2c_client *client, return 0; } -static int ics932s401_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int ics932s401_probe(struct i2c_client *client) { struct ics932s401_data *data; int err; -- cgit v1.2.3 From db687ce71845aeb639be7452f4d8a272cf190cd1 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:46 +0100 Subject: misc: isl29003: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-493-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/isl29003.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/isl29003.c b/drivers/misc/isl29003.c index 8ab61be79c76..aeda2fa89e61 100644 --- a/drivers/misc/isl29003.c +++ b/drivers/misc/isl29003.c @@ -374,8 +374,7 @@ static int isl29003_init_client(struct i2c_client *client) * I2C layer */ -static int isl29003_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int isl29003_probe(struct i2c_client *client) { struct i2c_adapter *adapter = client->adapter; struct isl29003_data *data; @@ -460,7 +459,7 @@ static struct i2c_driver isl29003_driver = { .name = ISL29003_DRV_NAME, .pm = ISL29003_PM_OPS, }, - .probe = isl29003_probe, + .probe_new = isl29003_probe, .remove = isl29003_remove, .id_table = isl29003_id, }; -- cgit v1.2.3 From 244179dbe11e707a0ef596246a9b80327492fc35 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:42 +0100 Subject: misc: eeprom/idt_89hpesx: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-489-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/eeprom/idt_89hpesx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/eeprom/idt_89hpesx.c b/drivers/misc/eeprom/idt_89hpesx.c index bb3ed352b95f..4e07ee9cb500 100644 --- a/drivers/misc/eeprom/idt_89hpesx.c +++ b/drivers/misc/eeprom/idt_89hpesx.c @@ -1366,7 +1366,7 @@ static void idt_remove_dbgfs_files(struct idt_89hpesx_dev *pdev) /* * idt_probe() - IDT 89HPESx driver probe() callback method */ -static int idt_probe(struct i2c_client *client, const struct i2c_device_id *id) +static int idt_probe(struct i2c_client *client) { struct idt_89hpesx_dev *pdev; int ret; @@ -1556,7 +1556,7 @@ static struct i2c_driver idt_driver = { .name = IDT_NAME, .of_match_table = idt_of_match, }, - .probe = idt_probe, + .probe_new = idt_probe, .remove = idt_remove, .id_table = idt_ids, }; -- cgit v1.2.3 From 6757c6480d7f34cb272d28339dfac096b94c8638 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:38 +0100 Subject: misc: apds990x: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-485-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/apds990x.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/apds990x.c b/drivers/misc/apds990x.c index e2100cc42ce8..0024503ea6db 100644 --- a/drivers/misc/apds990x.c +++ b/drivers/misc/apds990x.c @@ -1051,8 +1051,7 @@ static const struct attribute_group apds990x_attribute_group[] = { {.attrs = sysfs_attrs_ctrl }, }; -static int apds990x_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int apds990x_probe(struct i2c_client *client) { struct apds990x_chip *chip; int err; @@ -1272,7 +1271,7 @@ static struct i2c_driver apds990x_driver = { .name = "apds990x", .pm = &apds990x_pm_ops, }, - .probe = apds990x_probe, + .probe_new = apds990x_probe, .remove = apds990x_remove, .id_table = apds990x_id, }; -- cgit v1.2.3 From 9f28b675c160519c79daed9f73bc38ab3d6c9015 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:37 +0100 Subject: misc: apds9802als: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-484-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/apds9802als.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/apds9802als.c b/drivers/misc/apds9802als.c index a32431f4b370..0526c55d5cd5 100644 --- a/drivers/misc/apds9802als.c +++ b/drivers/misc/apds9802als.c @@ -212,8 +212,7 @@ static int als_set_default_config(struct i2c_client *client) return ret_val; } -static int apds9802als_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int apds9802als_probe(struct i2c_client *client) { int res; struct als_data *data; @@ -297,7 +296,7 @@ static struct i2c_driver apds9802als_driver = { .name = DRIVER_NAME, .pm = APDS9802ALS_PM_OPS, }, - .probe = apds9802als_probe, + .probe_new = apds9802als_probe, .remove = apds9802als_remove, .id_table = apds9802als_id, }; -- cgit v1.2.3 From 781edb0530a1009f89e7888726ca87b255d2526b Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:39 +0100 Subject: misc: bh1770glc: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-486-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/bh1770glc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/bh1770glc.c b/drivers/misc/bh1770glc.c index d0dfa674414c..bedbe0efb330 100644 --- a/drivers/misc/bh1770glc.c +++ b/drivers/misc/bh1770glc.c @@ -1162,8 +1162,7 @@ static const struct attribute_group bh1770_attribute_group = { .attrs = sysfs_attrs }; -static int bh1770_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int bh1770_probe(struct i2c_client *client) { struct bh1770_chip *chip; int err; @@ -1379,7 +1378,7 @@ static struct i2c_driver bh1770_driver = { .name = "bh1770glc", .pm = &bh1770_pm_ops, }, - .probe = bh1770_probe, + .probe_new = bh1770_probe, .remove = bh1770_remove, .id_table = bh1770_id, }; -- cgit v1.2.3 From 3127a86a3702bd3a2ff43503d49919d666739ae8 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 18 Nov 2022 23:43:40 +0100 Subject: misc: ds1682: Convert to i2c's .probe_new() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The probe function doesn't make use of the i2c_device_id * parameter so it can be trivially converted. Signed-off-by: Uwe Kleine-König Link: https://lore.kernel.org/r/20221118224540.619276-487-uwe@kleine-koenig.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/ds1682.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/ds1682.c b/drivers/misc/ds1682.c index 0698ddc5f4d5..d517eed32971 100644 --- a/drivers/misc/ds1682.c +++ b/drivers/misc/ds1682.c @@ -200,8 +200,7 @@ static const struct bin_attribute ds1682_eeprom_attr = { /* * Called when a ds1682 device is matched with this driver */ -static int ds1682_probe(struct i2c_client *client, - const struct i2c_device_id *id) +static int ds1682_probe(struct i2c_client *client) { int rc; @@ -251,7 +250,7 @@ static struct i2c_driver ds1682_driver = { .name = "ds1682", .of_match_table = ds1682_of_match, }, - .probe = ds1682_probe, + .probe_new = ds1682_probe, .remove = ds1682_remove, .id_table = ds1682_id, }; -- cgit v1.2.3 From 61c80d1c3833e196256fb060382db94f24d3d9a7 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 11 Nov 2022 22:54:39 +0800 Subject: cxl: fix possible null-ptr-deref in cxl_guest_init_afu|adapter() If device_register() fails in cxl_register_afu|adapter(), the device is not added, device_unregister() can not be called in the error path, otherwise it will cause a null-ptr-deref because of removing not added device. As comment of device_register() says, it should use put_device() to give up the reference in the error path. So split device_unregister() into device_del() and put_device(), then goes to put dev when register fails. Fixes: 14baf4d9c739 ("cxl: Add guest-specific code") Signed-off-by: Yang Yingliang Acked-by: Andrew Donnellan Acked-by: Frederic Barrat Link: https://lore.kernel.org/r/20221111145440.2426970-1-yangyingliang@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cxl/guest.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c index 375f692ae9d6..fb95a2d5cef4 100644 --- a/drivers/misc/cxl/guest.c +++ b/drivers/misc/cxl/guest.c @@ -965,10 +965,10 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n * if it returns an error! */ if ((rc = cxl_register_afu(afu))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_afu_add(afu))) - goto err_put1; + goto err_del_dev; /* * pHyp doesn't expose the programming models supported by the @@ -984,7 +984,7 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n afu->modes_supported = CXL_MODE_DIRECTED; if ((rc = cxl_afu_select_best_mode(afu))) - goto err_put2; + goto err_remove_sysfs; adapter->afu[afu->slice] = afu; @@ -1004,10 +1004,12 @@ int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_n return 0; -err_put2: +err_remove_sysfs: cxl_sysfs_afu_remove(afu); -err_put1: - device_unregister(&afu->dev); +err_del_dev: + device_del(&afu->dev); +err_put_dev: + put_device(&afu->dev); free = false; guest_release_serr_irq(afu); err2: @@ -1141,18 +1143,20 @@ struct cxl *cxl_guest_init_adapter(struct device_node *np, struct platform_devic * even if it returns an error! */ if ((rc = cxl_register_adapter(adapter))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_adapter_add(adapter))) - goto err_put1; + goto err_del_dev; /* release the context lock as the adapter is configured */ cxl_adapter_context_unlock(adapter); return adapter; -err_put1: - device_unregister(&adapter->dev); +err_del_dev: + device_del(&adapter->dev); +err_put_dev: + put_device(&adapter->dev); free = false; cxl_guest_remove_chardev(adapter); err1: -- cgit v1.2.3 From 02cd3032b154fa02fdf90e7467abaeed889330b2 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 11 Nov 2022 22:54:40 +0800 Subject: cxl: fix possible null-ptr-deref in cxl_pci_init_afu|adapter() If device_register() fails in cxl_pci_afu|adapter(), the device is not added, device_unregister() can not be called in the error path, otherwise it will cause a null-ptr-deref because of removing not added device. As comment of device_register() says, it should use put_device() to give up the reference in the error path. So split device_unregister() into device_del() and put_device(), then goes to put dev when register fails. Fixes: f204e0b8cedd ("cxl: Driver code for powernv PCIe based cards for userspace access") Signed-off-by: Yang Yingliang Acked-by: Frederic Barrat Acked-by: Andrew Donnellan Link: https://lore.kernel.org/r/20221111145440.2426970-2-yangyingliang@huawei.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cxl/pci.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 3de0aea62ade..6d495d641c95 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -1164,10 +1164,10 @@ static int pci_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev) * if it returns an error! */ if ((rc = cxl_register_afu(afu))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_afu_add(afu))) - goto err_put1; + goto err_del_dev; adapter->afu[afu->slice] = afu; @@ -1176,10 +1176,12 @@ static int pci_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev) return 0; -err_put1: +err_del_dev: + device_del(&afu->dev); +err_put_dev: pci_deconfigure_afu(afu); cxl_debugfs_afu_remove(afu); - device_unregister(&afu->dev); + put_device(&afu->dev); return rc; err_free_native: @@ -1667,23 +1669,25 @@ static struct cxl *cxl_pci_init_adapter(struct pci_dev *dev) * even if it returns an error! */ if ((rc = cxl_register_adapter(adapter))) - goto err_put1; + goto err_put_dev; if ((rc = cxl_sysfs_adapter_add(adapter))) - goto err_put1; + goto err_del_dev; /* Release the context lock as adapter is configured */ cxl_adapter_context_unlock(adapter); return adapter; -err_put1: +err_del_dev: + device_del(&adapter->dev); +err_put_dev: /* This should mirror cxl_remove_adapter, except without the * sysfs parts */ cxl_debugfs_adapter_remove(adapter); cxl_deconfigure_adapter(adapter); - device_unregister(&adapter->dev); + put_device(&adapter->dev); return ERR_PTR(rc); err_release: -- cgit v1.2.3 From 1959ab9edccd3de4bc8a876f97ce269bb9beeb31 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Fri, 25 Nov 2022 07:13:57 +0000 Subject: misc: fastrpc: Rename audio protection domain to root The AUDIO_PD will be done via static pd, so the proper name here is actually ROOT_PD. Co-developed-by: Srinivas Kandagatla Signed-off-by: Abel Vesa Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20221125071405.148786-3-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 7ff0b63c25e3..f2bda08adca7 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -83,7 +83,7 @@ #define FASTRPC_RMID_INIT_MEM_UNMAP 11 /* Protection Domain(PD) ids */ -#define AUDIO_PD (0) /* also GUEST_OS PD? */ +#define ROOT_PD (0) #define USER_PD (1) #define SENSORS_PD (2) @@ -1886,7 +1886,7 @@ static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, err = fastrpc_invoke(fl, argp); break; case FASTRPC_IOCTL_INIT_ATTACH: - err = fastrpc_init_attach(fl, AUDIO_PD); + err = fastrpc_init_attach(fl, ROOT_PD); break; case FASTRPC_IOCTL_INIT_ATTACH_SNS: err = fastrpc_init_attach(fl, SENSORS_PD); -- cgit v1.2.3 From 1ce91d45ba77a4f6bf9209d142d5c89c42cf877a Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Fri, 25 Nov 2022 07:13:58 +0000 Subject: misc: fastrpc: Add reserved mem support The reserved mem support is needed for CMA heap support, which will be used by AUDIOPD. Co-developed-by: Srinivas Kandagatla Signed-off-by: Abel Vesa Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20221125071405.148786-4-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index f2bda08adca7..3d5809622a6b 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -19,6 +19,7 @@ #include #include #include +#include #define ADSP_DOMAIN_ID (0) #define MDSP_DOMAIN_ID (1) @@ -2065,6 +2066,9 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev) return -EINVAL; } + if (of_reserved_mem_device_init_by_idx(rdev, rdev->of_node, 0)) + dev_info(rdev, "no reserved DMA memory for FASTRPC\n"); + vmcount = of_property_read_variable_u32_array(rdev->of_node, "qcom,vmids", &vmids[0], 0, FASTRPC_MAX_VMIDS); if (vmcount < 0) -- cgit v1.2.3 From 6f18c7e845346f365e08613fdc47a60fc201aedb Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Fri, 25 Nov 2022 07:13:59 +0000 Subject: misc: fastrpc: Add fastrpc_remote_heap_alloc Split fastrpc_buf_alloc in such a way it allows allocation of remote heap too and add fastrpc_remote_heap_alloc to do so. Co-developed-by: Srinivas Kandagatla Signed-off-by: Abel Vesa Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20221125071405.148786-5-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 3d5809622a6b..8b43fe5207fb 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -369,7 +369,7 @@ static void fastrpc_buf_free(struct fastrpc_buf *buf) kfree(buf); } -static int fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev, +static int __fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev, u64 size, struct fastrpc_buf **obuf) { struct fastrpc_buf *buf; @@ -397,14 +397,37 @@ static int fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev, return -ENOMEM; } + *obuf = buf; + + return 0; +} + +static int fastrpc_buf_alloc(struct fastrpc_user *fl, struct device *dev, + u64 size, struct fastrpc_buf **obuf) +{ + int ret; + struct fastrpc_buf *buf; + + ret = __fastrpc_buf_alloc(fl, dev, size, obuf); + if (ret) + return ret; + + buf = *obuf; + if (fl->sctx && fl->sctx->sid) buf->phys += ((u64)fl->sctx->sid << 32); - *obuf = buf; - return 0; } +static int fastrpc_remote_heap_alloc(struct fastrpc_user *fl, struct device *dev, + u64 size, struct fastrpc_buf **obuf) +{ + struct device *rdev = &fl->cctx->rpdev->dev; + + return __fastrpc_buf_alloc(fl, rdev, size, obuf); +} + static void fastrpc_channel_ctx_free(struct kref *ref) { struct fastrpc_channel_ctx *cctx; -- cgit v1.2.3 From 334f1a1cbe032d85fd58e771629e3a3b373b96d5 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Fri, 25 Nov 2022 07:14:00 +0000 Subject: misc: fastrpc: Use fastrpc_map_put in fastrpc_map_create on fail Move the kref_init right after the allocation so that we can use fastrpc_map_put on any following error case. Signed-off-by: Abel Vesa Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20221125071405.148786-6-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 8b43fe5207fb..332626df5d39 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -735,6 +735,8 @@ static int fastrpc_map_create(struct fastrpc_user *fl, int fd, return -ENOMEM; INIT_LIST_HEAD(&map->node); + kref_init(&map->refcount); + map->fl = fl; map->fd = fd; map->buf = dma_buf_get(fd); @@ -761,7 +763,6 @@ static int fastrpc_map_create(struct fastrpc_user *fl, int fd, map->size = len; map->va = sg_virt(map->table->sgl); map->len = len; - kref_init(&map->refcount); if (attr & FASTRPC_ATTR_SECUREMAP) { /* @@ -791,7 +792,7 @@ map_err: attach_err: dma_buf_put(map->buf); get_err: - kfree(map); + fastrpc_map_put(map); return err; } -- cgit v1.2.3 From 72fa6f7820c4cf96c5f7aabc4e54bdf52d1e2ac2 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Fri, 25 Nov 2022 07:14:01 +0000 Subject: misc: fastrpc: Rework fastrpc_req_munmap Move the lookup of the munmap request to the fastrpc_req_munmap and pass on only the buf to the lower level fastrpc_req_munmap_impl. That way we can use the lower level fastrpc_req_munmap_impl on error path in fastrpc_req_mmap to free the buf without searching for the munmap request it belongs to. Co-developed-by: Srinivas Kandagatla Signed-off-by: Abel Vesa Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20221125071405.148786-7-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'drivers/misc') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 332626df5d39..4590a11f7316 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -1627,30 +1627,14 @@ static int fastrpc_get_dsp_info(struct fastrpc_user *fl, char __user *argp) return 0; } -static int fastrpc_req_munmap_impl(struct fastrpc_user *fl, - struct fastrpc_req_munmap *req) +static int fastrpc_req_munmap_impl(struct fastrpc_user *fl, struct fastrpc_buf *buf) { struct fastrpc_invoke_args args[1] = { [0] = { 0 } }; - struct fastrpc_buf *buf = NULL, *iter, *b; struct fastrpc_munmap_req_msg req_msg; struct device *dev = fl->sctx->dev; int err; u32 sc; - spin_lock(&fl->lock); - list_for_each_entry_safe(iter, b, &fl->mmaps, node) { - if ((iter->raddr == req->vaddrout) && (iter->size == req->size)) { - buf = iter; - break; - } - } - spin_unlock(&fl->lock); - - if (!buf) { - dev_err(dev, "mmap not in list\n"); - return -EINVAL; - } - req_msg.pgid = fl->tgid; req_msg.size = buf->size; req_msg.vaddr = buf->raddr; @@ -1676,12 +1660,29 @@ static int fastrpc_req_munmap_impl(struct fastrpc_user *fl, static int fastrpc_req_munmap(struct fastrpc_user *fl, char __user *argp) { + struct fastrpc_buf *buf = NULL, *iter, *b; struct fastrpc_req_munmap req; + struct device *dev = fl->sctx->dev; if (copy_from_user(&req, argp, sizeof(req))) return -EFAULT; - return fastrpc_req_munmap_impl(fl, &req); + spin_lock(&fl->lock); + list_for_each_entry_safe(iter, b, &fl->mmaps, node) { + if ((iter->raddr == req.vaddrout) && (iter->size == req.size)) { + buf = iter; + break; + } + } + spin_unlock(&fl->lock); + + if (!buf) { + dev_err(dev, "mmap\t\tpt 0x%09llx [len 0x%08llx] not in list\n", + req.vaddrout, req.size); + return -EINVAL; + } + + return fastrpc_req_munmap_impl(fl, buf); } static int fastrpc_req_mmap(struct fastrpc_user *fl, char __user *argp) @@ -1690,7 +1691,6 @@ static int fastrpc_req_mmap(struct fastrpc_user *fl, char __user *argp) struct fastrpc_buf *buf = NULL; struct fastrpc_mmap_req_msg req_msg; struct fastrpc_mmap_rsp_msg rsp_msg; - struct fastrpc_req_munmap req_unmap; struct fastrpc_phy_page pages; struct fastrpc_req_mmap req; struct device *dev = fl->sctx->dev; @@ -1752,11 +1752,8 @@ static int fastrpc_req_mmap(struct fastrpc_user *fl, char __user *argp) spin_unlock(&fl->lock); if (copy_to_user((void __user *)argp, &req, sizeof(req))) { - /* unmap the memory and release the buffer */ - req_unmap.vaddrout = buf->raddr; - req_unmap.size = buf->size; - fastrpc_req_munmap_impl(fl, &req_unmap); - return -EFAULT; + err = -EFAULT; + goto err_assign; } dev_dbg(dev, "mmap\t\tpt 0x%09lx OK [len 0x%08llx]\n", @@ -1764,6 +1761,8 @@ static int fastrpc_req_mmap(struct fastrpc_user *fl, char __user *argp) return 0; +err_assign: + fastrpc_req_munmap_impl(fl, buf); err_invoke: fastrpc_buf_free(buf); -- cgit v1.2.3 From 0871561055e666da421d779397efcc1e5e964cab Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Fri, 25 Nov 2022 07:14:02 +0000 Subject: misc: fastrpc: Add support for audiopd In order to be able to start the adsp listener for audiopd using adsprpcd, we need to add the corresponding ioctl for creating a static process. On that ioctl call we need to allocate the heap. Allocating the heap needs to be happening only once and needs to be kept between different device open calls, so attach it to the channel context to make sure that remains until the RPMSG driver is removed. Then, if there are any VMIDs associated with the static ADSP process, do a call to SCM to assign it. And then, send all the necessary info related to heap to the DSP. Co-developed-by: Srinivas Kandagatla Signed-off-by: Abel Vesa Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20221125071405.148786-8-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 135 ++++++++++++++++++++++++++++++++++++++++++++ include/uapi/misc/fastrpc.h | 7 +++ 2 files changed, 142 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 4590a11f7316..adc44ba0eff6 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -37,8 +37,20 @@ #define FASTRPC_DSP_UTILITIES_HANDLE 2 #define FASTRPC_CTXID_MASK (0xFF0) #define INIT_FILELEN_MAX (2 * 1024 * 1024) +#define INIT_FILE_NAMELEN_MAX (128) #define FASTRPC_DEVICE_NAME "fastrpc" + +/* Add memory to static PD pool, protection thru XPU */ +#define ADSP_MMAP_HEAP_ADDR 4 +/* MAP static DMA buffer on DSP User PD */ +#define ADSP_MMAP_DMA_BUFFER 6 +/* Add memory to static PD pool protection thru hypervisor */ +#define ADSP_MMAP_REMOTE_HEAP_ADDR 8 +/* Add memory to userPD pool, for user heap */ #define ADSP_MMAP_ADD_PAGES 0x1000 +/* Add memory to userPD pool, for LLC heap */ +#define ADSP_MMAP_ADD_PAGES_LLC 0x3000, + #define DSP_UNSUPPORTED_API (0x80000414) /* MAX NUMBER of DSP ATTRIBUTES SUPPORTED */ #define FASTRPC_MAX_DSP_ATTRIBUTES (256) @@ -72,6 +84,7 @@ FASTRPC_BUILD_SCALARS(0, method, in, out, 0, 0) #define FASTRPC_CREATE_PROCESS_NARGS 6 +#define FASTRPC_CREATE_STATIC_PROCESS_NARGS 3 /* Remote Method id table */ #define FASTRPC_RMID_INIT_ATTACH 0 #define FASTRPC_RMID_INIT_RELEASE 1 @@ -261,6 +274,7 @@ struct fastrpc_channel_ctx { u32 dsp_attributes[FASTRPC_MAX_DSP_ATTRIBUTES]; struct fastrpc_device *secure_fdevice; struct fastrpc_device *fdevice; + struct fastrpc_buf *remote_heap; bool secure; bool unsigned_support; }; @@ -1157,6 +1171,7 @@ bail: spin_unlock(&fl->lock); fastrpc_context_put(ctx); } + if (err) dev_dbg(fl->sctx->dev, "Error: Invoke Failed %d\n", err); @@ -1181,6 +1196,120 @@ static bool is_session_rejected(struct fastrpc_user *fl, bool unsigned_pd_reques return false; } +static int fastrpc_init_create_static_process(struct fastrpc_user *fl, + char __user *argp) +{ + struct fastrpc_init_create_static init; + struct fastrpc_invoke_args *args; + struct fastrpc_phy_page pages[1]; + char *name; + int err; + struct { + int pgid; + u32 namelen; + u32 pageslen; + } inbuf; + u32 sc; + + args = kcalloc(FASTRPC_CREATE_STATIC_PROCESS_NARGS, sizeof(*args), GFP_KERNEL); + if (!args) + return -ENOMEM; + + if (copy_from_user(&init, argp, sizeof(init))) { + err = -EFAULT; + goto err; + } + + if (init.namelen > INIT_FILE_NAMELEN_MAX) { + err = -EINVAL; + goto err; + } + + name = kzalloc(init.namelen, GFP_KERNEL); + if (!name) { + err = -ENOMEM; + goto err; + } + + if (copy_from_user(name, (void __user *)(uintptr_t)init.name, init.namelen)) { + err = -EFAULT; + goto err_name; + } + + if (!fl->cctx->remote_heap) { + err = fastrpc_remote_heap_alloc(fl, fl->sctx->dev, init.memlen, + &fl->cctx->remote_heap); + if (err) + goto err_name; + + /* Map if we have any heap VMIDs associated with this ADSP Static Process. */ + if (fl->cctx->vmcount) { + unsigned int perms = BIT(QCOM_SCM_VMID_HLOS); + + err = qcom_scm_assign_mem(fl->cctx->remote_heap->phys, + (u64)fl->cctx->remote_heap->size, &perms, + fl->cctx->vmperms, fl->cctx->vmcount); + if (err) { + dev_err(fl->sctx->dev, "Failed to assign memory with phys 0x%llx size 0x%llx err %d", + fl->cctx->remote_heap->phys, fl->cctx->remote_heap->size, err); + goto err_map; + } + } + } + + inbuf.pgid = fl->tgid; + inbuf.namelen = init.namelen; + inbuf.pageslen = 0; + fl->pd = USER_PD; + + args[0].ptr = (u64)(uintptr_t)&inbuf; + args[0].length = sizeof(inbuf); + args[0].fd = -1; + + args[1].ptr = (u64)(uintptr_t)name; + args[1].length = inbuf.namelen; + args[1].fd = -1; + + pages[0].addr = fl->cctx->remote_heap->phys; + pages[0].size = fl->cctx->remote_heap->size; + + args[2].ptr = (u64)(uintptr_t) pages; + args[2].length = sizeof(*pages); + args[2].fd = -1; + + sc = FASTRPC_SCALARS(FASTRPC_RMID_INIT_CREATE_STATIC, 3, 0); + + err = fastrpc_internal_invoke(fl, true, FASTRPC_INIT_HANDLE, + sc, args); + if (err) + goto err_invoke; + + kfree(args); + + return 0; +err_invoke: + if (fl->cctx->vmcount) { + struct qcom_scm_vmperm perm; + + perm.vmid = QCOM_SCM_VMID_HLOS; + perm.perm = QCOM_SCM_PERM_RWX; + err = qcom_scm_assign_mem(fl->cctx->remote_heap->phys, + (u64)fl->cctx->remote_heap->size, + &(fl->cctx->vmperms[0].vmid), &perm, 1); + if (err) + dev_err(fl->sctx->dev, "Failed to assign memory phys 0x%llx size 0x%llx err %d", + fl->cctx->remote_heap->phys, fl->cctx->remote_heap->size, err); + } +err_map: + fastrpc_buf_free(fl->cctx->remote_heap); +err_name: + kfree(name); +err: + kfree(args); + + return err; +} + static int fastrpc_init_create_process(struct fastrpc_user *fl, char __user *argp) { @@ -1915,6 +2044,9 @@ static long fastrpc_device_ioctl(struct file *file, unsigned int cmd, case FASTRPC_IOCTL_INIT_ATTACH_SNS: err = fastrpc_init_attach(fl, SENSORS_PD); break; + case FASTRPC_IOCTL_INIT_CREATE_STATIC: + err = fastrpc_init_create_static_process(fl, argp); + break; case FASTRPC_IOCTL_INIT_CREATE: err = fastrpc_init_create_process(fl, argp); break; @@ -2184,6 +2316,9 @@ static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev) if (cctx->secure_fdevice) misc_deregister(&cctx->secure_fdevice->miscdev); + if (cctx->remote_heap) + fastrpc_buf_free(cctx->remote_heap); + of_platform_depopulate(&rpdev->dev); cctx->rpdev = NULL; diff --git a/include/uapi/misc/fastrpc.h b/include/uapi/misc/fastrpc.h index 5e29f2cfa42d..f33d914d8f46 100644 --- a/include/uapi/misc/fastrpc.h +++ b/include/uapi/misc/fastrpc.h @@ -13,6 +13,7 @@ #define FASTRPC_IOCTL_MMAP _IOWR('R', 6, struct fastrpc_req_mmap) #define FASTRPC_IOCTL_MUNMAP _IOWR('R', 7, struct fastrpc_req_munmap) #define FASTRPC_IOCTL_INIT_ATTACH_SNS _IO('R', 8) +#define FASTRPC_IOCTL_INIT_CREATE_STATIC _IOWR('R', 9, struct fastrpc_init_create_static) #define FASTRPC_IOCTL_MEM_MAP _IOWR('R', 10, struct fastrpc_mem_map) #define FASTRPC_IOCTL_MEM_UNMAP _IOWR('R', 11, struct fastrpc_mem_unmap) #define FASTRPC_IOCTL_GET_DSP_INFO _IOWR('R', 13, struct fastrpc_ioctl_capability) @@ -87,6 +88,12 @@ struct fastrpc_init_create { __u64 file; /* pointer to elf file */ }; +struct fastrpc_init_create_static { + __u32 namelen; /* length of pd process name */ + __u32 memlen; + __u64 name; /* pd process name */ +}; + struct fastrpc_alloc_dma_buf { __s32 fd; /* fd */ __u32 flags; /* flags to map with */ -- cgit v1.2.3 From 76e8e4ace1ed2c97dba3b1370e0e105e07c572bc Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Fri, 25 Nov 2022 07:14:03 +0000 Subject: misc: fastrpc: Safekeep mmaps on interrupted invoke If the userspace daemon is killed in the middle of an invoke (e.g. audiopd listerner invoke), we need to skip the unmapping on device release, otherwise the DSP will crash. So lets safekeep all the maps only if there is in invoke interrupted, by attaching them to the channel context (which is resident until RPMSG driver is removed), and free them on RPMSG driver remove. Signed-off-by: Abel Vesa Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20221125071405.148786-9-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index adc44ba0eff6..d7a98396ee94 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -275,6 +275,7 @@ struct fastrpc_channel_ctx { struct fastrpc_device *secure_fdevice; struct fastrpc_device *fdevice; struct fastrpc_buf *remote_heap; + struct list_head invoke_interrupted_mmaps; bool secure; bool unsigned_support; }; @@ -1109,6 +1110,8 @@ static int fastrpc_internal_invoke(struct fastrpc_user *fl, u32 kernel, struct fastrpc_invoke_args *args) { struct fastrpc_invoke_ctx *ctx = NULL; + struct fastrpc_buf *buf, *b; + int err = 0; if (!fl->sctx) @@ -1172,6 +1175,13 @@ bail: fastrpc_context_put(ctx); } + if (err == -ERESTARTSYS) { + list_for_each_entry_safe(buf, b, &fl->mmaps, node) { + list_del(&buf->node); + list_add_tail(&buf->node, &fl->cctx->invoke_interrupted_mmaps); + } + } + if (err) dev_dbg(fl->sctx->dev, "Error: Invoke Failed %d\n", err); @@ -2278,6 +2288,7 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev) dev_set_drvdata(&rpdev->dev, data); dma_set_mask_and_coherent(rdev, DMA_BIT_MASK(32)); INIT_LIST_HEAD(&data->users); + INIT_LIST_HEAD(&data->invoke_interrupted_mmaps); spin_lock_init(&data->lock); idr_init(&data->ctx_idr); data->domain_id = domain_id; @@ -2302,6 +2313,7 @@ static void fastrpc_notify_users(struct fastrpc_user *user) static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev) { struct fastrpc_channel_ctx *cctx = dev_get_drvdata(&rpdev->dev); + struct fastrpc_buf *buf, *b; struct fastrpc_user *user; unsigned long flags; @@ -2316,6 +2328,9 @@ static void fastrpc_rpmsg_remove(struct rpmsg_device *rpdev) if (cctx->secure_fdevice) misc_deregister(&cctx->secure_fdevice->miscdev); + list_for_each_entry_safe(buf, b, &cctx->invoke_interrupted_mmaps, node) + list_del(&buf->node); + if (cctx->remote_heap) fastrpc_buf_free(cctx->remote_heap); -- cgit v1.2.3 From 532ad70c6d449029cfa3eac8408f427e31334f33 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Fri, 25 Nov 2022 07:14:04 +0000 Subject: misc: fastrpc: Add mmap request assigning for static PD pool If the mmap request is to add pages and thre are VMIDs associated with that context, do a call to SCM to reassign that memory. Do not do this for remote heap allocation, that is done on init create static process only. Co-developed-by: Srinivas Kandagatla Signed-off-by: Abel Vesa Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20221125071405.148786-10-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'drivers/misc') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index d7a98396ee94..2e49bf1c31ef 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -1839,8 +1839,9 @@ static int fastrpc_req_mmap(struct fastrpc_user *fl, char __user *argp) if (copy_from_user(&req, argp, sizeof(req))) return -EFAULT; - if (req.flags != ADSP_MMAP_ADD_PAGES) { + if (req.flags != ADSP_MMAP_ADD_PAGES && req.flags != ADSP_MMAP_REMOTE_HEAP_ADDR) { dev_err(dev, "flag not supported 0x%x\n", req.flags); + return -EINVAL; } @@ -1886,6 +1887,22 @@ static int fastrpc_req_mmap(struct fastrpc_user *fl, char __user *argp) /* let the client know the address to use */ req.vaddrout = rsp_msg.vaddr; + /* Add memory to static PD pool, protection thru hypervisor */ + if (req.flags != ADSP_MMAP_REMOTE_HEAP_ADDR && fl->cctx->vmcount) { + struct qcom_scm_vmperm perm; + int err = 0; + + perm.vmid = QCOM_SCM_VMID_HLOS; + perm.perm = QCOM_SCM_PERM_RWX; + err = qcom_scm_assign_mem(buf->phys, buf->size, + &(fl->cctx->vmperms[0].vmid), &perm, 1); + if (err) { + dev_err(fl->sctx->dev, "Failed to assign memory phys 0x%llx size 0x%llx err %d", + buf->phys, buf->size, err); + goto err_assign; + } + } + spin_lock(&fl->lock); list_add_tail(&buf->node, &fl->mmaps); spin_unlock(&fl->lock); -- cgit v1.2.3 From 9bde43a0e2f469961e18d0a3496a9a74379c22bf Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Fri, 25 Nov 2022 07:14:05 +0000 Subject: misc: fastrpc: Add dma_mask to fastrpc_channel_ctx dma_set_mask_and_coherent only updates the mask to which the device dma_mask pointer points to. Add a dma_mask to the channel ctx and set the device dma_mask to point to that, otherwise the dma_set_mask will return an error and the dma_set_coherent_mask will be skipped too. Co-developed-by: Srinivas Kandagatla Signed-off-by: Abel Vesa Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20221125071405.148786-11-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/fastrpc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/misc') diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c index 2e49bf1c31ef..278ab6ca1a5a 100644 --- a/drivers/misc/fastrpc.c +++ b/drivers/misc/fastrpc.c @@ -278,6 +278,7 @@ struct fastrpc_channel_ctx { struct list_head invoke_interrupted_mmaps; bool secure; bool unsigned_support; + u64 dma_mask; }; struct fastrpc_device { @@ -2303,6 +2304,7 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev) kref_init(&data->refcount); dev_set_drvdata(&rpdev->dev, data); + rdev->dma_mask = &data->dma_mask; dma_set_mask_and_coherent(rdev, DMA_BIT_MASK(32)); INIT_LIST_HEAD(&data->users); INIT_LIST_HEAD(&data->invoke_interrupted_mmaps); -- cgit v1.2.3