summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>2021-12-27 10:28:13 +0100
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2021-12-27 10:28:13 +0100
commit651425fb24b2152637685886fab9108cda9096a7 (patch)
tree08a7cad090df85101a174b726a54982acf1e9e6c /include
parent372c73b469e4d519d5cac7b52defa202668ba8d3 (diff)
parentce80098db2439ee44403ec6fccd3a10be21c7aff (diff)
downloadlinux-651425fb24b2152637685886fab9108cda9096a7.tar.bz2
Merge tag 'misc-habanalabs-next-2021-12-27' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-next
Oded writes: This tag contains habanalabs driver changes for v5.17: - Support reset-during-reset. In case the f/w notifies the driver that the f/w is going to reset the device, the driver should support that even if it is in the middle of doing another reset - Support events from f/w that arrive during device resets. These events would be ignored which is bad as critical errors would not be reported and treated by the driver. - Don't kill processes that hold the control device open during hard-reset of the device. The control device operations can't crash if done during hard-reset. And usually, only monitoring applications are using the control device, so killing them defies their purpose. - Fix handling of hwmon nodes when working with legacy f/w - Change the compute context pointer to be boolean. This pointer was abused by multiple code paths that wanted fast access to the compute context structure. - Add uapi to fetch historical errors. This is necessary as errors sometimes result in hard-reset where the user application is being terminated. - Optimize GAUDI's MMU cache invalidation. - Add support for loading the latest f/w. - Add uapi to fetch HBM replacement and pending rows information. - Multiple bug fixes to the reset code. - Multiple bug fixes for Multi-CS ioctl code. - Multiple bug fixes for wait-for-interrupt ioctl code. - Many small bug fixes and cleanups. * tag 'misc-habanalabs-next-2021-12-27' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux: (70 commits) habanalabs: support hard-reset scheduling during soft-reset habanalabs: add a lock to protect multiple reset variables habanalabs: refactor reset information variables habanalabs: handle skip multi-CS if handling not done habanalabs: add CPU-CP packet for engine core ASID cfg habanalabs: replace some -ENOTTY with -EINVAL habanalabs: fix comments according to kernel-doc habanalabs: fix endianness when reading cpld version habanalabs: change wait_for_interrupt implementation habanalabs: prevent wait if CS in multi-CS list completed habanalabs: modify cpu boot status error print habanalabs: clean MMU headers definitions habanalabs: expose soft reset sysfs nodes for inference ASIC habanalabs: sysfs support for two infineon versions habanalabs: keep control device alive during hard reset habanalabs: fix hwmon handling for legacy f/w habanalabs: add current PI value to cpu packets habanalabs: remove in_debug check in device open habanalabs: return correct clock throttling period habanalabs: wait again for multi-CS if no CS completed ...
Diffstat (limited to 'include')
-rw-r--r--include/uapi/misc/habanalabs.h166
1 files changed, 139 insertions, 27 deletions
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h
index 00b309590499..371dfc4243b3 100644
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -333,7 +333,18 @@ enum hl_server_type {
* HL_INFO_SYNC_MANAGER - Retrieve sync manager info per dcore
* HL_INFO_TOTAL_ENERGY - Retrieve total energy consumption
* HL_INFO_PLL_FREQUENCY - Retrieve PLL frequency
+ * HL_INFO_POWER - Retrieve power information
* HL_INFO_OPEN_STATS - Retrieve info regarding recent device open calls
+ * HL_INFO_DRAM_REPLACED_ROWS - Retrieve DRAM replaced rows info
+ * HL_INFO_DRAM_PENDING_ROWS - Retrieve DRAM pending rows num
+ * HL_INFO_LAST_ERR_OPEN_DEV_TIME - Retrieve timestamp of the last time the device was opened
+ * and CS timeout or razwi error occurred.
+ * HL_INFO_CS_TIMEOUT_EVENT - Retrieve CS timeout timestamp and its related CS sequence number.
+ * HL_INFO_RAZWI_EVENT - Retrieve parameters of razwi:
+ * Timestamp of razwi.
+ * The address which accessing it caused the razwi.
+ * Razwi initiator.
+ * Razwi cause, was it a page fault or MMU access error.
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
@@ -353,8 +364,13 @@ enum hl_server_type {
#define HL_INFO_PLL_FREQUENCY 16
#define HL_INFO_POWER 17
#define HL_INFO_OPEN_STATS 18
+#define HL_INFO_DRAM_REPLACED_ROWS 21
+#define HL_INFO_DRAM_PENDING_ROWS 22
+#define HL_INFO_LAST_ERR_OPEN_DEV_TIME 23
+#define HL_INFO_CS_TIMEOUT_EVENT 24
+#define HL_INFO_RAZWI_EVENT 25
-#define HL_INFO_VERSION_MAX_LEN 128
+#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
/**
@@ -473,15 +489,27 @@ struct hl_info_pci_counters {
__u64 replay_cnt;
};
-#define HL_CLK_THROTTLE_POWER 0x1
-#define HL_CLK_THROTTLE_THERMAL 0x2
+enum hl_clk_throttling_type {
+ HL_CLK_THROTTLE_TYPE_POWER,
+ HL_CLK_THROTTLE_TYPE_THERMAL,
+ HL_CLK_THROTTLE_TYPE_MAX
+};
+
+/* clk_throttling_reason masks */
+#define HL_CLK_THROTTLE_POWER (1 << HL_CLK_THROTTLE_TYPE_POWER)
+#define HL_CLK_THROTTLE_THERMAL (1 << HL_CLK_THROTTLE_TYPE_THERMAL)
/**
* struct hl_info_clk_throttle - clock throttling reason
* @clk_throttling_reason: each bit represents a clk throttling reason
+ * @clk_throttling_timestamp_us: represents CPU timestamp in microseconds of the start-event
+ * @clk_throttling_duration_ns: the clock throttle time in nanosec
*/
struct hl_info_clk_throttle {
__u32 clk_throttling_reason;
+ __u32 pad;
+ __u64 clk_throttling_timestamp_us[HL_CLK_THROTTLE_TYPE_MAX];
+ __u64 clk_throttling_duration_ns[HL_CLK_THROTTLE_TYPE_MAX];
};
/**
@@ -559,6 +587,51 @@ struct hl_info_cs_counters {
__u64 ctx_validation_drop_cnt;
};
+/**
+ * struct hl_info_last_err_open_dev_time - last error boot information.
+ * @timestamp: timestamp of last time the device was opened and error occurred.
+ */
+struct hl_info_last_err_open_dev_time {
+ __s64 timestamp;
+};
+
+/**
+ * struct hl_info_cs_timeout_event - last CS timeout information.
+ * @timestamp: timestamp when last CS timeout event occurred.
+ * @seq: sequence number of last CS timeout event.
+ */
+struct hl_info_cs_timeout_event {
+ __s64 timestamp;
+ __u64 seq;
+};
+
+#define HL_RAZWI_PAGE_FAULT 0
+#define HL_RAZWI_MMU_ACCESS_ERROR 1
+
+/**
+ * struct hl_info_razwi_event - razwi information.
+ * @timestamp: timestamp of razwi.
+ * @addr: address which accessing it caused razwi.
+ * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not
+ * have engine id it will be set to U16_MAX.
+ * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
+ * engines which one them caused the razwi. In that case, it will contain the
+ * second possible engine id, otherwise it will be set to U16_MAX.
+ * @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1,
+ * otherwise 0.
+ * @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
+ * @pad: padding to 64 bit.
+ */
+struct hl_info_razwi_event {
+ __s64 timestamp;
+ __u64 addr;
+ __u16 engine_id_1;
+ __u16 engine_id_2;
+ __u8 no_engine_id;
+ __u8 error_type;
+ __u8 pad[2];
+};
+
enum gaudi_dcores {
HL_GAUDI_WS_DCORE,
HL_GAUDI_WN_DCORE,
@@ -607,7 +680,10 @@ struct hl_info_args {
#define HL_MAX_CB_SIZE (0x200000 - 32)
/* Indicates whether the command buffer should be mapped to the device's MMU */
-#define HL_CB_FLAGS_MAP 0x1
+#define HL_CB_FLAGS_MAP 0x1
+
+/* Used with HL_CB_OP_INFO opcode to get the device va address for kernel mapped CB */
+#define HL_CB_FLAGS_GET_DEVICE_VA 0x2
struct hl_cb_in {
/* Handle of CB or 0 if we want to create one */
@@ -629,11 +705,16 @@ struct hl_cb_out {
/* Handle of CB */
__u64 cb_handle;
- /* Information about CB */
- struct {
- /* Usage count of CB */
- __u32 usage_cnt;
- __u32 pad;
+ union {
+ /* Information about CB */
+ struct {
+ /* Usage count of CB */
+ __u32 usage_cnt;
+ __u32 pad;
+ };
+
+ /* CB mapped address to device MMU */
+ __u64 device_va;
};
};
};
@@ -856,9 +937,17 @@ struct hl_cs_out {
/*
* SOB base address offset
- * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set
+ * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY or HL_CS_FLAGS_SIGNAL is set
*/
__u32 sob_base_addr_offset;
+
+ /*
+ * Count of completed signals in SOB before current signal submission.
+ * Valid only when (HL_CS_FLAGS_ENCAP_SIGNALS & HL_CS_FLAGS_STAGED_SUBMISSION)
+ * or HL_CS_FLAGS_SIGNAL is set
+ */
+ __u16 sob_count_before_submission;
+ __u16 pad[3];
};
union hl_cs_args {
@@ -866,9 +955,10 @@ union hl_cs_args {
struct hl_cs_out out;
};
-#define HL_WAIT_CS_FLAGS_INTERRUPT 0x2
-#define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000
-#define HL_WAIT_CS_FLAGS_MULTI_CS 0x4
+#define HL_WAIT_CS_FLAGS_INTERRUPT 0x2
+#define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000
+#define HL_WAIT_CS_FLAGS_MULTI_CS 0x4
+#define HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ 0x10
#define HL_WAIT_MULTI_CS_LIST_MAX_LEN 32
@@ -888,14 +978,23 @@ struct hl_wait_cs_in {
};
struct {
- /* User address for completion comparison.
- * upon interrupt, driver will compare the value pointed
- * by this address with the supplied target value.
- * in order not to perform any comparison, set address
- * to all 1s.
- * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set
- */
- __u64 addr;
+ union {
+ /* User address for completion comparison.
+ * upon interrupt, driver will compare the value pointed
+ * by this address with the supplied target value.
+ * in order not to perform any comparison, set address
+ * to all 1s.
+ * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set
+ */
+ __u64 addr;
+
+ /* cq_counters_handle to a kernel mapped cb which contains
+ * cq counters.
+ * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ is set
+ */
+ __u64 cq_counters_handle;
+ };
+
/* Target value for completion comparison */
__u64 target;
};
@@ -911,14 +1010,27 @@ struct hl_wait_cs_in {
*/
__u32 flags;
- /* Multi CS API info- valid entries in multi-CS array */
- __u8 seq_arr_len;
- __u8 pad[3];
+ union {
+ struct {
+ /* Multi CS API info- valid entries in multi-CS array */
+ __u8 seq_arr_len;
+ __u8 pad[7];
+ };
+
+ /* Absolute timeout to wait for an interrupt in microseconds.
+ * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set
+ */
+ __u64 interrupt_timeout_us;
+ };
- /* Absolute timeout to wait for an interrupt in microseconds.
- * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set
+ /*
+ * cq counter offset inside the counters cb pointed by cq_counters_handle above.
+ * upon interrupt, driver will compare the value pointed
+ * by this address (cq_counters_handle + cq_counters_offset)
+ * with the supplied target value.
+ * relevant only when HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ is set
*/
- __u32 interrupt_timeout_us;
+ __u64 cq_counters_offset;
};
#define HL_WAIT_CS_STATUS_COMPLETED 0