diff options
author | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2021-12-27 10:28:13 +0100 |
---|---|---|
committer | Greg Kroah-Hartman <gregkh@linuxfoundation.org> | 2021-12-27 10:28:13 +0100 |
commit | 651425fb24b2152637685886fab9108cda9096a7 (patch) | |
tree | 08a7cad090df85101a174b726a54982acf1e9e6c /include | |
parent | 372c73b469e4d519d5cac7b52defa202668ba8d3 (diff) | |
parent | ce80098db2439ee44403ec6fccd3a10be21c7aff (diff) | |
download | linux-651425fb24b2152637685886fab9108cda9096a7.tar.bz2 |
Merge tag 'misc-habanalabs-next-2021-12-27' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-next
Oded writes:
This tag contains habanalabs driver changes for v5.17:
- Support reset-during-reset. In case the f/w notifies the driver
that the f/w is going to reset the device, the driver should
support that even if it is in the middle of doing another
reset
- Support events from f/w that arrive during device resets.
These events would be ignored which is bad as critical errors
would not be reported and treated by the driver.
- Don't kill processes that hold the control device open during
hard-reset of the device. The control device operations can't
crash if done during hard-reset. And usually, only monitoring
applications are using the control device, so killing them
defies their purpose.
- Fix handling of hwmon nodes when working with legacy f/w
- Change the compute context pointer to be boolean. This pointer
was abused by multiple code paths that wanted fast access to
the compute context structure.
- Add uapi to fetch historical errors. This is necessary as errors
sometimes result in hard-reset where the user application is
being terminated.
- Optimize GAUDI's MMU cache invalidation.
- Add support for loading the latest f/w.
- Add uapi to fetch HBM replacement and pending rows information.
- Multiple bug fixes to the reset code.
- Multiple bug fixes for Multi-CS ioctl code.
- Multiple bug fixes for wait-for-interrupt ioctl code.
- Many small bug fixes and cleanups.
* tag 'misc-habanalabs-next-2021-12-27' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux: (70 commits)
habanalabs: support hard-reset scheduling during soft-reset
habanalabs: add a lock to protect multiple reset variables
habanalabs: refactor reset information variables
habanalabs: handle skip multi-CS if handling not done
habanalabs: add CPU-CP packet for engine core ASID cfg
habanalabs: replace some -ENOTTY with -EINVAL
habanalabs: fix comments according to kernel-doc
habanalabs: fix endianness when reading cpld version
habanalabs: change wait_for_interrupt implementation
habanalabs: prevent wait if CS in multi-CS list completed
habanalabs: modify cpu boot status error print
habanalabs: clean MMU headers definitions
habanalabs: expose soft reset sysfs nodes for inference ASIC
habanalabs: sysfs support for two infineon versions
habanalabs: keep control device alive during hard reset
habanalabs: fix hwmon handling for legacy f/w
habanalabs: add current PI value to cpu packets
habanalabs: remove in_debug check in device open
habanalabs: return correct clock throttling period
habanalabs: wait again for multi-CS if no CS completed
...
Diffstat (limited to 'include')
-rw-r--r-- | include/uapi/misc/habanalabs.h | 166 |
1 files changed, 139 insertions, 27 deletions
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 00b309590499..371dfc4243b3 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -333,7 +333,18 @@ enum hl_server_type { * HL_INFO_SYNC_MANAGER - Retrieve sync manager info per dcore * HL_INFO_TOTAL_ENERGY - Retrieve total energy consumption * HL_INFO_PLL_FREQUENCY - Retrieve PLL frequency + * HL_INFO_POWER - Retrieve power information * HL_INFO_OPEN_STATS - Retrieve info regarding recent device open calls + * HL_INFO_DRAM_REPLACED_ROWS - Retrieve DRAM replaced rows info + * HL_INFO_DRAM_PENDING_ROWS - Retrieve DRAM pending rows num + * HL_INFO_LAST_ERR_OPEN_DEV_TIME - Retrieve timestamp of the last time the device was opened + * and CS timeout or razwi error occurred. + * HL_INFO_CS_TIMEOUT_EVENT - Retrieve CS timeout timestamp and its related CS sequence number. + * HL_INFO_RAZWI_EVENT - Retrieve parameters of razwi: + * Timestamp of razwi. + * The address which accessing it caused the razwi. + * Razwi initiator. + * Razwi cause, was it a page fault or MMU access error. */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -353,8 +364,13 @@ enum hl_server_type { #define HL_INFO_PLL_FREQUENCY 16 #define HL_INFO_POWER 17 #define HL_INFO_OPEN_STATS 18 +#define HL_INFO_DRAM_REPLACED_ROWS 21 +#define HL_INFO_DRAM_PENDING_ROWS 22 +#define HL_INFO_LAST_ERR_OPEN_DEV_TIME 23 +#define HL_INFO_CS_TIMEOUT_EVENT 24 +#define HL_INFO_RAZWI_EVENT 25 -#define HL_INFO_VERSION_MAX_LEN 128 +#define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 /** @@ -473,15 +489,27 @@ struct hl_info_pci_counters { __u64 replay_cnt; }; -#define HL_CLK_THROTTLE_POWER 0x1 -#define HL_CLK_THROTTLE_THERMAL 0x2 +enum hl_clk_throttling_type { + HL_CLK_THROTTLE_TYPE_POWER, + HL_CLK_THROTTLE_TYPE_THERMAL, + HL_CLK_THROTTLE_TYPE_MAX +}; + +/* clk_throttling_reason masks */ +#define HL_CLK_THROTTLE_POWER (1 << HL_CLK_THROTTLE_TYPE_POWER) +#define HL_CLK_THROTTLE_THERMAL (1 << HL_CLK_THROTTLE_TYPE_THERMAL) /** * struct hl_info_clk_throttle - clock throttling reason * @clk_throttling_reason: each bit represents a clk throttling reason + * @clk_throttling_timestamp_us: represents CPU timestamp in microseconds of the start-event + * @clk_throttling_duration_ns: the clock throttle time in nanosec */ struct hl_info_clk_throttle { __u32 clk_throttling_reason; + __u32 pad; + __u64 clk_throttling_timestamp_us[HL_CLK_THROTTLE_TYPE_MAX]; + __u64 clk_throttling_duration_ns[HL_CLK_THROTTLE_TYPE_MAX]; }; /** @@ -559,6 +587,51 @@ struct hl_info_cs_counters { __u64 ctx_validation_drop_cnt; }; +/** + * struct hl_info_last_err_open_dev_time - last error boot information. + * @timestamp: timestamp of last time the device was opened and error occurred. + */ +struct hl_info_last_err_open_dev_time { + __s64 timestamp; +}; + +/** + * struct hl_info_cs_timeout_event - last CS timeout information. + * @timestamp: timestamp when last CS timeout event occurred. + * @seq: sequence number of last CS timeout event. + */ +struct hl_info_cs_timeout_event { + __s64 timestamp; + __u64 seq; +}; + +#define HL_RAZWI_PAGE_FAULT 0 +#define HL_RAZWI_MMU_ACCESS_ERROR 1 + +/** + * struct hl_info_razwi_event - razwi information. + * @timestamp: timestamp of razwi. + * @addr: address which accessing it caused razwi. + * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not + * have engine id it will be set to U16_MAX. + * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible + * engines which one them caused the razwi. In that case, it will contain the + * second possible engine id, otherwise it will be set to U16_MAX. + * @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1, + * otherwise 0. + * @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX. + * @pad: padding to 64 bit. + */ +struct hl_info_razwi_event { + __s64 timestamp; + __u64 addr; + __u16 engine_id_1; + __u16 engine_id_2; + __u8 no_engine_id; + __u8 error_type; + __u8 pad[2]; +}; + enum gaudi_dcores { HL_GAUDI_WS_DCORE, HL_GAUDI_WN_DCORE, @@ -607,7 +680,10 @@ struct hl_info_args { #define HL_MAX_CB_SIZE (0x200000 - 32) /* Indicates whether the command buffer should be mapped to the device's MMU */ -#define HL_CB_FLAGS_MAP 0x1 +#define HL_CB_FLAGS_MAP 0x1 + +/* Used with HL_CB_OP_INFO opcode to get the device va address for kernel mapped CB */ +#define HL_CB_FLAGS_GET_DEVICE_VA 0x2 struct hl_cb_in { /* Handle of CB or 0 if we want to create one */ @@ -629,11 +705,16 @@ struct hl_cb_out { /* Handle of CB */ __u64 cb_handle; - /* Information about CB */ - struct { - /* Usage count of CB */ - __u32 usage_cnt; - __u32 pad; + union { + /* Information about CB */ + struct { + /* Usage count of CB */ + __u32 usage_cnt; + __u32 pad; + }; + + /* CB mapped address to device MMU */ + __u64 device_va; }; }; }; @@ -856,9 +937,17 @@ struct hl_cs_out { /* * SOB base address offset - * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY is set + * Valid only when HL_CS_FLAGS_RESERVE_SIGNALS_ONLY or HL_CS_FLAGS_SIGNAL is set */ __u32 sob_base_addr_offset; + + /* + * Count of completed signals in SOB before current signal submission. + * Valid only when (HL_CS_FLAGS_ENCAP_SIGNALS & HL_CS_FLAGS_STAGED_SUBMISSION) + * or HL_CS_FLAGS_SIGNAL is set + */ + __u16 sob_count_before_submission; + __u16 pad[3]; }; union hl_cs_args { @@ -866,9 +955,10 @@ union hl_cs_args { struct hl_cs_out out; }; -#define HL_WAIT_CS_FLAGS_INTERRUPT 0x2 -#define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000 -#define HL_WAIT_CS_FLAGS_MULTI_CS 0x4 +#define HL_WAIT_CS_FLAGS_INTERRUPT 0x2 +#define HL_WAIT_CS_FLAGS_INTERRUPT_MASK 0xFFF00000 +#define HL_WAIT_CS_FLAGS_MULTI_CS 0x4 +#define HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ 0x10 #define HL_WAIT_MULTI_CS_LIST_MAX_LEN 32 @@ -888,14 +978,23 @@ struct hl_wait_cs_in { }; struct { - /* User address for completion comparison. - * upon interrupt, driver will compare the value pointed - * by this address with the supplied target value. - * in order not to perform any comparison, set address - * to all 1s. - * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set - */ - __u64 addr; + union { + /* User address for completion comparison. + * upon interrupt, driver will compare the value pointed + * by this address with the supplied target value. + * in order not to perform any comparison, set address + * to all 1s. + * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set + */ + __u64 addr; + + /* cq_counters_handle to a kernel mapped cb which contains + * cq counters. + * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ is set + */ + __u64 cq_counters_handle; + }; + /* Target value for completion comparison */ __u64 target; }; @@ -911,14 +1010,27 @@ struct hl_wait_cs_in { */ __u32 flags; - /* Multi CS API info- valid entries in multi-CS array */ - __u8 seq_arr_len; - __u8 pad[3]; + union { + struct { + /* Multi CS API info- valid entries in multi-CS array */ + __u8 seq_arr_len; + __u8 pad[7]; + }; + + /* Absolute timeout to wait for an interrupt in microseconds. + * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set + */ + __u64 interrupt_timeout_us; + }; - /* Absolute timeout to wait for an interrupt in microseconds. - * Relevant only when HL_WAIT_CS_FLAGS_INTERRUPT is set + /* + * cq counter offset inside the counters cb pointed by cq_counters_handle above. + * upon interrupt, driver will compare the value pointed + * by this address (cq_counters_handle + cq_counters_offset) + * with the supplied target value. + * relevant only when HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ is set */ - __u32 interrupt_timeout_us; + __u64 cq_counters_offset; }; #define HL_WAIT_CS_STATUS_COMPLETED 0 |