diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-01-14 16:02:28 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-01-14 16:02:28 +0100 |
commit | 3bad80dab94a16c9b7991105e3bffd5fe5957e9a (patch) | |
tree | 78b38d6974bd023491cbb425875700cea4bad69f /drivers/misc/habanalabs/common/habanalabs.h | |
parent | 871bfa02d08d9c0ed981c50082b7afd367d3700b (diff) | |
parent | d47c7407b4c88cf66098eba8893bc38279f301fc (diff) | |
download | linux-3bad80dab94a16c9b7991105e3bffd5fe5957e9a.tar.bz2 |
Merge tag 'char-misc-5.17-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc
Pull char/misc and other driver updates from Greg KH:
"Here is the large set of char, misc, and other "small" driver
subsystem changes for 5.17-rc1.
Lots of different things are in here for char/misc drivers such as:
- habanalabs driver updates
- mei driver updates
- lkdtm driver updates
- vmw_vmci driver updates
- android binder driver updates
- other small char/misc driver updates
Also smaller driver subsystems have also been updated, including:
- fpga subsystem updates
- iio subsystem updates
- soundwire subsystem updates
- extcon subsystem updates
- gnss subsystem updates
- phy subsystem updates
- coresight subsystem updates
- firmware subsystem updates
- comedi subsystem updates
- mhi subsystem updates
- speakup subsystem updates
- rapidio subsystem updates
- spmi subsystem updates
- virtual driver updates
- counter subsystem updates
Too many individual changes to summarize, the shortlog contains the
full details.
All of these have been in linux-next for a while with no reported
issues"
* tag 'char-misc-5.17-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc: (406 commits)
counter: 104-quad-8: Fix use-after-free by quad8_irq_handler
dt-bindings: mux: Document mux-states property
dt-bindings: ti-serdes-mux: Add defines for J721S2 SoC
counter: remove old and now unused registration API
counter: ti-eqep: Convert to new counter registration
counter: stm32-lptimer-cnt: Convert to new counter registration
counter: stm32-timer-cnt: Convert to new counter registration
counter: microchip-tcb-capture: Convert to new counter registration
counter: ftm-quaddec: Convert to new counter registration
counter: intel-qep: Convert to new counter registration
counter: interrupt-cnt: Convert to new counter registration
counter: 104-quad-8: Convert to new counter registration
counter: Update documentation for new counter registration functions
counter: Provide alternative counter registration functions
counter: stm32-timer-cnt: Convert to counter_priv() wrapper
counter: stm32-lptimer-cnt: Convert to counter_priv() wrapper
counter: ti-eqep: Convert to counter_priv() wrapper
counter: ftm-quaddec: Convert to counter_priv() wrapper
counter: intel-qep: Convert to counter_priv() wrapper
counter: microchip-tcb-capture: Convert to counter_priv() wrapper
...
Diffstat (limited to 'drivers/misc/habanalabs/common/habanalabs.h')
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs.h | 301 |
1 files changed, 207 insertions, 94 deletions
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index a2002cbf794b..cb710fd478b6 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 * - * Copyright 2016-2019 HabanaLabs, Ltd. + * Copyright 2016-2021 HabanaLabs, Ltd. * All Rights Reserved. * */ @@ -61,6 +61,8 @@ #define HL_CPUCP_INFO_TIMEOUT_USEC 10000000 /* 10s */ #define HL_CPUCP_EEPROM_TIMEOUT_USEC 10000000 /* 10s */ +#define HL_FW_STATUS_POLL_INTERVAL_USEC 10000 /* 10ms */ + #define HL_PCI_ELBI_TIMEOUT_MSEC 10 /* 10ms */ #define HL_SIM_MAX_TIMEOUT_US 10000000 /* 10s */ @@ -117,37 +119,37 @@ enum hl_mmu_page_table_location { /* * Reset Flags * - * - HL_RESET_HARD + * - HL_DRV_RESET_HARD * If set do hard reset to all engines. If not set reset just * compute/DMA engines. * - * - HL_RESET_FROM_RESET_THREAD + * - HL_DRV_RESET_FROM_RESET_THR * Set if the caller is the hard-reset thread * - * - HL_RESET_HEARTBEAT + * - HL_DRV_RESET_HEARTBEAT * Set if reset is due to heartbeat * - * - HL_RESET_TDR + * - HL_DRV_RESET_TDR * Set if reset is due to TDR * - * - HL_RESET_DEVICE_RELEASE + * - HL_DRV_RESET_DEV_RELEASE * Set if reset is due to device release * - * - HL_RESET_FW + * - HL_DRV_RESET_BYPASS_REQ_TO_FW * F/W will perform the reset. No need to ask it to reset the device. This is relevant * only when running with secured f/w * - * - HL_RESET_FW_FATAL_ERR + * - HL_DRV_RESET_FW_FATAL_ERR * Set if reset is due to a fatal error from FW */ -#define HL_RESET_HARD (1 << 0) -#define HL_RESET_FROM_RESET_THREAD (1 << 1) -#define HL_RESET_HEARTBEAT (1 << 2) -#define HL_RESET_TDR (1 << 3) -#define HL_RESET_DEVICE_RELEASE (1 << 4) -#define HL_RESET_FW (1 << 5) -#define HL_RESET_FW_FATAL_ERR (1 << 6) +#define HL_DRV_RESET_HARD (1 << 0) +#define HL_DRV_RESET_FROM_RESET_THR (1 << 1) +#define HL_DRV_RESET_HEARTBEAT (1 << 2) +#define HL_DRV_RESET_TDR (1 << 3) +#define HL_DRV_RESET_DEV_RELEASE (1 << 4) +#define HL_DRV_RESET_BYPASS_REQ_TO_FW (1 << 5) +#define HL_DRV_RESET_FW_FATAL_ERR (1 << 6) #define HL_MAX_SOBS_PER_MONITOR 8 @@ -219,6 +221,7 @@ enum hl_fw_component { /** * enum hl_fw_types - F/W types present in the system + * @FW_TYPE_NONE: no FW component indication * @FW_TYPE_LINUX: Linux image for device CPU * @FW_TYPE_BOOT_CPU: Boot image for device CPU * @FW_TYPE_PREBOOT_CPU: Indicates pre-loaded CPUs are present in the system @@ -226,6 +229,7 @@ enum hl_fw_component { * @FW_TYPE_ALL_TYPES: Mask for all types */ enum hl_fw_types { + FW_TYPE_NONE = 0x0, FW_TYPE_LINUX = 0x1, FW_TYPE_BOOT_CPU = 0x2, FW_TYPE_PREBOOT_CPU = 0x4, @@ -353,6 +357,21 @@ enum vm_type { }; /** + * enum mmu_op_flags - mmu operation relevant information. + * @MMU_OP_USERPTR: operation on user memory (host resident). + * @MMU_OP_PHYS_PACK: operation on DRAM (device resident). + * @MMU_OP_CLEAR_MEMCACHE: operation has to clear memcache. + * @MMU_OP_SKIP_LOW_CACHE_INV: operation is allowed to skip parts of cache invalidation. + */ +enum mmu_op_flags { + MMU_OP_USERPTR = 0x1, + MMU_OP_PHYS_PACK = 0x2, + MMU_OP_CLEAR_MEMCACHE = 0x4, + MMU_OP_SKIP_LOW_CACHE_INV = 0x8, +}; + + +/** * enum hl_device_hw_state - H/W device state. use this to understand whether * to do reset before hw_init or not * @HL_DEVICE_HW_STATE_CLEAN: H/W state is clean. i.e. after hard reset @@ -382,6 +401,7 @@ enum hl_device_hw_state { * @hop3_mask: mask to get the PTE address in hop 3. * @hop4_mask: mask to get the PTE address in hop 4. * @hop5_mask: mask to get the PTE address in hop 5. + * @last_mask: mask to get the bit indicating this is the last hop. * @page_size: default page size used to allocate memory. * @num_hops: The amount of hops supported by the translation table. * @host_resident: Should the MMU page table reside in host memory or in the @@ -402,6 +422,7 @@ struct hl_mmu_properties { u64 hop3_mask; u64 hop4_mask; u64 hop5_mask; + u64 last_mask; u32 page_size; u32 num_hops; u8 host_resident; @@ -524,6 +545,15 @@ struct hl_hints_range { * @dynamic_fw_load: is dynamic FW load is supported. * @gic_interrupts_enable: true if FW is not blocking GIC controller, * false otherwise. + * @use_get_power_for_reset_history: To support backward compatibility for Goya + * and Gaudi + * @supports_soft_reset: is soft reset supported. + * @allow_inference_soft_reset: true if the ASIC supports soft reset that is + * initiated by user or TDR. This is only true + * in inference ASICs, as there is no real-world + * use-case of doing soft-reset in training (due + * to the fact that training runs on multiple + * devices) */ struct asic_fixed_properties { struct hw_queue_properties *hw_queues_props; @@ -604,6 +634,9 @@ struct asic_fixed_properties { u8 iatu_done_by_fw; u8 dynamic_fw_load; u8 gic_interrupts_enable; + u8 use_get_power_for_reset_history; + u8 supports_soft_reset; + u8 allow_inference_soft_reset; }; /** @@ -852,10 +885,15 @@ struct hl_user_interrupt { * pending on an interrupt * @wait_list_node: node in the list of user threads pending on an interrupt * @fence: hl fence object for interrupt completion + * @cq_target_value: CQ target value + * @cq_kernel_addr: CQ kernel address, to be used in the cq interrupt + * handler for taget value comparison */ struct hl_user_pending_interrupt { struct list_head wait_list_node; struct hl_fence fence; + u64 cq_target_value; + u64 *cq_kernel_addr; }; /** @@ -1010,6 +1048,7 @@ struct fw_response { * @image_region: region to copy the FW image to * @fw_image_size: size of FW image to load * @wait_for_bl_timeout: timeout for waiting for boot loader to respond + * @fw_desc_valid: true if FW descriptor has been validated and hence the data can be used */ struct dynamic_fw_load_mgr { struct fw_response response; @@ -1017,6 +1056,7 @@ struct dynamic_fw_load_mgr { struct pci_mem_region *image_region; size_t fw_image_size; u32 wait_for_bl_timeout; + bool fw_desc_valid; }; /** @@ -1042,7 +1082,8 @@ struct fw_image_props { * @skip_bmc: should BMC be skipped * @sram_bar_id: SRAM bar ID * @dram_bar_id: DRAM bar ID - * @linux_loaded: true if linux was loaded so far + * @fw_comp_loaded: bitmask of loaded FW components. set bit meaning loaded + * component. values are set according to enum hl_fw_types. */ struct fw_load_mgr { union { @@ -1056,7 +1097,7 @@ struct fw_load_mgr { u8 skip_bmc; u8 sram_bar_id; u8 dram_bar_id; - u8 linux_loaded; + u8 fw_comp_loaded; }; /** @@ -1128,7 +1169,7 @@ struct fw_load_mgr { * @disable_clock_gating: disable clock gating completely * @debug_coresight: perform certain actions on Coresight for debugging. * @is_device_idle: return true if device is idle, false otherwise. - * @soft_reset_late_init: perform certain actions needed after soft reset. + * @non_hard_reset_late_init: perform certain actions needed after a reset which is not hard-reset * @hw_queues_lock: acquire H/W queues lock. * @hw_queues_unlock: release H/W queues lock. * @get_pci_id: retrieve PCI ID. @@ -1261,10 +1302,10 @@ struct hl_asic_funcs { int (*send_heartbeat)(struct hl_device *hdev); void (*set_clock_gating)(struct hl_device *hdev); void (*disable_clock_gating)(struct hl_device *hdev); - int (*debug_coresight)(struct hl_device *hdev, void *data); + int (*debug_coresight)(struct hl_device *hdev, struct hl_ctx *ctx, void *data); bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr, u8 mask_len, struct seq_file *s); - int (*soft_reset_late_init)(struct hl_device *hdev); + int (*non_hard_reset_late_init)(struct hl_device *hdev); void (*hw_queues_lock)(struct hl_device *hdev); void (*hw_queues_unlock)(struct hl_device *hdev); u32 (*get_pci_id)(struct hl_device *hdev); @@ -1276,7 +1317,7 @@ struct hl_asic_funcs { int (*init_iatu)(struct hl_device *hdev); u32 (*rreg)(struct hl_device *hdev, u32 reg); void (*wreg)(struct hl_device *hdev, u32 reg, u32 val); - void (*halt_coresight)(struct hl_device *hdev); + void (*halt_coresight)(struct hl_device *hdev, struct hl_ctx *ctx); int (*ctx_init)(struct hl_ctx *ctx); void (*ctx_fini)(struct hl_ctx *ctx); int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk); @@ -1518,6 +1559,9 @@ struct hl_userptr { * @submission_time_jiffies: submission time of the cs * @type: CS_TYPE_*. * @encaps_sig_hdl_id: encaps signals handle id, set for the first staged cs. + * @sob_addr_offset: sob offset from the configuration base address. + * @initial_sob_count: count of completed signals in SOB before current submission of signal or + * cs with encaps signals. * @submitted: true if CS was submitted to H/W. * @completed: true if CS was completed by device. * @timedout : true if CS was timedout. @@ -1553,6 +1597,8 @@ struct hl_cs { u64 submission_time_jiffies; enum hl_cs_type type; u32 encaps_sig_hdl_id; + u32 sob_addr_offset; + u16 initial_sob_count; u8 submitted; u8 completed; u8 timedout; @@ -1792,7 +1838,6 @@ struct hl_debug_params { * @dev_node: node in the device list of file private data * @refcount: number of related contexts. * @restore_phase_mutex: lock for context switch and restore phase. - * @is_control: true for control device, false otherwise */ struct hl_fpriv { struct hl_device *hdev; @@ -1805,7 +1850,6 @@ struct hl_fpriv { struct list_head dev_node; struct kref refcount; struct mutex restore_phase_mutex; - u8 is_control; }; @@ -1864,6 +1908,7 @@ struct hl_debugfs_entry { * @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read. * @i2c_addr: generic u8 debugfs file for address value to use in i2c_data_read. * @i2c_reg: generic u8 debugfs file for register value to use in i2c_data_read. + * @i2c_len: generic u8 debugfs file for length value to use in i2c_data_read. */ struct hl_dbg_device_entry { struct dentry *root; @@ -1892,6 +1937,7 @@ struct hl_dbg_device_entry { u8 i2c_bus; u8 i2c_addr; u8 i2c_reg; + u8 i2c_len; }; /** @@ -2180,13 +2226,13 @@ struct hwmon_chip_info; * @wq: work queue for device reset procedure. * @reset_work: reset work to be done. * @hdev: habanalabs device structure. - * @fw_reset: whether f/w will do the reset without us sending them a message to do it. + * @flags: reset flags. */ struct hl_device_reset_work { struct workqueue_struct *wq; struct delayed_work reset_work; struct hl_device *hdev; - bool fw_reset; + u32 flags; }; /** @@ -2328,12 +2374,10 @@ struct multi_cs_completion { * @ctx: pointer to the context structure * @fence_arr: array of fences of all CSs * @seq_arr: array of CS sequence numbers - * @timeout_us: timeout in usec for waiting for CS to complete + * @timeout_jiffies: timeout in jiffies for waiting for CS to complete * @timestamp: timestamp of first completed CS * @wait_status: wait for CS status * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0) - * @stream_master_qid_map: bitmap of all stream master QIDs on which the - * multi-CS is waiting * @arr_len: fence_arr and seq_arr array length * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0) * @update_ts: update timestamp. 1- update the timestamp, otherwise 0. @@ -2342,17 +2386,114 @@ struct multi_cs_data { struct hl_ctx *ctx; struct hl_fence **fence_arr; u64 *seq_arr; - s64 timeout_us; + s64 timeout_jiffies; s64 timestamp; long wait_status; u32 completion_bitmap; - u32 stream_master_qid_map; u8 arr_len; u8 gone_cs; u8 update_ts; }; /** + * struct hl_clk_throttle_timestamp - current/last clock throttling timestamp + * @start: timestamp taken when 'start' event is received in driver + * @end: timestamp taken when 'end' event is received in driver + */ +struct hl_clk_throttle_timestamp { + ktime_t start; + ktime_t end; +}; + +/** + * struct hl_clk_throttle - keeps current/last clock throttling timestamps + * @timestamp: timestamp taken by driver and firmware, index 0 refers to POWER + * index 1 refers to THERMAL + * @lock: protects this structure as it can be accessed from both event queue + * context and info_ioctl context + * @current_reason: bitmask represents the current clk throttling reasons + * @aggregated_reason: bitmask represents aggregated clk throttling reasons since driver load + */ +struct hl_clk_throttle { + struct hl_clk_throttle_timestamp timestamp[HL_CLK_THROTTLE_TYPE_MAX]; + struct mutex lock; + u32 current_reason; + u32 aggregated_reason; +}; + +/** + * struct last_error_session_info - info about last session in which CS timeout or + * razwi error occurred. + * @open_dev_timestamp: device open timestamp. + * @cs_timeout_timestamp: CS timeout timestamp. + * @razwi_timestamp: razwi timestamp. + * @cs_write_disable: if set writing to CS parameters in the structure is disabled so the + * first (root cause) CS timeout will not be overwritten. + * @razwi_write_disable: if set writing to razwi parameters in the structure is disabled so the + * first (root cause) razwi will not be overwritten. + * @cs_timeout_seq: CS timeout sequence number. + * @razwi_addr: address that caused razwi. + * @razwi_engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does + * not have engine id it will be set to U16_MAX. + * @razwi_engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible + * engines which one them caused the razwi. In that case, it will contain the + * second possible engine id, otherwise it will be set to U16_MAX. + * @razwi_non_engine_initiator: in case the initiator of the razwi does not have engine id. + * @razwi_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX. + */ +struct last_error_session_info { + ktime_t open_dev_timestamp; + ktime_t cs_timeout_timestamp; + ktime_t razwi_timestamp; + atomic_t cs_write_disable; + atomic_t razwi_write_disable; + u64 cs_timeout_seq; + u64 razwi_addr; + u16 razwi_engine_id_1; + u16 razwi_engine_id_2; + u8 razwi_non_engine_initiator; + u8 razwi_type; +}; + +/** + * struct hl_reset_info - holds current device reset information. + * @lock: lock to protect critical reset flows. + * @soft_reset_cnt: number of soft reset since the driver was loaded. + * @hard_reset_cnt: number of hard reset since the driver was loaded. + * @hard_reset_schedule_flags: hard reset is scheduled to after current soft reset, + * here we hold the hard reset flags. + * @in_reset: is device in reset flow. + * @is_in_soft_reset: Device is currently in soft reset process. + * @needs_reset: true if reset_on_lockup is false and device should be reset + * due to lockup. + * @hard_reset_pending: is there a hard reset work pending. + * @curr_reset_cause: saves an enumerated reset cause when a hard reset is + * triggered, and cleared after it is shared with preboot. + * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden + * with a new value on next reset + * @reset_trigger_repeated: set if device reset is triggered more than once with + * same cause. + * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to + * complete instead. + */ +struct hl_reset_info { + spinlock_t lock; + u32 soft_reset_cnt; + u32 hard_reset_cnt; + u32 hard_reset_schedule_flags; + u8 in_reset; + u8 is_in_soft_reset; + u8 needs_reset; + u8 hard_reset_pending; + + u8 curr_reset_cause; + u8 prev_reset_trigger; + u8 reset_trigger_repeated; + + u8 skip_reset_on_timeout; +}; + +/** * struct hl_device - habanalabs device structure. * @pdev: pointer to PCI device, can be NULL in case of simulator device. * @pcie_bar_phys: array of available PCIe bars physical addresses. @@ -2363,7 +2504,6 @@ struct multi_cs_data { * @cdev_ctrl: char device for control operations only (INFO IOCTL) * @dev: related kernel basic device structure. * @dev_ctrl: related kernel device structure for the control device - * @work_freq: delayed work to lower device frequency if possible. * @work_heartbeat: delayed work for CPU-CP is-alive check. * @device_reset_work: delayed work which performs hard reset * @asic_name: ASIC specific name. @@ -2398,7 +2538,6 @@ struct multi_cs_data { * @asic_specific: ASIC specific information to use only from ASIC files. * @vm: virtual memory manager for MMU. * @hwmon_dev: H/W monitor device. - * @pm_mng_profile: current power management profile. * @hl_chip_info: ASIC's sensors information. * @device_status_description: device status description. * @hl_debugfs: device's debugfs manager. @@ -2410,8 +2549,10 @@ struct multi_cs_data { * @internal_cb_va_base: internal cb pool mmu virtual address base * @fpriv_list: list of file private data structures. Each structure is created * when a user opens the device + * @fpriv_ctrl_list: list of file private data structures. Each structure is created + * when a user opens the control device * @fpriv_list_lock: protects the fpriv_list - * @compute_ctx: current compute context executing. + * @fpriv_ctrl_list_lock: protects the fpriv_ctrl_list * @aggregated_cs_counters: aggregated cs counters among all contexts * @mmu_priv: device-specific MMU data. * @mmu_func: device-related MMU functions. @@ -2419,6 +2560,10 @@ struct multi_cs_data { * @pci_mem_region: array of memory regions in the PCI * @state_dump_specs: constants and dictionaries needed to dump system state. * @multi_cs_completion: array of multi-CS completion. + * @clk_throttling: holds information about current/previous clock throttling events + * @reset_info: holds current device reset information. + * @last_error: holds information about last session in which CS timeout or razwi error occurred. + * @stream_master_qid_arr: pointer to array with QIDs of master streams. * @dram_used_mem: current DRAM memory consumption. * @timeout_jiffies: device CS timeout value. * @max_power: the max power of the device, as configured by the sysadmin. This @@ -2434,20 +2579,17 @@ struct multi_cs_data { * device initialization. Mainly used to debug and * workaround firmware bugs * @dram_pci_bar_start: start bus address of PCIe bar towards DRAM. + * @last_successful_open_ktime: timestamp (ktime) of the last successful device open. * @last_successful_open_jif: timestamp (jiffies) of the last successful * device open. * @last_open_session_duration_jif: duration (jiffies) of the last device open * session. * @open_counter: number of successful device open operations. - * @in_reset: is device in reset flow. - * @curr_pll_profile: current PLL profile. + * @fw_poll_interval_usec: FW status poll interval in usec. * @card_type: Various ASICs have several card types. This indicates the card * type of the current device. * @major: habanalabs kernel driver major. * @high_pll: high PLL profile frequency. - * @soft_reset_cnt: number of soft reset since the driver was loaded. - * @hard_reset_cnt: number of hard reset since the driver was loaded. - * @clk_throttling_reason: bitmask represents the current clk throttling reasons * @id: device minor. * @id_control: minor of the control device * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit @@ -2455,7 +2597,6 @@ struct multi_cs_data { * @disabled: is device disabled. * @late_init_done: is late init stage was done during initialization. * @hwmon_initialized: is H/W monitor sensors was initialized. - * @hard_reset_pending: is there a hard reset work pending. * @heartbeat: is heartbeat sanity check towards CPU-CP enabled. * @reset_on_lockup: true if a reset should be done in case of stuck CS, false * otherwise. @@ -2467,8 +2608,9 @@ struct multi_cs_data { * @init_done: is the initialization of the device done. * @device_cpu_disabled: is the device CPU disabled (due to timeouts) * @dma_mask: the dma mask that was set for this device - * @in_debug: is device under debug. This, together with fpriv_list, enforces - * that only a single user is configuring the debug infrastructure. + * @in_debug: whether the device is in a state where the profiling/tracing infrastructure + * can be used. This indication is needed because in some ASICs we need to do + * specific operations to enable that infrastructure. * @power9_64bit_dma_enable: true to enable 64-bit DMA mask support. Relevant * only to POWER9 machines. * @cdev_sysfs_created: were char devices and sysfs nodes created. @@ -2477,34 +2619,18 @@ struct multi_cs_data { * @sync_stream_queue_idx: helper index for sync stream queues initialization. * @collective_mon_idx: helper index for collective initialization * @supports_coresight: is CoreSight supported. - * @supports_soft_reset: is soft reset supported. - * @allow_inference_soft_reset: true if the ASIC supports soft reset that is - * initiated by user or TDR. This is only true - * in inference ASICs, as there is no real-world - * use-case of doing soft-reset in training (due - * to the fact that training runs on multiple - * devices) * @supports_cb_mapping: is mapping a CB to the device's MMU supported. - * @needs_reset: true if reset_on_lockup is false and device should be reset - * due to lockup. * @process_kill_trial_cnt: number of trials reset thread tried killing * user processes * @device_fini_pending: true if device_fini was called and might be * waiting for the reset thread to finish * @supports_staged_submission: true if staged submissions are supported - * @curr_reset_cause: saves an enumerated reset cause when a hard reset is - * triggered, and cleared after it is shared with preboot. - * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden - * with a new value on next reset - * @reset_trigger_repeated: set if device reset is triggered more than once with - * same cause. - * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to - * complete instead. * @device_cpu_is_halted: Flag to indicate whether the device CPU was already * halted. We can't halt it again because the COMMS * protocol will throw an error. Relevant only for * cases where Linux was not loaded to device CPU * @supports_wait_for_multi_cs: true if wait for multi CS is supported + * @is_compute_ctx_active: Whether there is an active compute context executing. */ struct hl_device { struct pci_dev *pdev; @@ -2515,7 +2641,6 @@ struct hl_device { struct cdev cdev_ctrl; struct device *dev; struct device *dev_ctrl; - struct delayed_work work_freq; struct delayed_work work_heartbeat; struct hl_device_reset_work device_reset_work; char asic_name[HL_STR_MAX]; @@ -2546,7 +2671,6 @@ struct hl_device { void *asic_specific; struct hl_vm vm; struct device *hwmon_dev; - enum hl_pm_mng_profile pm_mng_profile; struct hwmon_chip_info *hl_chip_info; struct hl_dbg_device_entry hl_debugfs; @@ -2560,9 +2684,9 @@ struct hl_device { u64 internal_cb_va_base; struct list_head fpriv_list; + struct list_head fpriv_ctrl_list; struct mutex fpriv_list_lock; - - struct hl_ctx *compute_ctx; + struct mutex fpriv_ctrl_list_lock; struct hl_cs_counters_atomic aggregated_cs_counters; @@ -2577,6 +2701,11 @@ struct hl_device { struct multi_cs_completion multi_cs_completion[ MULTI_CS_MAX_USER_CTX]; + struct hl_clk_throttle clk_throttling; + struct last_error_session_info last_error; + + struct hl_reset_info reset_info; + u32 *stream_master_qid_arr; atomic64_t dram_used_mem; u64 timeout_jiffies; @@ -2587,21 +2716,17 @@ struct hl_device { u64 last_successful_open_jif; u64 last_open_session_duration_jif; u64 open_counter; - atomic_t in_reset; - enum hl_pll_frequency curr_pll_profile; + u64 fw_poll_interval_usec; + ktime_t last_successful_open_ktime; enum cpucp_card_types card_type; u32 major; u32 high_pll; - u32 soft_reset_cnt; - u32 hard_reset_cnt; - u32 clk_throttling_reason; u16 id; u16 id_control; u16 cpu_pci_msb_addr; u8 disabled; u8 late_init_done; u8 hwmon_initialized; - u8 hard_reset_pending; u8 heartbeat; u8 reset_on_lockup; u8 dram_default_page_mapping; @@ -2618,20 +2743,14 @@ struct hl_device { u8 sync_stream_queue_idx; u8 collective_mon_idx; u8 supports_coresight; - u8 supports_soft_reset; - u8 allow_inference_soft_reset; u8 supports_cb_mapping; - u8 needs_reset; u8 process_kill_trial_cnt; u8 device_fini_pending; u8 supports_staged_submission; - u8 curr_reset_cause; - u8 prev_reset_trigger; - u8 reset_trigger_repeated; - u8 skip_reset_on_timeout; u8 device_cpu_is_halted; u8 supports_wait_for_multi_cs; u8 stream_master_qid_arr_size; + u8 is_compute_ctx_active; /* Parameters for bring-up */ u64 nic_ports_mask; @@ -2659,6 +2778,7 @@ struct hl_device { * wait cs are used to wait of the reserved encaps signals. * @hdev: pointer to habanalabs device structure. * @hw_sob: pointer to H/W SOB used in the reservation. + * @ctx: pointer to the user's context data structure * @cs_seq: staged cs sequence which contains encapsulated signals * @id: idr handler id to be used to fetch the handler info * @q_idx: stream queue index @@ -2669,6 +2789,7 @@ struct hl_cs_encaps_sig_handle { struct kref refcount; struct hl_device *hdev; struct hl_hw_sob *hw_sob; + struct hl_ctx *ctx; u64 cs_seq; u32 id; u32 q_idx; @@ -2757,21 +2878,9 @@ static inline bool hl_mem_area_inside_range(u64 address, u64 size, static inline bool hl_mem_area_crosses_range(u64 address, u32 size, u64 range_start_address, u64 range_end_address) { - u64 end_address = address + size; + u64 end_address = address + size - 1; - if ((address >= range_start_address) && - (address < range_end_address)) - return true; - - if ((end_address >= range_start_address) && - (end_address < range_end_address)) - return true; - - if ((address < range_start_address) && - (end_address >= range_end_address)) - return true; - - return false; + return ((address <= range_end_address) && (range_start_address <= end_address)); } int hl_device_open(struct inode *inode, struct file *filp); @@ -2779,10 +2888,7 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp); bool hl_device_operational(struct hl_device *hdev, enum hl_device_status *status); enum hl_device_status hl_device_status(struct hl_device *hdev); -int hl_device_set_debug_mode(struct hl_device *hdev, bool enable); -int create_hdev(struct hl_device **dev, struct pci_dev *pdev, - enum hl_asic_type asic_type, int minor); -void destroy_hdev(struct hl_device *hdev); +int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable); int hl_hw_queues_create(struct hl_device *hdev); void hl_hw_queues_destroy(struct hl_device *hdev); int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, @@ -2821,6 +2927,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx); void hl_ctx_do_release(struct kref *ref); void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx); int hl_ctx_put(struct hl_ctx *ctx); +struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev); struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq); int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr, struct hl_fence **fence, u32 arr_len); @@ -2834,7 +2941,6 @@ int hl_device_resume(struct hl_device *hdev); int hl_device_reset(struct hl_device *hdev, u32 flags); void hl_hpriv_get(struct hl_fpriv *hpriv); int hl_hpriv_put(struct hl_fpriv *hpriv); -int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq); int hl_device_utilization(struct hl_device *hdev, u32 *utilization); int hl_build_hwmon_channel_info(struct hl_device *hdev, @@ -2915,6 +3021,9 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, int hl_mmu_map_contiguous(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 size); int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size); +int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags); +int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard, + u32 flags, u32 asid, u64 va, u64 size); void hl_mmu_swap_out(struct hl_ctx *ctx); void hl_mmu_swap_in(struct hl_ctx *ctx); int hl_mmu_if_set_funcs(struct hl_device *hdev); @@ -2969,6 +3078,10 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev, struct fw_load_mgr *fw_loader, enum comms_cmd cmd, unsigned int size, bool wait_ok, u32 timeout); +int hl_fw_dram_replaced_row_get(struct hl_device *hdev, + struct cpucp_hbm_row_info *info); +int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num); +int hl_fw_cpucp_engine_core_asid_set(struct hl_device *hdev, u32 asid); int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3], bool is_wc[3]); int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data); |