summaryrefslogtreecommitdiffstats
path: root/drivers/misc/habanalabs/gaudi2/gaudi2.c
diff options
context:
space:
mode:
authorTal Cohen <talcohen@habana.ai>2022-08-18 12:54:23 +0300
committerOded Gabbay <ogabbay@kernel.org>2022-09-19 15:08:38 +0300
commit6f0818c9fc9b81d8a303a8d3fb1826d71777f7ed (patch)
treee5af509d98db6b589ebb35ddcb570e2519ec5af5 /drivers/misc/habanalabs/gaudi2/gaudi2.c
parentc833ac1a5f34a21e9e9f8605b2f3f9f8dcaab6a0 (diff)
downloadlinux-6f0818c9fc9b81d8a303a8d3fb1826d71777f7ed.tar.bz2
habanalabs: new notifier events for device state
Add new notifier events that inform several device states. General H/W error raised on device general H/W error occurs. User engine error is raised when a device engine informs of an error. Signed-off-by: Tal Cohen <talcohen@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/misc/habanalabs/gaudi2/gaudi2.c')
-rw-r--r--drivers/misc/habanalabs/gaudi2/gaudi2.c54
1 files changed, 53 insertions, 1 deletions
diff --git a/drivers/misc/habanalabs/gaudi2/gaudi2.c b/drivers/misc/habanalabs/gaudi2/gaudi2.c
index 60694b8ed6fe..f749f7377ea6 100644
--- a/drivers/misc/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/misc/habanalabs/gaudi2/gaudi2.c
@@ -8530,6 +8530,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
struct gaudi2_device *gaudi2 = hdev->asic_specific;
bool reset_required = false, skip_reset = false;
int index, sbte_index;
+ u64 event_mask = 0;
u16 event_type;
ctl = le32_to_cpu(eq_entry->hdr.ctl);
@@ -8551,6 +8552,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
fallthrough;
case GAUDI2_EVENT_ROTATOR0_SERR ... GAUDI2_EVENT_ROTATOR1_DERR:
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
reset_required = gaudi2_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
break;
@@ -8560,21 +8562,25 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
fallthrough;
case GAUDI2_EVENT_NIC0_QM0 ... GAUDI2_EVENT_NIC11_QM1:
gaudi2_handle_qman_err(hdev, event_type);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_ARC_AXI_ERROR_RESPONSE_0:
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
gaudi2_handle_arc_farm_sei_err(hdev);
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_CPU_AXI_ERR_RSP:
gaudi2_handle_cpu_sei_err(hdev);
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_PDMA_CH0_AXI_ERR_RSP:
case GAUDI2_EVENT_PDMA_CH1_AXI_ERR_RSP:
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
gaudi2_handle_qm_sei_err(hdev, event_type, &eq_entry->razwi_info);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE:
@@ -8582,6 +8588,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
index = event_type - GAUDI2_EVENT_ROTATOR0_AXI_ERROR_RESPONSE;
gaudi2_handle_rot_err(hdev, index, &eq_entry->razwi_with_intr_cause);
gaudi2_handle_qm_sei_err(hdev, event_type, NULL);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_TPC0_AXI_ERR_RSP ... GAUDI2_EVENT_TPC24_AXI_ERR_RSP:
@@ -8589,11 +8596,13 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
gaudi2_tpc_ack_interrupts(hdev, index, "AXI_ERR_RSP",
&eq_entry->razwi_with_intr_cause);
gaudi2_handle_qm_sei_err(hdev, event_type, NULL);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE ... GAUDI2_EVENT_DEC9_AXI_ERR_RSPONSE:
index = event_type - GAUDI2_EVENT_DEC0_AXI_ERR_RSPONSE;
gaudi2_handle_dec_err(hdev, index, "AXI_ERR_RESPONSE", &eq_entry->razwi_info);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_TPC0_KERNEL_ERR:
@@ -8624,6 +8633,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
index = (event_type - GAUDI2_EVENT_TPC0_KERNEL_ERR) /
(GAUDI2_EVENT_TPC1_KERNEL_ERR - GAUDI2_EVENT_TPC0_KERNEL_ERR);
gaudi2_tpc_ack_interrupts(hdev, index, "KRN_ERR", &eq_entry->razwi_with_intr_cause);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_DEC0_SPI:
@@ -8639,6 +8649,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
index = (event_type - GAUDI2_EVENT_DEC0_SPI) /
(GAUDI2_EVENT_DEC1_SPI - GAUDI2_EVENT_DEC0_SPI);
gaudi2_handle_dec_err(hdev, index, "SPI", &eq_entry->razwi_info);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_MME0_CTRL_AXI_ERROR_RESPONSE:
@@ -8651,6 +8662,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
gaudi2_handle_mme_err(hdev, index,
"CTRL_AXI_ERROR_RESPONSE", &eq_entry->razwi_info);
gaudi2_handle_qm_sei_err(hdev, event_type, NULL);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_MME0_QMAN_SW_ERROR:
@@ -8661,6 +8673,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
(GAUDI2_EVENT_MME1_QMAN_SW_ERROR -
GAUDI2_EVENT_MME0_QMAN_SW_ERROR);
gaudi2_handle_mme_err(hdev, index, "QMAN_SW_ERROR", &eq_entry->razwi_info);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID:
@@ -8671,50 +8684,58 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
(GAUDI2_EVENT_MME1_WAP_SOURCE_RESULT_INVALID -
GAUDI2_EVENT_MME0_WAP_SOURCE_RESULT_INVALID);
gaudi2_handle_mme_wap_err(hdev, index, &eq_entry->razwi_info);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_KDMA_CH0_AXI_ERR_RSP:
case GAUDI2_EVENT_KDMA0_CORE:
gaudi2_handle_kdma_core_event(hdev,
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_HDMA2_CORE ... GAUDI2_EVENT_PDMA1_CORE:
gaudi2_handle_dma_core_event(hdev,
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_PCIE_ADDR_DEC_ERR:
gaudi2_print_pcie_addr_dec_info(hdev,
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_HMMU0_PAGE_FAULT_OR_WR_PERM ... GAUDI2_EVENT_HMMU12_SECURITY_ERROR:
case GAUDI2_EVENT_HMMU_0_AXI_ERR_RSP ... GAUDI2_EVENT_HMMU_12_AXI_ERR_RSP:
- case GAUDI2_EVENT_PMMU0_PAGE_FAULT_WR_PERM ... GAUDI2_EVENT_PMMU0_SECURITY_ERROR:
case GAUDI2_EVENT_PMMU_AXI_ERR_RSP_0:
gaudi2_handle_mmu_spi_sei_err(hdev, event_type);
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_HIF0_FATAL ... GAUDI2_EVENT_HIF12_FATAL:
gaudi2_handle_hif_fatal(hdev, event_type,
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_PMMU_FATAL_0:
gaudi2_handle_pif_fatal(hdev,
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_PSOC63_RAZWI_OR_PID_MIN_MAX_INTERRUPT:
gaudi2_ack_psoc_razwi_event_handler(hdev);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_HBM0_MC0_SEI_SEVERE ... GAUDI2_EVENT_HBM5_MC1_SEI_NON_SEVERE:
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
if (gaudi2_handle_hbm_mc_sei_err(hdev, event_type, &eq_entry->sei_data)) {
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
reset_required = true;
@@ -8723,25 +8744,31 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
case GAUDI2_EVENT_HBM_CATTRIP_0 ... GAUDI2_EVENT_HBM_CATTRIP_5:
gaudi2_handle_hbm_cattrip(hdev, le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_HBM0_MC0_SPI ... GAUDI2_EVENT_HBM5_MC1_SPI:
gaudi2_handle_hbm_mc_spi(hdev, le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_PCIE_DRAIN_COMPLETE:
gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data);
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_PSOC59_RPM_ERROR_OR_DRAIN:
gaudi2_handle_psoc_drain(hdev, le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_CPU_AXI_ECC:
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_CPU_L2_RAM_ECC:
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME0_SBTE4_AXI_ERR_RSP:
case GAUDI2_EVENT_MME1_SBTE0_AXI_ERR_RSP ... GAUDI2_EVENT_MME1_SBTE4_AXI_ERR_RSP:
@@ -8755,17 +8782,24 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
GAUDI2_EVENT_MME0_SBTE0_AXI_ERR_RSP);
gaudi2_handle_mme_sbte_err(hdev, index, sbte_index,
le64_to_cpu(eq_entry->intr_cause.intr_cause_data));
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_VM0_ALARM_A ... GAUDI2_EVENT_VM3_ALARM_B:
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_PSOC_AXI_ERR_RSP:
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+ break;
case GAUDI2_EVENT_PSOC_PRSTN_FALL:
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_PCIE_APB_TIMEOUT:
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_PCIE_FATAL_ERR:
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_TPC0_BMON_SPMU:
case GAUDI2_EVENT_TPC1_BMON_SPMU:
@@ -8817,6 +8851,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
case GAUDI2_EVENT_DEC8_BMON_SPMU:
case GAUDI2_EVENT_DEC9_BMON_SPMU:
case GAUDI2_EVENT_ROTATOR0_BMON_SPMU ... GAUDI2_EVENT_SM3_BMON_SPMU:
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_CPU_FIX_POWER_ENV_S:
@@ -8824,43 +8859,53 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
case GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_S:
case GAUDI2_EVENT_CPU_FIX_THERMAL_ENV_E:
gaudi2_print_clk_change_info(hdev, event_type);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_CPU_PKT_QUEUE_OUT_SYNC:
gaudi2_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_PCIE_FLR_REQUESTED:
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
/* Do nothing- FW will handle it */
break;
case GAUDI2_EVENT_PCIE_P2P_MSIX:
gaudi2_handle_pcie_p2p_msix(hdev);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_SM0_AXI_ERROR_RESPONSE ... GAUDI2_EVENT_SM3_AXI_ERROR_RESPONSE:
index = event_type - GAUDI2_EVENT_SM0_AXI_ERROR_RESPONSE;
skip_reset = !gaudi2_handle_sm_err(hdev, index);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
case GAUDI2_EVENT_PSOC_MME_PLL_LOCK_ERR ... GAUDI2_EVENT_DCORE2_HBM_PLL_LOCK_ERR:
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_CPU_CPLD_SHUTDOWN_CAUSE:
dev_info(hdev->dev, "CPLD shutdown cause, reset reason: 0x%llx\n",
le64_to_cpu(eq_entry->data[0]));
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_CPU_CPLD_SHUTDOWN_EVENT:
dev_err(hdev->dev, "CPLD shutdown event, reset reason: 0x%llx\n",
le64_to_cpu(eq_entry->data[0]));
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_CPU_PKT_SANITY_FAILED:
gaudi2_print_cpu_pkt_failure_info(hdev, &eq_entry->pkt_sync_err);
+ event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
break;
case GAUDI2_EVENT_ARC_DCCM_FULL:
hl_arc_event_handle(hdev, &eq_entry->arc_data);
+ event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
break;
default:
@@ -8876,15 +8921,22 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
if (!gaudi2_irq_map_table[event_type].msg)
hl_fw_unmask_irq(hdev, event_type);
+ if (event_mask)
+ hl_notifier_event_send_all(hdev, event_mask);
+
return;
reset_device:
if (hdev->hard_reset_on_fw_events) {
hl_device_reset(hdev, reset_flags);
+ event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
} else {
if (!gaudi2_irq_map_table[event_type].msg)
hl_fw_unmask_irq(hdev, event_type);
}
+
+ if (event_mask)
+ hl_notifier_event_send_all(hdev, event_mask);
}
static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size, u64 val)