From aa6df6533b8f9ead98889baa92e2b19793b1c77e Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 11 Jan 2021 15:00:38 +0200 Subject: habanalabs: fix reset process in case of failures There are some points in the reset process where if the code fails for some reason, and the system admin tries to initiate the reset process again we will get a kernel panic. This is because there aren't any protections in different fini functions that are called during the reset process. The protections that are added in this patch make sure that if the fini functions are called multiple times, without calling init functions between them, there won't be double release of already released resources. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/misc/habanalabs/common/device.c') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 1456eabf9601..1ea57d86caa3 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1037,7 +1037,7 @@ kill_processes: if (hard_reset) { /* Release kernel context */ - if (hl_ctx_put(hdev->kernel_ctx) == 1) + if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1) hdev->kernel_ctx = NULL; hl_vm_fini(hdev); hl_mmu_fini(hdev); -- cgit v1.2.3 From 2dc4a6d79168e7e426e8ddf8e7219c9ffd13b2b1 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Mon, 18 Jan 2021 21:39:46 +0200 Subject: habanalabs: disable FW events on device removal When device is removed, we need to make sure the F/W won't send us any more events because during the remove process we disable the interrupts. Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/misc/habanalabs/common/device.c') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 1ea57d86caa3..69d04eca767f 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1487,6 +1487,15 @@ void hl_device_fini(struct hl_device *hdev) } } + /* Disable PCI access from device F/W so it won't send us additional + * interrupts. We disable MSI/MSI-X at the halt_engines function and we + * can't have the F/W sending us interrupts after that. We need to + * disable the access here because if the device is marked disable, the + * message won't be send. Also, in case of heartbeat, the device CPU is + * marked as disable so this message won't be sent + */ + hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS); + /* Mark device as disabled */ hdev->disabled = true; -- cgit v1.2.3