Merge tag 'amd-drm-next-5.19-2022-04-29' of https://gitlab.freedesktop.org/agd5f/linux into drm-next

amd-drm-next-5.19-2022-04-29: amdgpu - RAS updates - SI dpm deadlock fix - Misc code cleanups - HDCP fixes - PSR fixes - DSC fixes - SDMA doorbell cleanups - S0ix fix - DC FP fix - Zen dom0 regression fix for APUs - IP discovery updates - Initial SoC21 support - Support for new vbios tables - Runtime PM fixes - Add PSP TA debugfs interface amdkfd: - Misc code cleanups - Ignore bogus MEC signals more efficiently - SVM fixes - Use bitmap helpers radeon: - Misc code cleanups - Spelling/grammer fixes From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20220429144853.5742-1-alexander.deucher@amd.com
author: Dave Airlie <airlied@redhat.com> 2022-05-06 15:05:27 +1000
committer: Dave Airlie <airlied@redhat.com> 2022-05-06 15:05:27 +1000
commit: b900352f9ddebc5c8dd30dc16218c4ff1b8c9147 (patch)
tree: b4b376bfa21a1dd65164b295f048eadc91f9ccec /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent: 8d62a974ac5fa1609e57a54622eef71e87bace78 (diff)
parent: 3da2c38231a4c62dafdbd762a199cfacaccd0533 (diff)
download: linux-b900352f9ddebc5c8dd30dc16218c4ff1b8c9147.tar.bz2
1 files changed, 98 insertions, 27 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ec709997c9c7..7e126dff004f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1515,12 +1515,97 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 /* ras fs end */
 
 /* ih begin */
+
+/* For the hardware that cannot enable bif ring for both ras_controller_irq
+ * and ras_err_evnet_athub_irq ih cookies, the driver has to poll status
+ * register to check whether the interrupt is triggered or not, and properly
+ * ack the interrupt if it is there
+ */
+void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
+{
+	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
+		return;
+
+	if (adev->nbio.ras &&
+	    adev->nbio.ras->handle_ras_controller_intr_no_bifring)
+		adev->nbio.ras->handle_ras_controller_intr_no_bifring(adev);
+
+	if (adev->nbio.ras &&
+	    adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring)
+		adev->nbio.ras->handle_ras_err_event_athub_intr_no_bifring(adev);
+}
+
+static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *obj,
+				struct amdgpu_iv_entry *entry)
+{
+	bool poison_stat = true, need_reset = true;
+	struct amdgpu_device *adev = obj->adev;
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	struct amdgpu_ras_block_object *block_obj =
+		amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
+
+	if (!adev->gmc.xgmi.connected_to_cpu)
+		amdgpu_umc_poison_handler(adev, &err_data, false);
+
+	/* both query_poison_status and handle_poison_consumption are optional */
+	if (block_obj && block_obj->hw_ops) {
+		if (block_obj->hw_ops->query_poison_status) {
+			poison_stat = block_obj->hw_ops->query_poison_status(adev);
+			if (!poison_stat)
+				dev_info(adev->dev, "No RAS poison status in %s poison IH.\n",
+						block_obj->ras_comm.name);
+		}
+
+		if (poison_stat && block_obj->hw_ops->handle_poison_consumption) {
+			poison_stat = block_obj->hw_ops->handle_poison_consumption(adev);
+			need_reset = poison_stat;
+		}
+	}
+
+	/* gpu reset is fallback for all failed cases */
+	if (need_reset)
+		amdgpu_ras_reset_gpu(adev);
+}
+
+static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj,
+				struct amdgpu_iv_entry *entry)
+{
+	dev_info(obj->adev->dev,
+		"Poison is created, no user action is needed.\n");
+}
+
+static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
+				struct amdgpu_iv_entry *entry)
+{
+	struct ras_ih_data *data = &obj->ih_data;
+	struct ras_err_data err_data = {0, 0, 0, NULL};
+	int ret;
+
+	if (!data->cb)
+		return;
+
+	/* Let IP handle its data, maybe we need get the output
+	 * from the callback to update the error type/count, etc
+	 */
+	ret = data->cb(obj->adev, &err_data, entry);
+	/* ue will trigger an interrupt, and in that case
+	 * we need do a reset to recovery the whole system.
+	 * But leave IP do that recovery, here we just dispatch
+	 * the error.
+	 */
+	if (ret == AMDGPU_RAS_SUCCESS) {
+		/* these counts could be left as 0 if
+		 * some blocks do not count error number
+		 */
+		obj->err_data.ue_count += err_data.ue_count;
+		obj->err_data.ce_count += err_data.ce_count;
+	}
+}
+
 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
 {
 	struct ras_ih_data *data = &obj->ih_data;
 	struct amdgpu_iv_entry entry;
-	int ret;
-	struct ras_err_data err_data = {0, 0, 0, NULL};
 
 	while (data->rptr != data->wptr) {
 		rmb();
@@ -1531,30 +1616,17 @@ static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
 		data->rptr = (data->aligned_element_size +
 				data->rptr) % data->ring_size;
 
-		if (data->cb) {
-			if (amdgpu_ras_is_poison_mode_supported(obj->adev) &&
-			    obj->head.block == AMDGPU_RAS_BLOCK__UMC)
-				dev_info(obj->adev->dev,
-						"Poison is created, no user action is needed.\n");
-			else {
-				/* Let IP handle its data, maybe we need get the output
-				 * from the callback to udpate the error type/count, etc
-				 */
-				memset(&err_data, 0, sizeof(err_data));
-				ret = data->cb(obj->adev, &err_data, &entry);
-				/* ue will trigger an interrupt, and in that case
-				 * we need do a reset to recovery the whole system.
-				 * But leave IP do that recovery, here we just dispatch
-				 * the error.
-				 */
-				if (ret == AMDGPU_RAS_SUCCESS) {
-					/* these counts could be left as 0 if
-					 * some blocks do not count error number
-					 */
-					obj->err_data.ue_count += err_data.ue_count;
-					obj->err_data.ce_count += err_data.ce_count;
-				}
-			}
+		if (amdgpu_ras_is_poison_mode_supported(obj->adev)) {
+			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+				amdgpu_ras_interrupt_poison_creation_handler(obj, &entry);
+			else
+				amdgpu_ras_interrupt_poison_consumption_handler(obj, &entry);
+		} else {
+			if (obj->head.block == AMDGPU_RAS_BLOCK__UMC)
+				amdgpu_ras_interrupt_umc_handler(obj, &entry);
+			else
+				dev_warn(obj->adev->dev,
+					"No RAS interrupt handler for non-UMC block with poison disabled.\n");
 		}
 	}
 }
@@ -1849,7 +1921,6 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
 	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
 
 	if (!bps) {
-		kfree(bps);
 		return -ENOMEM;
 	}
author	Dave Airlie <airlied@redhat.com>	2022-05-06 15:05:27 +1000
committer	Dave Airlie <airlied@redhat.com>	2022-05-06 15:05:27 +1000
commit	b900352f9ddebc5c8dd30dc16218c4ff1b8c9147 (patch)
tree	b4b376bfa21a1dd65164b295f048eadc91f9ccec /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent	8d62a974ac5fa1609e57a54622eef71e87bace78 (diff)
parent	3da2c38231a4c62dafdbd762a199cfacaccd0533 (diff)
download	linux-b900352f9ddebc5c8dd30dc16218c4ff1b8c9147.tar.bz2