summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
authorAndrey Grodzovsky <andrey.grodzovsky@amd.com>2019-09-13 17:40:32 -0500
committerAlex Deucher <alexander.deucher@amd.com>2019-09-13 17:41:05 -0500
commit7c6e68c777f109484559a35b125a773439bbd319 (patch)
treeaca19eee2211f8b1684a9ef789a53fe5c8ba1982 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
parent12ffa55da60f8355a5e485bc6d612257a303147e (diff)
downloadlinux-7c6e68c777f109484559a35b125a773439bbd319.tar.bz2
drm/amdgpu: Avoid HW GPU reset for RAS.
Problem: Under certain conditions, when some IP bocks take a RAS error, we can get into a situation where a GPU reset is not possible due to issues in RAS in SMU/PSP. Temporary fix until proper solution in PSP/SMU is ready: When uncorrectable error happens the DF will unconditionally broadcast error event packets to all its clients/slave upon receiving fatal error event and freeze all its outbound queues, err_event_athub interrupt will be triggered. In such case and we use this interrupt to issue GPU reset. THe GPU reset code is modified for such case to avoid HW reset, only stops schedulers, deatches all in progress and not yet scheduled job's fences, set error code on them and signals. Also reject any new incoming job submissions from user space. All this is done to notify the applications of the problem. v2: Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c Remove print param from amdgpu_ras_query_error_count v3: Update based on prevoius bug fixing patch to properly call amdgpu_amdkfd_pre_reset for other XGMI hive memebers. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c22
1 files changed, 20 insertions, 2 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0b466d101f53..d7bf8fc10869 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -25,6 +25,8 @@
#include <linux/list.h>
#include <linux/module.h>
#include <linux/uaccess.h>
+#include <linux/reboot.h>
+#include <linux/syscalls.h>
#include "amdgpu.h"
#include "amdgpu_ras.h"
@@ -66,6 +68,9 @@ const char *ras_block_string[] = {
/* inject address is 52 bits */
#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
+
+atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
+
static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
uint64_t offset, uint64_t size,
struct amdgpu_bo **bo_ptr);
@@ -190,6 +195,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
return 0;
}
+
+static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+ struct ras_common_if *head);
+
/**
* DOC: AMDGPU RAS debugfs control interface
*
@@ -629,12 +638,14 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
info->ue_count = obj->err_data.ue_count;
info->ce_count = obj->err_data.ce_count;
- if (err_data.ce_count)
+ if (err_data.ce_count) {
dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
obj->err_data.ce_count, ras_block_str(info->head.block));
- if (err_data.ue_count)
+ }
+ if (err_data.ue_count) {
dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
obj->err_data.ue_count, ras_block_str(info->head.block));
+ }
return 0;
}
@@ -1731,3 +1742,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
return 0;
}
+
+void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
+{
+ if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
+ DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
+ }
+}