summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
authorHorace Chen <horace.chen@amd.com>2021-01-20 22:03:28 +0800
committerAlex Deucher <alexander.deucher@amd.com>2021-01-25 17:45:16 -0500
commit91fb309d8294be5ab03746638e10bb3e5680f348 (patch)
tree74e197780fc7941ae409f12a61540c7bae762153 /drivers/gpu/drm/amd/amdgpu
parenteda1068dc995fbc87eee04496a3414372a8ef63d (diff)
downloadlinux-91fb309d8294be5ab03746638e10bb3e5680f348.tar.bz2
drm/amdgpu: race issue when jobs on 2 ring timeout
Fix a racing issue when jobs on 2 rings timeout simultaneously. If 2 rings timed out at the same time, the amdgpu_device_gpu_recover will be reentered. Then the adev->gmc.xgmi.head will be grabbed by 2 local linked list, which may cause wild pointer issue in iterating. lock the device earily to prevent the node be added to 2 different lists. also increase karma for the skipped job since the job is also timed out and should be guilty. Signed-off-by: Horace Chen <horace.chen@amd.com> Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c69
1 files changed, 59 insertions, 10 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e4e1f04d9164..9ae0f2d68281 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4461,6 +4461,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
up_write(&adev->reset_sem);
}
+/*
+ * to lockup a list of amdgpu devices in a hive safely, if not a hive
+ * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
+ *
+ * unlock won't require roll back.
+ */
+static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
+{
+ struct amdgpu_device *tmp_adev = NULL;
+
+ if (adev->gmc.xgmi.num_physical_nodes > 1) {
+ if (!hive) {
+ dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
+ return -ENODEV;
+ }
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+ if (!amdgpu_device_lock_adev(tmp_adev, hive))
+ goto roll_back;
+ }
+ } else if (!amdgpu_device_lock_adev(adev, hive))
+ return -EAGAIN;
+
+ return 0;
+roll_back:
+ if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
+ /*
+ * if the lockup iteration break in the middle of a hive,
+ * it may means there may has a race issue,
+ * or a hive device locked up independently.
+ * we may be in trouble and may not, so will try to roll back
+ * the lock and give out a warnning.
+ */
+ dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
+ list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+ amdgpu_device_unlock_adev(tmp_adev);
+ }
+ }
+ return -EAGAIN;
+}
+
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
{
struct pci_dev *p = NULL;
@@ -4574,20 +4614,36 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
job ? job->base.id : -1, hive->hive_id);
amdgpu_put_xgmi_hive(hive);
+ if (job)
+ drm_sched_increase_karma(&job->base);
return 0;
}
mutex_lock(&hive->hive_lock);
}
/*
+ * lock the device before we try to operate the linked list
+ * if didn't get the device lock, don't touch the linked list since
+ * others may iterating it.
+ */
+ r = amdgpu_device_lock_hive_adev(adev, hive);
+ if (r) {
+ dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
+ job ? job->base.id : -1);
+
+ /* even we skipped this reset, still need to set the job to guilty */
+ if (job)
+ drm_sched_increase_karma(&job->base);
+ goto skip_recovery;
+ }
+
+ /*
* Build list of devices to reset.
* In case we are in XGMI hive mode, resort the device list
* to put adev in the 1st position.
*/
INIT_LIST_HEAD(&device_list);
if (adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!hive)
- return -ENODEV;
if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
device_list_handle = &hive->device_list;
@@ -4598,13 +4654,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
- dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
- job ? job->base.id : -1);
- r = 0;
- goto skip_recovery;
- }
-
/*
* Try to put the audio codec into suspend state
* before gpu reset started.
@@ -4742,7 +4791,7 @@ skip_recovery:
amdgpu_put_xgmi_hive(hive);
}
- if (r)
+ if (r && r != -EAGAIN)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
return r;
}