drm/scheduler: rework job destruction

We now destroy finished jobs from the worker thread to make sure that we never destroy a job currently in timeout processing. By this we avoid holding lock around ring mirror list in drm_sched_stop which should solve a deadlock reported by a user. v2: Remove unused variable. v4: Move guilty job free into sched code. v5: Move sched->hw_rq_count to drm_sched_start to account for counter decrement in drm_sched_stop even when we don't call resubmit jobs if guily job did signal. v6: remove unused variable Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109692 Acked-by: Chunming Zhou <david1.zhou@amd.com> Signed-off-by: Christian König <christian.koenig@amd.com> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/1555599624-12285-3-git-send-email-andrey.grodzovsky@amd.com
author: Christian König <christian.koenig@amd.com> 2019-04-18 11:00:21 -0400
committer: Alex Deucher <alexander.deucher@amd.com> 2019-05-02 15:45:48 -0500
commit: 5918045c4ed492fb5813f980dcf89a90fefd0a4e (patch)
tree: a59c6f9bb006645b8fc46164d7fb53cb38b24294 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent: b3198c38f02d54a5e964258a2180d502abe6eaf0 (diff)
download: linux-5918045c4ed492fb5813f980dcf89a90fefd0a4e.tar.bz2
1 files changed, 3 insertions, 6 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7cee269ec3e3..a0e165c91a78 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3334,7 +3334,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 		if (!ring || !ring->sched.thread)
 			continue;
 
-		drm_sched_stop(&ring->sched);
+		drm_sched_stop(&ring->sched, &job->base);
 
 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
 		amdgpu_fence_driver_force_completion(ring);
@@ -3343,8 +3343,6 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
 	if(job)
 		drm_sched_increase_karma(&job->base);
 
-
-
 	if (!amdgpu_sriov_vf(adev)) {
 
 		if (!need_full_reset)
@@ -3482,8 +3480,7 @@ end:
 	return r;
 }
 
-static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
-					  struct amdgpu_job *job)
+static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
 {
 	int i;
 
@@ -3623,7 +3620,7 @@ retry:	/* Rest of adevs pre asic reset from XGMI hive. */
 
 	/* Post ASIC reset for all devs .*/
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-		amdgpu_device_post_asic_reset(tmp_adev, tmp_adev == adev ? job : NULL);
+		amdgpu_device_post_asic_reset(tmp_adev);
 
 		if (r) {
 			/* bad news, how to tell it to userspace ? */
author	Christian König <christian.koenig@amd.com>	2019-04-18 11:00:21 -0400
committer	Alex Deucher <alexander.deucher@amd.com>	2019-05-02 15:45:48 -0500
commit	5918045c4ed492fb5813f980dcf89a90fefd0a4e (patch)
tree	a59c6f9bb006645b8fc46164d7fb53cb38b24294 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent	b3198c38f02d54a5e964258a2180d502abe6eaf0 (diff)
download	linux-5918045c4ed492fb5813f980dcf89a90fefd0a4e.tar.bz2