drm/amdgpu: Rework reset domain to be refcounted.

The reset domain contains register access semaphor now and so needs to be present as long as each device in a hive needs it and so it cannot be binded to XGMI hive life cycle. Adress this by making reset domain refcounted and pointed by each member of the hive and the hive itself. v4: Fix crash on boot witrh XGMI hive by adding type to reset_domain. XGMI will only create a new reset_domain if prevoius was of single device type meaning it's first boot. Otherwsie it will take a refocunt to exsiting reset_domain from the amdgou device. Add a wrapper around reset_domain->refcount get/put and a wrapper around send to reset wq (Lijo) Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Link: https://www.spinics.net/lists/amd-gfx/msg74121.html
author: Andrey Grodzovsky <andrey.grodzovsky@amd.com> 2022-01-21 17:23:32 -0500
committer: Andrey Grodzovsky <andrey.grodzovsky@amd.com> 2022-02-09 12:17:09 -0500
commit: cfbb6b0047448e2d986160d9f30d60f604d9ad0f (patch)
tree: fdb18fa843a69019c49e92887cee48034d6b2e5b /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent: f287a3c5b03f51efa8d8f3e141a79177f91047e0 (diff)
download: linux-cfbb6b0047448e2d986160d9f30d60f604d9ad0f.tar.bz2
1 files changed, 27 insertions, 17 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e3c0ec684a85..d61bc0a0457c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2316,7 +2316,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
 
 		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
 				   ring->num_hw_submission, amdgpu_job_hang_limit,
-				   timeout, adev->reset_domain.wq, ring->sched_score, ring->name);
+				   timeout, adev->reset_domain->wq, ring->sched_score, ring->name);
 		if (r) {
 			DRM_ERROR("Failed to create scheduler on ring %s.\n",
 				  ring->name);
@@ -2439,24 +2439,22 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
 	if (r)
 		goto init_failed;
 
+	/**
+	 * In case of XGMI grab extra reference for reset domain for this device
+	 */
 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
-		struct amdgpu_hive_info *hive;
+		if (amdgpu_xgmi_add_device(adev) == 0) {
+			struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
-		amdgpu_xgmi_add_device(adev);
+			if (!hive->reset_domain ||
+			    !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
+				r = -ENOENT;
+				goto init_failed;
+			}
 
-		hive = amdgpu_get_xgmi_hive(adev);
-		if (!hive || !hive->reset_domain.wq) {
-			DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id);
-			r = -EINVAL;
-			goto init_failed;
-		}
-
-		adev->reset_domain.wq = hive->reset_domain.wq;
-	} else {
-		adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0);
-		if (!adev->reset_domain.wq) {
-			r = -ENOMEM;
-			goto init_failed;
+			/* Drop the early temporary reset domain we created for device */
+			amdgpu_reset_put_reset_domain(adev->reset_domain);
+			adev->reset_domain = hive->reset_domain;
 		}
 	}
 
@@ -3640,6 +3638,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 		return r;
 	}
 
+	/*
+	 * Reset domain needs to be present early, before XGMI hive discovered
+	 * (if any) and intitialized to use reset sem and in_gpu reset flag
+	 * early on during init.
+	 */
+	adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev");
+	if (!adev->reset_domain)
+		return -ENOMEM;
+
 	/* early init functions */
 	r = amdgpu_device_ip_early_init(adev);
 	if (r)
@@ -4016,6 +4023,9 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
 	if (adev->mman.discovery_bin)
 		amdgpu_discovery_fini(adev);
 
+	amdgpu_reset_put_reset_domain(adev->reset_domain);
+	adev->reset_domain = NULL;
+
 	kfree(adev->pci_state);
 
 }
@@ -5241,7 +5251,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 	INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
 
-	if (!queue_work(adev->reset_domain.wq, &work.base))
+	if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
 		return -EAGAIN;
 
 	flush_work(&work.base);
author	Andrey Grodzovsky <andrey.grodzovsky@amd.com>	2022-01-21 17:23:32 -0500
committer	Andrey Grodzovsky <andrey.grodzovsky@amd.com>	2022-02-09 12:17:09 -0500
commit	cfbb6b0047448e2d986160d9f30d60f604d9ad0f (patch)
tree	fdb18fa843a69019c49e92887cee48034d6b2e5b /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent	f287a3c5b03f51efa8d8f3e141a79177f91047e0 (diff)
download	linux-cfbb6b0047448e2d986160d9f30d60f604d9ad0f.tar.bz2