drm/msm: Hangcheck progress detection

If the hangcheck timer expires, check if the fw's position in the cmdstream has advanced (changed) since last timer expiration, and allow it up to three additional "extensions" to it's alotted time. The intention is to continue to catch "shader stuck in a loop" type hangs quickly, but allow more time for things that are actually making forward progress. Because we need to sample the CP state twice to detect if there has not been progress, this also cuts the the timer's duration in half. v2: Fix typo (REG_A6XX_CP_CSQ_IB2_STAT), add comment v3: Only halve hangcheck timer duration for generations which support progress detection (hdanton); removed unused a5xx progress (without knowing how to adjust for data buffered in ROQ it is too likely to report a false negative) v4: Comment updates to better describe the total hangcheck duration when progress detection is applied Reviewed-by: Chia-I Wu <olvaffe@gmail.com> Tested-by: Chia-I Wu <olvaffe@gmail.com> # dEQP-GLES2.functional.flush_finish.wait Signed-off-by: Rob Clark <robdclark@chromium.org> Reviewed-by: Akhil P Oommen <quic_akhilpo@quicinc.com> Patchwork: https://patchwork.freedesktop.org/patch/511584/ Link: https://lore.kernel.org/r/20221114193049.1533391-3-robdclark@gmail.com
author: Rob Clark <robdclark@chromium.org> 2022-11-14 11:30:41 -0800
committer: Rob Clark <robdclark@chromium.org> 2022-11-17 10:39:12 -0800
commit: d73b1d02de0858b96f743e1e8b767fb092ae4c1b (patch)
tree: a5fedb0e54c0f5e721fa11619796c818a9963e01 /drivers/gpu/drm/msm/msm_gpu.c
parent: cade05b2a88558847984287dd389fae0c7de31d6 (diff)
download: linux-d73b1d02de0858b96f743e1e8b767fb092ae4c1b.tar.bz2
1 files changed, 30 insertions, 1 deletions
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 0098ee8438aa..f7195b43cc6c 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -492,6 +492,21 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu)
 			round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period)));
 }
 
+static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring)
+{
+	if (ring->hangcheck_progress_retries >= DRM_MSM_HANGCHECK_PROGRESS_RETRIES)
+		return false;
+
+	if (!gpu->funcs->progress)
+		return false;
+
+	if (!gpu->funcs->progress(gpu, ring))
+		return false;
+
+	ring->hangcheck_progress_retries++;
+	return true;
+}
+
 static void hangcheck_handler(struct timer_list *t)
 {
 	struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer);
@@ -502,9 +517,12 @@ static void hangcheck_handler(struct timer_list *t)
 	if (fence != ring->hangcheck_fence) {
 		/* some progress has been made.. ya! */
 		ring->hangcheck_fence = fence;
-	} else if (fence_before(fence, ring->fctx->last_fence)) {
+		ring->hangcheck_progress_retries = 0;
+	} else if (fence_before(fence, ring->fctx->last_fence) &&
+			!made_progress(gpu, ring)) {
 		/* no progress and not done.. hung! */
 		ring->hangcheck_fence = fence;
+		ring->hangcheck_progress_retries = 0;
 		DRM_DEV_ERROR(dev->dev, "%s: hangcheck detected gpu lockup rb %d!\n",
 				gpu->name, ring->id);
 		DRM_DEV_ERROR(dev->dev, "%s:     completed fence: %u\n",
@@ -830,6 +848,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		struct msm_gpu *gpu, const struct msm_gpu_funcs *funcs,
 		const char *name, struct msm_gpu_config *config)
 {
+	struct msm_drm_private *priv = drm->dev_private;
 	int i, ret, nr_rings = config->nr_rings;
 	void *memptrs;
 	uint64_t memptrs_iova;
@@ -857,6 +876,16 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	kthread_init_work(&gpu->recover_work, recover_worker);
 	kthread_init_work(&gpu->fault_work, fault_worker);
 
+	priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD;
+
+	/*
+	 * If progress detection is supported, halve the hangcheck timer
+	 * duration, as it takes two iterations of the hangcheck handler
+	 * to detect a hang.
+	 */
+	if (funcs->progress)
+		priv->hangcheck_period /= 2;
+
 	timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);
 
 	spin_lock_init(&gpu->perf_lock);
author	Rob Clark <robdclark@chromium.org>	2022-11-14 11:30:41 -0800
committer	Rob Clark <robdclark@chromium.org>	2022-11-17 10:39:12 -0800
commit	d73b1d02de0858b96f743e1e8b767fb092ae4c1b (patch)
tree	a5fedb0e54c0f5e721fa11619796c818a9963e01 /drivers/gpu/drm/msm/msm_gpu.c
parent	cade05b2a88558847984287dd389fae0c7de31d6 (diff)
download	linux-d73b1d02de0858b96f743e1e8b767fb092ae4c1b.tar.bz2