From 05adfd80cc52e0b4581e65bb5418de5dfd24d105 Mon Sep 17 00:00:00 2001 From: Luben Tuikov <luben.tuikov@amd.com> Date: Fri, 21 May 2021 11:53:09 -0400 Subject: drm/amdgpu: Use delayed work to collect RAS error counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Context Query2 IOCTL return the correctable and uncorrectable errors in O(1) fashion, from cached values, and schedule a delayed work function to calculate and cache them for the next such IOCTL. v2: Cancel pending delayed work at ras_fini(). v3: Remove conditionals when dealing with delayed work manipulation as they're inherently racy. Cc: Alexander Deucher <Alexander.Deucher@amd.com> Cc: Christian König <christian.koenig@amd.com> Cc: John Clements <john.clements@amd.com> Cc: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com> Reviewed-by: Alexander Deucher <Alexander.Deucher@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index bb0cfe871aba..e7a010b7ca1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -331,10 +331,13 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev, return 0; } +#define AMDGPU_RAS_COUNTE_DELAY_MS 3000 + static int amdgpu_ctx_query2(struct amdgpu_device *adev, - struct amdgpu_fpriv *fpriv, uint32_t id, - union drm_amdgpu_ctx_out *out) + struct amdgpu_fpriv *fpriv, uint32_t id, + union drm_amdgpu_ctx_out *out) { + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ctx *ctx; struct amdgpu_ctx_mgr *mgr; @@ -361,6 +364,30 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, if (atomic_read(&ctx->guilty)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY; + if (adev->ras_enabled && con) { + /* Return the cached values in O(1), + * and schedule delayed work to cache + * new vaues. + */ + int ce_count, ue_count; + + ce_count = atomic_read(&con->ras_ce_count); + ue_count = atomic_read(&con->ras_ue_count); + + if (ce_count != ctx->ras_counter_ce) { + ctx->ras_counter_ce = ce_count; + out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; + } + + if (ue_count != ctx->ras_counter_ue) { + ctx->ras_counter_ue = ue_count; + out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; + } + + schedule_delayed_work(&con->ras_counte_delay_work, + msecs_to_jiffies(AMDGPU_RAS_COUNTE_DELAY_MS)); + } + mutex_unlock(&mgr->lock); return 0; } -- cgit v1.2.3