From 19d0dfda4c75a23012dd714e57b4f569df61089d Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Mon, 19 Apr 2021 11:33:10 +0800 Subject: drm/amdgpu: optimize gfx ras features flag clean Signed-off-by: Stanley.Yang Reviewed-by: Feifei Xu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b0d2fc9454ca..bb0d02755f11 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -658,11 +658,7 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, con->features |= BIT(head->block); } else { if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { - /* skip clean gfx ras context feature for VEGA20 Gaming. - * will clean later - */ - if (!(!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX))) - con->features &= ~BIT(head->block); + con->features &= ~BIT(head->block); put_obj(obj); } } @@ -770,6 +766,10 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, con->features |= BIT(head->block); ret = amdgpu_ras_feature_enable(adev, head, 0); + + /* clean gfx block ras features flag */ + if (adev->ras_features && head->block == AMDGPU_RAS_BLOCK__GFX) + con->features &= ~BIT(head->block); } } else ret = amdgpu_ras_feature_enable(adev, head, enable); -- cgit v1.2.3 From 502f0e28042b4ef271c70f54dc2e4feb230fdb64 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 22 Apr 2021 21:58:08 +0800 Subject: drm/amdgpu: disable gfx ras by default in aldebaran aldebaran gfx ras is still under development Signed-off-by: Hawking Zhang Reviewed-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index bb0d02755f11..f62873fbf249 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2066,8 +2066,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, } else { /* driver only manages a few IP blocks RAS feature * when GPU is connected cpu through XGMI */ - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | - 1 << AMDGPU_RAS_BLOCK__SDMA | + *hw_supported |= (1 << AMDGPU_RAS_BLOCK__SDMA | 1 << AMDGPU_RAS_BLOCK__MMHUB); } -- cgit v1.2.3 From a30f128602001c986bb2b3ede074b6193d43905f Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Sun, 25 Apr 2021 14:34:25 +0800 Subject: drm/amdgpu: provide socket/die id info in RAS message Add socket/die information in RAS messages for platforms that support query those information Signed-off-by: Hawking Zhang Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f62873fbf249..ae9fb2025259 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -901,17 +901,42 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, info->ce_count = obj->err_data.ce_count; if (err_data.ce_count) { - dev_info(adev->dev, "%ld correctable hardware errors " + if (adev->smuio.funcs && + adev->smuio.funcs->get_socket_id && + adev->smuio.funcs->get_die_id) { + dev_info(adev->dev, "socket: %d, die: %d " + "%ld correctable hardware errors " "detected in %s block, no user " "action is needed.\n", + adev->smuio.funcs->get_socket_id(adev), + adev->smuio.funcs->get_die_id(adev), obj->err_data.ce_count, ras_block_str(info->head.block)); + } else { + dev_info(adev->dev, "%ld correctable hardware errors " + "detected in %s block, no user " + "action is needed.\n", + obj->err_data.ce_count, + ras_block_str(info->head.block)); + } } if (err_data.ue_count) { - dev_info(adev->dev, "%ld uncorrectable hardware errors " + if (adev->smuio.funcs && + adev->smuio.funcs->get_socket_id && + adev->smuio.funcs->get_die_id) { + dev_info(adev->dev, "socket: %d, die: %d " + "%ld uncorrectable hardware errors " "detected in %s block\n", + adev->smuio.funcs->get_socket_id(adev), + adev->smuio.funcs->get_die_id(adev), obj->err_data.ue_count, ras_block_str(info->head.block)); + } else { + dev_info(adev->dev, "%ld uncorrectable hardware errors " + "detected in %s block\n", + obj->err_data.ue_count, + ras_block_str(info->head.block)); + } } return 0; -- cgit v1.2.3 From 78871b6c8be303a8626bfc85f022b31362495c1b Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Wed, 28 Apr 2021 22:51:22 +0800 Subject: drm/amdgpu: enable ras error count query and reset for HDP add hdp block ras error query and reset support in amdgpu ras error count query and reset interface Signed-off-by: Hawking Zhang Reviewed-by: John Clements Reviewed-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 ++++++++++ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/soc15.c | 3 --- 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ae9fb2025259..984e8271e675 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -890,6 +890,11 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, adev->gmc.xgmi.ras_funcs->query_ras_error_count) adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data); break; + case AMDGPU_RAS_BLOCK__HDP: + if (adev->hdp.ras_funcs && + adev->hdp.ras_funcs->query_ras_error_count) + adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data); + break; default: break; } @@ -967,6 +972,11 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, if (adev->sdma.funcs->reset_ras_error_count) adev->sdma.funcs->reset_ras_error_count(adev); break; + case AMDGPU_RAS_BLOCK__HDP: + if (adev->hdp.ras_funcs && + adev->hdp.ras_funcs->reset_ras_error_count) + adev->hdp.ras_funcs->reset_ras_error_count(adev); + break; default: break; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index b53aa4dd87c0..6028b55ea52c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1272,6 +1272,10 @@ static int gmc_v9_0_late_init(void *handle) adev->mmhub.ras_funcs->reset_ras_error_count) adev->mmhub.ras_funcs->reset_ras_error_count(adev); + if (adev->hdp.ras_funcs && + adev->hdp.ras_funcs->reset_ras_error_count) + adev->hdp.ras_funcs->reset_ras_error_count(adev); + r = amdgpu_gmc_ras_late_init(adev); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index d80e12b80c7e..28e9f6b726a6 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1521,9 +1521,6 @@ static int soc15_common_late_init(void *handle) if (amdgpu_sriov_vf(adev)) xgpu_ai_mailbox_get_irq(adev); - if (adev->hdp.funcs->reset_ras_error_count) - adev->hdp.funcs->reset_ras_error_count(adev); - if (adev->nbio.ras_funcs && adev->nbio.ras_funcs->ras_late_init) r = adev->nbio.ras_funcs->ras_late_init(adev); -- cgit v1.2.3 From 1f6e8eb153113c2da2027bea7793d88dc9c299c3 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Thu, 29 Apr 2021 14:28:13 +0800 Subject: drm/amdgpu: enable gfx ras in aldebran by default gfx ras now can be enabled by default in aldebaran Signed-off-by: Hawking Zhang Reviewed-by: John Clements Reviewed-by: Dennis Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 984e8271e675..9306e3925efe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2101,7 +2101,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, } else { /* driver only manages a few IP blocks RAS feature * when GPU is connected cpu through XGMI */ - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__SDMA | + *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | + 1 << AMDGPU_RAS_BLOCK__SDMA | 1 << AMDGPU_RAS_BLOCK__MMHUB); } -- cgit v1.2.3 From f50160cf0f98b5bc3074c219a46af7305a14ffff Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Thu, 29 Apr 2021 09:32:12 +0800 Subject: drm/amdgpu: force enable gfx ras for vega20 ws Signed-off-by: Stanley.Yang Reviewed-by: Feifei Xu Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9306e3925efe..ebbe2c5190c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -33,6 +33,7 @@ #include "amdgpu_atomfirmware.h" #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +#include "atom.h" static const char *RAS_FS_NAME = "ras"; @@ -2063,6 +2064,24 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) adev->asic_type == CHIP_SIENNA_CICHLID; } +/* + * this is workaround for vega20 workstation sku, + * force enable gfx ras, ignore vbios gfx ras flag + * due to GC EDC can not write + */ +static void amdgpu_ras_get_quirks(struct amdgpu_device *adev, + uint32_t *hw_supported) +{ + struct atom_context *ctx = adev->mode_info.atom_context; + + if (!ctx) + return; + + if (strnstr(ctx->vbios_version, "D16406", + sizeof(ctx->vbios_version))) + *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX); +} + /* * check hardware's ras ability which will be saved in hw_supported. * if hardware does not support ras, we can skip some ras initializtion and @@ -2106,6 +2125,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, 1 << AMDGPU_RAS_BLOCK__MMHUB); } + amdgpu_ras_get_quirks(adev, hw_supported); + /* hw_supported needs to be aligned with RAS block mask. */ *hw_supported &= AMDGPU_RAS_BLOCK_MASK; -- cgit v1.2.3 From acdae2169bae6993c61ded36ae0b2301c07c0a9e Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Mon, 3 May 2021 20:02:22 -0400 Subject: drm/amdgpu: Remove redundant ras->supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove redundant ras->supported, as this value is also stored in adev->ras_features. Use adev->ras_features, as that supercedes "ras", since the latter is its member. The dependency goes like this: ras <== adev->ras_features <== hw_supported, and is read as "ras depends on ras_features, which depends on hw_supported." The arrows show the flow of information, i.e. the dependency update. "hw_supported" should also live in "adev". Cc: Alexander Deucher Cc: John Clements Cc: Hawking Zhang Signed-off-by: Luben Tuikov Acked-by: Christian König Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 ++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +--- drivers/gpu/drm/amd/amdgpu/soc15.c | 7 ++++--- drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_baco.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 3 ++- 8 files changed, 18 insertions(+), 17 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1b60f8205f15..0ed11428a57d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5108,7 +5108,8 @@ int amdgpu_device_baco_enter(struct drm_device *dev) if (!amdgpu_device_supports_baco(adev_to_drm(adev))) return -ENOTSUPP; - if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) + if (ras && adev->ras_features && + adev->nbio.funcs->enable_doorbell_interrupt) adev->nbio.funcs->enable_doorbell_interrupt(adev, false); return amdgpu_dpm_baco_enter(adev); @@ -5127,7 +5128,8 @@ int amdgpu_device_baco_exit(struct drm_device *dev) if (ret) return ret; - if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt) + if (ras && adev->ras_features && + adev->nbio.funcs->enable_doorbell_interrupt) adev->nbio.funcs->enable_doorbell_interrupt(adev, true); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 07e8a7c28561..7a6a87e5c79e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -986,7 +986,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) if (!ras) return -EINVAL; - ras_mask = (uint64_t)ras->supported << 32 | ras->features; + ras_mask = (uint64_t)adev->ras_features << 32 | ras->features; return copy_to_user(out, &ras_mask, min_t(u64, size, sizeof(ras_mask))) ? diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 17b728d2c1f2..4885b718cb6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2146,7 +2146,7 @@ static int psp_load_smu_fw(struct psp_context *psp) return 0; if ((amdgpu_in_reset(adev) && - ras && ras->supported && + ras && adev->ras_features && (adev->asic_type == CHIP_ARCTURUS || adev->asic_type == CHIP_VEGA20)) || (adev->in_runpm && diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ebbe2c5190c4..a484ac6a8399 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2130,9 +2130,8 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, /* hw_supported needs to be aligned with RAS block mask. */ *hw_supported &= AMDGPU_RAS_BLOCK_MASK; - *supported = amdgpu_ras_enable == 0 ? - 0 : *hw_supported & amdgpu_ras_mask; - adev->ras_features = *supported; + *supported = amdgpu_ras_enable == 0 ? 0 : + *hw_supported & amdgpu_ras_mask; } int amdgpu_ras_init(struct amdgpu_device *adev) @@ -2154,7 +2153,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_set_context(adev, con); amdgpu_ras_check_supported(adev, &con->hw_supported, - &con->supported); + &adev->ras_features); if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) { /* set gfx block ras context feature for VEGA20 Gaming * send ras disable cmd to ras ta during ras late init. @@ -2210,7 +2209,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", - con->hw_supported, con->supported); + con->hw_supported, adev->ras_features); return 0; release_con: amdgpu_ras_set_context(adev, NULL); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 60df268a0c66..3e830dc1a33d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -314,8 +314,6 @@ struct amdgpu_ras { /* ras infrastructure */ /* for ras itself. */ uint32_t hw_supported; - /* for IP to check its ras ability. */ - uint32_t supported; uint32_t features; struct list_head head; /* sysfs */ @@ -478,7 +476,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, if (block >= AMDGPU_RAS_BLOCK_COUNT) return 0; - return ras && (ras->supported & (1 << block)); + return ras && (adev->ras_features & (1 << block)); } int amdgpu_ras_recovery_init(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index e3f42ad1e6bc..301695f3f237 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -655,7 +655,7 @@ static int soc15_asic_baco_reset(struct amdgpu_device *adev) int ret = 0; /* avoid NBIF got stuck when do RAS recovery in BACO reset */ - if (ras && ras->supported) + if (ras && adev->ras_features) adev->nbio.funcs->enable_doorbell_interrupt(adev, false); ret = amdgpu_dpm_baco_reset(adev); @@ -663,7 +663,7 @@ static int soc15_asic_baco_reset(struct amdgpu_device *adev) return ret; /* re-enable doorbell interrupt after BACO exit */ - if (ras && ras->supported) + if (ras && adev->ras_features) adev->nbio.funcs->enable_doorbell_interrupt(adev, true); return 0; @@ -710,7 +710,8 @@ soc15_asic_reset_method(struct amdgpu_device *adev) * 1. PMFW version > 0x284300: all cases use baco * 2. PMFW version <= 0x284300: only sGPU w/o RAS use baco */ - if ((ras && ras->supported) && adev->pm.fw_version <= 0x283400) + if (ras && adev->ras_features && + adev->pm.fw_version <= 0x283400) baco_reset = false; break; case CHIP_ALDEBARAN: diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_baco.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_baco.c index 2a28c9df15a0..f6b1efce450c 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_baco.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_baco.c @@ -85,7 +85,7 @@ int vega20_baco_set_state(struct pp_hwmgr *hwmgr, enum BACO_STATE state) return 0; if (state == BACO_STATE_IN) { - if (!ras || !ras->supported) { + if (!ras || !adev->ras_features) { data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL); data |= 0x80000000; WREG32_SOC15(THM, 0, mmTHM_BACO_CNTL, data); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c index 6274cae4a065..72581e43d83e 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c @@ -1531,7 +1531,8 @@ int smu_v11_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state) NULL); break; default: - if (!ras || !ras->supported || adev->gmc.xgmi.pending_reset) { + if (!ras || !adev->ras_features || + adev->gmc.xgmi.pending_reset) { if (adev->asic_type == CHIP_ARCTURUS) { data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL_ARCT); data |= 0x80000000; -- cgit v1.2.3 From e509965e58ab2c96418e1a5e756e25d9a54e0765 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Mon, 3 May 2021 20:43:00 -0400 Subject: drm/amdgpu: Move up ras_hw_supported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move ras_hw_supported into struct amdgpu_dev. The dependency is: struct amdgpu_ras <== struct amdgpu_dev <== ASIC, read as "struct amdgpu_ras depends on struct amdgpu_dev, which depends on the hardware." This can be loosely understood as, "if RAS is supported, which is property of the ASIC (struct amdgpu_dev), then we can access struct amdgpu_ras." v2: Fix a typo: must binary AND in ternary cond in amdgpu_ras.c Cc: Alexander Deucher Cc: John Clements Cc: Hawking Zhang Signed-off-by: Luben Tuikov Acked-by: Christian König Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 58 +++++++++++++++------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 - 3 files changed, 28 insertions(+), 34 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index dc3a69296321..ba740544eb77 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1073,7 +1073,8 @@ struct amdgpu_device { atomic_t throttling_logging_enabled; struct ratelimit_state throttling_logging_rs; - uint32_t ras_features; + uint32_t ras_hw_supported; + uint32_t ras_features; bool in_pci_err_recovery; struct pci_saved_state *pci_state; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a484ac6a8399..e7940e813f32 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -611,11 +611,9 @@ static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, /* feature ctl begin */ static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, - struct ras_common_if *head) + struct ras_common_if *head) { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - - return con->hw_supported & BIT(head->block); + return adev->ras_hw_supported & BIT(head->block); } static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, @@ -2069,8 +2067,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) * force enable gfx ras, ignore vbios gfx ras flag * due to GC EDC can not write */ -static void amdgpu_ras_get_quirks(struct amdgpu_device *adev, - uint32_t *hw_supported) +static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) { struct atom_context *ctx = adev->mode_info.atom_context; @@ -2078,8 +2075,8 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev, return; if (strnstr(ctx->vbios_version, "D16406", - sizeof(ctx->vbios_version))) - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX); + sizeof(ctx->vbios_version))) + adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX); } /* @@ -2091,11 +2088,9 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev, * we have to initialize ras as normal. but need check if operation is * allowed or not in each function. */ -static void amdgpu_ras_check_supported(struct amdgpu_device *adev, - uint32_t *hw_supported, uint32_t *supported) +static void amdgpu_ras_check_supported(struct amdgpu_device *adev) { - *hw_supported = 0; - *supported = 0; + adev->ras_hw_supported = adev->ras_features = 0; if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || !amdgpu_ras_asic_supported(adev)) @@ -2104,34 +2099,34 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, if (!adev->gmc.xgmi.connected_to_cpu) { if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { dev_info(adev->dev, "MEM ECC is active.\n"); - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | - 1 << AMDGPU_RAS_BLOCK__DF); + adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); } else { dev_info(adev->dev, "MEM ECC is not presented.\n"); } if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { dev_info(adev->dev, "SRAM ECC is active.\n"); - *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | - 1 << AMDGPU_RAS_BLOCK__DF); + adev->ras_hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); } else { dev_info(adev->dev, "SRAM ECC is not presented.\n"); } } else { /* driver only manages a few IP blocks RAS feature * when GPU is connected cpu through XGMI */ - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | - 1 << AMDGPU_RAS_BLOCK__SDMA | - 1 << AMDGPU_RAS_BLOCK__MMHUB); + adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | + 1 << AMDGPU_RAS_BLOCK__SDMA | + 1 << AMDGPU_RAS_BLOCK__MMHUB); } - amdgpu_ras_get_quirks(adev, hw_supported); + amdgpu_ras_get_quirks(adev); /* hw_supported needs to be aligned with RAS block mask. */ - *hw_supported &= AMDGPU_RAS_BLOCK_MASK; + adev->ras_hw_supported &= AMDGPU_RAS_BLOCK_MASK; - *supported = amdgpu_ras_enable == 0 ? 0 : - *hw_supported & amdgpu_ras_mask; + adev->ras_features = amdgpu_ras_enable == 0 ? 0 : + adev->ras_hw_supported & amdgpu_ras_mask; } int amdgpu_ras_init(struct amdgpu_device *adev) @@ -2152,9 +2147,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_set_context(adev, con); - amdgpu_ras_check_supported(adev, &con->hw_supported, - &adev->ras_features); - if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) { + amdgpu_ras_check_supported(adev); + + if (!adev->ras_hw_supported || adev->asic_type == CHIP_VEGA10) { /* set gfx block ras context feature for VEGA20 Gaming * send ras disable cmd to ras ta during ras late init. */ @@ -2208,8 +2203,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev) } dev_info(adev->dev, "RAS INFO: ras initialized successfully, " - "hardware ability[%x] ras_mask[%x]\n", - con->hw_supported, adev->ras_features); + "hardware ability[%x] ras_mask[%x]\n", + adev->ras_hw_supported, adev->ras_features); + return 0; release_con: amdgpu_ras_set_context(adev, NULL); @@ -2415,10 +2411,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { - uint32_t hw_supported, supported; - - amdgpu_ras_check_supported(adev, &hw_supported, &supported); - if (!hw_supported) + amdgpu_ras_check_supported(adev); + if (!adev->ras_hw_supported) return; if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 3e830dc1a33d..f60d1cfafa3e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -313,7 +313,6 @@ struct ras_common_if { struct amdgpu_ras { /* ras infrastructure */ /* for ras itself. */ - uint32_t hw_supported; uint32_t features; struct list_head head; /* sysfs */ -- cgit v1.2.3 From 8ab0d6f030bab4d8c7310e9894f6fdb59817d241 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Tue, 4 May 2021 02:25:29 -0400 Subject: drm/amdgpu: Rename to ras_*_enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename, ras_hw_supported --> ras_hw_enabled, and ras_features --> ras_enabled, to show that ras_enabled is a subset of ras_hw_enabled, which itself is a subset of the ASIC capability. Cc: Alexander Deucher Cc: John Clements Cc: Hawking Zhang Signed-off-by: Luben Tuikov Acked-by: Christian König Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 50 +++++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/soc15.c | 6 +-- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 6 +-- .../gpu/drm/amd/pm/powerplay/hwmgr/vega20_baco.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 2 +- 11 files changed, 41 insertions(+), 41 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index ba740544eb77..cef7bbe850e7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1073,8 +1073,8 @@ struct amdgpu_device { atomic_t throttling_logging_enabled; struct ratelimit_state throttling_logging_rs; - uint32_t ras_hw_supported; - uint32_t ras_features; + uint32_t ras_hw_enabled; + uint32_t ras_enabled; bool in_pci_err_recovery; struct pci_saved_state *pci_state; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0ed11428a57d..b0543f409039 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5108,7 +5108,7 @@ int amdgpu_device_baco_enter(struct drm_device *dev) if (!amdgpu_device_supports_baco(adev_to_drm(adev))) return -ENOTSUPP; - if (ras && adev->ras_features && + if (ras && adev->ras_enabled && adev->nbio.funcs->enable_doorbell_interrupt) adev->nbio.funcs->enable_doorbell_interrupt(adev, false); @@ -5128,7 +5128,7 @@ int amdgpu_device_baco_exit(struct drm_device *dev) if (ret) return ret; - if (ras && adev->ras_features && + if (ras && adev->ras_enabled && adev->nbio.funcs->enable_doorbell_interrupt) adev->nbio.funcs->enable_doorbell_interrupt(adev, true); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 7a6a87e5c79e..8d12e474745a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -986,7 +986,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) if (!ras) return -EINVAL; - ras_mask = (uint64_t)adev->ras_features << 32 | ras->features; + ras_mask = (uint64_t)adev->ras_enabled << 32 | ras->features; return copy_to_user(out, &ras_mask, min_t(u64, size, sizeof(ras_mask))) ? diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 4885b718cb6c..3179ca9fc03f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -2146,7 +2146,7 @@ static int psp_load_smu_fw(struct psp_context *psp) return 0; if ((amdgpu_in_reset(adev) && - ras && adev->ras_features && + ras && adev->ras_enabled && (adev->asic_type == CHIP_ARCTURUS || adev->asic_type == CHIP_VEGA20)) || (adev->in_runpm && diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e7940e813f32..444f5325f092 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -532,7 +532,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return NULL; if (head->block >= AMDGPU_RAS_BLOCK_COUNT) @@ -559,7 +559,7 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, struct ras_manager *obj; int i; - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return NULL; if (head) { @@ -613,7 +613,7 @@ static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, struct ras_common_if *head) { - return adev->ras_hw_supported & BIT(head->block); + return adev->ras_hw_enabled & BIT(head->block); } static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, @@ -767,7 +767,7 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, ret = amdgpu_ras_feature_enable(adev, head, 0); /* clean gfx block ras features flag */ - if (adev->ras_features && head->block == AMDGPU_RAS_BLOCK__GFX) + if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) con->features &= ~BIT(head->block); } } else @@ -1072,7 +1072,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, struct ras_manager *obj; struct ras_err_data data = {0, 0}; - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return 0; list_for_each_entry(obj, &con->head, node) { @@ -1595,7 +1595,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return; list_for_each_entry(obj, &con->head, node) { @@ -1645,7 +1645,7 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return; list_for_each_entry(obj, &con->head, node) { @@ -1959,7 +1959,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) bool exc_err_limit = false; int ret; - if (adev->ras_features && con) + if (adev->ras_enabled && con) data = &con->eh_data; else return 0; @@ -2076,7 +2076,7 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) if (strnstr(ctx->vbios_version, "D16406", sizeof(ctx->vbios_version))) - adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX); + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); } /* @@ -2090,7 +2090,7 @@ static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) */ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) { - adev->ras_hw_supported = adev->ras_features = 0; + adev->ras_hw_enabled = adev->ras_enabled = 0; if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || !amdgpu_ras_asic_supported(adev)) @@ -2099,7 +2099,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (!adev->gmc.xgmi.connected_to_cpu) { if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { dev_info(adev->dev, "MEM ECC is active.\n"); - adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); } else { dev_info(adev->dev, "MEM ECC is not presented.\n"); @@ -2107,7 +2107,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { dev_info(adev->dev, "SRAM ECC is active.\n"); - adev->ras_hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | + adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | 1 << AMDGPU_RAS_BLOCK__DF); } else { dev_info(adev->dev, "SRAM ECC is not presented.\n"); @@ -2115,7 +2115,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) } else { /* driver only manages a few IP blocks RAS feature * when GPU is connected cpu through XGMI */ - adev->ras_hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | 1 << AMDGPU_RAS_BLOCK__SDMA | 1 << AMDGPU_RAS_BLOCK__MMHUB); } @@ -2123,10 +2123,10 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) amdgpu_ras_get_quirks(adev); /* hw_supported needs to be aligned with RAS block mask. */ - adev->ras_hw_supported &= AMDGPU_RAS_BLOCK_MASK; + adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; - adev->ras_features = amdgpu_ras_enable == 0 ? 0 : - adev->ras_hw_supported & amdgpu_ras_mask; + adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : + adev->ras_hw_enabled & amdgpu_ras_mask; } int amdgpu_ras_init(struct amdgpu_device *adev) @@ -2149,11 +2149,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_check_supported(adev); - if (!adev->ras_hw_supported || adev->asic_type == CHIP_VEGA10) { + if (!adev->ras_hw_enabled || adev->asic_type == CHIP_VEGA10) { /* set gfx block ras context feature for VEGA20 Gaming * send ras disable cmd to ras ta during ras late init. */ - if (!adev->ras_features && adev->asic_type == CHIP_VEGA20) { + if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) { con->features |= BIT(AMDGPU_RAS_BLOCK__GFX); return 0; @@ -2204,7 +2204,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", - adev->ras_hw_supported, adev->ras_features); + adev->ras_hw_enabled, adev->ras_enabled); return 0; release_con: @@ -2319,7 +2319,7 @@ void amdgpu_ras_resume(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj, *tmp; - if (!adev->ras_features || !con) { + if (!adev->ras_enabled || !con) { /* clean ras context for VEGA20 Gaming after send ras disable cmd */ amdgpu_release_ras_context(adev); @@ -2365,7 +2365,7 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return; amdgpu_ras_disable_all_features(adev, 0); @@ -2379,7 +2379,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return 0; /* Need disable ras on all IPs here before ip [hw/sw]fini */ @@ -2392,7 +2392,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!adev->ras_features || !con) + if (!adev->ras_enabled || !con) return 0; amdgpu_ras_fs_fini(adev); @@ -2412,7 +2412,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { amdgpu_ras_check_supported(adev); - if (!adev->ras_hw_supported) + if (!adev->ras_hw_enabled) return; if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { @@ -2441,7 +2441,7 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev) if (!con) return; - if (!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { + if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX); amdgpu_ras_set_context(adev, NULL); kfree(con); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index f60d1cfafa3e..201fbdee1d09 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -475,7 +475,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, if (block >= AMDGPU_RAS_BLOCK_COUNT) return 0; - return ras && (adev->ras_features & (1 << block)); + return ras && (adev->ras_enabled & (1 << block)); } int amdgpu_ras_recovery_init(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 6028b55ea52c..093ab98c31bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1262,7 +1262,7 @@ static int gmc_v9_0_late_init(void *handle) * writes, while disables HBM ECC for vega10. */ if (!amdgpu_sriov_vf(adev) && (adev->asic_type == CHIP_VEGA10)) { - if (!(adev->ras_features & (1 << AMDGPU_RAS_BLOCK__UMC))) { + if (!(adev->ras_enabled & (1 << AMDGPU_RAS_BLOCK__UMC))) { if (adev->df.funcs->enable_ecc_force_par_wr_rmw) adev->df.funcs->enable_ecc_force_par_wr_rmw(adev, false); } diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 301695f3f237..49ece2a7f9f0 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -655,7 +655,7 @@ static int soc15_asic_baco_reset(struct amdgpu_device *adev) int ret = 0; /* avoid NBIF got stuck when do RAS recovery in BACO reset */ - if (ras && adev->ras_features) + if (ras && adev->ras_enabled) adev->nbio.funcs->enable_doorbell_interrupt(adev, false); ret = amdgpu_dpm_baco_reset(adev); @@ -663,7 +663,7 @@ static int soc15_asic_baco_reset(struct amdgpu_device *adev) return ret; /* re-enable doorbell interrupt after BACO exit */ - if (ras && adev->ras_features) + if (ras && adev->ras_enabled) adev->nbio.funcs->enable_doorbell_interrupt(adev, true); return 0; @@ -710,7 +710,7 @@ soc15_asic_reset_method(struct amdgpu_device *adev) * 1. PMFW version > 0x284300: all cases use baco * 2. PMFW version <= 0x284300: only sGPU w/o RAS use baco */ - if (ras && adev->ras_features && + if (ras && adev->ras_enabled && adev->pm.fw_version <= 0x283400) baco_reset = false; break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index fb4f718a1148..7fae6a7e51f5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1430,13 +1430,13 @@ int kfd_topology_add_device(struct kfd_dev *gpu) adev = (struct amdgpu_device *)(dev->gpu->kgd); /* kfd only concerns sram ecc on GFX and HBM ecc on UMC */ dev->node_props.capability |= - ((adev->ras_features & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ? + ((adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ? HSA_CAP_SRAM_EDCSUPPORTED : 0; - dev->node_props.capability |= ((adev->ras_features & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ? + dev->node_props.capability |= ((adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ? HSA_CAP_MEM_EDCSUPPORTED : 0; if (adev->asic_type != CHIP_VEGA10) - dev->node_props.capability |= (adev->ras_features != 0) ? + dev->node_props.capability |= (adev->ras_enabled != 0) ? HSA_CAP_RASEVENTNOTIFY : 0; /* SVM API and HMM page migration work together, device memory type diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_baco.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_baco.c index f6b1efce450c..8d99c7a5abf8 100644 --- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_baco.c +++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/vega20_baco.c @@ -85,7 +85,7 @@ int vega20_baco_set_state(struct pp_hwmgr *hwmgr, enum BACO_STATE state) return 0; if (state == BACO_STATE_IN) { - if (!ras || !adev->ras_features) { + if (!ras || !adev->ras_enabled) { data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL); data |= 0x80000000; WREG32_SOC15(THM, 0, mmTHM_BACO_CNTL, data); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c index 72581e43d83e..a06e6865507d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c @@ -1531,7 +1531,7 @@ int smu_v11_0_baco_set_state(struct smu_context *smu, enum smu_baco_state state) NULL); break; default: - if (!ras || !adev->ras_features || + if (!ras || !adev->ras_enabled || adev->gmc.xgmi.pending_reset) { if (adev->asic_type == CHIP_ARCTURUS) { data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL_ARCT); -- cgit v1.2.3 From ef0d7d2001c99eedb5a8ba71e9f7a055bfb029ea Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Tue, 4 May 2021 02:32:20 -0400 Subject: drm/amdgpu: Export ras_*_enabled to debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export the runtime-set "ras_hw_enabled" and "ras_enabled" to debugfs, for debugging. Cc: Alexander Deucher Cc: John Clements Cc: Hawking Zhang Signed-off-by: Luben Tuikov Acked-by: Christian König Reviewed-by: John Clements Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 444f5325f092..609530c4a599 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1299,8 +1299,8 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct dentry *dir; - struct drm_minor *minor = adev_to_drm(adev)->primary; + struct drm_minor *minor = adev_to_drm(adev)->primary; + struct dentry *dir; dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, @@ -1309,6 +1309,8 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device * &amdgpu_ras_debugfs_eeprom_ops); debugfs_create_u32("bad_page_cnt_threshold", 0444, dir, &con->bad_page_cnt_threshold); + debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); + debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled); /* * After one uncorrectable error happens, usually GPU recovery will -- cgit v1.2.3 From 7ddd9770857e26e6c284475260afce2fe71b4cb5 Mon Sep 17 00:00:00 2001 From: Oak Zeng Date: Thu, 6 May 2021 11:24:17 -0500 Subject: drm/amdgpu: Quit RAS initialization earlier if RAS is disabled If RAS is disabled through amdgpu_ras_enable kernel parameter, we should quit the RAS initialization eariler to avoid initialization of some RAS data structure such as sysfs etc. Reviewed-by: Hawking Zhang Signed-off-by: Oak Zeng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 609530c4a599..a94be181d066 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2151,7 +2151,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_check_supported(adev); - if (!adev->ras_hw_enabled || adev->asic_type == CHIP_VEGA10) { + if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) { /* set gfx block ras context feature for VEGA20 Gaming * send ras disable cmd to ras ta during ras late init. */ -- cgit v1.2.3 From 011907fda3605177f4fc05bf08fcf0fceaabda49 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Mon, 10 May 2021 11:04:59 +0800 Subject: drm/amdgpu: covert ras status to kernel errno The original codes use ras status and kernl errno together in the same function, which is a wrong code style. Signed-off-by: Dennis Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 29 ++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 ++++++--------------------------- 2 files changed, 34 insertions(+), 38 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 3179ca9fc03f..f7bbb04d01ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1114,6 +1114,31 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id) return ret; } +static int psp_ras_status_to_errno(struct amdgpu_device *adev, + enum ta_ras_status ras_status) +{ + int ret = -EINVAL; + + switch (ras_status) { + case TA_RAS_STATUS__SUCCESS: + ret = 0; + break; + case TA_RAS_STATUS__RESET_NEEDED: + ret = -EAGAIN; + break; + case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE: + dev_warn(adev->dev, "RAS WARN: ras function unavailable\n"); + break; + case TA_RAS_STATUS__ERROR_ASD_READ_WRITE: + dev_warn(adev->dev, "RAS WARN: asd read or write failed\n"); + break; + default: + dev_err(adev->dev, "RAS ERROR: ras function failed ret 0x%X\n", ret); + } + + return ret; +} + int psp_ras_enable_features(struct psp_context *psp, union ta_ras_cmd_input *info, bool enable) { @@ -1137,7 +1162,7 @@ int psp_ras_enable_features(struct psp_context *psp, if (ret) return -EINVAL; - return ras_cmd->ras_status; + return psp_ras_status_to_errno(psp->adev, ras_cmd->ras_status); } static int psp_ras_terminate(struct psp_context *psp) @@ -1220,7 +1245,7 @@ int psp_ras_trigger_error(struct psp_context *psp, if (amdgpu_ras_intr_triggered()) return 0; - return ras_cmd->ras_status; + return psp_ras_status_to_errno(psp->adev, ras_cmd->ras_status); } // ras end diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a94be181d066..4eebb97994d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -586,29 +586,6 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, } /* obj end */ -static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, - const char* invoke_type, - const char* block_name, - enum ta_ras_status ret) -{ - switch (ret) { - case TA_RAS_STATUS__SUCCESS: - return; - case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE: - dev_warn(adev->dev, - "RAS WARN: %s %s currently unavailable\n", - invoke_type, - block_name); - break; - default: - dev_err(adev->dev, - "RAS ERROR: %s %s error failed ret 0x%X\n", - invoke_type, - block_name, - ret); - } -} - /* feature ctl begin */ static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, struct ras_common_if *head) @@ -703,15 +680,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, info, enable); if (ret) { - amdgpu_ras_parse_status_code(adev, - enable ? "enable":"disable", - ras_block_str(head->block), - (enum ta_ras_status)ret); - if (ret == TA_RAS_STATUS__RESET_NEEDED) - ret = -EAGAIN; - else - ret = -EINVAL; - + dev_err(adev->dev, "ras %s %s failed %d\n", + enable ? "enable":"disable", + ras_block_str(head->block), + ret); goto out; } } @@ -1056,10 +1028,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, ret = -EINVAL; } - amdgpu_ras_parse_status_code(adev, - "inject", - ras_block_str(info->head.block), - (enum ta_ras_status)ret); + if (ret) + dev_err(adev->dev, "ras inject %s failed %d\n", + ras_block_str(info->head.block), ret); return ret; } -- cgit v1.2.3 From 7780f50358ee386de3b42ec236755fa72d97bb36 Mon Sep 17 00:00:00 2001 From: Dennis Li Date: Mon, 10 May 2021 19:08:11 +0800 Subject: drm/amdgpu: add function to clear MMEA error status for aldebaran For aldebaran, hardware will not clear error status automatically when reading error status register, insteadly driver should set clear bit of the error status register explicitly to clear error status. Signed-off-by: Dennis Li Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c | 19 +++++++++++++++++++ 3 files changed, 24 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h index 11aa29933c1f..b27fcbccce2b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mmhub.h @@ -28,6 +28,7 @@ struct amdgpu_mmhub_ras_funcs { void *ras_error_status); void (*query_ras_error_status)(struct amdgpu_device *adev); void (*reset_ras_error_count)(struct amdgpu_device *adev); + void (*reset_ras_error_status)(struct amdgpu_device *adev); }; struct amdgpu_mmhub_funcs { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 4eebb97994d6..a324dc2da101 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -938,6 +938,10 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, if (adev->mmhub.ras_funcs && adev->mmhub.ras_funcs->reset_ras_error_count) adev->mmhub.ras_funcs->reset_ras_error_count(adev); + + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->reset_ras_error_status) + adev->mmhub.ras_funcs->reset_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__SDMA: if (adev->sdma.funcs->reset_ras_error_count) diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c index 9aaa137662b5..6264934b67ac 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c @@ -1315,12 +1315,31 @@ static void mmhub_v1_7_query_ras_error_status(struct amdgpu_device *adev) } } +static void mmhub_v1_7_reset_ras_error_status(struct amdgpu_device *adev) +{ + int i; + uint32_t reg_value; + + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB)) + return; + + for (i = 0; i < ARRAY_SIZE(mmhub_v1_7_ea_err_status_regs); i++) { + reg_value = RREG32(SOC15_REG_ENTRY_OFFSET( + mmhub_v1_7_ea_err_status_regs[i])); + reg_value = REG_SET_FIELD(reg_value, MMEA0_ERR_STATUS, + CLEAR_ERROR_STATUS, 0x01); + WREG32(SOC15_REG_ENTRY_OFFSET(mmhub_v1_7_ea_err_status_regs[i]), + reg_value); + } +} + const struct amdgpu_mmhub_ras_funcs mmhub_v1_7_ras_funcs = { .ras_late_init = amdgpu_mmhub_ras_late_init, .ras_fini = amdgpu_mmhub_ras_fini, .query_ras_error_count = mmhub_v1_7_query_ras_error_count, .reset_ras_error_count = mmhub_v1_7_reset_ras_error_count, .query_ras_error_status = mmhub_v1_7_query_ras_error_status, + .reset_ras_error_status = mmhub_v1_7_reset_ras_error_status, }; const struct amdgpu_mmhub_funcs mmhub_v1_7_funcs = { -- cgit v1.2.3 From c666bbf0e9b56de479a8453fa5713d694db494ac Mon Sep 17 00:00:00 2001 From: Dwaipayan Ray Date: Sun, 9 May 2021 20:19:23 +0530 Subject: drm/amd/amdgpu: Fix errors in function documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a couple of syntax errors and removed one excess parameter in the function documentations which lead to kernel docs build warning. Reviewed-by: Christian König Signed-off-by: Dwaipayan Ray Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a324dc2da101..b1c57a5b6e89 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -321,11 +321,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, * "disable" requires only the block. * "enable" requires the block and error type. * "inject" requires the block, error type, address, and value. + * * The block is one of: umc, sdma, gfx, etc. * see ras_block_string[] for details + * * The error type is one of: ue, ce, where, * ue is multi-uncorrectable * ce is single-correctable + * * The sub-block is a the sub-block index, pass 0 if there is no sub-block. * The address and value are hexadecimal numbers, leading 0x is optional. * -- cgit v1.2.3