From b82e65a93510465cb4c203c938245f137a4e95dc Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Thu, 23 Jul 2020 15:42:19 +0800 Subject: drm/amdgpu: break driver init process when it's bad GPU(v5) When retrieving bad gpu tag from eeprom, GPU init should fail as the GPU needs to be retired for further check. v2: Fix spelling typo, correct the condition to detect bad gpu tag and refine error message. v3: Refine function argument name. v4: Fix missing check of returning value of i2c initialization error case. v5: Use dev_err to print PCI information in dmesg instead of DRM_ERROR. Signed-off-by: Guchun Chen Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6660094a1063..1a1652ea76b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1821,6 +1821,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; uint32_t max_eeprom_records_len = 0; + bool exc_err_limit = false; int ret; if (con) @@ -1842,8 +1843,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); - ret = amdgpu_ras_eeprom_init(&con->eeprom_control); - if (ret) + ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); + /* + * This calling fails when exc_err_limit is true or + * ret != 0. + */ + if (exc_err_limit || ret) goto free; if (con->eeprom_control.num_recs) { @@ -1867,6 +1872,15 @@ free: out: dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); + /* + * Except error threshold exceeding case, other failure cases in this + * function would not fail amdgpu driver init. + */ + if (!exc_err_limit) + ret = 0; + else + ret = -EINVAL; + return ret; } -- cgit v1.2.3