Merge tag 'amd-drm-next-5.10-2020-09-03' of git://people.freedesktop.org/~agd5f/linux into drm-next

amd-drm-next-5.10-2020-09-03: amdgpu: - RAS fixes - Sienna Cichlid updates - Navy Flounder updates - DCE6 (SI) support in DC - Enable plane rotation - Rework pre-OS vram reservation handling during driver init - Add standard interface to dump GPU metrics table from SMU - Rework tiling and tmz state handling in atomic commits - Pstate fixes - Add voltage and power hwmon interfaces for renoir - SW CTF fixes - S/G display fix for Raven - Print client strings for vmfaults for vega and newer - Manual fan control fixes - Display updates - Reorg power management directory structure - Misc bug fixes - Misc code cleanups amdkfd: - Topology fixes - Add SMI events for thermal throttling and GPU resets radeon: - switch from pci_* to dma_* for dma allocations - PLL fix Scheduler: - Clean up priority levels UAPI: - amdgpu INFO IOCTL query update for TMZ state https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6049 - amdkfd SMI event interface updates https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/therm_thrott From: Alex Deucher <alexdeucher@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200903222921.4152-1-alexander.deucher@amd.com
author: Dave Airlie <airlied@redhat.com> 2020-09-08 16:40:13 +1000
committer: Dave Airlie <airlied@redhat.com> 2020-09-08 16:40:13 +1000
commit: 0c8d22fcae2f9590a07b000e1724f665820b77f7 (patch)
tree: 5a2405fe298358d861a58dc933184ee6d3415eb4 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
parent: ce5c207c6b8dd9cdeaeeb2345b8a69335c0d98bf (diff)
parent: 11bc98bd71fe2e0cb572988519e51bca9d58a18a (diff)
download: linux-0c8d22fcae2f9590a07b000e1724f665820b77f7.tar.bz2
1 files changed, 19 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b2667342cf67..6b8d7bb83bb3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -31,6 +31,10 @@
 #include "ta_ras_if.h"
 #include "amdgpu_ras_eeprom.h"
 
+#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS		(0x1 << 0)
+#define AMDGPU_RAS_FLAG_INIT_NEED_RESET		(0x1 << 1)
+#define AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV	(0x1 << 2)
+
 enum amdgpu_ras_block {
 	AMDGPU_RAS_BLOCK__UMC = 0,
 	AMDGPU_RAS_BLOCK__SDMA,
@@ -336,6 +340,12 @@ struct amdgpu_ras {
 	struct amdgpu_ras_eeprom_control eeprom_control;
 
 	bool error_query_ready;
+
+	/* bad page count threshold */
+	uint32_t bad_page_cnt_threshold;
+
+	/* disable ras error count harvest in recovery */
+	bool disable_ras_err_cnt_harvest;
 };
 
 struct ras_fs_data {
@@ -490,6 +500,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev);
 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 		bool is_ce);
 
+bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev);
+
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 		struct eeprom_table_record *bps, int pages);
@@ -500,10 +512,14 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-	/* save bad page to eeprom before gpu reset,
-	 * i2c may be unstable in gpu reset
+	/*
+	 * Save bad page to eeprom before gpu reset, i2c may be unstable
+	 * in gpu reset.
+	 *
+	 * Also, exclude the case when ras recovery issuer is
+	 * eeprom page write itself.
 	 */
-	if (in_task())
+	if (!(ras->flags & AMDGPU_RAS_FLAG_SKIP_BAD_PAGE_RESV) && in_task())
 		amdgpu_ras_reserve_bad_pages(adev);
 
 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
author	Dave Airlie <airlied@redhat.com>	2020-09-08 16:40:13 +1000
committer	Dave Airlie <airlied@redhat.com>	2020-09-08 16:40:13 +1000
commit	0c8d22fcae2f9590a07b000e1724f665820b77f7 (patch)
tree	5a2405fe298358d861a58dc933184ee6d3415eb4 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
parent	ce5c207c6b8dd9cdeaeeb2345b8a69335c0d98bf (diff)
parent	11bc98bd71fe2e0cb572988519e51bca9d58a18a (diff)
download	linux-0c8d22fcae2f9590a07b000e1724f665820b77f7.tar.bz2