summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
authorThomas Zimmermann <tzimmermann@suse.de>2022-07-14 09:57:37 +0200
committerThomas Zimmermann <tzimmermann@suse.de>2022-07-14 09:57:37 +0200
commitf83d9396d1f63048c423efa00e4e244da10a35fd (patch)
tree48da2cf9b20cc7049c92dd4a7dd74be11add86e5 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
parent5ee8c8f930ba7d20717c4fc2d9f1ce0e757d1155 (diff)
parent0180290abb5ce5c870f84a00ffeda5802f641dce (diff)
downloadlinux-f83d9396d1f63048c423efa00e4e244da10a35fd.tar.bz2
Merge drm/drm-next into drm-misc-next-fixes
Backmerging from drm/drm-next for the final fixes that will go into v5.20. Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c331
1 files changed, 204 insertions, 127 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 94666c2417c6..64f37713b270 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -32,6 +32,9 @@
#include <linux/slab.h>
#include <linux/iommu.h>
#include <linux/pci.h>
+#include <linux/devcoredump.h>
+#include <generated/utsrelease.h>
+#include <linux/pci-p2pdma.h>
#include <drm/drm_atomic_helper.h>
#include <drm/drm_probe_helper.h>
@@ -913,7 +916,10 @@ static int amdgpu_device_asic_init(struct amdgpu_device *adev)
{
amdgpu_asic_pre_asic_init(adev);
- return amdgpu_atom_asic_init(adev->mode_info.atom_context);
+ if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
+ return amdgpu_atomfirmware_asic_init(adev, true);
+ else
+ return amdgpu_atom_asic_init(adev->mode_info.atom_context);
}
/**
@@ -1041,19 +1047,25 @@ static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
adev->doorbell.base = pci_resource_start(adev->pdev, 2);
adev->doorbell.size = pci_resource_len(adev->pdev, 2);
- adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
- adev->doorbell_index.max_assignment+1);
- if (adev->doorbell.num_doorbells == 0)
- return -EINVAL;
-
- /* For Vega, reserve and map two pages on doorbell BAR since SDMA
- * paging queue doorbell use the second page. The
- * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
- * doorbells are in the first page. So with paging queue enabled,
- * the max num_doorbells should + 1 page (0x400 in dword)
- */
- if (adev->asic_type >= CHIP_VEGA10)
- adev->doorbell.num_doorbells += 0x400;
+ if (adev->enable_mes) {
+ adev->doorbell.num_doorbells =
+ adev->doorbell.size / sizeof(u32);
+ } else {
+ adev->doorbell.num_doorbells =
+ min_t(u32, adev->doorbell.size / sizeof(u32),
+ adev->doorbell_index.max_assignment+1);
+ if (adev->doorbell.num_doorbells == 0)
+ return -EINVAL;
+
+ /* For Vega, reserve and map two pages on doorbell BAR since SDMA
+ * paging queue doorbell use the second page. The
+ * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
+ * doorbells are in the first page. So with paging queue enabled,
+ * the max num_doorbells should + 1 page (0x400 in dword)
+ */
+ if (adev->asic_type >= CHIP_VEGA10)
+ adev->doorbell.num_doorbells += 0x400;
+ }
adev->doorbell.ptr = ioremap(adev->doorbell.base,
adev->doorbell.num_doorbells *
@@ -1547,9 +1559,6 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
- amdgpu_gmc_tmz_set(adev);
-
-
return 0;
}
@@ -1926,11 +1935,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
adev->firmware.gpu_info_fw = NULL;
if (adev->mman.discovery_bin) {
- amdgpu_discovery_get_gfx_info(adev);
-
/*
* FIXME: The bounding box is still needed by Navi12, so
- * temporarily read it from gpu_info firmware. Should be droped
+ * temporarily read it from gpu_info firmware. Should be dropped
* when DAL no longer needs it.
*/
if (adev->asic_type != CHIP_NAVI12)
@@ -1938,35 +1945,6 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
}
switch (adev->asic_type) {
-#ifdef CONFIG_DRM_AMDGPU_SI
- case CHIP_VERDE:
- case CHIP_TAHITI:
- case CHIP_PITCAIRN:
- case CHIP_OLAND:
- case CHIP_HAINAN:
-#endif
-#ifdef CONFIG_DRM_AMDGPU_CIK
- case CHIP_BONAIRE:
- case CHIP_HAWAII:
- case CHIP_KAVERI:
- case CHIP_KABINI:
- case CHIP_MULLINS:
-#endif
- case CHIP_TOPAZ:
- case CHIP_TONGA:
- case CHIP_FIJI:
- case CHIP_POLARIS10:
- case CHIP_POLARIS11:
- case CHIP_POLARIS12:
- case CHIP_VEGAM:
- case CHIP_CARRIZO:
- case CHIP_STONEY:
- case CHIP_VEGA20:
- case CHIP_ALDEBARAN:
- case CHIP_SIENNA_CICHLID:
- case CHIP_NAVY_FLOUNDER:
- case CHIP_DIMGREY_CAVEFISH:
- case CHIP_BEIGE_GOBY:
default:
return 0;
case CHIP_VEGA10:
@@ -3312,38 +3290,12 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
case CHIP_MULLINS:
/*
* We have systems in the wild with these ASICs that require
- * LVDS and VGA support which is not supported with DC.
+ * VGA support which is not supported with DC.
*
* Fallback to the non-DC driver here by default so as not to
* cause regressions.
*/
return amdgpu_dc > 0;
- case CHIP_HAWAII:
- case CHIP_CARRIZO:
- case CHIP_STONEY:
- case CHIP_POLARIS10:
- case CHIP_POLARIS11:
- case CHIP_POLARIS12:
- case CHIP_VEGAM:
- case CHIP_TONGA:
- case CHIP_FIJI:
- case CHIP_VEGA10:
- case CHIP_VEGA12:
- case CHIP_VEGA20:
-#if defined(CONFIG_DRM_AMD_DC_DCN)
- case CHIP_RAVEN:
- case CHIP_NAVI10:
- case CHIP_NAVI14:
- case CHIP_NAVI12:
- case CHIP_RENOIR:
- case CHIP_CYAN_SKILLFISH:
- case CHIP_SIENNA_CICHLID:
- case CHIP_NAVY_FLOUNDER:
- case CHIP_DIMGREY_CAVEFISH:
- case CHIP_BEIGE_GOBY:
- case CHIP_VANGOGH:
- case CHIP_YELLOW_CARP:
-#endif
default:
return amdgpu_dc != 0;
#else
@@ -3365,7 +3317,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
*/
bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
{
- if (amdgpu_sriov_vf(adev) ||
+ if (amdgpu_sriov_vf(adev) ||
adev->enable_virtual_display ||
(adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
return false;
@@ -3663,9 +3615,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (amdgpu_mcbp)
DRM_INFO("MCBP is enabled\n");
- if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
- adev->enable_mes = true;
-
/*
* Reset domain needs to be present early, before XGMI hive discovered
* (if any) and intitialized to use reset sem and in_gpu reset flag
@@ -3689,6 +3638,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (r)
return r;
+ /* Enable TMZ based on IP_VERSION */
+ amdgpu_gmc_tmz_set(adev);
+
amdgpu_gmc_noretry_set(adev);
/* Need to get xgmi info early to decide the reset behavior*/
if (adev->gmc.xgmi.supported) {
@@ -3857,6 +3809,14 @@ fence_driver_init:
} else
adev->ucode_sysfs_en = true;
+ r = amdgpu_psp_sysfs_init(adev);
+ if (r) {
+ adev->psp_sysfs_en = false;
+ if (!amdgpu_sriov_vf(adev))
+ DRM_ERROR("Creating psp sysfs failed\n");
+ } else
+ adev->psp_sysfs_en = true;
+
/*
* Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
* Otherwise the mgpu fan boost feature will be skipped due to the
@@ -3960,10 +3920,6 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
{
dev_info(adev->dev, "amdgpu: finishing device.\n");
flush_delayed_work(&adev->delayed_init_work);
- if (adev->mman.initialized) {
- flush_delayed_work(&adev->mman.bdev.wq);
- ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
- }
adev->shutdown = true;
/* make sure IB test finished before entering exclusive mode
@@ -3984,10 +3940,17 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
}
amdgpu_fence_driver_hw_fini(adev);
+ if (adev->mman.initialized) {
+ flush_delayed_work(&adev->mman.bdev.wq);
+ ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
+ }
+
if (adev->pm_sysfs_en)
amdgpu_pm_sysfs_fini(adev);
if (adev->ucode_sysfs_en)
amdgpu_ucode_sysfs_fini(adev);
+ if (adev->psp_sysfs_en)
+ amdgpu_psp_sysfs_fini(adev);
sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
/* disable ras feature must before hw fini */
@@ -4486,6 +4449,7 @@ retry:
if (!r) {
amdgpu_irq_gpu_reset_resume_helper(adev);
r = amdgpu_ib_ring_tests(adev);
+
amdgpu_amdkfd_post_reset(adev);
}
@@ -4642,6 +4606,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
amdgpu_virt_fini_data_exchange(adev);
}
+ amdgpu_fence_driver_isr_toggle(adev, true);
+
/* block all schedulers and reset given job's ring */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
@@ -4657,6 +4623,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
amdgpu_fence_driver_force_completion(ring);
}
+ amdgpu_fence_driver_isr_toggle(adev, false);
+
if (job && job->vm)
drm_sched_increase_karma(&job->base);
@@ -4697,20 +4665,73 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
{
- uint32_t reg_value;
int i;
lockdep_assert_held(&adev->reset_domain->sem);
dump_stack();
for (i = 0; i < adev->num_regs; i++) {
- reg_value = RREG32(adev->reset_dump_reg_list[i]);
- trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value);
+ adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
+ trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
+ adev->reset_dump_reg_value[i]);
}
return 0;
}
+#ifdef CONFIG_DEV_COREDUMP
+static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
+ size_t count, void *data, size_t datalen)
+{
+ struct drm_printer p;
+ struct amdgpu_device *adev = data;
+ struct drm_print_iterator iter;
+ int i;
+
+ iter.data = buffer;
+ iter.offset = 0;
+ iter.start = offset;
+ iter.remain = count;
+
+ p = drm_coredump_printer(&iter);
+
+ drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
+ drm_printf(&p, "kernel: " UTS_RELEASE "\n");
+ drm_printf(&p, "module: " KBUILD_MODNAME "\n");
+ drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
+ if (adev->reset_task_info.pid)
+ drm_printf(&p, "process_name: %s PID: %d\n",
+ adev->reset_task_info.process_name,
+ adev->reset_task_info.pid);
+
+ if (adev->reset_vram_lost)
+ drm_printf(&p, "VRAM is lost due to GPU reset!\n");
+ if (adev->num_regs) {
+ drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
+
+ for (i = 0; i < adev->num_regs; i++)
+ drm_printf(&p, "0x%08x: 0x%08x\n",
+ adev->reset_dump_reg_list[i],
+ adev->reset_dump_reg_value[i]);
+ }
+
+ return count - iter.remain;
+}
+
+static void amdgpu_devcoredump_free(void *data)
+{
+}
+
+static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
+{
+ struct drm_device *dev = adev_to_drm(adev);
+
+ ktime_get_ts64(&adev->reset_time);
+ dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
+ amdgpu_devcoredump_read, amdgpu_devcoredump_free);
+}
+#endif
+
int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_reset_context *reset_context)
{
@@ -4795,6 +4816,15 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
goto out;
vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
+#ifdef CONFIG_DEV_COREDUMP
+ tmp_adev->reset_vram_lost = vram_lost;
+ memset(&tmp_adev->reset_task_info, 0,
+ sizeof(tmp_adev->reset_task_info));
+ if (reset_context->job && reset_context->job->vm)
+ tmp_adev->reset_task_info =
+ reset_context->job->vm->task_info;
+ amdgpu_reset_capture_coredumpm(tmp_adev);
+#endif
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
@@ -4980,16 +5010,32 @@ static void amdgpu_device_recheck_guilty_jobs(
/* clear job's guilty and depend the folowing step to decide the real one */
drm_sched_reset_karma(s_job);
- /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get
- * to make sure fence is balanced */
- dma_fence_get(s_job->s_fence->parent);
drm_sched_resubmit_jobs_ext(&ring->sched, 1);
+ if (!s_job->s_fence->parent) {
+ DRM_WARN("Failed to get a HW fence for job!");
+ continue;
+ }
+
ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
if (ret == 0) { /* timeout */
DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
ring->sched.name, s_job->id);
+
+ amdgpu_fence_driver_isr_toggle(adev, true);
+
+ /* Clear this failed job from fence array */
+ amdgpu_fence_driver_clear_job_fences(ring);
+
+ amdgpu_fence_driver_isr_toggle(adev, false);
+
+ /* Since the job won't signal and we go for
+ * another resubmit drop this parent pointer
+ */
+ dma_fence_put(s_job->s_fence->parent);
+ s_job->s_fence->parent = NULL;
+
/* set guilty */
drm_sched_increase_karma(s_job);
retry:
@@ -5018,7 +5064,6 @@ retry:
/* got the hw fence, signal finished fence */
atomic_dec(ring->sched.score);
- dma_fence_put(s_job->s_fence->parent);
dma_fence_get(&s_job->s_fence->finished);
dma_fence_signal(&s_job->s_fence->finished);
dma_fence_put(&s_job->s_fence->finished);
@@ -5031,8 +5076,29 @@ retry:
}
}
+static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+#if defined(CONFIG_DEBUG_FS)
+ if (!amdgpu_sriov_vf(adev))
+ cancel_work(&adev->reset_work);
+#endif
+
+ if (adev->kfd.dev)
+ cancel_work(&adev->kfd.reset_work);
+
+ if (amdgpu_sriov_vf(adev))
+ cancel_work(&adev->virt.flr_work);
+
+ if (con && adev->ras_enabled)
+ cancel_work(&con->recovery_work);
+
+}
+
+
/**
- * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler
+ * amdgpu_device_gpu_recover - reset the asic and recover scheduler
*
* @adev: amdgpu_device pointer
* @job: which job trigger hang
@@ -5042,7 +5108,7 @@ retry:
* Returns 0 for success or an error on failure.
*/
-int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
+int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
struct amdgpu_job *job)
{
struct list_head device_list, *device_list_handle = NULL;
@@ -5140,7 +5206,7 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
*/
amdgpu_unregister_gpu_instance(tmp_adev);
- drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
+ drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
/* disable ras on ALL IPs */
if (!need_emergency_restart &&
@@ -5170,8 +5236,8 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
*
* job->base holds a reference to parent fence
*/
- if (job && job->base.s_fence->parent &&
- dma_fence_is_signaled(job->base.s_fence->parent)) {
+ if (job && (job->hw_fence.ops != NULL) &&
+ dma_fence_is_signaled(&job->hw_fence)) {
job_signaled = true;
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
goto skip_hw_reset;
@@ -5186,6 +5252,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
r, adev_to_drm(tmp_adev)->unique);
tmp_adev->asic_reset_res = r;
}
+
+ /*
+ * Drop all pending non scheduler resets. Scheduler resets
+ * were already dropped during drm_sched_stop
+ */
+ amdgpu_device_stop_pending_resets(tmp_adev);
}
tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
@@ -5195,6 +5267,10 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
r = amdgpu_device_reset_sriov(adev, job ? false : true);
if (r)
adev->asic_reset_res = r;
+
+ /* Aldebaran supports ras in SRIOV, so need resume ras during reset */
+ if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
+ amdgpu_ras_resume(adev);
} else {
r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
if (r && r == -EAGAIN)
@@ -5280,38 +5356,9 @@ skip_sched_resume:
if (r)
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
- return r;
-}
-
-struct amdgpu_recover_work_struct {
- struct work_struct base;
- struct amdgpu_device *adev;
- struct amdgpu_job *job;
- int ret;
-};
-
-static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
-{
- struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base);
-
- recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
-}
-/*
- * Serialize gpu recover into reset domain single threaded wq
- */
-int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
- struct amdgpu_job *job)
-{
- struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
-
- INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
- if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
- return -EAGAIN;
-
- flush_work(&work.base);
-
- return work.ret;
+ atomic_set(&adev->reset_domain->reset_res, r);
+ return r;
}
/**
@@ -5462,6 +5509,36 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
}
}
+/**
+ * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
+ *
+ * @adev: amdgpu_device pointer
+ * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
+ *
+ * Return true if @peer_adev can access (DMA) @adev through the PCIe
+ * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
+ * @peer_adev.
+ */
+bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
+ struct amdgpu_device *peer_adev)
+{
+#ifdef CONFIG_HSA_AMD_P2P
+ uint64_t address_mask = peer_adev->dev->dma_mask ?
+ ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
+ resource_size_t aper_limit =
+ adev->gmc.aper_base + adev->gmc.aper_size - 1;
+ bool p2p_access = !(pci_p2pdma_distance_many(adev->pdev,
+ &peer_adev->dev, 1, true) < 0);
+
+ return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
+ adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
+ !(adev->gmc.aper_base & address_mask ||
+ aper_limit & address_mask));
+#else
+ return false;
+#endif
+}
+
int amdgpu_device_baco_enter(struct drm_device *dev)
{
struct amdgpu_device *adev = drm_to_adev(dev);