diff options
author | Thomas Zimmermann <tzimmermann@suse.de> | 2022-07-14 09:57:37 +0200 |
---|---|---|
committer | Thomas Zimmermann <tzimmermann@suse.de> | 2022-07-14 09:57:37 +0200 |
commit | f83d9396d1f63048c423efa00e4e244da10a35fd (patch) | |
tree | 48da2cf9b20cc7049c92dd4a7dd74be11add86e5 /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |
parent | 5ee8c8f930ba7d20717c4fc2d9f1ce0e757d1155 (diff) | |
parent | 0180290abb5ce5c870f84a00ffeda5802f641dce (diff) | |
download | linux-f83d9396d1f63048c423efa00e4e244da10a35fd.tar.bz2 |
Merge drm/drm-next into drm-misc-next-fixes
Backmerging from drm/drm-next for the final fixes that will go
into v5.20.
Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 331 |
1 files changed, 204 insertions, 127 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 94666c2417c6..64f37713b270 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -32,6 +32,9 @@ #include <linux/slab.h> #include <linux/iommu.h> #include <linux/pci.h> +#include <linux/devcoredump.h> +#include <generated/utsrelease.h> +#include <linux/pci-p2pdma.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_probe_helper.h> @@ -913,7 +916,10 @@ static int amdgpu_device_asic_init(struct amdgpu_device *adev) { amdgpu_asic_pre_asic_init(adev); - return amdgpu_atom_asic_init(adev->mode_info.atom_context); + if (adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0)) + return amdgpu_atomfirmware_asic_init(adev, true); + else + return amdgpu_atom_asic_init(adev->mode_info.atom_context); } /** @@ -1041,19 +1047,25 @@ static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) adev->doorbell.base = pci_resource_start(adev->pdev, 2); adev->doorbell.size = pci_resource_len(adev->pdev, 2); - adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32), - adev->doorbell_index.max_assignment+1); - if (adev->doorbell.num_doorbells == 0) - return -EINVAL; - - /* For Vega, reserve and map two pages on doorbell BAR since SDMA - * paging queue doorbell use the second page. The - * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the - * doorbells are in the first page. So with paging queue enabled, - * the max num_doorbells should + 1 page (0x400 in dword) - */ - if (adev->asic_type >= CHIP_VEGA10) - adev->doorbell.num_doorbells += 0x400; + if (adev->enable_mes) { + adev->doorbell.num_doorbells = + adev->doorbell.size / sizeof(u32); + } else { + adev->doorbell.num_doorbells = + min_t(u32, adev->doorbell.size / sizeof(u32), + adev->doorbell_index.max_assignment+1); + if (adev->doorbell.num_doorbells == 0) + return -EINVAL; + + /* For Vega, reserve and map two pages on doorbell BAR since SDMA + * paging queue doorbell use the second page. The + * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the + * doorbells are in the first page. So with paging queue enabled, + * the max num_doorbells should + 1 page (0x400 in dword) + */ + if (adev->asic_type >= CHIP_VEGA10) + adev->doorbell.num_doorbells += 0x400; + } adev->doorbell.ptr = ioremap(adev->doorbell.base, adev->doorbell.num_doorbells * @@ -1547,9 +1559,6 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); - amdgpu_gmc_tmz_set(adev); - - return 0; } @@ -1926,11 +1935,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) adev->firmware.gpu_info_fw = NULL; if (adev->mman.discovery_bin) { - amdgpu_discovery_get_gfx_info(adev); - /* * FIXME: The bounding box is still needed by Navi12, so - * temporarily read it from gpu_info firmware. Should be droped + * temporarily read it from gpu_info firmware. Should be dropped * when DAL no longer needs it. */ if (adev->asic_type != CHIP_NAVI12) @@ -1938,35 +1945,6 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) } switch (adev->asic_type) { -#ifdef CONFIG_DRM_AMDGPU_SI - case CHIP_VERDE: - case CHIP_TAHITI: - case CHIP_PITCAIRN: - case CHIP_OLAND: - case CHIP_HAINAN: -#endif -#ifdef CONFIG_DRM_AMDGPU_CIK - case CHIP_BONAIRE: - case CHIP_HAWAII: - case CHIP_KAVERI: - case CHIP_KABINI: - case CHIP_MULLINS: -#endif - case CHIP_TOPAZ: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - case CHIP_VEGAM: - case CHIP_CARRIZO: - case CHIP_STONEY: - case CHIP_VEGA20: - case CHIP_ALDEBARAN: - case CHIP_SIENNA_CICHLID: - case CHIP_NAVY_FLOUNDER: - case CHIP_DIMGREY_CAVEFISH: - case CHIP_BEIGE_GOBY: default: return 0; case CHIP_VEGA10: @@ -3312,38 +3290,12 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) case CHIP_MULLINS: /* * We have systems in the wild with these ASICs that require - * LVDS and VGA support which is not supported with DC. + * VGA support which is not supported with DC. * * Fallback to the non-DC driver here by default so as not to * cause regressions. */ return amdgpu_dc > 0; - case CHIP_HAWAII: - case CHIP_CARRIZO: - case CHIP_STONEY: - case CHIP_POLARIS10: - case CHIP_POLARIS11: - case CHIP_POLARIS12: - case CHIP_VEGAM: - case CHIP_TONGA: - case CHIP_FIJI: - case CHIP_VEGA10: - case CHIP_VEGA12: - case CHIP_VEGA20: -#if defined(CONFIG_DRM_AMD_DC_DCN) - case CHIP_RAVEN: - case CHIP_NAVI10: - case CHIP_NAVI14: - case CHIP_NAVI12: - case CHIP_RENOIR: - case CHIP_CYAN_SKILLFISH: - case CHIP_SIENNA_CICHLID: - case CHIP_NAVY_FLOUNDER: - case CHIP_DIMGREY_CAVEFISH: - case CHIP_BEIGE_GOBY: - case CHIP_VANGOGH: - case CHIP_YELLOW_CARP: -#endif default: return amdgpu_dc != 0; #else @@ -3365,7 +3317,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) */ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) { - if (amdgpu_sriov_vf(adev) || + if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display || (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)) return false; @@ -3663,9 +3615,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (amdgpu_mcbp) DRM_INFO("MCBP is enabled\n"); - if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10) - adev->enable_mes = true; - /* * Reset domain needs to be present early, before XGMI hive discovered * (if any) and intitialized to use reset sem and in_gpu reset flag @@ -3689,6 +3638,9 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) return r; + /* Enable TMZ based on IP_VERSION */ + amdgpu_gmc_tmz_set(adev); + amdgpu_gmc_noretry_set(adev); /* Need to get xgmi info early to decide the reset behavior*/ if (adev->gmc.xgmi.supported) { @@ -3857,6 +3809,14 @@ fence_driver_init: } else adev->ucode_sysfs_en = true; + r = amdgpu_psp_sysfs_init(adev); + if (r) { + adev->psp_sysfs_en = false; + if (!amdgpu_sriov_vf(adev)) + DRM_ERROR("Creating psp sysfs failed\n"); + } else + adev->psp_sysfs_en = true; + /* * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. * Otherwise the mgpu fan boost feature will be skipped due to the @@ -3960,10 +3920,6 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) { dev_info(adev->dev, "amdgpu: finishing device.\n"); flush_delayed_work(&adev->delayed_init_work); - if (adev->mman.initialized) { - flush_delayed_work(&adev->mman.bdev.wq); - ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); - } adev->shutdown = true; /* make sure IB test finished before entering exclusive mode @@ -3984,10 +3940,17 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) } amdgpu_fence_driver_hw_fini(adev); + if (adev->mman.initialized) { + flush_delayed_work(&adev->mman.bdev.wq); + ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); + } + if (adev->pm_sysfs_en) amdgpu_pm_sysfs_fini(adev); if (adev->ucode_sysfs_en) amdgpu_ucode_sysfs_fini(adev); + if (adev->psp_sysfs_en) + amdgpu_psp_sysfs_fini(adev); sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); /* disable ras feature must before hw fini */ @@ -4486,6 +4449,7 @@ retry: if (!r) { amdgpu_irq_gpu_reset_resume_helper(adev); r = amdgpu_ib_ring_tests(adev); + amdgpu_amdkfd_post_reset(adev); } @@ -4642,6 +4606,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, amdgpu_virt_fini_data_exchange(adev); } + amdgpu_fence_driver_isr_toggle(adev, true); + /* block all schedulers and reset given job's ring */ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; @@ -4657,6 +4623,8 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, amdgpu_fence_driver_force_completion(ring); } + amdgpu_fence_driver_isr_toggle(adev, false); + if (job && job->vm) drm_sched_increase_karma(&job->base); @@ -4697,20 +4665,73 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) { - uint32_t reg_value; int i; lockdep_assert_held(&adev->reset_domain->sem); dump_stack(); for (i = 0; i < adev->num_regs; i++) { - reg_value = RREG32(adev->reset_dump_reg_list[i]); - trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], reg_value); + adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]); + trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i], + adev->reset_dump_reg_value[i]); } return 0; } +#ifdef CONFIG_DEV_COREDUMP +static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset, + size_t count, void *data, size_t datalen) +{ + struct drm_printer p; + struct amdgpu_device *adev = data; + struct drm_print_iterator iter; + int i; + + iter.data = buffer; + iter.offset = 0; + iter.start = offset; + iter.remain = count; + + p = drm_coredump_printer(&iter); + + drm_printf(&p, "**** AMDGPU Device Coredump ****\n"); + drm_printf(&p, "kernel: " UTS_RELEASE "\n"); + drm_printf(&p, "module: " KBUILD_MODNAME "\n"); + drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec); + if (adev->reset_task_info.pid) + drm_printf(&p, "process_name: %s PID: %d\n", + adev->reset_task_info.process_name, + adev->reset_task_info.pid); + + if (adev->reset_vram_lost) + drm_printf(&p, "VRAM is lost due to GPU reset!\n"); + if (adev->num_regs) { + drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n"); + + for (i = 0; i < adev->num_regs; i++) + drm_printf(&p, "0x%08x: 0x%08x\n", + adev->reset_dump_reg_list[i], + adev->reset_dump_reg_value[i]); + } + + return count - iter.remain; +} + +static void amdgpu_devcoredump_free(void *data) +{ +} + +static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev) +{ + struct drm_device *dev = adev_to_drm(adev); + + ktime_get_ts64(&adev->reset_time); + dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL, + amdgpu_devcoredump_read, amdgpu_devcoredump_free); +} +#endif + int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_reset_context *reset_context) { @@ -4795,6 +4816,15 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, goto out; vram_lost = amdgpu_device_check_vram_lost(tmp_adev); +#ifdef CONFIG_DEV_COREDUMP + tmp_adev->reset_vram_lost = vram_lost; + memset(&tmp_adev->reset_task_info, 0, + sizeof(tmp_adev->reset_task_info)); + if (reset_context->job && reset_context->job->vm) + tmp_adev->reset_task_info = + reset_context->job->vm->task_info; + amdgpu_reset_capture_coredumpm(tmp_adev); +#endif if (vram_lost) { DRM_INFO("VRAM is lost due to GPU reset!\n"); amdgpu_inc_vram_lost(tmp_adev); @@ -4980,16 +5010,32 @@ static void amdgpu_device_recheck_guilty_jobs( /* clear job's guilty and depend the folowing step to decide the real one */ drm_sched_reset_karma(s_job); - /* for the real bad job, it will be resubmitted twice, adding a dma_fence_get - * to make sure fence is balanced */ - dma_fence_get(s_job->s_fence->parent); drm_sched_resubmit_jobs_ext(&ring->sched, 1); + if (!s_job->s_fence->parent) { + DRM_WARN("Failed to get a HW fence for job!"); + continue; + } + ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout); if (ret == 0) { /* timeout */ DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n", ring->sched.name, s_job->id); + + amdgpu_fence_driver_isr_toggle(adev, true); + + /* Clear this failed job from fence array */ + amdgpu_fence_driver_clear_job_fences(ring); + + amdgpu_fence_driver_isr_toggle(adev, false); + + /* Since the job won't signal and we go for + * another resubmit drop this parent pointer + */ + dma_fence_put(s_job->s_fence->parent); + s_job->s_fence->parent = NULL; + /* set guilty */ drm_sched_increase_karma(s_job); retry: @@ -5018,7 +5064,6 @@ retry: /* got the hw fence, signal finished fence */ atomic_dec(ring->sched.score); - dma_fence_put(s_job->s_fence->parent); dma_fence_get(&s_job->s_fence->finished); dma_fence_signal(&s_job->s_fence->finished); dma_fence_put(&s_job->s_fence->finished); @@ -5031,8 +5076,29 @@ retry: } } +static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + +#if defined(CONFIG_DEBUG_FS) + if (!amdgpu_sriov_vf(adev)) + cancel_work(&adev->reset_work); +#endif + + if (adev->kfd.dev) + cancel_work(&adev->kfd.reset_work); + + if (amdgpu_sriov_vf(adev)) + cancel_work(&adev->virt.flr_work); + + if (con && adev->ras_enabled) + cancel_work(&con->recovery_work); + +} + + /** - * amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler + * amdgpu_device_gpu_recover - reset the asic and recover scheduler * * @adev: amdgpu_device pointer * @job: which job trigger hang @@ -5042,7 +5108,7 @@ retry: * Returns 0 for success or an error on failure. */ -int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, +int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job) { struct list_head device_list, *device_list_handle = NULL; @@ -5140,7 +5206,7 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, */ amdgpu_unregister_gpu_instance(tmp_adev); - drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true); + drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true); /* disable ras on ALL IPs */ if (!need_emergency_restart && @@ -5170,8 +5236,8 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, * * job->base holds a reference to parent fence */ - if (job && job->base.s_fence->parent && - dma_fence_is_signaled(job->base.s_fence->parent)) { + if (job && (job->hw_fence.ops != NULL) && + dma_fence_is_signaled(&job->hw_fence)) { job_signaled = true; dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); goto skip_hw_reset; @@ -5186,6 +5252,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ r, adev_to_drm(tmp_adev)->unique); tmp_adev->asic_reset_res = r; } + + /* + * Drop all pending non scheduler resets. Scheduler resets + * were already dropped during drm_sched_stop + */ + amdgpu_device_stop_pending_resets(tmp_adev); } tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); @@ -5195,6 +5267,10 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ r = amdgpu_device_reset_sriov(adev, job ? false : true); if (r) adev->asic_reset_res = r; + + /* Aldebaran supports ras in SRIOV, so need resume ras during reset */ + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2)) + amdgpu_ras_resume(adev); } else { r = amdgpu_do_asic_reset(device_list_handle, &reset_context); if (r && r == -EAGAIN) @@ -5280,38 +5356,9 @@ skip_sched_resume: if (r) dev_info(adev->dev, "GPU reset end with ret = %d\n", r); - return r; -} - -struct amdgpu_recover_work_struct { - struct work_struct base; - struct amdgpu_device *adev; - struct amdgpu_job *job; - int ret; -}; - -static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work) -{ - struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base); - - recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job); -} -/* - * Serialize gpu recover into reset domain single threaded wq - */ -int amdgpu_device_gpu_recover(struct amdgpu_device *adev, - struct amdgpu_job *job) -{ - struct amdgpu_recover_work_struct work = {.adev = adev, .job = job}; - - INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); - if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base)) - return -EAGAIN; - - flush_work(&work.base); - - return work.ret; + atomic_set(&adev->reset_domain->reset_res, r); + return r; } /** @@ -5462,6 +5509,36 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) } } +/** + * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR + * + * @adev: amdgpu_device pointer + * @peer_adev: amdgpu_device pointer for peer device trying to access @adev + * + * Return true if @peer_adev can access (DMA) @adev through the PCIe + * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of + * @peer_adev. + */ +bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, + struct amdgpu_device *peer_adev) +{ +#ifdef CONFIG_HSA_AMD_P2P + uint64_t address_mask = peer_adev->dev->dma_mask ? + ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); + resource_size_t aper_limit = + adev->gmc.aper_base + adev->gmc.aper_size - 1; + bool p2p_access = !(pci_p2pdma_distance_many(adev->pdev, + &peer_adev->dev, 1, true) < 0); + + return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && + adev->gmc.real_vram_size == adev->gmc.visible_vram_size && + !(adev->gmc.aper_base & address_mask || + aper_limit & address_mask)); +#else + return false; +#endif +} + int amdgpu_device_baco_enter(struct drm_device *dev) { struct amdgpu_device *adev = drm_to_adev(dev); |