diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 145 |
1 files changed, 118 insertions, 27 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 90d22a376632..8219e40240ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -30,6 +30,7 @@ #include <linux/module.h> #include <linux/console.h> #include <linux/slab.h> +#include <linux/iommu.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_probe_helper.h> @@ -331,7 +332,7 @@ void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos, } /** - * amdgpu_device_vram_access - access vram by vram aperature + * amdgpu_device_aper_access - access vram by vram aperature * * @adev: amdgpu_device pointer * @pos: offset of the buffer in vram @@ -550,11 +551,11 @@ void amdgpu_device_wreg(struct amdgpu_device *adev, trace_amdgpu_device_wreg(adev->pdev->device, reg, v); } -/* +/** * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range * * this function is invoked only the debugfs register access - * */ + */ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v) { @@ -1100,7 +1101,7 @@ static void amdgpu_device_wb_fini(struct amdgpu_device *adev) } /** - * amdgpu_device_wb_init- Init Writeback driver info and allocate memory + * amdgpu_device_wb_init - Init Writeback driver info and allocate memory * * @adev: amdgpu_device pointer * @@ -2657,6 +2658,36 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) return 0; } +/** + * amdgpu_device_smu_fini_early - smu hw_fini wrapper + * + * @adev: amdgpu_device pointer + * + * For ASICs need to disable SMC first + */ +static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) +{ + int i, r; + + if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) + return; + + for (i = 0; i < adev->num_ip_blocks; i++) { + if (!adev->ip_blocks[i].status.hw) + continue; + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { + r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); + /* XXX handle errors */ + if (r) { + DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", + adev->ip_blocks[i].version->funcs->name, r); + } + adev->ip_blocks[i].status.hw = false; + break; + } + } +} + static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) { int i, r; @@ -2677,21 +2708,8 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev) amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); - /* need to disable SMC first */ - for (i = 0; i < adev->num_ip_blocks; i++) { - if (!adev->ip_blocks[i].status.hw) - continue; - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { - r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); - /* XXX handle errors */ - if (r) { - DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", - adev->ip_blocks[i].version->funcs->name, r); - } - adev->ip_blocks[i].status.hw = false; - break; - } - } + /* Workaroud for ASICs need to disable SMC first */ + amdgpu_device_smu_fini_early(adev); for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.hw) @@ -2733,8 +2751,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done) amdgpu_virt_release_ras_err_handler_data(adev); - amdgpu_ras_pre_fini(adev); - if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_remove_device(adev); @@ -3367,6 +3383,22 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) return ret; } +/** + * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU + * + * @adev: amdgpu_device pointer + * + * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode + */ +static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) +{ + struct iommu_domain *domain; + + domain = iommu_get_domain_for_dev(adev->dev); + if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) + adev->ram_is_direct_mapped = true; +} + static const struct attribute *amdgpu_dev_attributes[] = { &dev_attr_product_name.attr, &dev_attr_product_number.attr, @@ -3770,6 +3802,8 @@ fence_driver_init: queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work, msecs_to_jiffies(AMDGPU_RESUME_MS)); + amdgpu_device_check_iommu_direct_map(adev); + return 0; release_ras_con: @@ -3803,7 +3837,7 @@ static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev) } /** - * amdgpu_device_fini - tear down the driver + * amdgpu_device_fini_hw - tear down the driver * * @adev: amdgpu_device pointer * @@ -3844,6 +3878,9 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) amdgpu_ucode_sysfs_fini(adev); sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); + /* disable ras feature must before hw fini */ + amdgpu_ras_pre_fini(adev); + amdgpu_device_ip_fini_early(adev); amdgpu_irq_fini_hw(adev); @@ -4284,6 +4321,9 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, bool from_hypervisor) { int r; + struct amdgpu_hive_info *hive = NULL; + + amdgpu_amdkfd_pre_reset(adev); amdgpu_amdkfd_pre_reset(adev); @@ -4312,8 +4352,19 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) goto error; - amdgpu_irq_gpu_reset_resume_helper(adev); - r = amdgpu_ib_ring_tests(adev); + hive = amdgpu_get_xgmi_hive(adev); + /* Update PSP FW topology after reset */ + if (hive && adev->gmc.xgmi.num_physical_nodes > 1) + r = amdgpu_xgmi_update_topology(hive, adev); + + if (hive) + amdgpu_put_xgmi_hive(hive); + + if (!r) { + amdgpu_irq_gpu_reset_resume_helper(adev); + r = amdgpu_ib_ring_tests(adev); + amdgpu_amdkfd_post_reset(adev); + } error: if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { @@ -4745,7 +4796,7 @@ static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgp { struct amdgpu_device *tmp_adev = NULL; - if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { if (!hive) { dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); return -ENODEV; @@ -4957,7 +5008,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * We always reset all schedulers for device and all devices for XGMI * hive so that should take care of them too. */ - hive = amdgpu_get_xgmi_hive(adev); + if (!amdgpu_sriov_vf(adev)) + hive = amdgpu_get_xgmi_hive(adev); if (hive) { if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", @@ -4998,7 +5050,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * to put adev in the 1st position. */ INIT_LIST_HEAD(&device_list); - if (adev->gmc.xgmi.num_physical_nodes > 1) { + if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) list_add_tail(&tmp_adev->reset_list, &device_list); if (!list_is_first(&adev->reset_list, &device_list)) @@ -5632,3 +5684,42 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev, amdgpu_asic_invalidate_hdp(adev, ring); } + +/** + * amdgpu_device_halt() - bring hardware to some kind of halt state + * + * @adev: amdgpu_device pointer + * + * Bring hardware to some kind of halt state so that no one can touch it + * any more. It will help to maintain error context when error occurred. + * Compare to a simple hang, the system will keep stable at least for SSH + * access. Then it should be trivial to inspect the hardware state and + * see what's going on. Implemented as following: + * + * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), + * clears all CPU mappings to device, disallows remappings through page faults + * 2. amdgpu_irq_disable_all() disables all interrupts + * 3. amdgpu_fence_driver_hw_fini() signals all HW fences + * 4. set adev->no_hw_access to avoid potential crashes after setp 5 + * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings + * 6. pci_disable_device() and pci_wait_for_pending_transaction() + * flush any in flight DMA operations + */ +void amdgpu_device_halt(struct amdgpu_device *adev) +{ + struct pci_dev *pdev = adev->pdev; + struct drm_device *ddev = adev_to_drm(adev); + + drm_dev_unplug(ddev); + + amdgpu_irq_disable_all(adev); + + amdgpu_fence_driver_hw_fini(adev); + + adev->no_hw_access = true; + + amdgpu_device_unmap_mmio(adev); + + pci_disable_device(pdev); + pci_wait_for_pending_transaction(pdev); +} |