From 81d0bcf9900932633d270d5bc4a54ff599c6ebdb Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 7 Dec 2022 11:08:53 -0500 Subject: drm/amdgpu: make display pinning more flexible (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only apply the static threshold for Stoney and Carrizo. This hardware has certain requirements that don't allow mixing of GTT and VRAM. Newer asics do not have these requirements so we should be able to be more flexible with where buffers end up. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2270 Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2291 Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2255 Acked-by: Luben Tuikov Reviewed-by: Christian König Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 919bbea2e3ac..2df55cc7e07f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1506,7 +1506,8 @@ u64 amdgpu_bo_gpu_offset_no_check(struct amdgpu_bo *bo) uint32_t amdgpu_bo_get_preferred_domain(struct amdgpu_device *adev, uint32_t domain) { - if (domain == (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) { + if ((domain == (AMDGPU_GEM_DOMAIN_VRAM | AMDGPU_GEM_DOMAIN_GTT)) && + ((adev->asic_type == CHIP_CARRIZO) || (adev->asic_type == CHIP_STONEY))) { domain = AMDGPU_GEM_DOMAIN_VRAM; if (adev->gmc.real_vram_size <= AMDGPU_SG_THRESHOLD) domain = AMDGPU_GEM_DOMAIN_GTT; -- cgit v1.2.3 From 1d4624cd72b912b2680c08d0be48338a1629a858 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 21 Nov 2022 15:52:19 -0500 Subject: drm/amdgpu: handle polaris10/11 overlap asics (v2) Some special polaris 10 chips overlap with the polaris11 DID range. Handle this properly in the driver. v2: use local flags for other function calls. Acked-by: Luben Tuikov Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 7383272c6a3a..b4f2d61ea0d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2039,6 +2039,15 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, "See modparam exp_hw_support\n"); return -ENODEV; } + /* differentiate between P10 and P11 asics with the same DID */ + if (pdev->device == 0x67FF && + (pdev->revision == 0xE3 || + pdev->revision == 0xE7 || + pdev->revision == 0xF3 || + pdev->revision == 0xF7)) { + flags &= ~AMD_ASIC_MASK; + flags |= CHIP_POLARIS10; + } /* Due to hardware bugs, S/G Display on raven requires a 1:1 IOMMU mapping, * however, SME requires an indirect IOMMU mapping because the encryption @@ -2108,12 +2117,12 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, pci_set_drvdata(pdev, ddev); - ret = amdgpu_driver_load_kms(adev, ent->driver_data); + ret = amdgpu_driver_load_kms(adev, flags); if (ret) goto err_pci; retry_init: - ret = drm_dev_register(ddev, ent->driver_data); + ret = drm_dev_register(ddev, flags); if (ret == -EAGAIN && ++retry <= 3) { DRM_INFO("retry init %d\n", retry); /* Don't request EX mode too frequently which is attacking */ -- cgit v1.2.3 From 47ea20762bb7875a62e10433a3cd5d34e9133f47 Mon Sep 17 00:00:00 2001 From: Shikang Fan Date: Thu, 8 Dec 2022 19:53:14 +0800 Subject: drm/amdgpu: Add an extra evict_resource call during device_suspend. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - evict_resource is taking too long causing sriov full access mode timeout. So, add an extra evict_resource in the beginning as an early evict. Signed-off-by: Shikang Fan Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index cfa411c12072..64660a41d53c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4112,6 +4112,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) adev->in_suspend = true; + /* Evict the majority of BOs before grabbing the full access */ + r = amdgpu_device_evict_resources(adev); + if (r) + return r; + if (amdgpu_sriov_vf(adev)) { amdgpu_virt_fini_data_exchange(adev); r = amdgpu_virt_request_full_gpu(adev, false); -- cgit v1.2.3 From 9c3db58bf8f7d0007049f686ce8c419eed4325d1 Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 7 Dec 2022 08:47:30 +0100 Subject: drm/amdgpu: fixx NULL pointer deref in gmc_v9_0_get_vm_pte MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We not only need to make sure that we have a BO, but also that the BO has some backing store. Fixes: d1a372af1c3d ("drm/amdgpu: Set MTYPE in PTE based on BO flags") Signed-off-by: Christian König Reviewed-by: Felix Kuehling Reviewed-by: Alex Deucher Reviewed-by: Luben Tuikov Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 50386eb2eec8..08d6cf79fb15 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1185,6 +1185,8 @@ static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev, struct amdgpu_bo_va_mapping *mapping, uint64_t *flags) { + struct amdgpu_bo *bo = mapping->bo_va->base.bo; + *flags &= ~AMDGPU_PTE_EXECUTABLE; *flags |= mapping->flags & AMDGPU_PTE_EXECUTABLE; @@ -1196,7 +1198,7 @@ static void gmc_v9_0_get_vm_pte(struct amdgpu_device *adev, *flags &= ~AMDGPU_PTE_VALID; } - if (mapping->bo_va->base.bo) + if (bo && bo->tbo.resource) gmc_v9_0_get_coherence_flags(adev, mapping->bo_va->base.bo, mapping, flags); } -- cgit v1.2.3 From 4d2ccd96ac25846749fc58691f5142a966e65b3a Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 16 Nov 2022 15:45:36 +0100 Subject: drm/amdgpu: WARN when freeing kernel memory during suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When buffers are freed during suspend there is no guarantee that they can be re-allocated during resume. The PSP subsystem seems to be quite buggy regarding this, so add a WARN_ON() to point out those bugs. Signed-off-by: Christian König Reviewed-by: Alex Deucher Tested-by: Guilherme G. Piccoli Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 2df55cc7e07f..3393c1a6a0ff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -422,6 +422,8 @@ void amdgpu_bo_free_kernel(struct amdgpu_bo **bo, u64 *gpu_addr, if (*bo == NULL) return; + WARN_ON(amdgpu_ttm_adev((*bo)->tbo.bdev)->in_suspend); + if (likely(amdgpu_bo_reserve(*bo, true) == 0)) { if (cpu_addr) amdgpu_bo_kunmap(*bo); -- cgit v1.2.3 From f95f51a4c3357eabf74fe14ab7daa5b5c0422b27 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 21 Apr 2021 21:09:54 -0400 Subject: drm/amdgpu: Add notifier lock for KFD userptrs Add a per-process MMU notifier lock for processing notifiers from userptrs. Use that lock to properly synchronize page table updates with MMU notifiers. Signed-off-by: Felix Kuehling Reviewed-by: Xiaogang Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 13 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 212 +++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 12 +- drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.h | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 17 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 6 + 6 files changed, 172 insertions(+), 91 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index f50e3ba4d7a5..589939631ed4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include "amdgpu_sync.h" @@ -65,6 +66,7 @@ struct kgd_mem { struct mutex lock; struct amdgpu_bo *bo; struct dma_buf *dmabuf; + struct hmm_range *range; struct list_head attachments; /* protected by amdkfd_process_info.lock */ struct ttm_validate_buffer validate_list; @@ -75,7 +77,7 @@ struct kgd_mem { uint32_t alloc_flags; - atomic_t invalid; + uint32_t invalid; struct amdkfd_process_info *process_info; struct amdgpu_sync sync; @@ -131,7 +133,8 @@ struct amdkfd_process_info { struct amdgpu_amdkfd_fence *eviction_fence; /* MMU-notifier related fields */ - atomic_t evicted_bos; + struct mutex notifier_lock; + uint32_t evicted_bos; struct delayed_work restore_userptr_work; struct pid *pid; bool block_mmu_notifications; @@ -180,7 +183,8 @@ int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data); bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm); struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f); int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo); -int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm); +int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, + unsigned long cur_seq, struct kgd_mem *mem); #else static inline bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm) @@ -201,7 +205,8 @@ int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo) } static inline -int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, struct mm_struct *mm) +int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, + unsigned long cur_seq, struct kgd_mem *mem) { return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 8782916e64a0..0a854bb8b47e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -964,7 +964,9 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr, * later stage when it is scheduled by another ioctl called by * CRIU master process for the target pid for restore. */ - atomic_inc(&mem->invalid); + mutex_lock(&process_info->notifier_lock); + mem->invalid++; + mutex_unlock(&process_info->notifier_lock); mutex_unlock(&process_info->lock); return 0; } @@ -1301,6 +1303,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, return -ENOMEM; mutex_init(&info->lock); + mutex_init(&info->notifier_lock); INIT_LIST_HEAD(&info->vm_list_head); INIT_LIST_HEAD(&info->kfd_bo_list); INIT_LIST_HEAD(&info->userptr_valid_list); @@ -1317,7 +1320,6 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info, } info->pid = get_task_pid(current->group_leader, PIDTYPE_PID); - atomic_set(&info->evicted_bos, 0); INIT_DELAYED_WORK(&info->restore_userptr_work, amdgpu_amdkfd_restore_userptr_worker); @@ -1372,6 +1374,7 @@ reserve_pd_fail: put_pid(info->pid); create_evict_fence_fail: mutex_destroy(&info->lock); + mutex_destroy(&info->notifier_lock); kfree(info); } return ret; @@ -1496,6 +1499,7 @@ void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, cancel_delayed_work_sync(&process_info->restore_userptr_work); put_pid(process_info->pid); mutex_destroy(&process_info->lock); + mutex_destroy(&process_info->notifier_lock); kfree(process_info); } } @@ -1548,7 +1552,9 @@ int amdgpu_amdkfd_criu_resume(void *p) mutex_lock(&pinfo->lock); pr_debug("scheduling work\n"); - atomic_inc(&pinfo->evicted_bos); + mutex_lock(&pinfo->notifier_lock); + pinfo->evicted_bos++; + mutex_unlock(&pinfo->notifier_lock); if (!READ_ONCE(pinfo->block_mmu_notifications)) { ret = -EINVAL; goto out_unlock; @@ -1773,8 +1779,13 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu( list_del(&bo_list_entry->head); mutex_unlock(&process_info->lock); - /* No more MMU notifiers */ - amdgpu_hmm_unregister(mem->bo); + /* Cleanup user pages and MMU notifiers */ + if (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) { + amdgpu_hmm_unregister(mem->bo); + mutex_lock(&process_info->notifier_lock); + amdgpu_ttm_tt_discard_user_pages(mem->bo->tbo.ttm, mem->range); + mutex_unlock(&process_info->notifier_lock); + } ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx); if (unlikely(ret)) @@ -1864,6 +1875,16 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu( */ mutex_lock(&mem->process_info->lock); + /* Lock notifier lock. If we find an invalid userptr BO, we can be + * sure that the MMU notifier is no longer running + * concurrently and the queues are actually stopped + */ + if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) { + mutex_lock(&mem->process_info->notifier_lock); + is_invalid_userptr = !!mem->invalid; + mutex_unlock(&mem->process_info->notifier_lock); + } + mutex_lock(&mem->lock); domain = mem->domain; @@ -2241,34 +2262,38 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev, * * Runs in MMU notifier, may be in RECLAIM_FS context. This means it * cannot do any memory allocations, and cannot take any locks that - * are held elsewhere while allocating memory. Therefore this is as - * simple as possible, using atomic counters. + * are held elsewhere while allocating memory. * * It doesn't do anything to the BO itself. The real work happens in * restore, where we get updated page addresses. This function only * ensures that GPU access to the BO is stopped. */ -int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, - struct mm_struct *mm) +int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, + unsigned long cur_seq, struct kgd_mem *mem) { struct amdkfd_process_info *process_info = mem->process_info; - int evicted_bos; int r = 0; - /* Do not process MMU notifications until stage-4 IOCTL is received */ + /* Do not process MMU notifications during CRIU restore until + * KFD_CRIU_OP_RESUME IOCTL is received + */ if (READ_ONCE(process_info->block_mmu_notifications)) return 0; - atomic_inc(&mem->invalid); - evicted_bos = atomic_inc_return(&process_info->evicted_bos); - if (evicted_bos == 1) { + mutex_lock(&process_info->notifier_lock); + mmu_interval_set_seq(mni, cur_seq); + + mem->invalid++; + if (++process_info->evicted_bos == 1) { /* First eviction, stop the queues */ - r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_USERPTR); + r = kgd2kfd_quiesce_mm(mni->mm, + KFD_QUEUE_EVICTION_TRIGGER_USERPTR); if (r) pr_err("Failed to quiesce KFD\n"); schedule_delayed_work(&process_info->restore_userptr_work, msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS)); } + mutex_unlock(&process_info->notifier_lock); return r; } @@ -2285,54 +2310,58 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info, struct kgd_mem *mem, *tmp_mem; struct amdgpu_bo *bo; struct ttm_operation_ctx ctx = { false, false }; - int invalid, ret; + uint32_t invalid; + int ret = 0; - /* Move all invalidated BOs to the userptr_inval_list and - * release their user pages by migration to the CPU domain - */ + mutex_lock(&process_info->notifier_lock); + + /* Move all invalidated BOs to the userptr_inval_list */ list_for_each_entry_safe(mem, tmp_mem, &process_info->userptr_valid_list, - validate_list.head) { - if (!atomic_read(&mem->invalid)) - continue; /* BO is still valid */ - - bo = mem->bo; - - if (amdgpu_bo_reserve(bo, true)) - return -EAGAIN; - amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); - ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); - amdgpu_bo_unreserve(bo); - if (ret) { - pr_err("%s: Failed to invalidate userptr BO\n", - __func__); - return -EAGAIN; - } - - list_move_tail(&mem->validate_list.head, - &process_info->userptr_inval_list); - } - - if (list_empty(&process_info->userptr_inval_list)) - return 0; /* All evicted userptr BOs were freed */ + validate_list.head) + if (mem->invalid) + list_move_tail(&mem->validate_list.head, + &process_info->userptr_inval_list); /* Go through userptr_inval_list and update any invalid user_pages */ list_for_each_entry(mem, &process_info->userptr_inval_list, validate_list.head) { - struct hmm_range *range; - - invalid = atomic_read(&mem->invalid); + invalid = mem->invalid; if (!invalid) /* BO hasn't been invalidated since the last - * revalidation attempt. Keep its BO list. + * revalidation attempt. Keep its page list. */ continue; bo = mem->bo; + amdgpu_ttm_tt_discard_user_pages(bo->tbo.ttm, mem->range); + mem->range = NULL; + + /* BO reservations and getting user pages (hmm_range_fault) + * must happen outside the notifier lock + */ + mutex_unlock(&process_info->notifier_lock); + + /* Move the BO to system (CPU) domain if necessary to unmap + * and free the SG table + */ + if (bo->tbo.resource->mem_type != TTM_PL_SYSTEM) { + if (amdgpu_bo_reserve(bo, true)) + return -EAGAIN; + amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); + amdgpu_bo_unreserve(bo); + if (ret) { + pr_err("%s: Failed to invalidate userptr BO\n", + __func__); + return -EAGAIN; + } + } + /* Get updated user pages */ ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, - &range); + &mem->range); if (ret) { pr_debug("Failed %d to get user pages\n", ret); @@ -2345,30 +2374,32 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info, */ if (ret != -EFAULT) return ret; - } else { - /* - * FIXME: Cannot ignore the return code, must hold - * notifier_lock - */ - amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, range); + ret = 0; } + mutex_lock(&process_info->notifier_lock); + /* Mark the BO as valid unless it was invalidated * again concurrently. */ - if (atomic_cmpxchg(&mem->invalid, invalid, 0) != invalid) - return -EAGAIN; + if (mem->invalid != invalid) { + ret = -EAGAIN; + goto unlock_out; + } + mem->invalid = 0; } - return 0; +unlock_out: + mutex_unlock(&process_info->notifier_lock); + + return ret; } /* Validate invalid userptr BOs * - * Validates BOs on the userptr_inval_list, and moves them back to the - * userptr_valid_list. Also updates GPUVM page tables with new page - * addresses and waits for the page table updates to complete. + * Validates BOs on the userptr_inval_list. Also updates GPUVM page tables + * with new page addresses and waits for the page table updates to complete. */ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) { @@ -2439,9 +2470,6 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) } } - list_move_tail(&mem->validate_list.head, - &process_info->userptr_valid_list); - /* Update mapping. If the BO was not validated * (because we couldn't get user pages), this will * clear the page table entries, which will result in @@ -2457,7 +2485,9 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info) if (ret) { pr_err("%s: update PTE failed\n", __func__); /* make sure this gets validated again */ - atomic_inc(&mem->invalid); + mutex_lock(&process_info->notifier_lock); + mem->invalid++; + mutex_unlock(&process_info->notifier_lock); goto unreserve_out; } } @@ -2477,6 +2507,36 @@ out_no_mem: return ret; } +/* Confirm that all user pages are valid while holding the notifier lock + * + * Moves valid BOs from the userptr_inval_list back to userptr_val_list. + */ +static int confirm_valid_user_pages_locked(struct amdkfd_process_info *process_info) +{ + struct kgd_mem *mem, *tmp_mem; + int ret = 0; + + list_for_each_entry_safe(mem, tmp_mem, + &process_info->userptr_inval_list, + validate_list.head) { + bool valid = amdgpu_ttm_tt_get_user_pages_done( + mem->bo->tbo.ttm, mem->range); + + mem->range = NULL; + if (!valid) { + WARN(!mem->invalid, "Invalid BO not marked invalid"); + ret = -EAGAIN; + continue; + } + WARN(mem->invalid, "Valid BO is marked invalid"); + + list_move_tail(&mem->validate_list.head, + &process_info->userptr_valid_list); + } + + return ret; +} + /* Worker callback to restore evicted userptr BOs * * Tries to update and validate all userptr BOs. If successful and no @@ -2491,9 +2551,11 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) restore_userptr_work); struct task_struct *usertask; struct mm_struct *mm; - int evicted_bos; + uint32_t evicted_bos; - evicted_bos = atomic_read(&process_info->evicted_bos); + mutex_lock(&process_info->notifier_lock); + evicted_bos = process_info->evicted_bos; + mutex_unlock(&process_info->notifier_lock); if (!evicted_bos) return; @@ -2516,9 +2578,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) * and we can just restart the queues. */ if (!list_empty(&process_info->userptr_inval_list)) { - if (atomic_read(&process_info->evicted_bos) != evicted_bos) - goto unlock_out; /* Concurrent eviction, try again */ - if (validate_invalid_user_pages(process_info)) goto unlock_out; } @@ -2527,10 +2586,17 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) * be a first eviction that calls quiesce_mm. The eviction * reference counting inside KFD will handle this case. */ - if (atomic_cmpxchg(&process_info->evicted_bos, evicted_bos, 0) != - evicted_bos) - goto unlock_out; - evicted_bos = 0; + mutex_lock(&process_info->notifier_lock); + if (process_info->evicted_bos != evicted_bos) + goto unlock_notifier_out; + + if (confirm_valid_user_pages_locked(process_info)) { + WARN(1, "User pages unexpectedly invalid"); + goto unlock_notifier_out; + } + + process_info->evicted_bos = evicted_bos = 0; + if (kgd2kfd_resume_mm(mm)) { pr_err("%s: Failed to resume KFD\n", __func__); /* No recovery from this failure. Probably the CP is @@ -2538,6 +2604,8 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) */ } +unlock_notifier_out: + mutex_unlock(&process_info->notifier_lock); unlock_out: mutex_unlock(&process_info->lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c index 65715cb395d8..2dadcfe43d03 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c @@ -105,17 +105,11 @@ static bool amdgpu_hmm_invalidate_hsa(struct mmu_interval_notifier *mni, unsigned long cur_seq) { struct amdgpu_bo *bo = container_of(mni, struct amdgpu_bo, notifier); - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); if (!mmu_notifier_range_blockable(range)) return false; - mutex_lock(&adev->notifier_lock); - - mmu_interval_set_seq(mni, cur_seq); - - amdgpu_amdkfd_evict_userptr(bo->kfd_bo, bo->notifier.mm); - mutex_unlock(&adev->notifier_lock); + amdgpu_amdkfd_evict_userptr(mni, cur_seq, bo->kfd_bo); return true; } @@ -244,9 +238,9 @@ out_free_range: return r; } -int amdgpu_hmm_range_get_pages_done(struct hmm_range *hmm_range) +bool amdgpu_hmm_range_get_pages_done(struct hmm_range *hmm_range) { - int r; + bool r; r = mmu_interval_read_retry(hmm_range->notifier, hmm_range->notifier_seq); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.h index 13ed94d3b01b..e2edcd010ccc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.h @@ -29,12 +29,13 @@ #include #include #include +#include int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier, uint64_t start, uint64_t npages, bool readonly, void *owner, struct page **pages, struct hmm_range **phmm_range); -int amdgpu_hmm_range_get_pages_done(struct hmm_range *hmm_range); +bool amdgpu_hmm_range_get_pages_done(struct hmm_range *hmm_range); #if defined(CONFIG_HMM_MIRROR) int amdgpu_hmm_register(struct amdgpu_bo *bo, unsigned long addr); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index b4236572eae1..f0e4c7309438 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -695,8 +695,19 @@ out_unlock: return r; } +/* amdgpu_ttm_tt_discard_user_pages - Discard range and pfn array allocations + */ +void amdgpu_ttm_tt_discard_user_pages(struct ttm_tt *ttm, + struct hmm_range *range) +{ + struct amdgpu_ttm_tt *gtt = (void *)ttm; + + if (gtt && gtt->userptr && range) + amdgpu_hmm_range_get_pages_done(range); +} + /* - * amdgpu_ttm_tt_userptr_range_done - stop HMM track the CPU page table change + * amdgpu_ttm_tt_get_user_pages_done - stop HMM track the CPU page table change * Check if the pages backing this ttm range have been invalidated * * Returns: true if pages are still valid @@ -714,10 +725,6 @@ bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm, WARN_ONCE(!range->hmm_pfns, "No user pages to check\n"); - /* - * FIXME: Must always hold notifier_lock for this, and must - * not ignore the return code. - */ return !amdgpu_hmm_range_get_pages_done(range); } #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index b4d8ba2789f3..e2cd5894afc9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -159,6 +159,8 @@ uint64_t amdgpu_ttm_domain_start(struct amdgpu_device *adev, uint32_t type); #if IS_ENABLED(CONFIG_DRM_AMDGPU_USERPTR) int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages, struct hmm_range **range); +void amdgpu_ttm_tt_discard_user_pages(struct ttm_tt *ttm, + struct hmm_range *range); bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm, struct hmm_range *range); #else @@ -168,6 +170,10 @@ static inline int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, { return -EPERM; } +static inline void amdgpu_ttm_tt_discard_user_pages(struct ttm_tt *ttm, + struct hmm_range *range) +{ +} static inline bool amdgpu_ttm_tt_get_user_pages_done(struct ttm_tt *ttm, struct hmm_range *range) { -- cgit v1.2.3 From 56b0989e2939811c11ed9c449ff84cf85878ffe3 Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 25 Nov 2022 16:04:25 +0100 Subject: drm/amdgpu: fix GDS/GWS/OA switch handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bas pointed out that this isn't working as expected and could cause crashes. Fix the handling by storing the marker that a switch is needed inside the job instead. Reported-by: Bas Nieuwenhuizen Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 42 ++++++++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 54 ++++++++++++--------------------- 3 files changed, 54 insertions(+), 43 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c index 2a9a2593dc18..01878145a586 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c @@ -165,6 +165,26 @@ bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev, atomic_read(&adev->gpu_reset_counter); } +/* Check if we need to switch to another set of resources */ +static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id, + struct amdgpu_job *job) +{ + return id->gds_base != job->gds_base || + id->gds_size != job->gds_size || + id->gws_base != job->gws_base || + id->gws_size != job->gws_size || + id->oa_base != job->oa_base || + id->oa_size != job->oa_size; +} + +/* Check if the id is compatible with the job */ +static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id, + struct amdgpu_job *job) +{ + return id->pd_gpu_addr == job->vm_pd_addr && + !amdgpu_vmid_gds_switch_needed(id, job); +} + /** * amdgpu_vmid_grab_idle - grab idle VMID * @@ -265,7 +285,7 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm, *id = vm->reserved_vmid[vmhub]; if ((*id)->owner != vm->immediate.fence_context || - (*id)->pd_gpu_addr != job->vm_pd_addr || + !amdgpu_vmid_compatible(*id, job) || (*id)->flushed_updates < updates || !(*id)->last_flush || ((*id)->last_flush->context != fence_context && @@ -294,7 +314,6 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm, if (r) return r; - (*id)->flushed_updates = updates; job->vm_needs_flush = needs_flush; return 0; } @@ -333,7 +352,7 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm, if ((*id)->owner != vm->immediate.fence_context) continue; - if ((*id)->pd_gpu_addr != job->vm_pd_addr) + if (!amdgpu_vmid_compatible(*id, job)) continue; if (!(*id)->last_flush || @@ -355,7 +374,6 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm, if (r) return r; - (*id)->flushed_updates = updates; job->vm_needs_flush |= needs_flush; return 0; } @@ -408,22 +426,30 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring, if (r) goto error; - id->flushed_updates = amdgpu_vm_tlb_seq(vm); job->vm_needs_flush = true; } list_move_tail(&id->list, &id_mgr->ids_lru); } - id->pd_gpu_addr = job->vm_pd_addr; - id->owner = vm->immediate.fence_context; - + job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job); if (job->vm_needs_flush) { + id->flushed_updates = amdgpu_vm_tlb_seq(vm); dma_fence_put(id->last_flush); id->last_flush = NULL; } job->vmid = id - id_mgr->ids; job->pasid = vm->pasid; + + id->gds_base = job->gds_base; + id->gds_size = job->gds_size; + id->gws_base = job->gws_base; + id->gws_size = job->gws_size; + id->oa_base = job->oa_base; + id->oa_size = job->oa_size; + id->pd_gpu_addr = job->vm_pd_addr; + id->owner = vm->immediate.fence_context; + trace_amdgpu_vm_grab_id(vm, ring, job); error: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index a372802ea4e0..02e85b040baf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -53,6 +53,7 @@ struct amdgpu_job { uint32_t preamble_status; uint32_t preemption_status; bool vm_needs_flush; + bool gds_switch_needed; uint64_t vm_pd_addr; unsigned vmid; unsigned pasid; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index c05cff979004..245c66ea10e7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -484,25 +484,20 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring, struct amdgpu_device *adev = ring->adev; unsigned vmhub = ring->funcs->vmhub; struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; - struct amdgpu_vmid *id; - bool gds_switch_needed; - bool vm_flush_needed = job->vm_needs_flush || ring->has_compute_vm_bug; if (job->vmid == 0) return false; - id = &id_mgr->ids[job->vmid]; - gds_switch_needed = ring->funcs->emit_gds_switch && ( - id->gds_base != job->gds_base || - id->gds_size != job->gds_size || - id->gws_base != job->gws_base || - id->gws_size != job->gws_size || - id->oa_base != job->oa_base || - id->oa_size != job->oa_size); - - if (amdgpu_vmid_had_gpu_reset(adev, id)) + + if (job->vm_needs_flush || ring->has_compute_vm_bug) + return true; + + if (ring->funcs->emit_gds_switch && job->gds_switch_needed) return true; - return vm_flush_needed || gds_switch_needed; + if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid])) + return true; + + return false; } /** @@ -524,13 +519,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, unsigned vmhub = ring->funcs->vmhub; struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; struct amdgpu_vmid *id = &id_mgr->ids[job->vmid]; - bool gds_switch_needed = ring->funcs->emit_gds_switch && ( - id->gds_base != job->gds_base || - id->gds_size != job->gds_size || - id->gws_base != job->gws_base || - id->gws_size != job->gws_size || - id->oa_base != job->oa_base || - id->oa_size != job->oa_size); + bool gds_switch_needed = ring->funcs->emit_gds_switch && + job->gds_switch_needed; bool vm_flush_needed = job->vm_needs_flush; struct dma_fence *fence = NULL; bool pasid_mapping_needed = false; @@ -577,6 +567,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, if (pasid_mapping_needed) amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid); + if (!ring->is_mes_queue && ring->funcs->emit_gds_switch && + gds_switch_needed) { + amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base, + job->gds_size, job->gws_base, + job->gws_size, job->oa_base, + job->oa_size); + } + if (vm_flush_needed || pasid_mapping_needed) { r = amdgpu_fence_emit(ring, &fence, NULL, 0); if (r) @@ -601,20 +599,6 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, } dma_fence_put(fence); - if (!ring->is_mes_queue && ring->funcs->emit_gds_switch && - gds_switch_needed) { - id->gds_base = job->gds_base; - id->gds_size = job->gds_size; - id->gws_base = job->gws_base; - id->gws_size = job->gws_size; - id->oa_base = job->oa_base; - id->oa_size = job->oa_size; - amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base, - job->gds_size, job->gws_base, - job->gws_size, job->oa_base, - job->oa_size); - } - if (ring->funcs->patch_cond_exec) amdgpu_ring_patch_cond_exec(ring, patch_offset); -- cgit v1.2.3 From 5f3c40e9e2460c42f5bf6c51b1e393d7159241c3 Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 25 Nov 2022 16:42:45 +0100 Subject: drm/amdgpu: cleanup SPM support a bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should probably not access job->vm and also emit the SPM switch under the conditional execute. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 9 +++++---- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c index 01878145a586..6949dfec75d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c @@ -315,6 +315,7 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm, return r; job->vm_needs_flush = needs_flush; + job->spm_update_needed = true; return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 02e85b040baf..52f2e313ea17 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -54,6 +54,7 @@ struct amdgpu_job { uint32_t preemption_status; bool vm_needs_flush; bool gds_switch_needed; + bool spm_update_needed; uint64_t vm_pd_addr; unsigned vmid; unsigned pasid; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 245c66ea10e7..a05cce3f3170 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -519,22 +519,20 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, unsigned vmhub = ring->funcs->vmhub; struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; struct amdgpu_vmid *id = &id_mgr->ids[job->vmid]; + bool spm_update_needed = job->spm_update_needed; bool gds_switch_needed = ring->funcs->emit_gds_switch && job->gds_switch_needed; bool vm_flush_needed = job->vm_needs_flush; struct dma_fence *fence = NULL; bool pasid_mapping_needed = false; unsigned patch_offset = 0; - bool update_spm_vmid_needed = (job->vm && (job->vm->reserved_vmid[vmhub] != NULL)); int r; - if (update_spm_vmid_needed && adev->gfx.rlc.funcs->update_spm_vmid) - adev->gfx.rlc.funcs->update_spm_vmid(adev, job->vmid); - if (amdgpu_vmid_had_gpu_reset(adev, id)) { gds_switch_needed = true; vm_flush_needed = true; pasid_mapping_needed = true; + spm_update_needed = true; } mutex_lock(&id_mgr->lock); @@ -567,6 +565,9 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, if (pasid_mapping_needed) amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid); + if (spm_update_needed && adev->gfx.rlc.funcs->update_spm_vmid) + adev->gfx.rlc.funcs->update_spm_vmid(adev, job->vmid); + if (!ring->is_mes_queue && ring->funcs->emit_gds_switch && gds_switch_needed) { amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base, -- cgit v1.2.3 From 053499f7b45dc56758240615569b349fe9e2fc8d Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 25 Nov 2022 16:45:09 +0100 Subject: drm/amdgpu: stop waiting for the VM during unreserve MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is completely pointless since the VMID always stays allocated until the VM is idle. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index a05cce3f3170..dc379dc22c77 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2368,7 +2368,6 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) union drm_amdgpu_vm *args = data; struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_fpriv *fpriv = filp->driver_priv; - long timeout = msecs_to_jiffies(2000); int r; switch (args->in.op) { @@ -2380,21 +2379,6 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) return r; break; case AMDGPU_VM_OP_UNRESERVE_VMID: - if (amdgpu_sriov_runtime(adev)) - timeout = 8 * timeout; - - /* Wait vm idle to make sure the vmid set in SPM_VMID is - * not referenced anymore. - */ - r = amdgpu_bo_reserve(fpriv->vm.root.bo, true); - if (r) - return r; - - r = amdgpu_vm_wait_idle(&fpriv->vm, timeout); - if (r < 0) - return r; - - amdgpu_bo_unreserve(fpriv->vm.root.bo); amdgpu_vmid_free_reserved(adev, &fpriv->vm, AMDGPU_GFXHUB_0); break; default: -- cgit v1.2.3 From e44a0fe630c58b0a87d8281f5c1077a3479e5fce Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 25 Nov 2022 17:04:25 +0100 Subject: drm/amdgpu: rework reserved VMID handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of reserving a VMID for a single process allow that many processes use the reserved ID. This allows for proper isolation between the processes. Signed-off-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 45 +++++++++++++++------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 6 +---- 3 files changed, 24 insertions(+), 30 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c index 6949dfec75d5..fcb711a11a5b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c @@ -278,12 +278,13 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm, { struct amdgpu_device *adev = ring->adev; unsigned vmhub = ring->funcs->vmhub; + struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; uint64_t fence_context = adev->fence_context + ring->idx; bool needs_flush = vm->use_cpu_for_update; uint64_t updates = amdgpu_vm_tlb_seq(vm); int r; - *id = vm->reserved_vmid[vmhub]; + *id = id_mgr->reserved; if ((*id)->owner != vm->immediate.fence_context || !amdgpu_vmid_compatible(*id, job) || (*id)->flushed_updates < updates || @@ -462,31 +463,27 @@ int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned vmhub) { - struct amdgpu_vmid_mgr *id_mgr; - struct amdgpu_vmid *idle; - int r = 0; + struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; - id_mgr = &adev->vm_manager.id_mgr[vmhub]; mutex_lock(&id_mgr->lock); if (vm->reserved_vmid[vmhub]) goto unlock; - if (atomic_inc_return(&id_mgr->reserved_vmid_num) > - AMDGPU_VM_MAX_RESERVED_VMID) { - DRM_ERROR("Over limitation of reserved vmid\n"); - atomic_dec(&id_mgr->reserved_vmid_num); - r = -EINVAL; - goto unlock; + + ++id_mgr->reserved_use_count; + if (!id_mgr->reserved) { + struct amdgpu_vmid *id; + + id = list_first_entry(&id_mgr->ids_lru, struct amdgpu_vmid, + list); + /* Remove from normal round robin handling */ + list_del_init(&id->list); + id_mgr->reserved = id; } - /* Select the first entry VMID */ - idle = list_first_entry(&id_mgr->ids_lru, struct amdgpu_vmid, list); - list_del_init(&idle->list); - vm->reserved_vmid[vmhub] = idle; - mutex_unlock(&id_mgr->lock); + vm->reserved_vmid[vmhub] = true; - return 0; unlock: mutex_unlock(&id_mgr->lock); - return r; + return 0; } void amdgpu_vmid_free_reserved(struct amdgpu_device *adev, @@ -496,12 +493,12 @@ void amdgpu_vmid_free_reserved(struct amdgpu_device *adev, struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; mutex_lock(&id_mgr->lock); - if (vm->reserved_vmid[vmhub]) { - list_add(&vm->reserved_vmid[vmhub]->list, - &id_mgr->ids_lru); - vm->reserved_vmid[vmhub] = NULL; - atomic_dec(&id_mgr->reserved_vmid_num); + if (vm->reserved_vmid[vmhub] && + !--id_mgr->reserved_use_count) { + /* give the reserved ID back to normal round robin */ + list_add(&id_mgr->reserved->list, &id_mgr->ids_lru); } + vm->reserved_vmid[vmhub] = false; mutex_unlock(&id_mgr->lock); } @@ -568,7 +565,7 @@ void amdgpu_vmid_mgr_init(struct amdgpu_device *adev) mutex_init(&id_mgr->lock); INIT_LIST_HEAD(&id_mgr->ids_lru); - atomic_set(&id_mgr->reserved_vmid_num, 0); + id_mgr->reserved_use_count = 0; /* manage only VMIDs not used by KFD */ id_mgr->num_ids = adev->vm_manager.first_kfd_vmid; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h index 57efe61dceed..d1cc09b45da4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h @@ -67,7 +67,8 @@ struct amdgpu_vmid_mgr { unsigned num_ids; struct list_head ids_lru; struct amdgpu_vmid ids[AMDGPU_NUM_VMID]; - atomic_t reserved_vmid_num; + struct amdgpu_vmid *reserved; + unsigned int reserved_use_count; }; int amdgpu_pasid_alloc(unsigned int bits); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 6546e786bf00..094bb4807303 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -119,9 +119,6 @@ struct amdgpu_bo_vm; /* Reserve 2MB at top/bottom of address space for kernel use */ #define AMDGPU_VA_RESERVED_SIZE (2ULL << 20) -/* max vmids dedicated for process */ -#define AMDGPU_VM_MAX_RESERVED_VMID 1 - /* See vm_update_mode */ #define AMDGPU_VM_USE_CPU_FOR_GFX (1 << 0) #define AMDGPU_VM_USE_CPU_FOR_COMPUTE (1 << 1) @@ -298,8 +295,7 @@ struct amdgpu_vm { struct dma_fence *last_unlocked; unsigned int pasid; - /* dedicated to vm */ - struct amdgpu_vmid *reserved_vmid[AMDGPU_MAX_VMHUBS]; + bool reserved_vmid[AMDGPU_MAX_VMHUBS]; /* Flag to indicate if VM tables are updated by CPU or GPU (SDMA) */ bool use_cpu_for_update; -- cgit v1.2.3 From 28afcb0ad54c858d0f426b340e88e0277a375597 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Mon, 12 Dec 2022 12:04:42 -0500 Subject: drm/amdgpu: Check if fru_addr is not NULL (v2) Always check if fru_addr is not NULL. This commit also fixes a "smatch" warning. v2: Add a Fixes tag. Cc: Alex Deucher Cc: Dan Carpenter Cc: kernel test robot Cc: AMD Graphics Fixes: afbe5d1e4bd7c7 ("drm/amdgpu: Bug-fix: Reading I2C FRU data on newer ASICs") Signed-off-by: Luben Tuikov Reviewed-by: Kent Russell Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c index 2c38ac7bc643..4620c4712ce3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c @@ -64,7 +64,8 @@ static bool is_fru_eeprom_supported(struct amdgpu_device *adev, u32 *fru_addr) sizeof(atom_ctx->vbios_version)) || strnstr(atom_ctx->vbios_version, "D163", sizeof(atom_ctx->vbios_version))) { - *fru_addr = FRU_EEPROM_MADDR_6; + if (fru_addr) + *fru_addr = FRU_EEPROM_MADDR_6; return true; } else { return false; @@ -83,7 +84,8 @@ static bool is_fru_eeprom_supported(struct amdgpu_device *adev, u32 *fru_addr) sizeof(atom_ctx->vbios_version))) { return false; } else { - *fru_addr = FRU_EEPROM_MADDR_6; + if (fru_addr) + *fru_addr = FRU_EEPROM_MADDR_6; return true; } } else { -- cgit v1.2.3 From 7554886daa31eacc8e7fac9e15bbce67d10b8f1f Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Sat, 10 Dec 2022 02:51:19 -0500 Subject: drm/amdgpu: Fix size validation for non-exclusive domains (v4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix amdgpu_bo_validate_size() to check whether the TTM domain manager for the requested memory exists, else we get a kernel oops when dereferencing "man". v2: Make the patch standalone, i.e. not dependent on local patches. v3: Preserve old behaviour and just check that the manager pointer is not NULL. v4: Complain if GTT domain requested and it is uninitialized--most likely a bug. Cc: Alex Deucher Cc: Christian König Cc: AMD Graphics Signed-off-by: Luben Tuikov Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 3393c1a6a0ff..26c6e9b2d460 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -448,27 +448,24 @@ static bool amdgpu_bo_validate_size(struct amdgpu_device *adev, /* * If GTT is part of requested domains the check must succeed to - * allow fall back to GTT + * allow fall back to GTT. */ if (domain & AMDGPU_GEM_DOMAIN_GTT) { man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT); - if (size < man->size) + if (man && size < man->size) return true; - else - goto fail; - } - - if (domain & AMDGPU_GEM_DOMAIN_VRAM) { + else if (!man) + WARN_ON_ONCE("GTT domain requested but GTT mem manager uninitialized"); + goto fail; + } else if (domain & AMDGPU_GEM_DOMAIN_VRAM) { man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM); - if (size < man->size) + if (man && size < man->size) return true; - else - goto fail; + goto fail; } - /* TODO add more domains checks, such as AMDGPU_GEM_DOMAIN_CPU */ return true; -- cgit v1.2.3 From 3273f11675ef11959d25a56df3279f712bcd41b7 Mon Sep 17 00:00:00 2001 From: Luben Tuikov Date: Wed, 14 Dec 2022 03:56:03 -0500 Subject: drm/amdgpu: Remove unnecessary domain argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the "domain" argument to amdgpu_bo_create_kernel_at() since this function takes an "offset" argument which is the offset off of VRAM, and as such allocation always takes place in VRAM. Thus, the "domain" argument is unnecessary. Cc: Alex Deucher Cc: Christian König Cc: AMD Graphics Signed-off-by: Luben Tuikov Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 10 +++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 7 ------- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 1 - 4 files changed, 6 insertions(+), 14 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 26c6e9b2d460..0f6385941480 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -346,17 +346,16 @@ int amdgpu_bo_create_kernel(struct amdgpu_device *adev, * @adev: amdgpu device object * @offset: offset of the BO * @size: size of the BO - * @domain: where to place it * @bo_ptr: used to initialize BOs in structures * @cpu_addr: optional CPU address mapping * - * Creates a kernel BO at a specific offset in the address space of the domain. + * Creates a kernel BO at a specific offset in VRAM. * * Returns: * 0 on success, negative error code otherwise. */ int amdgpu_bo_create_kernel_at(struct amdgpu_device *adev, - uint64_t offset, uint64_t size, uint32_t domain, + uint64_t offset, uint64_t size, struct amdgpu_bo **bo_ptr, void **cpu_addr) { struct ttm_operation_ctx ctx = { false, false }; @@ -366,8 +365,9 @@ int amdgpu_bo_create_kernel_at(struct amdgpu_device *adev, offset &= PAGE_MASK; size = ALIGN(size, PAGE_SIZE); - r = amdgpu_bo_create_reserved(adev, size, PAGE_SIZE, domain, bo_ptr, - NULL, cpu_addr); + r = amdgpu_bo_create_reserved(adev, size, PAGE_SIZE, + AMDGPU_GEM_DOMAIN_VRAM, bo_ptr, NULL, + cpu_addr); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index 147b79c10cbb..93207badf83f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -284,7 +284,7 @@ int amdgpu_bo_create_kernel(struct amdgpu_device *adev, u32 domain, struct amdgpu_bo **bo_ptr, u64 *gpu_addr, void **cpu_addr); int amdgpu_bo_create_kernel_at(struct amdgpu_device *adev, - uint64_t offset, uint64_t size, uint32_t domain, + uint64_t offset, uint64_t size, struct amdgpu_bo **bo_ptr, void **cpu_addr); int amdgpu_bo_create_user(struct amdgpu_device *adev, struct amdgpu_bo_param *bp, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index f0e4c7309438..55e0284b2bdd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1576,7 +1576,6 @@ static int amdgpu_ttm_fw_reserve_vram_init(struct amdgpu_device *adev) return amdgpu_bo_create_kernel_at(adev, adev->mman.fw_vram_usage_start_offset, adev->mman.fw_vram_usage_size, - AMDGPU_GEM_DOMAIN_VRAM, &adev->mman.fw_vram_usage_reserved_bo, &adev->mman.fw_vram_usage_va); } @@ -1602,7 +1601,6 @@ static int amdgpu_ttm_drv_reserve_vram_init(struct amdgpu_device *adev) return amdgpu_bo_create_kernel_at(adev, adev->mman.drv_vram_usage_start_offset, adev->mman.drv_vram_usage_size, - AMDGPU_GEM_DOMAIN_VRAM, &adev->mman.drv_vram_usage_reserved_bo, &adev->mman.drv_vram_usage_va); } @@ -1683,7 +1681,6 @@ static int amdgpu_ttm_reserve_tmr(struct amdgpu_device *adev) ret = amdgpu_bo_create_kernel_at(adev, ctx->c2p_train_data_offset, ctx->train_data_size, - AMDGPU_GEM_DOMAIN_VRAM, &ctx->c2p_bo, NULL); if (ret) { @@ -1697,7 +1694,6 @@ static int amdgpu_ttm_reserve_tmr(struct amdgpu_device *adev) ret = amdgpu_bo_create_kernel_at(adev, adev->gmc.real_vram_size - adev->mman.discovery_tmr_size, adev->mman.discovery_tmr_size, - AMDGPU_GEM_DOMAIN_VRAM, &adev->mman.discovery_memory, NULL); if (ret) { @@ -1798,21 +1794,18 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) * avoid display artifacts while transitioning between pre-OS * and driver. */ r = amdgpu_bo_create_kernel_at(adev, 0, adev->mman.stolen_vga_size, - AMDGPU_GEM_DOMAIN_VRAM, &adev->mman.stolen_vga_memory, NULL); if (r) return r; r = amdgpu_bo_create_kernel_at(adev, adev->mman.stolen_vga_size, adev->mman.stolen_extended_size, - AMDGPU_GEM_DOMAIN_VRAM, &adev->mman.stolen_extended_memory, NULL); if (r) return r; r = amdgpu_bo_create_kernel_at(adev, adev->mman.stolen_reserved_offset, adev->mman.stolen_reserved_size, - AMDGPU_GEM_DOMAIN_VRAM, &adev->mman.stolen_reserved_memory, NULL); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 15544f262ec1..2994b9db196f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -395,7 +395,6 @@ static void amdgpu_virt_ras_reserve_bps(struct amdgpu_device *adev) */ if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT, AMDGPU_GPU_PAGE_SIZE, - AMDGPU_GEM_DOMAIN_VRAM, &bo, NULL)) DRM_DEBUG("RAS WARN: reserve vram for retired page %llx fail\n", bp); -- cgit v1.2.3 From 47722220660cfb935e27e62d385959ecc296cddb Mon Sep 17 00:00:00 2001 From: Christian König Date: Mon, 12 Dec 2022 17:31:57 +0100 Subject: drm/amdgpu: revert "generally allow over-commit during BO allocation" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f9d00a4a8dc8fff951c97b3213f90d6bc7a72175. This causes problem for KFD because when we overcommit we accidentially bind the BO to GTT for moving it into VRAM. We also need to make sure that this is done only as fallback after trying to evict first. Signed-off-by: Christian König Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 16 +++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 6 +++++- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 62e98f1ad770..a0780a4e3e61 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -113,7 +113,7 @@ int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size, bp.resv = resv; bp.preferred_domain = initial_domain; bp.flags = flags; - bp.domain = initial_domain | AMDGPU_GEM_DOMAIN_CPU; + bp.domain = initial_domain; bp.bo_ptr_size = sizeof(struct amdgpu_bo); r = amdgpu_bo_create_user(adev, &bp, &ubo); @@ -332,10 +332,20 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data, } initial_domain = (u32)(0xffffffff & args->in.domains); +retry: r = amdgpu_gem_object_create(adev, size, args->in.alignment, - initial_domain, flags, ttm_bo_type_device, - resv, &gobj); + initial_domain, + flags, ttm_bo_type_device, resv, &gobj); if (r && r != -ERESTARTSYS) { + if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) { + flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; + goto retry; + } + + if (initial_domain == AMDGPU_GEM_DOMAIN_VRAM) { + initial_domain |= AMDGPU_GEM_DOMAIN_GTT; + goto retry; + } DRM_DEBUG("Failed to allocate GEM object (%llu, %d, %llu, %d)\n", size, initial_domain, args->in.alignment, r); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 0f6385941480..4e684c2afc70 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -580,7 +580,11 @@ int amdgpu_bo_create(struct amdgpu_device *adev, bo->flags |= AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE; bo->tbo.bdev = &adev->mman.bdev; - amdgpu_bo_placement_from_domain(bo, bp->domain); + if (bp->domain & (AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA | + AMDGPU_GEM_DOMAIN_GDS)) + amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU); + else + amdgpu_bo_placement_from_domain(bo, bp->domain); if (bp->type == ttm_bo_type_kernel) bo->tbo.priority = 1; -- cgit v1.2.3 From 1a799c4c190ea9f0e81028e3eb3037ed0ab17ff5 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Tue, 13 Dec 2022 00:50:03 -0500 Subject: drm/amdkfd: Fix double release compute pasid If kfd_process_device_init_vm returns failure after vm is converted to compute vm and vm->pasid set to compute pasid, KFD will not take pdd->drm_file reference. As a result, drm close file handler maybe called to release the compute pasid before KFD process destroy worker to release the same pasid and set vm->pasid to zero, this generates below WARNING backtrace and NULL pointer access. Add helper amdgpu_amdkfd_gpuvm_set_vm_pasid and call it at the last step of kfd_process_device_init_vm, to ensure vm pasid is the original pasid if acquiring vm failed or is the compute pasid with pdd->drm_file reference taken to avoid double release same pasid. amdgpu: Failed to create process VM object ida_free called for id=32770 which is not allocated. WARNING: CPU: 57 PID: 72542 at ../lib/idr.c:522 ida_free+0x96/0x140 RIP: 0010:ida_free+0x96/0x140 Call Trace: amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu] amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu] drm_file_free.part.13+0x216/0x270 [drm] drm_close_helper.isra.14+0x60/0x70 [drm] drm_release+0x6e/0xf0 [drm] __fput+0xcc/0x280 ____fput+0xe/0x20 task_work_run+0x96/0xc0 do_exit+0x3d0/0xc10 BUG: kernel NULL pointer dereference, address: 0000000000000000 RIP: 0010:ida_free+0x76/0x140 Call Trace: amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu] amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu] drm_file_free.part.13+0x216/0x270 [drm] drm_close_helper.isra.14+0x60/0x70 [drm] drm_release+0x6e/0xf0 [drm] __fput+0xcc/0x280 ____fput+0xe/0x20 task_work_run+0x96/0xc0 do_exit+0x3d0/0xc10 Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 4 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 39 +++++++++++++++++------- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 12 ++++++-- 3 files changed, 40 insertions(+), 15 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 589939631ed4..0040deaf8a83 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -270,8 +270,10 @@ int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool is_ (&((struct amdgpu_fpriv *) \ ((struct drm_file *)(drm_priv))->driver_priv)->vm) +int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev, + struct file *filp, u32 pasid); int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, - struct file *filp, u32 pasid, + struct file *filp, void **process_info, struct dma_fence **ef); void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 0a854bb8b47e..b15091d8310d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1429,10 +1429,9 @@ static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo) amdgpu_bo_unreserve(bo); } -int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, - struct file *filp, u32 pasid, - void **process_info, - struct dma_fence **ef) +int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev, + struct file *filp, u32 pasid) + { struct amdgpu_fpriv *drv_priv; struct amdgpu_vm *avm; @@ -1443,10 +1442,6 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, return ret; avm = &drv_priv->vm; - /* Already a compute VM? */ - if (avm->process_info) - return -EINVAL; - /* Free the original amdgpu allocated pasid, * will be replaced with kfd allocated pasid. */ @@ -1455,14 +1450,36 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, amdgpu_vm_set_pasid(adev, avm, 0); } - /* Convert VM into a compute VM */ - ret = amdgpu_vm_make_compute(adev, avm); + ret = amdgpu_vm_set_pasid(adev, avm, pasid); if (ret) return ret; - ret = amdgpu_vm_set_pasid(adev, avm, pasid); + return 0; +} + +int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, + struct file *filp, + void **process_info, + struct dma_fence **ef) +{ + struct amdgpu_fpriv *drv_priv; + struct amdgpu_vm *avm; + int ret; + + ret = amdgpu_file_to_fpriv(filp, &drv_priv); if (ret) return ret; + avm = &drv_priv->vm; + + /* Already a compute VM? */ + if (avm->process_info) + return -EINVAL; + + /* Convert VM into a compute VM */ + ret = amdgpu_vm_make_compute(adev, avm); + if (ret) + return ret; + /* Initialize KFD part of the VM and process info */ ret = init_kfd_vm(avm, process_info, ef); if (ret) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 6caa9dd57ff1..51b1683ac5c1 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1576,9 +1576,9 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, p = pdd->process; dev = pdd->dev; - ret = amdgpu_amdkfd_gpuvm_acquire_process_vm( - dev->adev, drm_file, p->pasid, - &p->kgd_process_info, &p->ef); + ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, drm_file, + &p->kgd_process_info, + &p->ef); if (ret) { pr_err("Failed to create process VM object\n"); return ret; @@ -1593,10 +1593,16 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, if (ret) goto err_init_cwsr; + ret = amdgpu_amdkfd_gpuvm_set_vm_pasid(dev->adev, drm_file, p->pasid); + if (ret) + goto err_set_pasid; + pdd->drm_file = drm_file; return 0; +err_set_pasid: + kfd_process_device_destroy_cwsr_dgpu(pdd); err_init_cwsr: kfd_process_device_destroy_ib_mem(pdd); err_reserve_ib_mem: -- cgit v1.2.3 From afa6646b1c5d3affd541f76bd7476e4b835a9174 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 16 Dec 2022 11:42:20 -0500 Subject: drm/amdgpu: skip MES for S0ix as well since it's part of GFX It's also part of gfxoff. Cc: stable@vger.kernel.org # 6.0, 6.1 Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 64660a41d53c..afe6af9c0138 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3016,14 +3016,15 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) continue; } - /* skip suspend of gfx and psp for S0ix + /* skip suspend of gfx/mes and psp for S0ix * gfx is in gfxoff state, so on resume it will exit gfxoff just * like at runtime. PSP is also part of the always on hardware * so no need to suspend it. */ if (adev->in_s0ix && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)) + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX || + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES)) continue; /* XXX handle errors */ -- cgit v1.2.3 From 8660495a9c5b9afeec4cc006b3b75178f0fb2f10 Mon Sep 17 00:00:00 2001 From: Tim Huang Date: Mon, 19 Dec 2022 18:32:32 +0800 Subject: drm/amdgpu: skip mes self test after s0i3 resume for MES IP v11.0 MES is part of gfxoff and MES suspend and resume are skipped for S0i3. But the mes_self_test call path is still in the amdgpu_device_ip_late_init. it's should also be skipped for s0ix as no hardware re-initialization happened. Besides, mes_self_test will free the BO that triggers a lot of warning messages while in the suspend state. [ 81.656085] WARNING: CPU: 2 PID: 1550 at drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:425 amdgpu_bo_free_kernel+0xfc/0x110 [amdgpu] [ 81.679435] Call Trace: [ 81.679726] [ 81.679981] amdgpu_mes_remove_hw_queue+0x17a/0x230 [amdgpu] [ 81.680857] amdgpu_mes_self_test+0x390/0x430 [amdgpu] [ 81.681665] mes_v11_0_late_init+0x37/0x50 [amdgpu] [ 81.682423] amdgpu_device_ip_late_init+0x53/0x280 [amdgpu] [ 81.683257] amdgpu_device_resume+0xae/0x2a0 [amdgpu] [ 81.684043] amdgpu_pmops_resume+0x37/0x70 [amdgpu] [ 81.684818] pci_pm_resume+0x5c/0xa0 [ 81.685247] ? pci_pm_thaw+0x90/0x90 [ 81.685658] dpm_run_callback+0x4e/0x160 [ 81.686110] device_resume+0xad/0x210 [ 81.686529] async_resume+0x1e/0x40 [ 81.686931] async_run_entry_fn+0x33/0x120 [ 81.687405] process_one_work+0x21d/0x3f0 [ 81.687869] worker_thread+0x4a/0x3c0 [ 81.688293] ? process_one_work+0x3f0/0x3f0 [ 81.688777] kthread+0xff/0x130 [ 81.689157] ? kthread_complete_and_exit+0x20/0x20 [ 81.689707] ret_from_fork+0x22/0x30 [ 81.690118] [ 81.690380] ---[ end trace 0000000000000000 ]--- v2: make the comment clean and use adev->in_s0ix instead of adev->suspend Signed-off-by: Tim Huang Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.0, 6.1 --- drivers/gpu/drm/amd/amdgpu/mes_v11_0.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c index 5459366f49ff..970b066b37bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c @@ -1342,7 +1342,8 @@ static int mes_v11_0_late_init(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; - if (!amdgpu_in_reset(adev) && + /* it's only intended for use in mes_self_test case, not for s0ix and reset */ + if (!amdgpu_in_reset(adev) && !adev->in_s0ix && (adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))) amdgpu_mes_self_test(adev); -- cgit v1.2.3 From e1d900df63adcb748905131dd6258e570e11aed1 Mon Sep 17 00:00:00 2001 From: Saleemkhan Jamadar Date: Tue, 20 Dec 2022 13:21:44 +0530 Subject: drm/amdgpu: enable VCN DPG for GC IP v11.0.4 Enable VCN Dynamic Power Gating control for GC IP v11.0.4. Signed-off-by: Saleemkhan Jamadar Reviewed-by: Veerabadhran Gopalakrishnan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.0, 6.1 --- drivers/gpu/drm/amd/amdgpu/soc21.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c b/drivers/gpu/drm/amd/amdgpu/soc21.c index 7d5fdf450d0c..5562670b7b52 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc21.c +++ b/drivers/gpu/drm/amd/amdgpu/soc21.c @@ -666,6 +666,7 @@ static int soc21_common_early_init(void *handle) AMD_CG_SUPPORT_VCN_MGCG | AMD_CG_SUPPORT_JPEG_MGCG; adev->pg_flags = AMD_PG_SUPPORT_VCN | + AMD_PG_SUPPORT_VCN_DPG | AMD_PG_SUPPORT_GFX_PG | AMD_PG_SUPPORT_JPEG; adev->external_rev_id = adev->rev_id + 0x1; -- cgit v1.2.3 From c1c4a8b217213c1924eabf4f28385bbee9cc50c1 Mon Sep 17 00:00:00 2001 From: Christian König Date: Mon, 19 Dec 2022 11:47:18 +0100 Subject: drm/amdgpu: grab extra fence reference for drm_sched_job_add_dependency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That function consumes the reference. Reviewed-by: Luben Tuikov Reported-by: Borislav Petkov (AMD) Tested-by: Borislav Petkov (AMD) Signed-off-by: Christian König Fixes: aab9cf7b6954 ("drm/amdgpu: use scheduler dependencies for VM updates") Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c index 59cf64216fbb..535cd6569bcc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c @@ -238,8 +238,10 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p, /* Wait for PD/PT moves to be completed */ dma_resv_iter_begin(&cursor, bo->tbo.base.resv, DMA_RESV_USAGE_KERNEL); dma_resv_for_each_fence_unlocked(&cursor, fence) { + dma_fence_get(fence); r = drm_sched_job_add_dependency(&p->job->base, fence); if (r) { + dma_fence_put(fence); dma_resv_iter_end(&cursor); return r; } -- cgit v1.2.3