From 9527b9caf82b040e794fba2667f6d8a85a793bb7 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 11 Mar 2022 15:22:15 -0500 Subject: drm/amdkfd: evict svm bo worker handle error Migrate vram to ram may fail to find the vma if process is exiting and vma is removed, evict svm bo worker sets prange->svm_bo to NULL and warn svm_bo ref count != 1 only if migrating vram to ram successfully. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 29 +++++++++++++++++++++++------ drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 21 +++++++++++++-------- 2 files changed, 36 insertions(+), 14 deletions(-) (limited to 'drivers/gpu/drm/amd/amdkfd') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 7f689094be5a..3a29d857640b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -638,6 +638,22 @@ out_oom: return r; } +/** + * svm_migrate_vma_to_ram - migrate range inside one vma from device to system + * + * @adev: amdgpu device to migrate from + * @prange: svm range structure + * @vma: vm_area_struct that range [start, end] belongs to + * @start: range start virtual address in pages + * @end: range end virtual address in pages + * + * Context: Process context, caller hold mmap read lock, prange->migrate_mutex + * + * Return: + * 0 - success with all pages migrated + * negative values - indicate error + * positive values - partial migration, number of pages not migrated + */ static long svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, struct vm_area_struct *vma, uint64_t start, uint64_t end) @@ -709,8 +725,6 @@ out: pdd = svm_range_get_pdd_by_adev(prange, adev); if (pdd) WRITE_ONCE(pdd->page_out, pdd->page_out + cpages); - - return upages; } return r ? r : upages; } @@ -720,7 +734,7 @@ out: * @prange: range structure * @mm: process mm, use current->mm if NULL * - * Context: Process context, caller hold mmap read lock, svms lock, prange lock + * Context: Process context, caller hold mmap read lock, prange->migrate_mutex * * Return: * 0 - OK, otherwise error code @@ -759,13 +773,16 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm) unsigned long next; vma = find_vma(mm, addr); - if (!vma || addr < vma->vm_start) + if (!vma || addr < vma->vm_start) { + pr_debug("failed to find vma for prange %p\n", prange); + r = -EFAULT; break; + } next = min(vma->vm_end, end); r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next); if (r < 0) { - pr_debug("failed %ld to migrate\n", r); + pr_debug("failed %ld to migrate prange %p\n", r, prange); break; } else { upages += r; @@ -773,7 +790,7 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm) addr = next; } - if (!upages) { + if (r >= 0 && !upages) { svm_range_vram_node_free(prange); prange->actual_loc = 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 509d915cbe69..3b8856b4cece 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -3155,6 +3155,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) struct svm_range_bo *svm_bo; struct kfd_process *p; struct mm_struct *mm; + int r = 0; svm_bo = container_of(work, struct svm_range_bo, eviction_work); if (!svm_bo_ref_unless_zero(svm_bo)) @@ -3170,7 +3171,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) mmap_read_lock(mm); spin_lock(&svm_bo->list_lock); - while (!list_empty(&svm_bo->range_list)) { + while (!list_empty(&svm_bo->range_list) && !r) { struct svm_range *prange = list_first_entry(&svm_bo->range_list, struct svm_range, svm_bo_list); @@ -3184,15 +3185,18 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) mutex_lock(&prange->migrate_mutex); do { - svm_migrate_vram_to_ram(prange, + r = svm_migrate_vram_to_ram(prange, svm_bo->eviction_fence->mm); - } while (prange->actual_loc && --retries); - WARN(prange->actual_loc, "Migration failed during eviction"); + } while (!r && prange->actual_loc && --retries); - mutex_lock(&prange->lock); - prange->svm_bo = NULL; - mutex_unlock(&prange->lock); + if (!r && prange->actual_loc) + pr_info_once("Migration failed during eviction"); + if (!prange->actual_loc) { + mutex_lock(&prange->lock); + prange->svm_bo = NULL; + mutex_unlock(&prange->lock); + } mutex_unlock(&prange->migrate_mutex); spin_lock(&svm_bo->list_lock); @@ -3201,10 +3205,11 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) mmap_read_unlock(mm); dma_fence_signal(&svm_bo->eviction_fence->base); + /* This is the last reference to svm_bo, after svm_range_vram_node_free * has been called in svm_migrate_vram_to_ram */ - WARN_ONCE(kref_read(&svm_bo->kref) != 1, "This was not the last reference\n"); + WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n"); svm_range_bo_unref(svm_bo); } -- cgit v1.2.3