From e709accc7670a07b2e5186449f0640c2416662ec Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 24 Jul 2019 08:52:58 +0200 Subject: mm/hmm: comment on VM_FAULT_RETRY semantics in handle_mm_fault The magic dropping of mmap_sem when handle_mm_fault returns VM_FAULT_RETRY is rather subtile. Add a comment explaining it. Link: https://lore.kernel.org/r/20190724065258.16603-8-hch@lst.de Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe [hch: wrote a changelog] Signed-off-by: Christoph Hellwig --- mm/hmm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index 16b6731a34db..54b3a4162ae9 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -301,8 +301,10 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; flags |= write_fault ? FAULT_FLAG_WRITE : 0; ret = handle_mm_fault(vma, addr, flags); - if (ret & VM_FAULT_RETRY) + if (ret & VM_FAULT_RETRY) { + /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ return -EAGAIN; + } if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; -- cgit v1.2.3 From 1f961807925032daa90267d8a23ff730e7ede07a Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 25 Jul 2019 17:56:44 -0700 Subject: mm/hmm: replace hmm_update with mmu_notifier_range The hmm_mirror_ops callback function sync_cpu_device_pagetables() passes a struct hmm_update which is a simplified version of struct mmu_notifier_range. This is unnecessary so replace hmm_update with mmu_notifier_range directly. Link: https://lore.kernel.org/r/20190726005650.2566-2-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed: Christoph Hellwig Reviewed-by: Jason Gunthorpe [jgg: white space tuning] Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 14 ++++++++------ drivers/gpu/drm/nouveau/nouveau_svm.c | 4 ++-- include/linux/hmm.h | 34 ++++++---------------------------- mm/hmm.c | 13 ++++--------- 4 files changed, 20 insertions(+), 45 deletions(-) (limited to 'mm') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3971c201f320..b698b423b25d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -195,13 +195,14 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, * Block for operations on BOs to finish and mark pages as accessed and * potentially dirty. */ -static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, - const struct hmm_update *update) +static int +amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, + const struct mmu_notifier_range *update) { struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); unsigned long start = update->start; unsigned long end = update->end; - bool blockable = update->blockable; + bool blockable = mmu_notifier_range_blockable(update); struct interval_tree_node *it; /* notification is exclusive, but interval is inclusive */ @@ -243,13 +244,14 @@ static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, * necessitates evicting all user-mode queues of the process. The BOs * are restorted in amdgpu_mn_invalidate_range_end_hsa. */ -static int amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, - const struct hmm_update *update) +static int +amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, + const struct mmu_notifier_range *update) { struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); unsigned long start = update->start; unsigned long end = update->end; - bool blockable = update->blockable; + bool blockable = mmu_notifier_range_blockable(update); struct interval_tree_node *it; /* notification is exclusive, but interval is inclusive */ diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 545100f7c594..79b29c918717 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -252,13 +252,13 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit) static int nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror, - const struct hmm_update *update) + const struct mmu_notifier_range *update) { struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror); unsigned long start = update->start; unsigned long limit = update->end; - if (!update->blockable) + if (!mmu_notifier_range_blockable(update)) return -EAGAIN; SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit); diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 9f32586684c9..4f870f7017cb 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -340,29 +340,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, struct hmm_mirror; -/* - * enum hmm_update_event - type of update - * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) - */ -enum hmm_update_event { - HMM_UPDATE_INVALIDATE, -}; - -/* - * struct hmm_update - HMM update information for callback - * - * @start: virtual start address of the range to update - * @end: virtual end address of the range to update - * @event: event triggering the update (what is happening) - * @blockable: can the callback block/sleep ? - */ -struct hmm_update { - unsigned long start; - unsigned long end; - enum hmm_update_event event; - bool blockable; -}; - /* * struct hmm_mirror_ops - HMM mirror device operations callback * @@ -383,9 +360,9 @@ struct hmm_mirror_ops { /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror - * @update: update information (see struct hmm_update) - * Return: -EAGAIN if update.blockable false and callback need to - * block, 0 otherwise. + * @update: update information (see struct mmu_notifier_range) + * Return: -EAGAIN if mmu_notifier_range_blockable(update) is false + * and callback needs to block, 0 otherwise. * * This callback ultimately originates from mmu_notifiers when the CPU * page table is updated. The device driver must update its page table @@ -396,8 +373,9 @@ struct hmm_mirror_ops { * page tables are completely updated (TLBs flushed, etc); this is a * synchronous call. */ - int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, - const struct hmm_update *update); + int (*sync_cpu_device_pagetables)( + struct hmm_mirror *mirror, + const struct mmu_notifier_range *update); }; /* diff --git a/mm/hmm.c b/mm/hmm.c index 54b3a4162ae9..4040b4427635 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -165,7 +165,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; - struct hmm_update update; struct hmm_range *range; unsigned long flags; int ret = 0; @@ -173,15 +172,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, if (!kref_get_unless_zero(&hmm->kref)) return 0; - update.start = nrange->start; - update.end = nrange->end; - update.event = HMM_UPDATE_INVALIDATE; - update.blockable = mmu_notifier_range_blockable(nrange); - spin_lock_irqsave(&hmm->ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, &hmm->ranges, list) { - if (update.end < range->start || update.start >= range->end) + if (nrange->end < range->start || nrange->start >= range->end) continue; range->valid = false; @@ -198,9 +192,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, list_for_each_entry(mirror, &hmm->mirrors, list) { int rc; - rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update); + rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange); if (rc) { - if (WARN_ON(update.blockable || rc != -EAGAIN)) + if (WARN_ON(mmu_notifier_range_blockable(nrange) || + rc != -EAGAIN)) continue; ret = -EAGAIN; break; -- cgit v1.2.3 From d2e8d551165ccb6669a0d0273be01ac99d61deba Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 25 Jul 2019 17:56:45 -0700 Subject: mm/hmm: a few more C style and comment clean ups A few more comments and minor programming style clean ups. There should be no functional changes. Link: https://lore.kernel.org/r/20190726005650.2566-3-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index 4040b4427635..362944b0fbca 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -32,7 +32,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; * hmm_get_or_create - register HMM against an mm (HMM internal) * * @mm: mm struct to attach to - * Returns: returns an HMM object, either by referencing the existing + * Return: an HMM object, either by referencing the existing * (per-process) object, or by creating a new one. * * This is not intended to be used directly by device drivers. If mm already @@ -325,8 +325,8 @@ static int hmm_pfns_bad(unsigned long addr, } /* - * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) - * @start: range virtual start address (inclusive) + * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) + * @addr: range virtual start address (inclusive) * @end: range virtual end address (exclusive) * @fault: should we fault or not ? * @write_fault: write fault ? @@ -376,9 +376,9 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, /* * So we not only consider the individual per page request we also * consider the default flags requested for the range. The API can - * be use in 2 fashions. The first one where the HMM user coalesce - * multiple page fault into one request and set flags per pfns for - * of those faults. The second one where the HMM user want to pre- + * be used 2 ways. The first one where the HMM user coalesces + * multiple page faults into one request and sets flags per pfn for + * those faults. The second one where the HMM user wants to pre- * fault a range with specific flags. For the latter one it is a * waste to have the user pre-fill the pfn arrays with a default * flags value. @@ -388,7 +388,7 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) return; - /* If this is device memory than only fault if explicitly requested */ + /* If this is device memory then only fault if explicitly requested */ if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { /* Do we fault on device memory ? */ if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { @@ -502,7 +502,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, hmm_vma_walk->last = end; return 0; #else - /* If THP is not enabled then we should never reach that code ! */ + /* If THP is not enabled then we should never reach this code ! */ return -EINVAL; #endif } @@ -522,7 +522,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; bool fault, write_fault; uint64_t cpu_flags; pte_t pte = *ptep; @@ -571,8 +570,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (fault || write_fault) { pte_unmap(ptep); hmm_vma_walk->last = addr; - migration_entry_wait(vma->vm_mm, - pmdp, addr); + migration_entry_wait(walk->mm, pmdp, addr); return -EBUSY; } return 0; @@ -620,13 +618,11 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; uint64_t *pfns = range->pfns; unsigned long addr = start, i; pte_t *ptep; pmd_t pmd; - again: pmd = READ_ONCE(*pmdp); if (pmd_none(pmd)) @@ -648,7 +644,7 @@ again: 0, &fault, &write_fault); if (fault || write_fault) { hmm_vma_walk->last = addr; - pmd_migration_entry_wait(vma->vm_mm, pmdp); + pmd_migration_entry_wait(walk->mm, pmdp); return -EBUSY; } return 0; @@ -657,11 +653,11 @@ again: if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* - * No need to take pmd_lock here, even if some other threads + * No need to take pmd_lock here, even if some other thread * is splitting the huge pmd we will get that event through * mmu_notifier callback. * - * So just read pmd value and check again its a transparent + * So just read pmd value and check again it's a transparent * huge or device mapping one and compute corresponding pfn * values. */ @@ -675,7 +671,7 @@ again: } /* - * We have handled all the valid case above ie either none, migration, + * We have handled all the valid cases above ie either none, migration, * huge or transparent huge. At this point either it is a valid pmd * entry pointing to pte directory or it is a bad pmd that will not * recover. @@ -795,10 +791,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, pte_t entry; int ret = 0; - size = 1UL << huge_page_shift(h); + size = huge_page_size(h); mask = size - 1; if (range->page_shift != PAGE_SHIFT) { - /* Make sure we are looking at full page. */ + /* Make sure we are looking at a full page. */ if (start & mask) return -EINVAL; if (end < (start + size)) @@ -809,8 +805,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, size = PAGE_SIZE; } - - ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); entry = huge_ptep_get(pte); i = (start - range->start) >> range->page_shift; @@ -859,7 +854,7 @@ static void hmm_pfns_clear(struct hmm_range *range, * @start: start virtual address (inclusive) * @end: end virtual address (exclusive) * @page_shift: expect page shift for the range - * Returns 0 on success, -EFAULT if the address space is no longer valid + * Return: 0 on success, -EFAULT if the address space is no longer valid * * Track updates to the CPU page table see include/linux/hmm.h */ -- cgit v1.2.3 From 9a4903e49e495bfd2650862dfae4178bebe4db9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jul 2019 17:56:46 -0700 Subject: mm/hmm: replace the block argument to hmm_range_fault with a flags value This allows easier expansion to other flags, and also makes the callers a little easier to read. Link: https://lore.kernel.org/r/20190726005650.2566-4-rcampbell@nvidia.com Signed-off-by: Christoph Hellwig Signed-off-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- include/linux/hmm.h | 11 ++++- mm/hmm.c | 74 ++++++++++++++++----------------- 4 files changed, 48 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index e51b48ac48eb..12a59ac83f72 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -832,7 +832,7 @@ retry: down_read(&mm->mmap_sem); - r = hmm_range_fault(range, true); + r = hmm_range_fault(range, 0); if (unlikely(r < 0)) { if (likely(r == -EAGAIN)) { /* diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 79b29c918717..49b520c60fc5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -505,7 +505,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) return -EBUSY; } - ret = hmm_range_fault(range, true); + ret = hmm_range_fault(range, 0); if (ret <= 0) { if (ret == 0) ret = -EBUSY; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4f870f7017cb..89a141060e5b 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -407,12 +407,19 @@ int hmm_range_register(struct hmm_range *range, unsigned long end, unsigned page_shift); void hmm_range_unregister(struct hmm_range *range); + +/* + * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case. + */ +#define HMM_FAULT_ALLOW_RETRY (1 << 0) + long hmm_range_snapshot(struct hmm_range *range); -long hmm_range_fault(struct hmm_range *range, bool block); +long hmm_range_fault(struct hmm_range *range, unsigned int flags); + long hmm_range_dma_map(struct hmm_range *range, struct device *device, dma_addr_t *daddrs, - bool block); + unsigned int flags); long hmm_range_dma_unmap(struct hmm_range *range, struct vm_area_struct *vma, struct device *device, diff --git a/mm/hmm.c b/mm/hmm.c index 362944b0fbca..84f2791d3510 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -281,7 +281,7 @@ struct hmm_vma_walk { struct dev_pagemap *pgmap; unsigned long last; bool fault; - bool block; + unsigned int flags; }; static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, @@ -293,8 +293,11 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, struct vm_area_struct *vma = walk->vma; vm_fault_t ret; - flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; - flags |= write_fault ? FAULT_FLAG_WRITE : 0; + if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) + flags |= FAULT_FLAG_ALLOW_RETRY; + if (write_fault) + flags |= FAULT_FLAG_WRITE; + ret = handle_mm_fault(vma, addr, flags); if (ret & VM_FAULT_RETRY) { /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ @@ -1012,26 +1015,26 @@ long hmm_range_snapshot(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_snapshot); -/* - * hmm_range_fault() - try to fault some address in a virtual address range - * @range: range being faulted - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Return: number of valid pages in range->pfns[] (from range start - * address). This may be zero. If the return value is negative, - * then one of the following values may be returned: +/** + * hmm_range_fault - try to fault some address in a virtual address range + * @range: range being faulted + * @flags: HMM_FAULT_* flags * - * -EINVAL invalid arguments or mm or virtual address are in an - * invalid vma (for instance device file vma). - * -ENOMEM: Out of memory. - * -EPERM: Invalid permission (for instance asking for write and - * range is read only). - * -EAGAIN: If you need to retry and mmap_sem was drop. This can only - * happens if block argument is false. - * -EBUSY: If the the range is being invalidated and you should wait - * for invalidation to finish. - * -EFAULT: Invalid (ie either no valid vma or it is illegal to access - * that range), number of valid pages in range->pfns[] (from - * range start address). + * Return: the number of valid pages in range->pfns[] (from range start + * address), which may be zero. On error one of the following status codes + * can be returned: + * + * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma + * (e.g., device file vma). + * -ENOMEM: Out of memory. + * -EPERM: Invalid permission (e.g., asking for write and range is read + * only). + * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped. + * -EBUSY: The range has been invalidated and the caller needs to wait for + * the invalidation to finish. + * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access + * that range) number of valid pages in range->pfns[] (from + * range start address). * * This is similar to a regular CPU page fault except that it will not trigger * any memory migration if the memory being faulted is not accessible by CPUs @@ -1040,7 +1043,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); * On error, for one virtual address in the range, the function will mark the * corresponding HMM pfn entry with an error flag. */ -long hmm_range_fault(struct hmm_range *range, bool block) +long hmm_range_fault(struct hmm_range *range, unsigned int flags) { const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; unsigned long start = range->start, end; @@ -1086,7 +1089,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) hmm_vma_walk.pgmap = NULL; hmm_vma_walk.last = start; hmm_vma_walk.fault = true; - hmm_vma_walk.block = block; + hmm_vma_walk.flags = flags; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; end = min(range->end, vma->vm_end); @@ -1125,25 +1128,22 @@ long hmm_range_fault(struct hmm_range *range, bool block) EXPORT_SYMBOL(hmm_range_fault); /** - * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. - * @range: range being faulted - * @device: device against to dma map page to - * @daddrs: dma address of mapped pages - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been - * drop and you need to try again, some other error value otherwise + * hmm_range_dma_map - hmm_range_fault() and dma map page all in one. + * @range: range being faulted + * @device: device to map page to + * @daddrs: array of dma addresses for the mapped pages + * @flags: HMM_FAULT_* * - * Note same usage pattern as hmm_range_fault(). + * Return: the number of pages mapped on success (including zero), or any + * status return from hmm_range_fault() otherwise. */ -long hmm_range_dma_map(struct hmm_range *range, - struct device *device, - dma_addr_t *daddrs, - bool block) +long hmm_range_dma_map(struct hmm_range *range, struct device *device, + dma_addr_t *daddrs, unsigned int flags) { unsigned long i, npages, mapped; long ret; - ret = hmm_range_fault(range, block); + ret = hmm_range_fault(range, flags); if (ret <= 0) return ret ? ret : -EBUSY; -- cgit v1.2.3 From d45d464b118f428229d91769c8a3cc1e2e0bb4d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jul 2019 17:56:47 -0700 Subject: mm/hmm: merge hmm_range_snapshot into hmm_range_fault Add a HMM_FAULT_SNAPSHOT flag so that hmm_range_snapshot can be merged into the almost identical hmm_range_fault function. Link: https://lore.kernel.org/r/20190726005650.2566-5-rcampbell@nvidia.com Signed-off-by: Christoph Hellwig Signed-off-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- Documentation/vm/hmm.rst | 17 +++++----- include/linux/hmm.h | 4 ++- mm/hmm.c | 85 ++---------------------------------------------- 3 files changed, 13 insertions(+), 93 deletions(-) (limited to 'mm') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 710ce1c701bf..ddcb5ca8b296 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -192,15 +192,14 @@ read only, or fully unmap, etc.). The device must complete the update before the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can -use either:: +use:: - long hmm_range_snapshot(struct hmm_range *range); - long hmm_range_fault(struct hmm_range *range, bool block); + long hmm_range_fault(struct hmm_range *range, unsigned int flags); -The first one (hmm_range_snapshot()) will only fetch present CPU page table +With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. -The second one does trigger a page fault on missing or read-only entries if -write access is requested (see below). Page faults use the generic mm page +Without that flag, it does trigger a page fault on missing or read-only entries +if write access is requested (see below). Page faults use the generic mm page fault code path just like a CPU page fault. Both functions copy CPU page table entries into their pfns array argument. Each @@ -227,20 +226,20 @@ The usage pattern is:: /* * Just wait for range to be valid, safe to ignore return value as we - * will use the return value of hmm_range_snapshot() below under the + * will use the return value of hmm_range_fault() below under the * mmap_sem to ascertain the validity of the range. */ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); again: down_read(&mm->mmap_sem); - ret = hmm_range_snapshot(&range); + ret = hmm_range_fault(&range, HMM_RANGE_SNAPSHOT); if (ret) { up_read(&mm->mmap_sem); if (ret == -EBUSY) { /* * No need to check hmm_range_wait_until_valid() return value - * on retry we will get proper error with hmm_range_snapshot() + * on retry we will get proper error with hmm_range_fault() */ hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); goto again; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 89a141060e5b..90dc5944b1bc 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -413,7 +413,9 @@ void hmm_range_unregister(struct hmm_range *range); */ #define HMM_FAULT_ALLOW_RETRY (1 << 0) -long hmm_range_snapshot(struct hmm_range *range); +/* Don't fault in missing PTEs, just snapshot the current state. */ +#define HMM_FAULT_SNAPSHOT (1 << 1) + long hmm_range_fault(struct hmm_range *range, unsigned int flags); long hmm_range_dma_map(struct hmm_range *range, diff --git a/mm/hmm.c b/mm/hmm.c index 84f2791d3510..1bc014cddd78 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -280,7 +280,6 @@ struct hmm_vma_walk { struct hmm_range *range; struct dev_pagemap *pgmap; unsigned long last; - bool fault; unsigned int flags; }; @@ -373,7 +372,7 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { struct hmm_range *range = hmm_vma_walk->range; - if (!hmm_vma_walk->fault) + if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) return; /* @@ -418,7 +417,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { unsigned long i; - if (!hmm_vma_walk->fault) { + if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { *fault = *write_fault = false; return; } @@ -936,85 +935,6 @@ void hmm_range_unregister(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_unregister); -/* - * hmm_range_snapshot() - snapshot CPU page table for a range - * @range: range - * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid - * permission (for instance asking for write and range is read only), - * -EBUSY if you need to retry, -EFAULT invalid (ie either no valid - * vma or it is illegal to access that range), number of valid pages - * in range->pfns[] (from range start address). - * - * This snapshots the CPU page table for a range of virtual addresses. Snapshot - * validity is tracked by range struct. See in include/linux/hmm.h for example - * on how to use. - */ -long hmm_range_snapshot(struct hmm_range *range) -{ - const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; - unsigned long start = range->start, end; - struct hmm_vma_walk hmm_vma_walk; - struct hmm *hmm = range->hmm; - struct vm_area_struct *vma; - struct mm_walk mm_walk; - - lockdep_assert_held(&hmm->mm->mmap_sem); - do { - /* If range is no longer valid force retry. */ - if (!range->valid) - return -EBUSY; - - vma = find_vma(hmm->mm, start); - if (vma == NULL || (vma->vm_flags & device_vma)) - return -EFAULT; - - if (is_vm_hugetlb_page(vma)) { - if (huge_page_shift(hstate_vma(vma)) != - range->page_shift && - range->page_shift != PAGE_SHIFT) - return -EINVAL; - } else { - if (range->page_shift != PAGE_SHIFT) - return -EINVAL; - } - - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it - * does not allow write access, either. HMM does not - * support architecture that allow write without read. - */ - hmm_pfns_clear(range, range->pfns, - range->start, range->end); - return -EPERM; - } - - range->vma = vma; - hmm_vma_walk.pgmap = NULL; - hmm_vma_walk.last = start; - hmm_vma_walk.fault = false; - hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; - end = min(range->end, vma->vm_end); - - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pud_entry = hmm_vma_walk_pud; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; - mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; - - walk_page_range(start, end, &mm_walk); - start = end; - } while (start < range->end); - - return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; -} -EXPORT_SYMBOL(hmm_range_snapshot); - /** * hmm_range_fault - try to fault some address in a virtual address range * @range: range being faulted @@ -1088,7 +1008,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) range->vma = vma; hmm_vma_walk.pgmap = NULL; hmm_vma_walk.last = start; - hmm_vma_walk.fault = true; hmm_vma_walk.flags = flags; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; -- cgit v1.2.3 From f527688d5d8a80d2d1b2c02779105747c2f4f705 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 25 Jul 2019 17:56:49 -0700 Subject: mm/hmm: remove hugetlbfs check in hmm_vma_walk_pmd walk_page_range() will only call hmm_vma_walk_hugetlb_entry() for hugetlbfs pages and doesn't call hmm_vma_walk_pmd() in this case. Therefore, it is safe to remove the check for vma->vm_flags & VM_HUGETLB in hmm_vma_walk_pmd(). Link: https://lore.kernel.org/r/20190726005650.2566-7-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index 1bc014cddd78..6111c0a3c12d 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -630,9 +630,6 @@ again: if (pmd_none(pmd)) return hmm_vma_walk_hole(start, end, walk); - if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) - return hmm_pfns_bad(start, end, walk); - if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { bool fault, write_fault; unsigned long npages; -- cgit v1.2.3 From cc374377a19d2a49d693997b62dc3a6f5fac6d61 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Thu, 25 Jul 2019 17:56:50 -0700 Subject: mm/hmm: remove hmm_range vma Since hmm_range_fault() doesn't use the struct hmm_range vma field, remove it. Link: https://lore.kernel.org/r/20190726005650.2566-8-rcampbell@nvidia.com Suggested-by: Jason Gunthorpe Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/nouveau/nouveau_svm.c | 7 +++---- include/linux/hmm.h | 1 - mm/hmm.c | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 49b520c60fc5..a74530b5a523 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -496,12 +496,12 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) range->start, range->end, PAGE_SHIFT); if (ret) { - up_read(&range->vma->vm_mm->mmap_sem); + up_read(&range->hmm->mm->mmap_sem); return (int)ret; } if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { - up_read(&range->vma->vm_mm->mmap_sem); + up_read(&range->hmm->mm->mmap_sem); return -EBUSY; } @@ -509,7 +509,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) if (ret <= 0) { if (ret == 0) ret = -EBUSY; - up_read(&range->vma->vm_mm->mmap_sem); + up_read(&range->hmm->mm->mmap_sem); hmm_range_unregister(range); return ret; } @@ -682,7 +682,6 @@ nouveau_svm_fault(struct nvif_notify *notify) args.i.p.addr + args.i.p.size, fn - fi); /* Have HMM fault pages within the fault window to the GPU. */ - range.vma = vma; range.start = args.i.p.addr; range.end = args.i.p.addr + args.i.p.size; range.pfns = args.phys; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 90dc5944b1bc..82265118d94a 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -164,7 +164,6 @@ enum hmm_pfn_value_e { */ struct hmm_range { struct hmm *hmm; - struct vm_area_struct *vma; struct list_head list; unsigned long start; unsigned long end; diff --git a/mm/hmm.c b/mm/hmm.c index 6111c0a3c12d..9a908902e4cc 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1002,7 +1002,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) return -EPERM; } - range->vma = vma; hmm_vma_walk.pgmap = NULL; hmm_vma_walk.last = start; hmm_vma_walk.flags = flags; -- cgit v1.2.3 From 2cbeb41913e639890bf2c6485d7bdc6a25fdfd56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:43 +0300 Subject: mm/hmm: remove the unused vma argument to hmm_range_dma_unmap Link: https://lore.kernel.org/r/20190806160554.14046-6-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/linux/hmm.h | 1 - mm/hmm.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'mm') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 82265118d94a..59be0aa2476d 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -422,7 +422,6 @@ long hmm_range_dma_map(struct hmm_range *range, dma_addr_t *daddrs, unsigned int flags); long hmm_range_dma_unmap(struct hmm_range *range, - struct vm_area_struct *vma, struct device *device, dma_addr_t *daddrs, bool dirty); diff --git a/mm/hmm.c b/mm/hmm.c index 9a908902e4cc..160ecb47d04b 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1129,7 +1129,6 @@ EXPORT_SYMBOL(hmm_range_dma_map); /** * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() * @range: range being unmapped - * @vma: the vma against which the range (optional) * @device: device against which dma map was done * @daddrs: dma address of mapped pages * @dirty: dirty page if it had the write flag set @@ -1141,7 +1140,6 @@ EXPORT_SYMBOL(hmm_range_dma_map); * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. */ long hmm_range_dma_unmap(struct hmm_range *range, - struct vm_area_struct *vma, struct device *device, dma_addr_t *daddrs, bool dirty) -- cgit v1.2.3 From fac555ac93d453a0d2265eef88bf4c249dd63e07 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:44 +0300 Subject: mm/hmm: remove superfluous arguments from hmm_range_register The start, end and page_shift values are all saved in the range structure, so we might as well use that for argument passing. Link: https://lore.kernel.org/r/20190806160554.14046-7-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Felix Kuehling Signed-off-by: Jason Gunthorpe --- Documentation/vm/hmm.rst | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 7 +++++-- drivers/gpu/drm/nouveau/nouveau_svm.c | 5 ++--- include/linux/hmm.h | 6 +----- mm/hmm.c | 20 +++++--------------- 5 files changed, 14 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index ddcb5ca8b296..e63c11f7e0e0 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -222,7 +222,7 @@ The usage pattern is:: range.flags = ...; range.values = ...; range.pfn_shift = ...; - hmm_range_register(&range); + hmm_range_register(&range, mirror); /* * Just wait for range to be valid, safe to ignore return value as we diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index f0821638bbc6..71d6e7087b0b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -818,8 +818,11 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) 0 : range->flags[HMM_PFN_WRITE]; range->pfn_flags_mask = 0; range->pfns = pfns; - hmm_range_register(range, mirror, start, - start + ttm->num_pages * PAGE_SIZE, PAGE_SHIFT); + range->page_shift = PAGE_SHIFT; + range->start = start; + range->end = start + ttm->num_pages * PAGE_SIZE; + + hmm_range_register(range, mirror); /* * Just wait for range to be valid, safe to ignore return value as we diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 98072fd48cf7..41fad4719ac6 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -492,9 +492,7 @@ nouveau_range_fault(struct nouveau_svmm *svmm, struct hmm_range *range) range->default_flags = 0; range->pfn_flags_mask = -1UL; - ret = hmm_range_register(range, &svmm->mirror, - range->start, range->end, - PAGE_SHIFT); + ret = hmm_range_register(range, &svmm->mirror); if (ret) { up_read(&svmm->mm->mmap_sem); return (int)ret; @@ -682,6 +680,7 @@ nouveau_svm_fault(struct nvif_notify *notify) args.i.p.addr + args.i.p.size, fn - fi); /* Have HMM fault pages within the fault window to the GPU. */ + range.page_shift = PAGE_SHIFT; range.start = args.i.p.addr; range.end = args.i.p.addr + args.i.p.size; range.pfns = args.phys; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 59be0aa2476d..c5b51376b453 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -400,11 +400,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); /* * Please see Documentation/vm/hmm.rst for how to use the range API. */ -int hmm_range_register(struct hmm_range *range, - struct hmm_mirror *mirror, - unsigned long start, - unsigned long end, - unsigned page_shift); +int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror); void hmm_range_unregister(struct hmm_range *range); /* diff --git a/mm/hmm.c b/mm/hmm.c index 160ecb47d04b..ab22fdb655c8 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -850,35 +850,25 @@ static void hmm_pfns_clear(struct hmm_range *range, * hmm_range_register() - start tracking change to CPU page table over a range * @range: range * @mm: the mm struct for the range of virtual address - * @start: start virtual address (inclusive) - * @end: end virtual address (exclusive) - * @page_shift: expect page shift for the range + * * Return: 0 on success, -EFAULT if the address space is no longer valid * * Track updates to the CPU page table see include/linux/hmm.h */ -int hmm_range_register(struct hmm_range *range, - struct hmm_mirror *mirror, - unsigned long start, - unsigned long end, - unsigned page_shift) +int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) { - unsigned long mask = ((1UL << page_shift) - 1UL); + unsigned long mask = ((1UL << range->page_shift) - 1UL); struct hmm *hmm = mirror->hmm; unsigned long flags; range->valid = false; range->hmm = NULL; - if ((start & mask) || (end & mask)) + if ((range->start & mask) || (range->end & mask)) return -EINVAL; - if (start >= end) + if (range->start >= range->end) return -EINVAL; - range->page_shift = page_shift; - range->start = start; - range->end = end; - /* Prevent hmm_release() from running while the range is valid */ if (!mmget_not_zero(hmm->mm)) return -EFAULT; -- cgit v1.2.3 From 7f08263d9bc6627382da14f9e81d643d0329d5d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:45 +0300 Subject: mm/hmm: remove the page_shift member from struct hmm_range All users pass PAGE_SIZE here, and if we wanted to support single entries for huge pages we should really just add a HMM_FAULT_HUGEPAGE flag instead that uses the huge page size instead of having the caller calculate that size once, just for the hmm code to verify it. Link: https://lore.kernel.org/r/20190806160554.14046-8-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Felix Kuehling Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 1 - drivers/gpu/drm/nouveau/nouveau_svm.c | 1 - include/linux/hmm.h | 22 ----------------- mm/hmm.c | 42 +++++++-------------------------- 4 files changed, 9 insertions(+), 57 deletions(-) (limited to 'mm') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 71d6e7087b0b..8bf79288c4e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -818,7 +818,6 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) 0 : range->flags[HMM_PFN_WRITE]; range->pfn_flags_mask = 0; range->pfns = pfns; - range->page_shift = PAGE_SHIFT; range->start = start; range->end = start + ttm->num_pages * PAGE_SIZE; diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 41fad4719ac6..668d4bd0c118 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -680,7 +680,6 @@ nouveau_svm_fault(struct nvif_notify *notify) args.i.p.addr + args.i.p.size, fn - fi); /* Have HMM fault pages within the fault window to the GPU. */ - range.page_shift = PAGE_SHIFT; range.start = args.i.p.addr; range.end = args.i.p.addr + args.i.p.size; range.pfns = args.phys; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index c5b51376b453..51e18fbb8953 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -158,7 +158,6 @@ enum hmm_pfn_value_e { * @values: pfn value for some special case (none, special, error, ...) * @default_flags: default flags for the range (write, read, ... see hmm doc) * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter - * @page_shift: device virtual address shift value (should be >= PAGE_SHIFT) * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) * @valid: pfns array did not change since it has been fill by an HMM function */ @@ -172,31 +171,10 @@ struct hmm_range { const uint64_t *values; uint64_t default_flags; uint64_t pfn_flags_mask; - uint8_t page_shift; uint8_t pfn_shift; bool valid; }; -/* - * hmm_range_page_shift() - return the page shift for the range - * @range: range being queried - * Return: page shift (page size = 1 << page shift) for the range - */ -static inline unsigned hmm_range_page_shift(const struct hmm_range *range) -{ - return range->page_shift; -} - -/* - * hmm_range_page_size() - return the page size for the range - * @range: range being queried - * Return: page size for the range in bytes - */ -static inline unsigned long hmm_range_page_size(const struct hmm_range *range) -{ - return 1UL << hmm_range_page_shift(range); -} - /* * hmm_range_wait_until_valid() - wait for range to be valid * @range: range affected by invalidation to wait on diff --git a/mm/hmm.c b/mm/hmm.c index ab22fdb655c8..9e0052e037fe 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -345,13 +345,12 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; - unsigned long i, page_size; + unsigned long i; hmm_vma_walk->last = addr; - page_size = hmm_range_page_size(range); - i = (addr - range->start) >> range->page_shift; + i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += page_size, i++) { + for (; addr < end; addr += PAGE_SIZE, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { int ret; @@ -779,7 +778,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE - unsigned long addr = start, i, pfn, mask, size, pfn_inc; + unsigned long addr = start, i, pfn, mask; struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; @@ -790,24 +789,12 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, pte_t entry; int ret = 0; - size = huge_page_size(h); - mask = size - 1; - if (range->page_shift != PAGE_SHIFT) { - /* Make sure we are looking at a full page. */ - if (start & mask) - return -EINVAL; - if (end < (start + size)) - return -EINVAL; - pfn_inc = size >> PAGE_SHIFT; - } else { - pfn_inc = 1; - size = PAGE_SIZE; - } + mask = huge_page_size(h) - 1; ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); entry = huge_ptep_get(pte); - i = (start - range->start) >> range->page_shift; + i = (start - range->start) >> PAGE_SHIFT; orig_pfn = range->pfns[i]; range->pfns[i] = range->values[HMM_PFN_NONE]; cpu_flags = pte_to_hmm_pfn_flags(range, entry); @@ -819,8 +806,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, goto unlock; } - pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); - for (; addr < end; addr += size, i++, pfn += pfn_inc) + pfn = pte_pfn(entry) + ((start & mask) >> PAGE_SHIFT); + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; hmm_vma_walk->last = end; @@ -857,14 +844,13 @@ static void hmm_pfns_clear(struct hmm_range *range, */ int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) { - unsigned long mask = ((1UL << range->page_shift) - 1UL); struct hmm *hmm = mirror->hmm; unsigned long flags; range->valid = false; range->hmm = NULL; - if ((range->start & mask) || (range->end & mask)) + if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1))) return -EINVAL; if (range->start >= range->end) return -EINVAL; @@ -971,16 +957,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) if (vma == NULL || (vma->vm_flags & device_vma)) return -EFAULT; - if (is_vm_hugetlb_page(vma)) { - if (huge_page_shift(hstate_vma(vma)) != - range->page_shift && - range->page_shift != PAGE_SHIFT) - return -EINVAL; - } else { - if (range->page_shift != PAGE_SHIFT) - return -EINVAL; - } - if (!(vma->vm_flags & VM_READ)) { /* * If vma do not allow read access, then assume that it -- cgit v1.2.3 From 05c23af4a1b34df5838ebab2da1d6f802cf5ece3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:46 +0300 Subject: mm/hmm: remove the mask variable in hmm_vma_walk_hugetlb_entry The pagewalk code already passes the value as the hmask parameter. Link: https://lore.kernel.org/r/20190806160554.14046-9-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index 9e0052e037fe..4807f4b16736 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -778,19 +778,16 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE - unsigned long addr = start, i, pfn, mask; + unsigned long addr = start, i, pfn; struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - struct hstate *h = hstate_vma(vma); uint64_t orig_pfn, cpu_flags; bool fault, write_fault; spinlock_t *ptl; pte_t entry; int ret = 0; - mask = huge_page_size(h) - 1; - ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); entry = huge_ptep_get(pte); @@ -806,7 +803,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, goto unlock; } - pfn = pte_pfn(entry) + ((start & mask) >> PAGE_SHIFT); + pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); for (; addr < end; addr += PAGE_SIZE, i++, pfn++) range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; -- cgit v1.2.3 From 309f9a4f5e1a233d5df2101b9394ee689d9e463f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:47 +0300 Subject: mm/hmm: don't abuse pte_index() in hmm_vma_handle_pmd pte_index is an internal arch helper in various architectures, without consistent semantics. Open code that calculation of a PMD index based on the virtual address instead. Link: https://lore.kernel.org/r/20190806160554.14046-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index 4807f4b16736..8d56a4342624 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -486,7 +486,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, if (pmd_protnone(pmd) || fault || write_fault) return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); - pfn = pmd_pfn(pmd) + pte_index(addr); + pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { if (pmd_devmap(pmd)) { hmm_vma_walk->pgmap = get_dev_pagemap(pfn, -- cgit v1.2.3 From f0b3c45c8931fd7448a638557752f2743f76f51a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:48 +0300 Subject: mm/hmm: only define hmm_vma_walk_pud if needed We only need the special pud_entry walker if PUD-sized hugepages and pte mappings are supported, else the common pagewalk code will take care of the iteration. Not implementing this callback reduced the amount of code compiled for non-x86 platforms, and also fixes compile failures with other architectures when helpers like pud_pfn are not implemented. Link: https://lore.kernel.org/r/20190806160554.14046-11-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index 8d56a4342624..fb1306409258 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -456,15 +456,6 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) range->flags[HMM_PFN_VALID]; } -static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) -{ - if (!pud_present(pud)) - return 0; - return pud_write(pud) ? range->flags[HMM_PFN_VALID] | - range->flags[HMM_PFN_WRITE] : - range->flags[HMM_PFN_VALID]; -} - static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, @@ -705,10 +696,19 @@ again: return 0; } -static int hmm_vma_walk_pud(pud_t *pudp, - unsigned long start, - unsigned long end, - struct mm_walk *walk) +#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) +{ + if (!pud_present(pud)) + return 0; + return pud_write(pud) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + +static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, + struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; @@ -772,6 +772,9 @@ again: return 0; } +#else +#define hmm_vma_walk_pud NULL +#endif static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long start, unsigned long end, -- cgit v1.2.3 From 9d3973d60f0abd3985229bd895f45e2b8974344c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:49 +0300 Subject: mm/hmm: cleanup the hmm_vma_handle_pmd stub Stub out the whole function when CONFIG_TRANSPARENT_HUGEPAGE is not set to make the function easier to read. Link: https://lore.kernel.org/r/20190806160554.14046-12-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index fb1306409258..900cb4837315 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -456,13 +456,10 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) range->flags[HMM_PFN_VALID]; } -static int hmm_vma_handle_pmd(struct mm_walk *walk, - unsigned long addr, - unsigned long end, - uint64_t *pfns, - pmd_t pmd) -{ #ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, + unsigned long end, uint64_t *pfns, pmd_t pmd) +{ struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; @@ -493,11 +490,12 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, } hmm_vma_walk->last = end; return 0; -#else - /* If THP is not enabled then we should never reach this code ! */ - return -EINVAL; -#endif } +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +/* stub to allow the code below to compile */ +int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, + unsigned long end, uint64_t *pfns, pmd_t pmd); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) { -- cgit v1.2.3 From 251bbe59b7a6362f4c417aa8af281d44f9edfa0e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:50 +0300 Subject: mm/hmm: cleanup the hmm_vma_walk_hugetlb_entry stub Stub out the whole function and assign NULL to the .hugetlb_entry method if CONFIG_HUGETLB_PAGE is not set, as the method won't ever be called in that case. Link: https://lore.kernel.org/r/20190806160554.14046-13-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index 900cb4837315..ef553e664061 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -774,11 +774,11 @@ again: #define hmm_vma_walk_pud NULL #endif +#ifdef CONFIG_HUGETLB_PAGE static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long start, unsigned long end, struct mm_walk *walk) { -#ifdef CONFIG_HUGETLB_PAGE unsigned long addr = start, i, pfn; struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; @@ -817,10 +817,10 @@ unlock: return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); return ret; -#else /* CONFIG_HUGETLB_PAGE */ - return -EINVAL; -#endif } +#else +#define hmm_vma_walk_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ static void hmm_pfns_clear(struct hmm_range *range, uint64_t *pfns, -- cgit v1.2.3 From f442c283efe4bda53c8e52b230159d020ea11c8e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:51 +0300 Subject: mm/hmm: allow HMM_MIRROR on all architectures with MMU There isn't really any architecture specific code in this page table walk implementation, so drop the dependencies. Link: https://lore.kernel.org/r/20190806160554.14046-14-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 56cec636a1fc..b18782be969c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -677,8 +677,7 @@ config DEV_PAGEMAP_OPS config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" - depends on (X86_64 || PPC64) - depends on MMU && 64BIT + depends on MMU select MMU_NOTIFIER help Select HMM_MIRROR if you want to mirror range of the CPU page table of a -- cgit v1.2.3 From 9c240a7bb33787d4fed56508425ae0b2936b1674 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 6 Aug 2019 19:05:52 +0300 Subject: mm/hmm: make HMM_MIRROR an implicit option Make HMM_MIRROR an option that is selected by drivers wanting to use it instead of a user visible option as it is just a low-level implementation detail. Link: https://lore.kernel.org/r/20190806160554.14046-15-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/Kconfig | 4 +++- drivers/gpu/drm/nouveau/Kconfig | 4 +++- mm/Kconfig | 14 ++++++-------- 3 files changed, 12 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/drivers/gpu/drm/amd/amdgpu/Kconfig b/drivers/gpu/drm/amd/amdgpu/Kconfig index f6e5c0282fc1..2e98c016cb47 100644 --- a/drivers/gpu/drm/amd/amdgpu/Kconfig +++ b/drivers/gpu/drm/amd/amdgpu/Kconfig @@ -27,7 +27,9 @@ config DRM_AMDGPU_CIK config DRM_AMDGPU_USERPTR bool "Always enable userptr write support" depends on DRM_AMDGPU - depends on HMM_MIRROR + depends on MMU + select HMM_MIRROR + select MMU_NOTIFIER help This option selects CONFIG_HMM and CONFIG_HMM_MIRROR if it isn't already selected to enabled full userptr support. diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig index 96b9814e6d06..df4352c279ba 100644 --- a/drivers/gpu/drm/nouveau/Kconfig +++ b/drivers/gpu/drm/nouveau/Kconfig @@ -86,9 +86,11 @@ config DRM_NOUVEAU_SVM bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" depends on DEVICE_PRIVATE depends on DRM_NOUVEAU - depends on HMM_MIRROR + depends on MMU depends on STAGING + select HMM_MIRROR select MIGRATE_VMA_HELPER + select MMU_NOTIFIER default n help Say Y here if you want to enable experimental support for diff --git a/mm/Kconfig b/mm/Kconfig index b18782be969c..563436dc1f24 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -675,16 +675,14 @@ config MIGRATE_VMA_HELPER config DEV_PAGEMAP_OPS bool +# +# Helpers to mirror range of the CPU page tables of a process into device page +# tables. +# config HMM_MIRROR - bool "HMM mirror CPU page table into a device page table" + bool depends on MMU - select MMU_NOTIFIER - help - Select HMM_MIRROR if you want to mirror range of the CPU page table of a - process into a device page table. Here, mirror means "keep synchronized". - Prerequisites: the device must provide the ability to write-protect its - page tables (at PAGE_SIZE granularity), and must be able to recover from - the resulting potential page faults. + depends on MMU_NOTIFIER config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" -- cgit v1.2.3 From 56c57103db17db9ecdad0507a3f0e3eea747fabe Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Aug 2019 20:15:38 -0300 Subject: mm/mmu_notifiers: hoist do_mmu_notifier_register down_write to the caller This simplifies the code to not have so many one line functions and extra logic. __mmu_notifier_register() simply becomes the entry point to register the notifier, and the other one calls it under lock. Also add a lockdep_assert to check that the callers are holding the lock as expected. Link: https://lore.kernel.org/r/20190806231548.25242-2-jgg@ziepe.ca Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Reviewed-by: Ralph Campbell Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe --- mm/mmu_notifier.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index b5670620aea0..218a6f108bc2 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -236,22 +236,22 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); -static int do_mmu_notifier_register(struct mmu_notifier *mn, - struct mm_struct *mm, - int take_mmap_sem) +/* + * Same as mmu_notifier_register but here the caller must hold the + * mmap_sem in write mode. + */ +int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) { struct mmu_notifier_mm *mmu_notifier_mm; int ret; + lockdep_assert_held_write(&mm->mmap_sem); BUG_ON(atomic_read(&mm->mm_users) <= 0); - ret = -ENOMEM; mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (unlikely(!mmu_notifier_mm)) - goto out; + return -ENOMEM; - if (take_mmap_sem) - down_write(&mm->mmap_sem); ret = mm_take_all_locks(mm); if (unlikely(ret)) goto out_clean; @@ -279,13 +279,11 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, mm_drop_all_locks(mm); out_clean: - if (take_mmap_sem) - up_write(&mm->mmap_sem); kfree(mmu_notifier_mm); -out: BUG_ON(atomic_read(&mm->mm_users) <= 0); return ret; } +EXPORT_SYMBOL_GPL(__mmu_notifier_register); /* * Must not hold mmap_sem nor any other VM related lock when calling @@ -302,19 +300,14 @@ out: */ int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) { - return do_mmu_notifier_register(mn, mm, 1); -} -EXPORT_SYMBOL_GPL(mmu_notifier_register); + int ret; -/* - * Same as mmu_notifier_register but here the caller must hold the - * mmap_sem in write mode. - */ -int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) -{ - return do_mmu_notifier_register(mn, mm, 0); + down_write(&mm->mmap_sem); + ret = __mmu_notifier_register(mn, mm); + up_write(&mm->mmap_sem); + return ret; } -EXPORT_SYMBOL_GPL(__mmu_notifier_register); +EXPORT_SYMBOL_GPL(mmu_notifier_register); /* this is called after the last mmu_notifier_unregister() returned */ void __mmu_notifier_mm_destroy(struct mm_struct *mm) -- cgit v1.2.3 From 70df291bf81ffda47ff84e6e2da4fbe21f95a861 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Aug 2019 20:15:39 -0300 Subject: mm/mmu_notifiers: do not speculatively allocate a mmu_notifier_mm A prior commit e0f3c3f78da2 ("mm/mmu_notifier: init notifier if necessary") made an attempt at doing this, but had to be reverted as calling the GFP_KERNEL allocator under the i_mmap_mutex causes deadlock, see commit 35cfa2b0b491 ("mm/mmu_notifier: allocate mmu_notifier in advance"). However, we can avoid that problem by doing the allocation only under the mmap_sem, which is already happening. Since all writers to mm->mmu_notifier_mm hold the write side of the mmap_sem reading it under that sem is deterministic and we can use that to decide if the allocation path is required, without speculation. The actual update to mmu_notifier_mm must still be done under the mm_take_all_locks() to ensure read-side coherency. Link: https://lore.kernel.org/r/20190806231548.25242-3-jgg@ziepe.ca Reviewed-by: Christoph Hellwig Reviewed-by: Ralph Campbell Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe --- mm/mmu_notifier.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 218a6f108bc2..696810f632ad 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -242,27 +242,32 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); */ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) { - struct mmu_notifier_mm *mmu_notifier_mm; + struct mmu_notifier_mm *mmu_notifier_mm = NULL; int ret; lockdep_assert_held_write(&mm->mmap_sem); BUG_ON(atomic_read(&mm->mm_users) <= 0); - mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); - if (unlikely(!mmu_notifier_mm)) - return -ENOMEM; + if (!mm->mmu_notifier_mm) { + /* + * kmalloc cannot be called under mm_take_all_locks(), but we + * know that mm->mmu_notifier_mm can't change while we hold + * the write side of the mmap_sem. + */ + mmu_notifier_mm = + kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + if (!mmu_notifier_mm) + return -ENOMEM; + + INIT_HLIST_HEAD(&mmu_notifier_mm->list); + spin_lock_init(&mmu_notifier_mm->lock); + } ret = mm_take_all_locks(mm); if (unlikely(ret)) goto out_clean; - if (!mm_has_notifiers(mm)) { - INIT_HLIST_HEAD(&mmu_notifier_mm->list); - spin_lock_init(&mmu_notifier_mm->lock); - - mm->mmu_notifier_mm = mmu_notifier_mm; - mmu_notifier_mm = NULL; - } + /* Pairs with the mmdrop in mmu_notifier_unregister_* */ mmgrab(mm); /* @@ -273,14 +278,19 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) * We can't race against any other mmu notifier method either * thanks to mm_take_all_locks(). */ + if (mmu_notifier_mm) + mm->mmu_notifier_mm = mmu_notifier_mm; + spin_lock(&mm->mmu_notifier_mm->lock); hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); mm_drop_all_locks(mm); + BUG_ON(atomic_read(&mm->mm_users) <= 0); + return 0; + out_clean: kfree(mmu_notifier_mm); - BUG_ON(atomic_read(&mm->mm_users) <= 0); return ret; } EXPORT_SYMBOL_GPL(__mmu_notifier_register); -- cgit v1.2.3 From 2c7933f53f6bff7656e3324ca1a04e478bdc57c1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Aug 2019 20:15:40 -0300 Subject: mm/mmu_notifiers: add a get/put scheme for the registration Many places in the kernel have a flow where userspace will create some object and that object will need to connect to the subsystem's mmu_notifier subscription for the duration of its lifetime. In this case the subsystem is usually tracking multiple mm_structs and it is difficult to keep track of what struct mmu_notifier's have been allocated for what mm's. Since this has been open coded in a variety of exciting ways, provide core functionality to do this safely. This approach uses the struct mmu_notifier_ops * as a key to determine if the subsystem has a notifier registered on the mm or not. If there is a registration then the existing notifier struct is returned, otherwise the ops->alloc_notifiers() is used to create a new per-subsystem notifier for the mm. The destroy side incorporates an async call_srcu based destruction which will avoid bugs in the callers such as commit 6d7c3cde93c1 ("mm/hmm: fix use after free with struct hmm in the mmu notifiers"). Since we are inside the mmu notifier core locking is fairly simple, the allocation uses the same approach as for mmu_notifier_mm, the write side of the mmap_sem makes everything deterministic and we only need to do hlist_add_head_rcu() under the mm_take_all_locks(). The new users count and the discoverability in the hlist is fully serialized by the mmu_notifier_mm->lock. Link: https://lore.kernel.org/r/20190806231548.25242-4-jgg@ziepe.ca Co-developed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe --- include/linux/mmu_notifier.h | 35 ++++++++++ mm/mmu_notifier.c | 156 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 185 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b6c004bd9f6a..31aa971315a1 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -211,6 +211,19 @@ struct mmu_notifier_ops { */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); + + /* + * These callbacks are used with the get/put interface to manage the + * lifetime of the mmu_notifier memory. alloc_notifier() returns a new + * notifier for use with the mm. + * + * free_notifier() is only called after the mmu_notifier has been + * fully put, calls to any ops callback are prevented and no ops + * callbacks are currently running. It is called from a SRCU callback + * and cannot sleep. + */ + struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); + void (*free_notifier)(struct mmu_notifier *mn); }; /* @@ -227,6 +240,9 @@ struct mmu_notifier_ops { struct mmu_notifier { struct hlist_node hlist; const struct mmu_notifier_ops *ops; + struct mm_struct *mm; + struct rcu_head rcu; + unsigned int users; }; static inline int mm_has_notifiers(struct mm_struct *mm) @@ -234,6 +250,21 @@ static inline int mm_has_notifiers(struct mm_struct *mm) return unlikely(mm->mmu_notifier_mm); } +struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, + struct mm_struct *mm); +static inline struct mmu_notifier * +mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) +{ + struct mmu_notifier *ret; + + down_write(&mm->mmap_sem); + ret = mmu_notifier_get_locked(ops, mm); + up_write(&mm->mmap_sem); + return ret; +} +void mmu_notifier_put(struct mmu_notifier *mn); +void mmu_notifier_synchronize(void); + extern int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); extern int __mmu_notifier_register(struct mmu_notifier *mn, @@ -581,6 +612,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define pudp_huge_clear_flush_notify pudp_huge_clear_flush #define set_pte_at_notify set_pte_at +static inline void mmu_notifier_synchronize(void) +{ +} + #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */ diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 696810f632ad..9e92ec8006fc 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -248,6 +248,9 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) lockdep_assert_held_write(&mm->mmap_sem); BUG_ON(atomic_read(&mm->mm_users) <= 0); + mn->mm = mm; + mn->users = 1; + if (!mm->mmu_notifier_mm) { /* * kmalloc cannot be called under mm_take_all_locks(), but we @@ -295,18 +298,24 @@ out_clean: } EXPORT_SYMBOL_GPL(__mmu_notifier_register); -/* +/** + * mmu_notifier_register - Register a notifier on a mm + * @mn: The notifier to attach + * @mm: The mm to attach the notifier to + * * Must not hold mmap_sem nor any other VM related lock when calling * this registration function. Must also ensure mm_users can't go down * to zero while this runs to avoid races with mmu_notifier_release, * so mm has to be current->mm or the mm should be pinned safely such * as with get_task_mm(). If the mm is not current->mm, the mm_users * pin should be released by calling mmput after mmu_notifier_register - * returns. mmu_notifier_unregister must be always called to - * unregister the notifier. mm_count is automatically pinned to allow - * mmu_notifier_unregister to safely run at any time later, before or - * after exit_mmap. ->release will always be called before exit_mmap - * frees the pages. + * returns. + * + * mmu_notifier_unregister() or mmu_notifier_put() must be always called to + * unregister the notifier. + * + * While the caller has a mmu_notifier get the mn->mm pointer will remain + * valid, and can be converted to an active mm pointer via mmget_not_zero(). */ int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -319,6 +328,72 @@ int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmu_notifier_register); +static struct mmu_notifier * +find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops) +{ + struct mmu_notifier *mn; + + spin_lock(&mm->mmu_notifier_mm->lock); + hlist_for_each_entry_rcu (mn, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops != ops) + continue; + + if (likely(mn->users != UINT_MAX)) + mn->users++; + else + mn = ERR_PTR(-EOVERFLOW); + spin_unlock(&mm->mmu_notifier_mm->lock); + return mn; + } + spin_unlock(&mm->mmu_notifier_mm->lock); + return NULL; +} + +/** + * mmu_notifier_get_locked - Return the single struct mmu_notifier for + * the mm & ops + * @ops: The operations struct being subscribe with + * @mm : The mm to attach notifiers too + * + * This function either allocates a new mmu_notifier via + * ops->alloc_notifier(), or returns an already existing notifier on the + * list. The value of the ops pointer is used to determine when two notifiers + * are the same. + * + * Each call to mmu_notifier_get() must be paired with a call to + * mmu_notifier_put(). The caller must hold the write side of mm->mmap_sem. + * + * While the caller has a mmu_notifier get the mm pointer will remain valid, + * and can be converted to an active mm pointer via mmget_not_zero(). + */ +struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, + struct mm_struct *mm) +{ + struct mmu_notifier *mn; + int ret; + + lockdep_assert_held_write(&mm->mmap_sem); + + if (mm->mmu_notifier_mm) { + mn = find_get_mmu_notifier(mm, ops); + if (mn) + return mn; + } + + mn = ops->alloc_notifier(mm); + if (IS_ERR(mn)) + return mn; + mn->ops = ops; + ret = __mmu_notifier_register(mn, mm); + if (ret) + goto out_free; + return mn; +out_free: + mn->ops->free_notifier(mn); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(mmu_notifier_get_locked); + /* this is called after the last mmu_notifier_unregister() returned */ void __mmu_notifier_mm_destroy(struct mm_struct *mm) { @@ -397,6 +472,75 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); +static void mmu_notifier_free_rcu(struct rcu_head *rcu) +{ + struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu); + struct mm_struct *mm = mn->mm; + + mn->ops->free_notifier(mn); + /* Pairs with the get in __mmu_notifier_register() */ + mmdrop(mm); +} + +/** + * mmu_notifier_put - Release the reference on the notifier + * @mn: The notifier to act on + * + * This function must be paired with each mmu_notifier_get(), it releases the + * reference obtained by the get. If this is the last reference then process + * to free the notifier will be run asynchronously. + * + * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release + * when the mm_struct is destroyed. Instead free_notifier is always called to + * release any resources held by the user. + * + * As ops->release is not guaranteed to be called, the user must ensure that + * all sptes are dropped, and no new sptes can be established before + * mmu_notifier_put() is called. + * + * This function can be called from the ops->release callback, however the + * caller must still ensure it is called pairwise with mmu_notifier_get(). + * + * Modules calling this function must call mmu_notifier_synchronize() in + * their __exit functions to ensure the async work is completed. + */ +void mmu_notifier_put(struct mmu_notifier *mn) +{ + struct mm_struct *mm = mn->mm; + + spin_lock(&mm->mmu_notifier_mm->lock); + if (WARN_ON(!mn->users) || --mn->users) + goto out_unlock; + hlist_del_init_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); + + call_srcu(&srcu, &mn->rcu, mmu_notifier_free_rcu); + return; + +out_unlock: + spin_unlock(&mm->mmu_notifier_mm->lock); +} +EXPORT_SYMBOL_GPL(mmu_notifier_put); + +/** + * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed + * + * This function ensures that all outstanding async SRU work from + * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops + * associated with an unused mmu_notifier will no longer be called. + * + * Before using the caller must ensure that all of its mmu_notifiers have been + * fully released via mmu_notifier_put(). + * + * Modules using the mmu_notifier_put() API should call this in their __exit + * function to avoid module unloading races. + */ +void mmu_notifier_synchronize(void) +{ + synchronize_srcu(&srcu); +} +EXPORT_SYMBOL_GPL(mmu_notifier_synchronize); + bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) { -- cgit v1.2.3 From c7d8b7824ff9de866a356e1892dbe9f191aa5d06 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Aug 2019 20:15:42 -0300 Subject: hmm: use mmu_notifier_get/put for 'struct hmm' This is a significant simplification, it eliminates all the remaining 'hmm' stuff in mm_struct, eliminates krefing along the critical notifier paths, and takes away all the ugly locking and abuse of page_table_lock. mmu_notifier_get() provides the single struct hmm per struct mm which eliminates mm->hmm. It also directly guarantees that no mmu_notifier op callback is callable while concurrent free is possible, this eliminates all the krefs inside the mmu_notifier callbacks. The remaining krefs in the range code were overly cautious, drivers are already not permitted to free the mirror while a range exists. Link: https://lore.kernel.org/r/20190806231548.25242-6-jgg@ziepe.ca Reviewed-by: Christoph Hellwig Reviewed-by: Ralph Campbell Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 + drivers/gpu/drm/nouveau/nouveau_drm.c | 3 + include/linux/hmm.h | 12 +--- include/linux/mm_types.h | 6 -- kernel/fork.c | 1 - mm/hmm.c | 121 ++++++++------------------------ 6 files changed, 34 insertions(+), 111 deletions(-) (limited to 'mm') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index f2e8b4238efd..abc233088201 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "amdgpu.h" #include "amdgpu_irq.h" @@ -1464,6 +1465,7 @@ static void __exit amdgpu_exit(void) amdgpu_unregister_atpx_handler(); amdgpu_sync_fini(); amdgpu_fence_slab_fini(); + mmu_notifier_synchronize(); } module_init(amdgpu_init); diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 7c2fcaba42d6..a0e48a482452 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -1292,6 +1293,8 @@ nouveau_drm_exit(void) #ifdef CONFIG_NOUVEAU_PLATFORM_DRIVER platform_driver_unregister(&nouveau_platform_driver); #endif + if (IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM)) + mmu_notifier_synchronize(); } module_init(nouveau_drm_init); diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 51e18fbb8953..3fec513b9c00 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -84,15 +84,12 @@ * @notifiers: count of active mmu notifiers */ struct hmm { - struct mm_struct *mm; - struct kref kref; + struct mmu_notifier mmu_notifier; spinlock_t ranges_lock; struct list_head ranges; struct list_head mirrors; - struct mmu_notifier mmu_notifier; struct rw_semaphore mirrors_sem; wait_queue_head_t wq; - struct rcu_head rcu; long notifiers; }; @@ -409,13 +406,6 @@ long hmm_range_dma_unmap(struct hmm_range *range, */ #define HMM_RANGE_DEFAULT_TIMEOUT 1000 -/* Below are for HMM internal use only! Not to be used by device driver! */ -static inline void hmm_mm_init(struct mm_struct *mm) -{ - mm->hmm = NULL; -} -#else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -static inline void hmm_mm_init(struct mm_struct *mm) {} #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ #endif /* LINUX_HMM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3a37a89eb7a7..525d25d93330 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -25,7 +25,6 @@ struct address_space; struct mem_cgroup; -struct hmm; /* * Each physical page in the system has a struct page associated with @@ -502,11 +501,6 @@ struct mm_struct { atomic_long_t hugetlb_usage; #endif struct work_struct async_put_work; - -#ifdef CONFIG_HMM_MIRROR - /* HMM needs to track a few things per mm */ - struct hmm *hmm; -#endif } __randomize_layout; /* diff --git a/kernel/fork.c b/kernel/fork.c index d8ae0f1b4148..bd4a0762f12f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1007,7 +1007,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_owner(mm, p); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); - hmm_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; diff --git a/mm/hmm.c b/mm/hmm.c index ef553e664061..49eace16f9f8 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -26,101 +26,37 @@ #include #include -static const struct mmu_notifier_ops hmm_mmu_notifier_ops; - -/** - * hmm_get_or_create - register HMM against an mm (HMM internal) - * - * @mm: mm struct to attach to - * Return: an HMM object, either by referencing the existing - * (per-process) object, or by creating a new one. - * - * This is not intended to be used directly by device drivers. If mm already - * has an HMM struct then it get a reference on it and returns it. Otherwise - * it allocates an HMM struct, initializes it, associate it with the mm and - * returns it. - */ -static struct hmm *hmm_get_or_create(struct mm_struct *mm) +static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm) { struct hmm *hmm; - lockdep_assert_held_write(&mm->mmap_sem); - - /* Abuse the page_table_lock to also protect mm->hmm. */ - spin_lock(&mm->page_table_lock); - hmm = mm->hmm; - if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref)) - goto out_unlock; - spin_unlock(&mm->page_table_lock); - - hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); + hmm = kzalloc(sizeof(*hmm), GFP_KERNEL); if (!hmm) - return NULL; + return ERR_PTR(-ENOMEM); + init_waitqueue_head(&hmm->wq); INIT_LIST_HEAD(&hmm->mirrors); init_rwsem(&hmm->mirrors_sem); - hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); spin_lock_init(&hmm->ranges_lock); - kref_init(&hmm->kref); hmm->notifiers = 0; - hmm->mm = mm; - - hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; - if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { - kfree(hmm); - return NULL; - } - - mmgrab(hmm->mm); - - /* - * We hold the exclusive mmap_sem here so we know that mm->hmm is - * still NULL or 0 kref, and is safe to update. - */ - spin_lock(&mm->page_table_lock); - mm->hmm = hmm; - -out_unlock: - spin_unlock(&mm->page_table_lock); - return hmm; + return &hmm->mmu_notifier; } -static void hmm_free_rcu(struct rcu_head *rcu) +static void hmm_free_notifier(struct mmu_notifier *mn) { - struct hmm *hmm = container_of(rcu, struct hmm, rcu); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - mmdrop(hmm->mm); + WARN_ON(!list_empty(&hmm->ranges)); + WARN_ON(!list_empty(&hmm->mirrors)); kfree(hmm); } -static void hmm_free(struct kref *kref) -{ - struct hmm *hmm = container_of(kref, struct hmm, kref); - - spin_lock(&hmm->mm->page_table_lock); - if (hmm->mm->hmm == hmm) - hmm->mm->hmm = NULL; - spin_unlock(&hmm->mm->page_table_lock); - - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm); - mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu); -} - -static inline void hmm_put(struct hmm *hmm) -{ - kref_put(&hmm->kref, hmm_free); -} - static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; - /* Bail out if hmm is in the process of being freed */ - if (!kref_get_unless_zero(&hmm->kref)) - return; - /* * Since hmm_range_register() holds the mmget() lock hmm_release() is * prevented as long as a range exists. @@ -137,8 +73,6 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) mirror->ops->release(mirror); } up_read(&hmm->mirrors_sem); - - hmm_put(hmm); } static void notifiers_decrement(struct hmm *hmm) @@ -169,9 +103,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, unsigned long flags; int ret = 0; - if (!kref_get_unless_zero(&hmm->kref)) - return 0; - spin_lock_irqsave(&hmm->ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, &hmm->ranges, list) { @@ -206,7 +137,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, out: if (ret) notifiers_decrement(hmm); - hmm_put(hmm); return ret; } @@ -215,17 +145,15 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - if (!kref_get_unless_zero(&hmm->kref)) - return; - notifiers_decrement(hmm); - hmm_put(hmm); } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { .release = hmm_release, .invalidate_range_start = hmm_invalidate_range_start, .invalidate_range_end = hmm_invalidate_range_end, + .alloc_notifier = hmm_alloc_notifier, + .free_notifier = hmm_free_notifier, }; /* @@ -237,18 +165,27 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { * * To start mirroring a process address space, the device driver must register * an HMM mirror struct. + * + * The caller cannot unregister the hmm_mirror while any ranges are + * registered. + * + * Callers using this function must put a call to mmu_notifier_synchronize() + * in their module exit functions. */ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) { + struct mmu_notifier *mn; + lockdep_assert_held_write(&mm->mmap_sem); /* Sanity check */ if (!mm || !mirror || !mirror->ops) return -EINVAL; - mirror->hmm = hmm_get_or_create(mm); - if (!mirror->hmm) - return -ENOMEM; + mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm); + if (IS_ERR(mn)) + return PTR_ERR(mn); + mirror->hmm = container_of(mn, struct hmm, mmu_notifier); down_write(&mirror->hmm->mirrors_sem); list_add(&mirror->list, &mirror->hmm->mirrors); @@ -272,7 +209,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror) down_write(&hmm->mirrors_sem); list_del(&mirror->list); up_write(&hmm->mirrors_sem); - hmm_put(hmm); + mmu_notifier_put(&hmm->mmu_notifier); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -854,14 +791,13 @@ int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror) return -EINVAL; /* Prevent hmm_release() from running while the range is valid */ - if (!mmget_not_zero(hmm->mm)) + if (!mmget_not_zero(hmm->mmu_notifier.mm)) return -EFAULT; /* Initialize range to track CPU page table updates. */ spin_lock_irqsave(&hmm->ranges_lock, flags); range->hmm = hmm; - kref_get(&hmm->kref); list_add(&range->list, &hmm->ranges); /* @@ -893,8 +829,7 @@ void hmm_range_unregister(struct hmm_range *range) spin_unlock_irqrestore(&hmm->ranges_lock, flags); /* Drop reference taken by hmm_range_register() */ - mmput(hmm->mm); - hmm_put(hmm); + mmput(hmm->mmu_notifier.mm); /* * The range is now invalid and the ref on the hmm is dropped, so @@ -944,14 +879,14 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) struct mm_walk mm_walk; int ret; - lockdep_assert_held(&hmm->mm->mmap_sem); + lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); do { /* If range is no longer valid force retry. */ if (!range->valid) return -EBUSY; - vma = find_vma(hmm->mm, start); + vma = find_vma(hmm->mmu_notifier.mm, start); if (vma == NULL || (vma->vm_flags & device_vma)) return -EFAULT; -- cgit v1.2.3 From a7d1f22bb74f32cf3cd93f52776007e161f1a738 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Aug 2019 09:59:19 +0200 Subject: mm: turn migrate_vma upside down There isn't any good reason to pass callbacks to migrate_vma. Instead we can just export the three steps done by this function to drivers and let them sequence the operation without callbacks. This removes a lot of boilerplate code as-is, and will allow the drivers to drastically improve code flow and error handling further on. Link: https://lore.kernel.org/r/20190814075928.23766-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe --- Documentation/vm/hmm.rst | 54 +------- drivers/gpu/drm/nouveau/nouveau_dmem.c | 122 +++++++++-------- include/linux/migrate.h | 118 +++------------- mm/migrate.c | 244 +++++++++++++++------------------ 4 files changed, 194 insertions(+), 344 deletions(-) (limited to 'mm') diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index e63c11f7e0e0..0a5960beccf7 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -339,58 +339,8 @@ Migration to and from device memory =================================== Because the CPU cannot access device memory, migration must use the device DMA -engine to perform copy from and to device memory. For this we need a new -migration helper:: - - int migrate_vma(const struct migrate_vma_ops *ops, - struct vm_area_struct *vma, - unsigned long mentries, - unsigned long start, - unsigned long end, - unsigned long *src, - unsigned long *dst, - void *private); - -Unlike other migration functions it works on a range of virtual address, there -are two reasons for that. First, device DMA copy has a high setup overhead cost -and thus batching multiple pages is needed as otherwise the migration overhead -makes the whole exercise pointless. The second reason is because the -migration might be for a range of addresses the device is actively accessing. - -The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy()) -controls destination memory allocation and copy operation. Second one is there -to allow the device driver to perform cleanup operations after migration:: - - struct migrate_vma_ops { - void (*alloc_and_copy)(struct vm_area_struct *vma, - const unsigned long *src, - unsigned long *dst, - unsigned long start, - unsigned long end, - void *private); - void (*finalize_and_map)(struct vm_area_struct *vma, - const unsigned long *src, - const unsigned long *dst, - unsigned long start, - unsigned long end, - void *private); - }; - -It is important to stress that these migration helpers allow for holes in the -virtual address range. Some pages in the range might not be migrated for all -the usual reasons (page is pinned, page is locked, ...). This helper does not -fail but just skips over those pages. - -The alloc_and_copy() might decide to not migrate all pages in the -range (for reasons under the callback control). For those, the callback just -has to leave the corresponding dst entry empty. - -Finally, the migration of the struct page might fail (for file backed page) for -various reasons (failure to freeze reference, or update page cache, ...). If -that happens, then the finalize_and_map() can catch any pages that were not -migrated. Note those pages were still copied to a new page and thus we wasted -bandwidth but this is considered as a rare event and a price that we are -willing to pay to keep all the code simpler. +engine to perform copy from and to device memory. For this we need to use +migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() helpers. Memory cgroup (memcg) and rss accounting diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 345c63cb752a..38416798abd4 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -131,9 +131,8 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, unsigned long *dst_pfns, unsigned long start, unsigned long end, - void *private) + struct nouveau_dmem_fault *fault) { - struct nouveau_dmem_fault *fault = private; struct nouveau_drm *drm = fault->drm; struct device *dev = drm->dev->dev; unsigned long addr, i, npages = 0; @@ -230,14 +229,9 @@ error: } } -void nouveau_dmem_fault_finalize_and_map(struct vm_area_struct *vma, - const unsigned long *src_pfns, - const unsigned long *dst_pfns, - unsigned long start, - unsigned long end, - void *private) +static void +nouveau_dmem_fault_finalize_and_map(struct nouveau_dmem_fault *fault) { - struct nouveau_dmem_fault *fault = private; struct nouveau_drm *drm = fault->drm; if (fault->fence) { @@ -257,29 +251,35 @@ void nouveau_dmem_fault_finalize_and_map(struct vm_area_struct *vma, kfree(fault->dma); } -static const struct migrate_vma_ops nouveau_dmem_fault_migrate_ops = { - .alloc_and_copy = nouveau_dmem_fault_alloc_and_copy, - .finalize_and_map = nouveau_dmem_fault_finalize_and_map, -}; - static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) { struct nouveau_dmem *dmem = page_to_dmem(vmf->page); unsigned long src[1] = {0}, dst[1] = {0}; + struct migrate_vma args = { + .vma = vmf->vma, + .start = vmf->address, + .end = vmf->address + PAGE_SIZE, + .src = src, + .dst = dst, + }; struct nouveau_dmem_fault fault = { .drm = dmem->drm }; - int ret; /* * FIXME what we really want is to find some heuristic to migrate more * than just one page on CPU fault. When such fault happens it is very * likely that more surrounding page will CPU fault too. */ - ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vmf->vma, - vmf->address, vmf->address + PAGE_SIZE, - src, dst, &fault); - if (ret) + if (migrate_vma_setup(&args) < 0) return VM_FAULT_SIGBUS; + if (!args.cpages) + return 0; + + nouveau_dmem_fault_alloc_and_copy(args.vma, src, dst, args.start, + args.end, &fault); + migrate_vma_pages(&args); + nouveau_dmem_fault_finalize_and_map(&fault); + migrate_vma_finalize(&args); if (dst[0] == MIGRATE_PFN_ERROR) return VM_FAULT_SIGBUS; @@ -648,9 +648,8 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma, unsigned long *dst_pfns, unsigned long start, unsigned long end, - void *private) + struct nouveau_migrate *migrate) { - struct nouveau_migrate *migrate = private; struct nouveau_drm *drm = migrate->drm; struct device *dev = drm->dev->dev; unsigned long addr, i, npages = 0; @@ -747,14 +746,9 @@ error: } } -void nouveau_dmem_migrate_finalize_and_map(struct vm_area_struct *vma, - const unsigned long *src_pfns, - const unsigned long *dst_pfns, - unsigned long start, - unsigned long end, - void *private) +static void +nouveau_dmem_migrate_finalize_and_map(struct nouveau_migrate *migrate) { - struct nouveau_migrate *migrate = private; struct nouveau_drm *drm = migrate->drm; if (migrate->fence) { @@ -779,10 +773,15 @@ void nouveau_dmem_migrate_finalize_and_map(struct vm_area_struct *vma, */ } -static const struct migrate_vma_ops nouveau_dmem_migrate_ops = { - .alloc_and_copy = nouveau_dmem_migrate_alloc_and_copy, - .finalize_and_map = nouveau_dmem_migrate_finalize_and_map, -}; +static void nouveau_dmem_migrate_chunk(struct migrate_vma *args, + struct nouveau_migrate *migrate) +{ + nouveau_dmem_migrate_alloc_and_copy(args->vma, args->src, args->dst, + args->start, args->end, migrate); + migrate_vma_pages(args); + nouveau_dmem_migrate_finalize_and_map(migrate); + migrate_vma_finalize(args); +} int nouveau_dmem_migrate_vma(struct nouveau_drm *drm, @@ -790,40 +789,45 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, unsigned long start, unsigned long end) { - unsigned long *src_pfns, *dst_pfns, npages; - struct nouveau_migrate migrate = {0}; - unsigned long i, c, max; - int ret = 0; - - npages = (end - start) >> PAGE_SHIFT; - max = min(SG_MAX_SINGLE_ALLOC, npages); - src_pfns = kzalloc(sizeof(long) * max, GFP_KERNEL); - if (src_pfns == NULL) - return -ENOMEM; - dst_pfns = kzalloc(sizeof(long) * max, GFP_KERNEL); - if (dst_pfns == NULL) { - kfree(src_pfns); - return -ENOMEM; - } + unsigned long npages = (end - start) >> PAGE_SHIFT; + unsigned long max = min(SG_MAX_SINGLE_ALLOC, npages); + struct migrate_vma args = { + .vma = vma, + .start = start, + }; + struct nouveau_migrate migrate = { + .drm = drm, + .vma = vma, + .npages = npages, + }; + unsigned long c, i; + int ret = -ENOMEM; + + args.src = kzalloc(sizeof(long) * max, GFP_KERNEL); + if (!args.src) + goto out; + args.dst = kzalloc(sizeof(long) * max, GFP_KERNEL); + if (!args.dst) + goto out_free_src; - migrate.drm = drm; - migrate.vma = vma; - migrate.npages = npages; for (i = 0; i < npages; i += c) { - unsigned long next; - c = min(SG_MAX_SINGLE_ALLOC, npages); - next = start + (c << PAGE_SHIFT); - ret = migrate_vma(&nouveau_dmem_migrate_ops, vma, start, - next, src_pfns, dst_pfns, &migrate); + args.end = start + (c << PAGE_SHIFT); + ret = migrate_vma_setup(&args); if (ret) - goto out; - start = next; + goto out_free_dst; + + if (args.cpages) + nouveau_dmem_migrate_chunk(&args, &migrate); + args.start = args.end; } + ret = 0; +out_free_dst: + kfree(args.dst); +out_free_src: + kfree(args.src); out: - kfree(dst_pfns); - kfree(src_pfns); return ret; } diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 7f04754c7f2b..18156d379ebf 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -182,107 +182,27 @@ static inline unsigned long migrate_pfn(unsigned long pfn) return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; } -/* - * struct migrate_vma_ops - migrate operation callback - * - * @alloc_and_copy: alloc destination memory and copy source memory to it - * @finalize_and_map: allow caller to map the successfully migrated pages - * - * - * The alloc_and_copy() callback happens once all source pages have been locked, - * unmapped and checked (checked whether pinned or not). All pages that can be - * migrated will have an entry in the src array set with the pfn value of the - * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other - * flags might be set but should be ignored by the callback). - * - * The alloc_and_copy() callback can then allocate destination memory and copy - * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and - * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the - * callback must update each corresponding entry in the dst array with the pfn - * value of the destination page and with the MIGRATE_PFN_VALID and - * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages - * locked, via lock_page()). - * - * At this point the alloc_and_copy() callback is done and returns. - * - * Note that the callback does not have to migrate all the pages that are - * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration - * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also - * set in the src array entry). If the device driver cannot migrate a device - * page back to system memory, then it must set the corresponding dst array - * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to - * access any of the virtual addresses originally backed by this page. Because - * a SIGBUS is such a severe result for the userspace process, the device - * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an - * unrecoverable state. - * - * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we - * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus - * allowing device driver to allocate device memory for those unback virtual - * address. For this the device driver simply have to allocate device memory - * and properly set the destination entry like for regular migration. Note that - * this can still fails and thus inside the device driver must check if the - * migration was successful for those entry inside the finalize_and_map() - * callback just like for regular migration. - * - * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES - * OR BAD THINGS WILL HAPPEN ! - * - * - * The finalize_and_map() callback happens after struct page migration from - * source to destination (destination struct pages are the struct pages for the - * memory allocated by the alloc_and_copy() callback). Migration can fail, and - * thus the finalize_and_map() allows the driver to inspect which pages were - * successfully migrated, and which were not. Successfully migrated pages will - * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. - * - * It is safe to update device page table from within the finalize_and_map() - * callback because both destination and source page are still locked, and the - * mmap_sem is held in read mode (hence no one can unmap the range being - * migrated). - * - * Once callback is done cleaning up things and updating its page table (if it - * chose to do so, this is not an obligation) then it returns. At this point, - * the HMM core will finish up the final steps, and the migration is complete. - * - * THE finalize_and_map() CALLBACK MUST NOT CHANGE ANY OF THE SRC OR DST ARRAY - * ENTRIES OR BAD THINGS WILL HAPPEN ! - */ -struct migrate_vma_ops { - void (*alloc_and_copy)(struct vm_area_struct *vma, - const unsigned long *src, - unsigned long *dst, - unsigned long start, - unsigned long end, - void *private); - void (*finalize_and_map)(struct vm_area_struct *vma, - const unsigned long *src, - const unsigned long *dst, - unsigned long start, - unsigned long end, - void *private); +struct migrate_vma { + struct vm_area_struct *vma; + /* + * Both src and dst array must be big enough for + * (end - start) >> PAGE_SHIFT entries. + * + * The src array must not be modified by the caller after + * migrate_vma_setup(), and must not change the dst array after + * migrate_vma_pages() returns. + */ + unsigned long *dst; + unsigned long *src; + unsigned long cpages; + unsigned long npages; + unsigned long start; + unsigned long end; }; -#if defined(CONFIG_MIGRATE_VMA_HELPER) -int migrate_vma(const struct migrate_vma_ops *ops, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long *src, - unsigned long *dst, - void *private); -#else -static inline int migrate_vma(const struct migrate_vma_ops *ops, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long *src, - unsigned long *dst, - void *private) -{ - return -EINVAL; -} -#endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */ +int migrate_vma_setup(struct migrate_vma *args); +void migrate_vma_pages(struct migrate_vma *migrate); +void migrate_vma_finalize(struct migrate_vma *migrate); #endif /* CONFIG_MIGRATION */ diff --git a/mm/migrate.c b/mm/migrate.c index 8992741f10aa..8111e031fa2b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2118,16 +2118,6 @@ out_unlock: #endif /* CONFIG_NUMA */ #if defined(CONFIG_MIGRATE_VMA_HELPER) -struct migrate_vma { - struct vm_area_struct *vma; - unsigned long *dst; - unsigned long *src; - unsigned long cpages; - unsigned long npages; - unsigned long start; - unsigned long end; -}; - static int migrate_vma_collect_hole(unsigned long start, unsigned long end, struct mm_walk *walk) @@ -2578,6 +2568,110 @@ restore: } } +/** + * migrate_vma_setup() - prepare to migrate a range of memory + * @args: contains the vma, start, and and pfns arrays for the migration + * + * Returns: negative errno on failures, 0 when 0 or more pages were migrated + * without an error. + * + * Prepare to migrate a range of memory virtual address range by collecting all + * the pages backing each virtual address in the range, saving them inside the + * src array. Then lock those pages and unmap them. Once the pages are locked + * and unmapped, check whether each page is pinned or not. Pages that aren't + * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the + * corresponding src array entry. Then restores any pages that are pinned, by + * remapping and unlocking those pages. + * + * The caller should then allocate destination memory and copy source memory to + * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE + * flag set). Once these are allocated and copied, the caller must update each + * corresponding entry in the dst array with the pfn value of the destination + * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set + * (destination pages must have their struct pages locked, via lock_page()). + * + * Note that the caller does not have to migrate all the pages that are marked + * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from + * device memory to system memory. If the caller cannot migrate a device page + * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe + * consequences for the userspace process, so it must be avoided if at all + * possible. + * + * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we + * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus + * allowing the caller to allocate device memory for those unback virtual + * address. For this the caller simply has to allocate device memory and + * properly set the destination entry like for regular migration. Note that + * this can still fails and thus inside the device driver must check if the + * migration was successful for those entries after calling migrate_vma_pages() + * just like for regular migration. + * + * After that, the callers must call migrate_vma_pages() to go over each entry + * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then migrate_vma_pages() to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the + * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the + * src array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * Once migrate_vma_pages() returns the caller may inspect which pages were + * successfully migrated, and which were not. Successfully migrated pages will + * have the MIGRATE_PFN_MIGRATE flag set for their src array entry. + * + * It is safe to update device page table after migrate_vma_pages() because + * both destination and source page are still locked, and the mmap_sem is held + * in read mode (hence no one can unmap the range being migrated). + * + * Once the caller is done cleaning up things and updating its page table (if it + * chose to do so, this is not an obligation) it finally calls + * migrate_vma_finalize() to update the CPU page table to point to new pages + * for successfully migrated pages or otherwise restore the CPU page table to + * point to the original source pages. + */ +int migrate_vma_setup(struct migrate_vma *args) +{ + long nr_pages = (args->end - args->start) >> PAGE_SHIFT; + + args->start &= PAGE_MASK; + args->end &= PAGE_MASK; + if (!args->vma || is_vm_hugetlb_page(args->vma) || + (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma)) + return -EINVAL; + if (nr_pages <= 0) + return -EINVAL; + if (args->start < args->vma->vm_start || + args->start >= args->vma->vm_end) + return -EINVAL; + if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end) + return -EINVAL; + if (!args->src || !args->dst) + return -EINVAL; + + memset(args->src, 0, sizeof(*args->src) * nr_pages); + args->cpages = 0; + args->npages = 0; + + migrate_vma_collect(args); + + if (args->cpages) + migrate_vma_prepare(args); + if (args->cpages) + migrate_vma_unmap(args); + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the drivers. + */ + return 0; + +} +EXPORT_SYMBOL(migrate_vma_setup); + static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, struct page *page, @@ -2709,7 +2803,7 @@ abort: *src &= ~MIGRATE_PFN_MIGRATE; } -/* +/** * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information * @@ -2717,7 +2811,7 @@ abort: * struct page. This effectively finishes the migration from source page to the * destination page. */ -static void migrate_vma_pages(struct migrate_vma *migrate) +void migrate_vma_pages(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; const unsigned long start = migrate->start; @@ -2791,8 +2885,9 @@ static void migrate_vma_pages(struct migrate_vma *migrate) if (notified) mmu_notifier_invalidate_range_only_end(&range); } +EXPORT_SYMBOL(migrate_vma_pages); -/* +/** * migrate_vma_finalize() - restore CPU page table entry * @migrate: migrate struct containing all migration information * @@ -2803,7 +2898,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) * This also unlocks the pages and puts them back on the lru, or drops the extra * refcount, for device pages. */ -static void migrate_vma_finalize(struct migrate_vma *migrate) +void migrate_vma_finalize(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; unsigned long i; @@ -2846,124 +2941,5 @@ static void migrate_vma_finalize(struct migrate_vma *migrate) } } } - -/* - * migrate_vma() - migrate a range of memory inside vma - * - * @ops: migration callback for allocating destination memory and copying - * @vma: virtual memory area containing the range to be migrated - * @start: start address of the range to migrate (inclusive) - * @end: end address of the range to migrate (exclusive) - * @src: array of hmm_pfn_t containing source pfns - * @dst: array of hmm_pfn_t containing destination pfns - * @private: pointer passed back to each of the callback - * Returns: 0 on success, error code otherwise - * - * This function tries to migrate a range of memory virtual address range, using - * callbacks to allocate and copy memory from source to destination. First it - * collects all the pages backing each virtual address in the range, saving this - * inside the src array. Then it locks those pages and unmaps them. Once the pages - * are locked and unmapped, it checks whether each page is pinned or not. Pages - * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) - * in the corresponding src array entry. It then restores any pages that are - * pinned, by remapping and unlocking those pages. - * - * At this point it calls the alloc_and_copy() callback. For documentation on - * what is expected from that callback, see struct migrate_vma_ops comments in - * include/linux/migrate.h - * - * After the alloc_and_copy() callback, this function goes over each entry in - * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag - * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, - * then the function tries to migrate struct page information from the source - * struct page to the destination struct page. If it fails to migrate the struct - * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src - * array. - * - * At this point all successfully migrated pages have an entry in the src - * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst - * array entry with MIGRATE_PFN_VALID flag set. - * - * It then calls the finalize_and_map() callback. See comments for "struct - * migrate_vma_ops", in include/linux/migrate.h for details about - * finalize_and_map() behavior. - * - * After the finalize_and_map() callback, for successfully migrated pages, this - * function updates the CPU page table to point to new pages, otherwise it - * restores the CPU page table to point to the original source pages. - * - * Function returns 0 after the above steps, even if no pages were migrated - * (The function only returns an error if any of the arguments are invalid.) - * - * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT - * unsigned long entries. - */ -int migrate_vma(const struct migrate_vma_ops *ops, - struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - unsigned long *src, - unsigned long *dst, - void *private) -{ - struct migrate_vma migrate; - - /* Sanity check the arguments */ - start &= PAGE_MASK; - end &= PAGE_MASK; - if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || - vma_is_dax(vma)) - return -EINVAL; - if (start < vma->vm_start || start >= vma->vm_end) - return -EINVAL; - if (end <= vma->vm_start || end > vma->vm_end) - return -EINVAL; - if (!ops || !src || !dst || start >= end) - return -EINVAL; - - memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); - migrate.src = src; - migrate.dst = dst; - migrate.start = start; - migrate.npages = 0; - migrate.cpages = 0; - migrate.end = end; - migrate.vma = vma; - - /* Collect, and try to unmap source pages */ - migrate_vma_collect(&migrate); - if (!migrate.cpages) - return 0; - - /* Lock and isolate page */ - migrate_vma_prepare(&migrate); - if (!migrate.cpages) - return 0; - - /* Unmap pages */ - migrate_vma_unmap(&migrate); - if (!migrate.cpages) - return 0; - - /* - * At this point pages are locked and unmapped, and thus they have - * stable content and can safely be copied to destination memory that - * is allocated by the callback. - * - * Note that migration can fail in migrate_vma_struct_page() for each - * individual page. - */ - ops->alloc_and_copy(vma, src, dst, start, end, private); - - /* This does the real migration of struct page */ - migrate_vma_pages(&migrate); - - ops->finalize_and_map(vma, src, dst, start, end, private); - - /* Unlock and remap pages */ - migrate_vma_finalize(&migrate); - - return 0; -} -EXPORT_SYMBOL(migrate_vma); +EXPORT_SYMBOL(migrate_vma_finalize); #endif /* defined(MIGRATE_VMA_HELPER) */ -- cgit v1.2.3 From 06d462beb470d361ffa8bd7b3d865509a8606987 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Aug 2019 09:59:27 +0200 Subject: mm: remove the unused MIGRATE_PFN_DEVICE flag No one ever checks this flag, and we could easily get that information from the page if needed. Link: https://lore.kernel.org/r/20190814075928.23766-10-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell Reviewed-by: Jason Gunthorpe Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 +-- include/linux/migrate.h | 1 - mm/migrate.c | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index d96b987b9982..fa1439941596 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -580,8 +580,7 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, *dma_addr)) goto out_dma_unmap; - return migrate_pfn(page_to_pfn(dpage)) | - MIGRATE_PFN_LOCKED | MIGRATE_PFN_DEVICE; + return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; out_dma_unmap: dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 1e67dcfd318f..72120061b7d4 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -166,7 +166,6 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, #define MIGRATE_PFN_MIGRATE (1UL << 1) #define MIGRATE_PFN_LOCKED (1UL << 2) #define MIGRATE_PFN_WRITE (1UL << 3) -#define MIGRATE_PFN_DEVICE (1UL << 4) #define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) diff --git a/mm/migrate.c b/mm/migrate.c index 8111e031fa2b..f4f5ae5ae44f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2237,8 +2237,8 @@ again: goto next; page = device_private_entry_to_page(entry); - mpfn = migrate_pfn(page_to_pfn(page))| - MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; + mpfn = migrate_pfn(page_to_pfn(page)) | + MIGRATE_PFN_MIGRATE; if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { -- cgit v1.2.3 From 9b2ed9cb975c63f5534daaebeb225ab52b589372 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Aug 2019 09:59:28 +0200 Subject: mm: remove CONFIG_MIGRATE_VMA_HELPER CONFIG_MIGRATE_VMA_HELPER guards helpers that are required for proper devic private memory support. Remove the option and just check for CONFIG_DEVICE_PRIVATE instead. Link: https://lore.kernel.org/r/20190814075928.23766-11-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/nouveau/Kconfig | 1 - mm/Kconfig | 3 --- mm/migrate.c | 4 ++-- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig index df4352c279ba..3558df043592 100644 --- a/drivers/gpu/drm/nouveau/Kconfig +++ b/drivers/gpu/drm/nouveau/Kconfig @@ -89,7 +89,6 @@ config DRM_NOUVEAU_SVM depends on MMU depends on STAGING select HMM_MIRROR - select MIGRATE_VMA_HELPER select MMU_NOTIFIER default n help diff --git a/mm/Kconfig b/mm/Kconfig index 563436dc1f24..2fe4902ad755 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -669,9 +669,6 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. -config MIGRATE_VMA_HELPER - bool - config DEV_PAGEMAP_OPS bool diff --git a/mm/migrate.c b/mm/migrate.c index f4f5ae5ae44f..2cff57e7116e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2117,7 +2117,7 @@ out_unlock: #endif /* CONFIG_NUMA */ -#if defined(CONFIG_MIGRATE_VMA_HELPER) +#ifdef CONFIG_DEVICE_PRIVATE static int migrate_vma_collect_hole(unsigned long start, unsigned long end, struct mm_walk *walk) @@ -2942,4 +2942,4 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } } EXPORT_SYMBOL(migrate_vma_finalize); -#endif /* defined(MIGRATE_VMA_HELPER) */ +#endif /* CONFIG_DEVICE_PRIVATE */ -- cgit v1.2.3 From 8402ce61bec28a1b73ec467b4c146f1e3f587044 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 14 Aug 2019 22:20:23 +0200 Subject: mm/mmu_notifiers: check if mmu notifier callbacks are allowed to fail Just a bit of paranoia, since if we start pushing this deep into callchains it's hard to spot all places where an mmu notifier implementation might fail when it's not allowed to. Inspired by some confusion we had discussing i915 mmu notifiers and whether we could use the newly-introduced return value to handle some corner cases. Until we realized that these are only for when a task has been killed by the oom reaper. An alternative approach would be to split the callback into two versions, one with the int return value, and the other with void return value like in older kernels. But that's a lot more churn for fairly little gain I think. Summary from the m-l discussion on why we want something at warning level: This allows automated tooling in CI to catch bugs without humans having to look at everything. If we just upgrade the existing pr_info to a pr_warn, then we'll have false positives. And as-is, no one will ever spot the problem since it's lost in the massive amounts of overall dmesg noise. Link: https://lore.kernel.org/r/20190814202027.18735-2-daniel.vetter@ffwll.ch Signed-off-by: Daniel Vetter Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/mmu_notifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9e92ec8006fc..d76ea27e2bbb 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -179,6 +179,8 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) pr_info("%pS callback failed with %d in %sblockable context.\n", mn->ops->invalidate_range_start, _ret, !mmu_notifier_range_blockable(range) ? "non-" : ""); + WARN_ON(mmu_notifier_range_blockable(range) || + ret != -EAGAIN); ret = _ret; } } -- cgit v1.2.3 From fdc029b19dfd190840d23e5c70bd231140b0e4a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Aug 2019 11:05:55 +0200 Subject: memremap: remove the dev field in struct dev_pagemap The dev field in struct dev_pagemap is only used to print dev_name in two places, which are at best nice to have. Just remove the field and thus the name in those two messages. Link: https://lore.kernel.org/r/20190818090557.17853-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Reviewed-by: Dan Williams Tested-by: Bharata B Rao Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- include/linux/memremap.h | 1 - kernel/memremap.c | 6 +----- mm/page_alloc.c | 2 +- tools/testing/nvdimm/test/iomap.c | 1 - 4 files changed, 2 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index f8a5b2a19945..8f0013e18e14 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -109,7 +109,6 @@ struct dev_pagemap { struct percpu_ref *ref; struct percpu_ref internal_ref; struct completion done; - struct device *dev; enum memory_type type; unsigned int flags; u64 pci_p2pdma_bus_offset; diff --git a/kernel/memremap.c b/kernel/memremap.c index 6ee03a816d67..600a14cbe663 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -96,7 +96,6 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap) static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; - struct device *dev = pgmap->dev; struct resource *res = &pgmap->res; unsigned long pfn; int nid; @@ -123,8 +122,7 @@ static void devm_memremap_pages_release(void *data) untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res)); pgmap_array_delete(res); - dev_WARN_ONCE(dev, pgmap->altmap.alloc, - "%s: failed to free all reserved pages\n", __func__); + WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n"); } static void dev_pagemap_percpu_release(struct percpu_ref *ref) @@ -245,8 +243,6 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_array; } - pgmap->dev = dev; - error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end), pgmap, GFP_KERNEL)); if (error) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 272c6de1bf4e..b39baa2b1faf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5982,7 +5982,7 @@ void __ref memmap_init_zone_device(struct zone *zone, } } - pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), + pr_info("%s initialised %lu pages in %ums\n", __func__, size, jiffies_to_msecs(jiffies - start)); } diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index cd040b5abffe..3f55f2f99112 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -132,7 +132,6 @@ void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) if (!nfit_res) return devm_memremap_pages(dev, pgmap); - pgmap->dev = dev; if (!pgmap->ref) { if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup)) return ERR_PTR(-EINVAL); -- cgit v1.2.3 From c96245148c1ec7af086da322481bf4119d1141d3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Aug 2019 20:15:48 -0300 Subject: mm/mmu_notifiers: remove unregister_no_release mmu_notifier_unregister_no_release() and mmu_notifier_call_srcu() no longer have any users, they have all been converted to use mmu_notifier_put(). So delete this difficult to use interface. Link: https://lore.kernel.org/r/20190806231548.25242-12-jgg@ziepe.ca Reviewed-by: Christoph Hellwig Reviewed-by: Ralph Campbell Tested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe --- include/linux/mmu_notifier.h | 5 ----- mm/mmu_notifier.c | 31 ------------------------------- 2 files changed, 36 deletions(-) (limited to 'mm') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 31aa971315a1..52929e5ef708 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -271,8 +271,6 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm); -extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, - struct mm_struct *mm); extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, @@ -513,9 +511,6 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, set_pte_at(___mm, ___address, __ptep, ___pte); \ }) -extern void mmu_notifier_call_srcu(struct rcu_head *rcu, - void (*func)(struct rcu_head *rcu)); - #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index d76ea27e2bbb..3ebdc748b168 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -21,18 +21,6 @@ /* global SRCU for all MMs */ DEFINE_STATIC_SRCU(srcu); -/* - * This function allows mmu_notifier::release callback to delay a call to - * a function that will free appropriate resources. The function must be - * quick and must not block. - */ -void mmu_notifier_call_srcu(struct rcu_head *rcu, - void (*func)(struct rcu_head *rcu)) -{ - call_srcu(&srcu, rcu, func); -} -EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu); - /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap @@ -455,25 +443,6 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); -/* - * Same as mmu_notifier_unregister but no callback and no srcu synchronization. - */ -void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, - struct mm_struct *mm) -{ - spin_lock(&mm->mmu_notifier_mm->lock); - /* - * Can not use list_del_rcu() since __mmu_notifier_release - * can delete it before we hold the lock. - */ - hlist_del_init_rcu(&mn->hlist); - spin_unlock(&mm->mmu_notifier_mm->lock); - - BUG_ON(atomic_read(&mm->mm_count) <= 0); - mmdrop(mm); -} -EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); - static void mmu_notifier_free_rcu(struct rcu_head *rcu) { struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu); -- cgit v1.2.3 From e3fe8e555dd05cf74168d18555c44320ed50a0e1 Mon Sep 17 00:00:00 2001 From: "Yang, Philip" Date: Thu, 15 Aug 2019 20:52:56 +0000 Subject: mm/hmm: fix hmm_range_fault()'s handling of swapped out pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hmm_range_fault() may return NULL pages because some of the pfns are equal to HMM_PFN_NONE. This happens randomly under memory pressure. The reason is during the swapped out page pte path, hmm_vma_handle_pte() doesn't update the fault variable from cpu_flags, so it failed to call hmm_vam_do_fault() to swap the page in. The fix is to call hmm_pte_need_fault() to update fault variable. Fixes: 74eee180b935 ("mm/hmm/mirror: device page fault handler") Link: https://lore.kernel.org/r/20190815205227.7949-1-Philip.Yang@amd.com Signed-off-by: Philip Yang Reviewed-by: "Jérôme Glisse" Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index 49eace16f9f8..fc05c8fe78b4 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -469,6 +469,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, swp_entry_t entry = pte_to_swp_entry(pte); if (!non_swap_entry(entry)) { + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); if (fault || write_fault) goto fault; return 0; -- cgit v1.2.3 From 6c64f2bbe79cf3b770ac60ae79442322bd76d55e Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 23 Aug 2019 15:17:52 -0700 Subject: mm/hmm: hmm_range_fault() NULL pointer bug Although hmm_range_fault() calls find_vma() to make sure that a vma exists before calling walk_page_range(), hmm_vma_walk_hole() can still be called with walk->vma == NULL if the start and end address are not contained within the vma range. hmm_range_fault() /* calls find_vma() but no range check */ walk_page_range() /* calls find_vma(), sets walk->vma = NULL */ __walk_page_range() walk_pgd_range() walk_p4d_range() walk_pud_range() hmm_vma_walk_hole() hmm_vma_walk_hole_() hmm_vma_do_fault() handle_mm_fault(vma=0) Link: https://lore.kernel.org/r/20190823221753.2514-2-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index fc05c8fe78b4..29371485fe94 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -229,6 +229,9 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, struct vm_area_struct *vma = walk->vma; vm_fault_t ret; + if (!vma) + goto err; + if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) flags |= FAULT_FLAG_ALLOW_RETRY; if (write_fault) @@ -239,12 +242,14 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */ return -EAGAIN; } - if (ret & VM_FAULT_ERROR) { - *pfn = range->values[HMM_PFN_ERROR]; - return -EFAULT; - } + if (ret & VM_FAULT_ERROR) + goto err; return -EBUSY; + +err: + *pfn = range->values[HMM_PFN_ERROR]; + return -EFAULT; } static int hmm_pfns_bad(unsigned long addr, -- cgit v1.2.3 From c18ce674d548c00faa6b7e760bacbaf1f39315f3 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 23 Aug 2019 15:17:53 -0700 Subject: mm/hmm: hmm_range_fault() infinite loop Normally, callers to handle_mm_fault() are supposed to check the vma->vm_flags first. hmm_range_fault() checks for VM_READ but doesn't check for VM_WRITE if the caller requests a page to be faulted in with write permission (via the hmm_range.pfns[] value). If the vma is write protected, this can result in an infinite loop: hmm_range_fault() walk_page_range() ... hmm_vma_walk_hole() hmm_vma_walk_hole_() hmm_vma_do_fault() handle_mm_fault(FAULT_FLAG_WRITE) /* returns VM_FAULT_WRITE */ /* returns -EBUSY */ /* returns -EBUSY */ /* returns -EBUSY */ /* loops on -EBUSY and range->valid */ Prevent this by checking for vma->vm_flags & VM_WRITE before calling handle_mm_fault(). Link: https://lore.kernel.org/r/20190823221753.2514-3-rcampbell@nvidia.com Signed-off-by: Ralph Campbell Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/hmm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/hmm.c b/mm/hmm.c index 29371485fe94..4882b83aeccb 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -292,6 +292,9 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; + if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) + return -EPERM; + for (; addr < end; addr += PAGE_SIZE, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { -- cgit v1.2.3 From f0ade90a8aa1ea523eb366d1d1e8bd3463d9cf8a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Aug 2019 16:21:09 +0200 Subject: mm/mmu_notifiers: remove the __mmu_notifier_invalidate_range_start/end exports No modular code uses these, which makes a lot of sense given the wrappers around them are only called by core mm code. Link: https://lore.kernel.org/r/20190828142109.29012-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/mmu_notifier.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 3ebdc748b168..9e2125ae10a5 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -177,7 +177,6 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) return ret; } -EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, bool only_end) @@ -209,7 +208,6 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, } srcu_read_unlock(&srcu, id); } -EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end); void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) @@ -224,7 +222,6 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } srcu_read_unlock(&srcu, id); } -EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); /* * Same as mmu_notifier_register but here the caller must hold the -- cgit v1.2.3 From 23b68395c7c78a764e8963fc15a7cfd318bf187f Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 26 Aug 2019 22:14:21 +0200 Subject: mm/mmu_notifiers: add a lockdep map for invalidate_range_start/end This is a similar idea to the fs_reclaim fake lockdep lock. It's fairly easy to provoke a specific notifier to be run on a specific range: Just prep it, and then munmap() it. A bit harder, but still doable, is to provoke the mmu notifiers for all the various callchains that might lead to them. But both at the same time is really hard to reliably hit, especially when you want to exercise paths like direct reclaim or compaction, where it's not easy to control what exactly will be unmapped. By introducing a lockdep map to tie them all together we allow lockdep to see a lot more dependencies, without having to actually hit them in a single challchain while testing. On Jason's suggestion this is is rolled out for both invalidate_range_start and invalidate_range_end. They both have the same calling context, hence we can share the same lockdep map. Note that the annotation for invalidate_ranage_start is outside of the mm_has_notifiers(), to make sure lockdep is informed about all paths leading to this context irrespective of whether mmu notifiers are present for a given context. We don't do that on the invalidate_range_end side to avoid paying the overhead twice, there the lockdep annotation is pushed down behind the mm_has_notifiers() check. Link: https://lore.kernel.org/r/20190826201425.17547-2-daniel.vetter@ffwll.ch Reviewed-by: Jason Gunthorpe Signed-off-by: Daniel Vetter Signed-off-by: Jason Gunthorpe --- include/linux/mmu_notifier.h | 14 ++++++++++++-- mm/mmu_notifier.c | 8 ++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 52929e5ef708..4dfe996dafd2 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -42,6 +42,10 @@ enum mmu_notifier_event { #ifdef CONFIG_MMU_NOTIFIER +#ifdef CONFIG_LOCKDEP +extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; +#endif + /* * The mmu notifier_mm structure is allocated and installed in * mm->mmu_notifier_mm inside the mm_take_all_locks() protected @@ -339,20 +343,26 @@ static inline void mmu_notifier_change_pte(struct mm_struct *mm, static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } + lock_map_release(&__mmu_notifier_invalidate_range_start_map); } static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { + int ret = 0; + + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; - return __mmu_notifier_invalidate_range_start(range); + ret = __mmu_notifier_invalidate_range_start(range); } - return 0; + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + return ret; } static inline void diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9e2125ae10a5..05d98167da7b 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -21,6 +21,12 @@ /* global SRCU for all MMs */ DEFINE_STATIC_SRCU(srcu); +#ifdef CONFIG_LOCKDEP +struct lockdep_map __mmu_notifier_invalidate_range_start_map = { + .name = "mmu_notifier_invalidate_range_start" +}; +#endif + /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap @@ -184,6 +190,7 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, struct mmu_notifier *mn; int id; + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { /* @@ -207,6 +214,7 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, mn->ops->invalidate_range_end(mn, range); } srcu_read_unlock(&srcu, id); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); } void __mmu_notifier_invalidate_range(struct mm_struct *mm, -- cgit v1.2.3 From 66204f1d2d1b42962033dfa867442f3dfd898d5f Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 26 Aug 2019 22:14:22 +0200 Subject: mm/mmu_notifiers: prime lockdep We want to teach lockdep that mmu notifiers can be called from direct reclaim paths, since on many CI systems load might never reach that level (e.g. when just running fuzzer or small functional tests). I've put the annotation into mmu_notifier_register since only when we have mmu notifiers registered is there any point in teaching lockdep about them. Also, we already have a kmalloc(, GFP_KERNEL), so this is safe. Link: https://lore.kernel.org/r/20190826201425.17547-3-daniel.vetter@ffwll.ch Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Daniel Vetter Signed-off-by: Jason Gunthorpe --- mm/mmu_notifier.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 05d98167da7b..3f39fb1402db 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -243,6 +243,13 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) lockdep_assert_held_write(&mm->mmap_sem); BUG_ON(atomic_read(&mm->mm_users) <= 0); + if (IS_ENABLED(CONFIG_LOCKDEP)) { + fs_reclaim_acquire(GFP_KERNEL); + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); + fs_reclaim_release(GFP_KERNEL); + } + mn->mm = mm; mn->users = 1; -- cgit v1.2.3 From a520110e4a15ceb385304d9cab22bb51438f6080 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Aug 2019 16:19:53 +0200 Subject: mm: split out a new pagewalk.h header from mm.h Add a new header for the two handful of users of the walk_page_range / walk_page_vma interface instead of polluting all users of mm.h with it. Link: https://lore.kernel.org/r/20190828141955.22210-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Thomas Hellstrom Reviewed-by: Steven Price Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- arch/openrisc/kernel/dma.c | 1 + arch/powerpc/mm/book3s64/subpage_prot.c | 2 +- arch/s390/mm/gmap.c | 2 +- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 46 ---------------------------- include/linux/pagewalk.h | 54 +++++++++++++++++++++++++++++++++ mm/hmm.c | 2 +- mm/madvise.c | 1 + mm/memcontrol.c | 2 +- mm/mempolicy.c | 2 +- mm/migrate.c | 1 + mm/mincore.c | 2 +- mm/mprotect.c | 2 +- mm/pagewalk.c | 2 +- 14 files changed, 66 insertions(+), 55 deletions(-) create mode 100644 include/linux/pagewalk.h (limited to 'mm') diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index b41a79fcdbd9..c7812e6effa2 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -16,6 +16,7 @@ */ #include +#include #include #include diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 9ba07e55c489..236f0a861ecc 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 39c3a6e3d262..cf80feae970d 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include #include diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 731642e0f5a0..8857da830b86 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index 0334ca97c584..7cf955feb823 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1430,54 +1430,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start, unsigned long end); -/** - * mm_walk - callbacks for walk_page_range - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry - * this handler should only handle pud_trans_huge() puds. - * the pmd_entry or pte_entry callbacks will be used for - * regular PUDs. - * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry - * this handler is required to be able to handle - * pmd_trans_huge() pmds. They may simply choose to - * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (4th-level) entry - * @pte_hole: if set, called for each hole at all levels - * @hugetlb_entry: if set, called for each hugetlb entry - * @test_walk: caller specific callback function to determine whether - * we walk over the current vma or not. Returning 0 - * value means "do page table walk over the current vma," - * and a negative one means "abort current page table walk - * right now." 1 means "skip the current vma." - * @mm: mm_struct representing the target process of page table walk - * @vma: vma currently walked (NULL if walking outside vmas) - * @private: private data for callbacks' usage - * - * (see the comment on walk_page_range() for more details) - */ -struct mm_walk { - int (*pud_entry)(pud_t *pud, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pmd_entry)(pmd_t *pmd, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pte_entry)(pte_t *pte, unsigned long addr, - unsigned long next, struct mm_walk *walk); - int (*pte_hole)(unsigned long addr, unsigned long next, - struct mm_walk *walk); - int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, - unsigned long addr, unsigned long next, - struct mm_walk *walk); - int (*test_walk)(unsigned long addr, unsigned long next, - struct mm_walk *walk); - struct mm_struct *mm; - struct vm_area_struct *vma; - void *private; -}; - struct mmu_notifier_range; -int walk_page_range(unsigned long addr, unsigned long end, - struct mm_walk *walk); -int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h new file mode 100644 index 000000000000..df278a94086d --- /dev/null +++ b/include/linux/pagewalk.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PAGEWALK_H +#define _LINUX_PAGEWALK_H + +#include + +/** + * mm_walk - callbacks for walk_page_range + * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry + * this handler should only handle pud_trans_huge() puds. + * the pmd_entry or pte_entry callbacks will be used for + * regular PUDs. + * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry + * this handler is required to be able to handle + * pmd_trans_huge() pmds. They may simply choose to + * split_huge_page() instead of handling it explicitly. + * @pte_entry: if set, called for each non-empty PTE (4th-level) entry + * @pte_hole: if set, called for each hole at all levels + * @hugetlb_entry: if set, called for each hugetlb entry + * @test_walk: caller specific callback function to determine whether + * we walk over the current vma or not. Returning 0 + * value means "do page table walk over the current vma," + * and a negative one means "abort current page table walk + * right now." 1 means "skip the current vma." + * @mm: mm_struct representing the target process of page table walk + * @vma: vma currently walked (NULL if walking outside vmas) + * @private: private data for callbacks' usage + * + * (see the comment on walk_page_range() for more details) + */ +struct mm_walk { + int (*pud_entry)(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pmd_entry)(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pte_entry)(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*pte_hole)(unsigned long addr, unsigned long next, + struct mm_walk *walk); + int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk); + int (*test_walk)(unsigned long addr, unsigned long next, + struct mm_walk *walk); + struct mm_struct *mm; + struct vm_area_struct *vma; + void *private; +}; + +int walk_page_range(unsigned long addr, unsigned long end, + struct mm_walk *walk); +int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); + +#endif /* _LINUX_PAGEWALK_H */ diff --git a/mm/hmm.c b/mm/hmm.c index 4882b83aeccb..26916ff6c8df 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -8,7 +8,7 @@ * Refer to include/linux/hmm.h for information about heterogeneous memory * management or HMM for short. */ -#include +#include #include #include #include diff --git a/mm/madvise.c b/mm/madvise.c index 968df3aa069f..80a78bb16782 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6f5c0c517c49..4c3af5d71ab1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 65e0874fce17..3a96def1e796 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -68,7 +68,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include +#include #include #include #include diff --git a/mm/migrate.c b/mm/migrate.c index 962cb62c621f..c9c73a35aca7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/mincore.c b/mm/mincore.c index 4fe91d497436..3b051b6ab3fe 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -10,7 +10,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/mm/mprotect.c b/mm/mprotect.c index bf38dfbbb4b4..cc73318dbc25 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -9,7 +9,7 @@ * (C) Copyright 2002 Red Hat Inc, All Rights Reserved */ -#include +#include #include #include #include diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3084ff2569d..8a92a961a2ee 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include #include #include -- cgit v1.2.3 From 7b86ac3371b70c3fd8fd95501719beb1faab719f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Aug 2019 16:19:54 +0200 Subject: pagewalk: separate function pointers from iterator data The mm_walk structure currently mixed data and code. Split out the operations vectors into a new mm_walk_ops structure, and while we are changing the API also declare the mm_walk structure inside the walk_page_range and walk_page_vma functions. Based on patch from Linus Torvalds. Link: https://lore.kernel.org/r/20190828141955.22210-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Thomas Hellstrom Reviewed-by: Steven Price Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- arch/openrisc/kernel/dma.c | 22 +++--- arch/powerpc/mm/book3s64/subpage_prot.c | 10 +-- arch/s390/mm/gmap.c | 33 ++++----- fs/proc/task_mmu.c | 78 ++++++++++---------- include/linux/pagewalk.h | 64 ++++++++++------- mm/hmm.c | 23 +++--- mm/madvise.c | 41 ++++------- mm/memcontrol.c | 23 +++--- mm/mempolicy.c | 15 ++-- mm/migrate.c | 23 +++--- mm/mincore.c | 15 ++-- mm/mprotect.c | 24 +++---- mm/pagewalk.c | 124 ++++++++++++++++++-------------- 13 files changed, 251 insertions(+), 244 deletions(-) (limited to 'mm') diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index c7812e6effa2..4d5b8bd1d795 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -44,6 +44,10 @@ page_set_nocache(pte_t *pte, unsigned long addr, return 0; } +static const struct mm_walk_ops set_nocache_walk_ops = { + .pte_entry = page_set_nocache, +}; + static int page_clear_nocache(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) @@ -59,6 +63,10 @@ page_clear_nocache(pte_t *pte, unsigned long addr, return 0; } +static const struct mm_walk_ops clear_nocache_walk_ops = { + .pte_entry = page_clear_nocache, +}; + /* * Alloc "coherent" memory, which for OpenRISC means simply uncached. * @@ -81,10 +89,6 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, { unsigned long va; void *page; - struct mm_walk walk = { - .pte_entry = page_set_nocache, - .mm = &init_mm - }; page = alloc_pages_exact(size, gfp | __GFP_ZERO); if (!page) @@ -99,7 +103,8 @@ arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, * We need to iterate through the pages, clearing the dcache for * them and setting the cache-inhibit bit. */ - if (walk_page_range(va, va + size, &walk)) { + if (walk_page_range(&init_mm, va, va + size, &set_nocache_walk_ops, + NULL)) { free_pages_exact(page, size); return NULL; } @@ -112,13 +117,10 @@ arch_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { unsigned long va = (unsigned long)vaddr; - struct mm_walk walk = { - .pte_entry = page_clear_nocache, - .mm = &init_mm - }; /* walk_page_range shouldn't be able to fail here */ - WARN_ON(walk_page_range(va, va + size, &walk)); + WARN_ON(walk_page_range(&init_mm, va, va + size, + &clear_nocache_walk_ops, NULL)); free_pages_exact(vaddr, size); } diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 236f0a861ecc..2ef24a53f4c9 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -139,14 +139,14 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, return 0; } +static const struct mm_walk_ops subpage_walk_ops = { + .pmd_entry = subpage_walk_pmd_entry, +}; + static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; - struct mm_walk subpage_proto_walk = { - .mm = mm, - .pmd_entry = subpage_walk_pmd_entry, - }; /* * We don't try too hard, we just mark all the vma in that range @@ -163,7 +163,7 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, if (vma->vm_start >= (addr + len)) break; vma->vm_flags |= VM_NOHUGEPAGE; - walk_page_vma(vma, &subpage_proto_walk); + walk_page_vma(vma, &subpage_walk_ops, NULL); vma = vma->vm_next; } } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index cf80feae970d..bd78d504fdad 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2521,13 +2521,9 @@ static int __zap_zero_pages(pmd_t *pmd, unsigned long start, return 0; } -static inline void zap_zero_pages(struct mm_struct *mm) -{ - struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; - - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); -} +static const struct mm_walk_ops zap_zero_walk_ops = { + .pmd_entry = __zap_zero_pages, +}; /* * switch on pgstes for its userspace process (for kvm) @@ -2546,7 +2542,7 @@ int s390_enable_sie(void) mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); - zap_zero_pages(mm); + walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL); up_write(&mm->mmap_sem); return 0; } @@ -2589,12 +2585,13 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, return 0; } +static const struct mm_walk_ops enable_skey_walk_ops = { + .hugetlb_entry = __s390_enable_skey_hugetlb, + .pte_entry = __s390_enable_skey_pte, +}; + int s390_enable_skey(void) { - struct mm_walk walk = { - .hugetlb_entry = __s390_enable_skey_hugetlb, - .pte_entry = __s390_enable_skey_pte, - }; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc = 0; @@ -2614,8 +2611,7 @@ int s390_enable_skey(void) } mm->def_flags &= ~VM_MERGEABLE; - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); + walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); out_up: up_write(&mm->mmap_sem); @@ -2633,13 +2629,14 @@ static int __s390_reset_cmma(pte_t *pte, unsigned long addr, return 0; } +static const struct mm_walk_ops reset_cmma_walk_ops = { + .pte_entry = __s390_reset_cmma, +}; + void s390_reset_cmma(struct mm_struct *mm) { - struct mm_walk walk = { .pte_entry = __s390_reset_cmma }; - down_write(&mm->mmap_sem); - walk.mm = mm; - walk_page_range(0, TASK_SIZE, &walk); + walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL); up_write(&mm->mmap_sem); } EXPORT_SYMBOL_GPL(s390_reset_cmma); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8857da830b86..bf43d1d60059 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -513,7 +513,9 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, return 0; } -#endif +#else +#define smaps_pte_hole NULL +#endif /* CONFIG_SHMEM */ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) @@ -729,21 +731,24 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, } return 0; } +#else +#define smaps_hugetlb_range NULL #endif /* HUGETLB_PAGE */ +static const struct mm_walk_ops smaps_walk_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, +}; + +static const struct mm_walk_ops smaps_shmem_walk_ops = { + .pmd_entry = smaps_pte_range, + .hugetlb_entry = smaps_hugetlb_range, + .pte_hole = smaps_pte_hole, +}; + static void smap_gather_stats(struct vm_area_struct *vma, struct mem_size_stats *mss) { - struct mm_walk smaps_walk = { - .pmd_entry = smaps_pte_range, -#ifdef CONFIG_HUGETLB_PAGE - .hugetlb_entry = smaps_hugetlb_range, -#endif - .mm = vma->vm_mm, - }; - - smaps_walk.private = mss; - #ifdef CONFIG_SHMEM /* In case of smaps_rollup, reset the value from previous vma */ mss->check_shmem_swap = false; @@ -765,12 +770,13 @@ static void smap_gather_stats(struct vm_area_struct *vma, mss->swap += shmem_swapped; } else { mss->check_shmem_swap = true; - smaps_walk.pte_hole = smaps_pte_hole; + walk_page_vma(vma, &smaps_shmem_walk_ops, mss); + return; } } #endif /* mmap_sem is held in m_start */ - walk_page_vma(vma, &smaps_walk); + walk_page_vma(vma, &smaps_walk_ops, mss); } #define SEQ_PUT_DEC(str, val) \ @@ -1118,6 +1124,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, return 0; } +static const struct mm_walk_ops clear_refs_walk_ops = { + .pmd_entry = clear_refs_pte_range, + .test_walk = clear_refs_test_walk, +}; + static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -1151,12 +1162,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, struct clear_refs_private cp = { .type = type, }; - struct mm_walk clear_refs_walk = { - .pmd_entry = clear_refs_pte_range, - .test_walk = clear_refs_test_walk, - .mm = mm, - .private = &cp, - }; if (type == CLEAR_REFS_MM_HIWATER_RSS) { if (down_write_killable(&mm->mmap_sem)) { @@ -1217,7 +1222,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); + walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, + &cp); if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb, 0, -1); @@ -1489,8 +1495,16 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, return err; } +#else +#define pagemap_hugetlb_range NULL #endif /* HUGETLB_PAGE */ +static const struct mm_walk_ops pagemap_ops = { + .pmd_entry = pagemap_pmd_range, + .pte_hole = pagemap_pte_hole, + .hugetlb_entry = pagemap_hugetlb_range, +}; + /* * /proc/pid/pagemap - an array mapping virtual pages to pfns * @@ -1522,7 +1536,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, { struct mm_struct *mm = file->private_data; struct pagemapread pm; - struct mm_walk pagemap_walk = {}; unsigned long src; unsigned long svpfn; unsigned long start_vaddr; @@ -1550,14 +1563,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, if (!pm.buffer) goto out_mm; - pagemap_walk.pmd_entry = pagemap_pmd_range; - pagemap_walk.pte_hole = pagemap_pte_hole; -#ifdef CONFIG_HUGETLB_PAGE - pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; -#endif - pagemap_walk.mm = mm; - pagemap_walk.private = ± - src = *ppos; svpfn = src / PM_ENTRY_BYTES; start_vaddr = svpfn << PAGE_SHIFT; @@ -1586,7 +1591,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, ret = down_read_killable(&mm->mmap_sem); if (ret) goto out_free; - ret = walk_page_range(start_vaddr, end, &pagemap_walk); + ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); up_read(&mm->mmap_sem); start_vaddr = end; @@ -1798,6 +1803,11 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, } #endif +static const struct mm_walk_ops show_numa_ops = { + .hugetlb_entry = gather_hugetlb_stats, + .pmd_entry = gather_pte_stats, +}; + /* * Display pages allocated per node and memory policy via /proc. */ @@ -1809,12 +1819,6 @@ static int show_numa_map(struct seq_file *m, void *v) struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct mm_walk walk = { - .hugetlb_entry = gather_hugetlb_stats, - .pmd_entry = gather_pte_stats, - .private = md, - .mm = mm, - }; struct mempolicy *pol; char buffer[64]; int nid; @@ -1848,7 +1852,7 @@ static int show_numa_map(struct seq_file *m, void *v) seq_puts(m, " huge"); /* mmap_sem is held by m_start */ - walk_page_vma(vma, &walk); + walk_page_vma(vma, &show_numa_ops, md); if (!md->pages) goto out; diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index df278a94086d..bddd9759bab9 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -4,31 +4,28 @@ #include +struct mm_walk; + /** - * mm_walk - callbacks for walk_page_range - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry - * this handler should only handle pud_trans_huge() puds. - * the pmd_entry or pte_entry callbacks will be used for - * regular PUDs. - * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry - * this handler is required to be able to handle - * pmd_trans_huge() pmds. They may simply choose to - * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (4th-level) entry - * @pte_hole: if set, called for each hole at all levels - * @hugetlb_entry: if set, called for each hugetlb entry - * @test_walk: caller specific callback function to determine whether - * we walk over the current vma or not. Returning 0 - * value means "do page table walk over the current vma," - * and a negative one means "abort current page table walk - * right now." 1 means "skip the current vma." - * @mm: mm_struct representing the target process of page table walk - * @vma: vma currently walked (NULL if walking outside vmas) - * @private: private data for callbacks' usage - * - * (see the comment on walk_page_range() for more details) + * mm_walk_ops - callbacks for walk_page_range + * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry + * this handler should only handle pud_trans_huge() puds. + * the pmd_entry or pte_entry callbacks will be used for + * regular PUDs. + * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry + * this handler is required to be able to handle + * pmd_trans_huge() pmds. They may simply choose to + * split_huge_page() instead of handling it explicitly. + * @pte_entry: if set, called for each non-empty PTE (4th-level) entry + * @pte_hole: if set, called for each hole at all levels + * @hugetlb_entry: if set, called for each hugetlb entry + * @test_walk: caller specific callback function to determine whether + * we walk over the current vma or not. Returning 0 means + * "do page table walk over the current vma", returning + * a negative value means "abort current page table walk + * right now" and returning 1 means "skip the current vma" */ -struct mm_walk { +struct mm_walk_ops { int (*pud_entry)(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pmd_entry)(pmd_t *pmd, unsigned long addr, @@ -42,13 +39,28 @@ struct mm_walk { struct mm_walk *walk); int (*test_walk)(unsigned long addr, unsigned long next, struct mm_walk *walk); +}; + +/** + * mm_walk - walk_page_range data + * @ops: operation to call during the walk + * @mm: mm_struct representing the target process of page table walk + * @vma: vma currently walked (NULL if walking outside vmas) + * @private: private data for callbacks' usage + * + * (see the comment on walk_page_range() for more details) + */ +struct mm_walk { + const struct mm_walk_ops *ops; struct mm_struct *mm; struct vm_area_struct *vma; void *private; }; -int walk_page_range(unsigned long addr, unsigned long end, - struct mm_walk *walk); -int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); +int walk_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); +int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, + void *private); #endif /* _LINUX_PAGEWALK_H */ diff --git a/mm/hmm.c b/mm/hmm.c index 26916ff6c8df..902f5fa6bf93 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -852,6 +852,13 @@ void hmm_range_unregister(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_unregister); +static const struct mm_walk_ops hmm_walk_ops = { + .pud_entry = hmm_vma_walk_pud, + .pmd_entry = hmm_vma_walk_pmd, + .pte_hole = hmm_vma_walk_hole, + .hugetlb_entry = hmm_vma_walk_hugetlb_entry, +}; + /** * hmm_range_fault - try to fault some address in a virtual address range * @range: range being faulted @@ -887,7 +894,6 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) struct hmm_vma_walk hmm_vma_walk; struct hmm *hmm = range->hmm; struct vm_area_struct *vma; - struct mm_walk mm_walk; int ret; lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem); @@ -916,21 +922,14 @@ long hmm_range_fault(struct hmm_range *range, unsigned int flags) hmm_vma_walk.last = start; hmm_vma_walk.flags = flags; hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; end = min(range->end, vma->vm_end); - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pud_entry = hmm_vma_walk_pud; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; - mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; + walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops, + &hmm_vma_walk); do { - ret = walk_page_range(start, end, &mm_walk); + ret = walk_page_range(vma->vm_mm, start, end, + &hmm_walk_ops, &hmm_vma_walk); start = hmm_vma_walk.last; /* Keep trying while the range is valid. */ diff --git a/mm/madvise.c b/mm/madvise.c index 80a78bb16782..afe2b015ea58 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -226,19 +226,9 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, return 0; } -static void force_swapin_readahead(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_walk walk = { - .mm = vma->vm_mm, - .pmd_entry = swapin_walk_pmd_entry, - .private = vma, - }; - - walk_page_range(start, end, &walk); - - lru_add_drain(); /* Push any new pages onto the LRU now */ -} +static const struct mm_walk_ops swapin_walk_ops = { + .pmd_entry = swapin_walk_pmd_entry, +}; static void force_shm_swapin_readahead(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -280,7 +270,8 @@ static long madvise_willneed(struct vm_area_struct *vma, *prev = vma; #ifdef CONFIG_SWAP if (!file) { - force_swapin_readahead(vma, start, end); + walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma); + lru_add_drain(); /* Push any new pages onto the LRU now */ return 0; } @@ -441,20 +432,9 @@ next: return 0; } -static void madvise_free_page_range(struct mmu_gather *tlb, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - struct mm_walk free_walk = { - .pmd_entry = madvise_free_pte_range, - .mm = vma->vm_mm, - .private = tlb, - }; - - tlb_start_vma(tlb, vma); - walk_page_range(addr, end, &free_walk); - tlb_end_vma(tlb, vma); -} +static const struct mm_walk_ops madvise_free_walk_ops = { + .pmd_entry = madvise_free_pte_range, +}; static int madvise_free_single_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) @@ -481,7 +461,10 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); - madvise_free_page_range(&tlb, vma, range.start, range.end); + tlb_start_vma(&tlb, vma); + walk_page_range(vma->vm_mm, range.start, range.end, + &madvise_free_walk_ops, &tlb); + tlb_end_vma(&tlb, vma); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb, range.start, range.end); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4c3af5d71ab1..9b2516a76be2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5283,17 +5283,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, return 0; } +static const struct mm_walk_ops precharge_walk_ops = { + .pmd_entry = mem_cgroup_count_precharge_pte_range, +}; + static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) { unsigned long precharge; - struct mm_walk mem_cgroup_count_precharge_walk = { - .pmd_entry = mem_cgroup_count_precharge_pte_range, - .mm = mm, - }; down_read(&mm->mmap_sem); - walk_page_range(0, mm->highest_vm_end, - &mem_cgroup_count_precharge_walk); + walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); up_read(&mm->mmap_sem); precharge = mc.precharge; @@ -5562,13 +5561,12 @@ put: /* get_mctgt_type() gets the page */ return ret; } +static const struct mm_walk_ops charge_walk_ops = { + .pmd_entry = mem_cgroup_move_charge_pte_range, +}; + static void mem_cgroup_move_charge(void) { - struct mm_walk mem_cgroup_move_charge_walk = { - .pmd_entry = mem_cgroup_move_charge_pte_range, - .mm = mc.mm, - }; - lru_add_drain_all(); /* * Signal lock_page_memcg() to take the memcg's move_lock @@ -5594,7 +5592,8 @@ retry: * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); + walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, + NULL); up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3a96def1e796..f000771558d8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -655,6 +655,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, return 1; } +static const struct mm_walk_ops queue_pages_walk_ops = { + .hugetlb_entry = queue_pages_hugetlb, + .pmd_entry = queue_pages_pte_range, + .test_walk = queue_pages_test_walk, +}; + /* * Walk through page tables and collect pages to be migrated. * @@ -679,15 +685,8 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, .nmask = nodes, .prev = NULL, }; - struct mm_walk queue_pages_walk = { - .hugetlb_entry = queue_pages_hugetlb, - .pmd_entry = queue_pages_pte_range, - .test_walk = queue_pages_test_walk, - .mm = mm, - .private = &qp, - }; - return walk_page_range(start, end, &queue_pages_walk); + return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); } /* diff --git a/mm/migrate.c b/mm/migrate.c index c9c73a35aca7..9f4ed4e985c1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2320,6 +2320,11 @@ next: return 0; } +static const struct mm_walk_ops migrate_vma_walk_ops = { + .pmd_entry = migrate_vma_collect_pmd, + .pte_hole = migrate_vma_collect_hole, +}; + /* * migrate_vma_collect() - collect pages over a range of virtual addresses * @migrate: migrate struct containing all migration information @@ -2331,21 +2336,15 @@ next: static void migrate_vma_collect(struct migrate_vma *migrate) { struct mmu_notifier_range range; - struct mm_walk mm_walk = { - .pmd_entry = migrate_vma_collect_pmd, - .pte_hole = migrate_vma_collect_hole, - .vma = migrate->vma, - .mm = migrate->vma->vm_mm, - .private = migrate, - }; - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, - migrate->start, - migrate->end); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, + migrate->vma->vm_mm, migrate->start, migrate->end); mmu_notifier_invalidate_range_start(&range); - walk_page_range(migrate->start, migrate->end, &mm_walk); - mmu_notifier_invalidate_range_end(&range); + walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, + &migrate_vma_walk_ops, migrate); + + mmu_notifier_invalidate_range_end(&range); migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); } diff --git a/mm/mincore.c b/mm/mincore.c index 3b051b6ab3fe..f9a9dbe8cd33 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -193,6 +193,12 @@ static inline bool can_do_mincore(struct vm_area_struct *vma) inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; } +static const struct mm_walk_ops mincore_walk_ops = { + .pmd_entry = mincore_pte_range, + .pte_hole = mincore_unmapped_range, + .hugetlb_entry = mincore_hugetlb, +}; + /* * Do a chunk of "sys_mincore()". We've already checked * all the arguments, we hold the mmap semaphore: we should @@ -203,12 +209,6 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v struct vm_area_struct *vma; unsigned long end; int err; - struct mm_walk mincore_walk = { - .pmd_entry = mincore_pte_range, - .pte_hole = mincore_unmapped_range, - .hugetlb_entry = mincore_hugetlb, - .private = vec, - }; vma = find_vma(current->mm, addr); if (!vma || addr < vma->vm_start) @@ -219,8 +219,7 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v memset(vec, 1, pages); return pages; } - mincore_walk.mm = vma->vm_mm; - err = walk_page_range(addr, end, &mincore_walk); + err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec); if (err < 0) return err; return (end - addr) >> PAGE_SHIFT; diff --git a/mm/mprotect.c b/mm/mprotect.c index cc73318dbc25..675e5d34a507 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -329,20 +329,11 @@ static int prot_none_test(unsigned long addr, unsigned long next, return 0; } -static int prot_none_walk(struct vm_area_struct *vma, unsigned long start, - unsigned long end, unsigned long newflags) -{ - pgprot_t new_pgprot = vm_get_page_prot(newflags); - struct mm_walk prot_none_walk = { - .pte_entry = prot_none_pte_entry, - .hugetlb_entry = prot_none_hugetlb_entry, - .test_walk = prot_none_test, - .mm = current->mm, - .private = &new_pgprot, - }; - - return walk_page_range(start, end, &prot_none_walk); -} +static const struct mm_walk_ops prot_none_walk_ops = { + .pte_entry = prot_none_pte_entry, + .hugetlb_entry = prot_none_hugetlb_entry, + .test_walk = prot_none_test, +}; int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, @@ -369,7 +360,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, if (arch_has_pfn_modify_check() && (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { - error = prot_none_walk(vma, start, end, newflags); + pgprot_t new_pgprot = vm_get_page_prot(newflags); + + error = walk_page_range(current->mm, start, end, + &prot_none_walk_ops, &new_pgprot); if (error) return error; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8a92a961a2ee..b8762b673a3d 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -9,10 +9,11 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, { pte_t *pte; int err = 0; + const struct mm_walk_ops *ops = walk->ops; pte = pte_offset_map(pmd, addr); for (;;) { - err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk); + err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); if (err) break; addr += PAGE_SIZE; @@ -30,6 +31,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, { pmd_t *pmd; unsigned long next; + const struct mm_walk_ops *ops = walk->ops; int err = 0; pmd = pmd_offset(pud, addr); @@ -37,8 +39,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, again: next = pmd_addr_end(addr, end); if (pmd_none(*pmd) || !walk->vma) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + if (ops->pte_hole) + err = ops->pte_hole(addr, next, walk); if (err) break; continue; @@ -47,8 +49,8 @@ again: * This implies that each ->pmd_entry() handler * needs to know about pmd_trans_huge() pmds */ - if (walk->pmd_entry) - err = walk->pmd_entry(pmd, addr, next, walk); + if (ops->pmd_entry) + err = ops->pmd_entry(pmd, addr, next, walk); if (err) break; @@ -56,7 +58,7 @@ again: * Check this here so we only break down trans_huge * pages when we _need_ to */ - if (!walk->pte_entry) + if (!ops->pte_entry) continue; split_huge_pmd(walk->vma, pmd, addr); @@ -75,6 +77,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, { pud_t *pud; unsigned long next; + const struct mm_walk_ops *ops = walk->ops; int err = 0; pud = pud_offset(p4d, addr); @@ -82,18 +85,18 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, again: next = pud_addr_end(addr, end); if (pud_none(*pud) || !walk->vma) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + if (ops->pte_hole) + err = ops->pte_hole(addr, next, walk); if (err) break; continue; } - if (walk->pud_entry) { + if (ops->pud_entry) { spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma); if (ptl) { - err = walk->pud_entry(pud, addr, next, walk); + err = ops->pud_entry(pud, addr, next, walk); spin_unlock(ptl); if (err) break; @@ -105,7 +108,7 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, if (pud_none(*pud)) goto again; - if (walk->pmd_entry || walk->pte_entry) + if (ops->pmd_entry || ops->pte_entry) err = walk_pmd_range(pud, addr, next, walk); if (err) break; @@ -119,19 +122,20 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, { p4d_t *p4d; unsigned long next; + const struct mm_walk_ops *ops = walk->ops; int err = 0; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + if (ops->pte_hole) + err = ops->pte_hole(addr, next, walk); if (err) break; continue; } - if (walk->pmd_entry || walk->pte_entry) + if (ops->pmd_entry || ops->pte_entry) err = walk_pud_range(p4d, addr, next, walk); if (err) break; @@ -145,19 +149,20 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, { pgd_t *pgd; unsigned long next; + const struct mm_walk_ops *ops = walk->ops; int err = 0; pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { - if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + if (ops->pte_hole) + err = ops->pte_hole(addr, next, walk); if (err) break; continue; } - if (walk->pmd_entry || walk->pte_entry) + if (ops->pmd_entry || ops->pte_entry) err = walk_p4d_range(pgd, addr, next, walk); if (err) break; @@ -183,6 +188,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, unsigned long hmask = huge_page_mask(h); unsigned long sz = huge_page_size(h); pte_t *pte; + const struct mm_walk_ops *ops = walk->ops; int err = 0; do { @@ -190,9 +196,9 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, pte = huge_pte_offset(walk->mm, addr & hmask, sz); if (pte) - err = walk->hugetlb_entry(pte, hmask, addr, next, walk); - else if (walk->pte_hole) - err = walk->pte_hole(addr, next, walk); + err = ops->hugetlb_entry(pte, hmask, addr, next, walk); + else if (ops->pte_hole) + err = ops->pte_hole(addr, next, walk); if (err) break; @@ -220,9 +226,10 @@ static int walk_page_test(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; + const struct mm_walk_ops *ops = walk->ops; - if (walk->test_walk) - return walk->test_walk(start, end, walk); + if (ops->test_walk) + return ops->test_walk(start, end, walk); /* * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP @@ -234,8 +241,8 @@ static int walk_page_test(unsigned long start, unsigned long end, */ if (vma->vm_flags & VM_PFNMAP) { int err = 1; - if (walk->pte_hole) - err = walk->pte_hole(start, end, walk); + if (ops->pte_hole) + err = ops->pte_hole(start, end, walk); return err ? err : 1; } return 0; @@ -248,7 +255,7 @@ static int __walk_page_range(unsigned long start, unsigned long end, struct vm_area_struct *vma = walk->vma; if (vma && is_vm_hugetlb_page(vma)) { - if (walk->hugetlb_entry) + if (walk->ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else err = walk_pgd_range(start, end, walk); @@ -258,11 +265,13 @@ static int __walk_page_range(unsigned long start, unsigned long end, /** * walk_page_range - walk page table with caller specific callbacks - * @start: start address of the virtual address range - * @end: end address of the virtual address range - * @walk: mm_walk structure defining the callbacks and the target address space + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @private: private data for callbacks' usage * - * Recursively walk the page table tree of the process represented by @walk->mm + * Recursively walk the page table tree of the process represented by @mm * within the virtual address range [@start, @end). During walking, we can do * some caller-specific works for each entry, by setting up pmd_entry(), * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these @@ -278,47 +287,52 @@ static int __walk_page_range(unsigned long start, unsigned long end, * * Before starting to walk page table, some callers want to check whether * they really want to walk over the current vma, typically by checking - * its vm_flags. walk_page_test() and @walk->test_walk() are used for this + * its vm_flags. walk_page_test() and @ops->test_walk() are used for this * purpose. * * struct mm_walk keeps current values of some common data like vma and pmd, * which are useful for the access from callbacks. If you want to pass some - * caller-specific data to callbacks, @walk->private should be helpful. + * caller-specific data to callbacks, @private should be helpful. * * Locking: - * Callers of walk_page_range() and walk_page_vma() should hold - * @walk->mm->mmap_sem, because these function traverse vma list and/or - * access to vma's data. + * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem, + * because these function traverse vma list and/or access to vma's data. */ -int walk_page_range(unsigned long start, unsigned long end, - struct mm_walk *walk) +int walk_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) { int err = 0; unsigned long next; struct vm_area_struct *vma; + struct mm_walk walk = { + .ops = ops, + .mm = mm, + .private = private, + }; if (start >= end) return -EINVAL; - if (!walk->mm) + if (!walk.mm) return -EINVAL; - VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); + VM_BUG_ON_MM(!rwsem_is_locked(&walk.mm->mmap_sem), walk.mm); - vma = find_vma(walk->mm, start); + vma = find_vma(walk.mm, start); do { if (!vma) { /* after the last vma */ - walk->vma = NULL; + walk.vma = NULL; next = end; } else if (start < vma->vm_start) { /* outside vma */ - walk->vma = NULL; + walk.vma = NULL; next = min(end, vma->vm_start); } else { /* inside vma */ - walk->vma = vma; + walk.vma = vma; next = min(end, vma->vm_end); vma = vma->vm_next; - err = walk_page_test(start, next, walk); + err = walk_page_test(start, next, &walk); if (err > 0) { /* * positive return values are purely for @@ -331,28 +345,34 @@ int walk_page_range(unsigned long start, unsigned long end, if (err < 0) break; } - if (walk->vma || walk->pte_hole) - err = __walk_page_range(start, next, walk); + if (walk.vma || walk.ops->pte_hole) + err = __walk_page_range(start, next, &walk); if (err) break; } while (start = next, start < end); return err; } -int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) +int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, + void *private) { + struct mm_walk walk = { + .ops = ops, + .mm = vma->vm_mm, + .vma = vma, + .private = private, + }; int err; - if (!walk->mm) + if (!walk.mm) return -EINVAL; - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); - VM_BUG_ON(!vma); - walk->vma = vma; - err = walk_page_test(vma->vm_start, vma->vm_end, walk); + VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); + + err = walk_page_test(vma->vm_start, vma->vm_end, &walk); if (err > 0) return 0; if (err < 0) return err; - return __walk_page_range(vma->vm_start, vma->vm_end, walk); + return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } -- cgit v1.2.3 From b4bc7817b2bcc2432db6de99188c85ef16314c93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Aug 2019 16:19:55 +0200 Subject: pagewalk: use lockdep_assert_held for locking validation Use lockdep to check for held locks instead of using home grown asserts. Link: https://lore.kernel.org/r/20190828141955.22210-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Thomas Hellstrom Reviewed-by: Steven Price Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/pagewalk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/pagewalk.c b/mm/pagewalk.c index b8762b673a3d..d48c2a986ea3 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -317,7 +317,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, if (!walk.mm) return -EINVAL; - VM_BUG_ON_MM(!rwsem_is_locked(&walk.mm->mmap_sem), walk.mm); + lockdep_assert_held(&walk.mm->mmap_sem); vma = find_vma(walk.mm, start); do { @@ -367,7 +367,7 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, if (!walk.mm) return -EINVAL; - VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); + lockdep_assert_held(&walk.mm->mmap_sem); err = walk_page_test(vma->vm_start, vma->vm_end, &walk); if (err > 0) -- cgit v1.2.3 From ba170f76b69d1d45a60eaa9ec920c8fddd4c16f3 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Mon, 26 Aug 2019 22:14:24 +0200 Subject: mm, notifier: Catch sleeping/blocking for !blockable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to make sure implementations don't cheat and don't have a possible schedule/blocking point deeply burried where review can't catch it. I'm not sure whether this is the best way to make sure all the might_sleep() callsites trigger, and it's a bit ugly in the code flow. But it gets the job done. Inspired by an i915 patch series which did exactly that, because the rules haven't been entirely clear to us. Link: https://lore.kernel.org/r/20190826201425.17547-5-daniel.vetter@ffwll.ch Reviewed-by: Christian König (v1) Reviewed-by: Jérôme Glisse (v4) Signed-off-by: Daniel Vetter Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- mm/mmu_notifier.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 3f39fb1402db..7fde88695f35 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -168,7 +168,13 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_start) { - int _ret = mn->ops->invalidate_range_start(mn, range); + int _ret; + + if (!mmu_notifier_range_blockable(range)) + non_block_start(); + _ret = mn->ops->invalidate_range_start(mn, range); + if (!mmu_notifier_range_blockable(range)) + non_block_end(); if (_ret) { pr_info("%pS callback failed with %d in %sblockable context.\n", mn->ops->invalidate_range_start, _ret, @@ -210,8 +216,13 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, mn->ops->invalidate_range(mn, range->mm, range->start, range->end); - if (mn->ops->invalidate_range_end) + if (mn->ops->invalidate_range_end) { + if (!mmu_notifier_range_blockable(range)) + non_block_start(); mn->ops->invalidate_range_end(mn, range); + if (!mmu_notifier_range_blockable(range)) + non_block_end(); + } } srcu_read_unlock(&srcu, id); lock_map_release(&__mmu_notifier_invalidate_range_start_map); -- cgit v1.2.3