diff options
Diffstat (limited to 'mm/gup.c')
-rw-r--r-- | mm/gup.c | 393 |
1 files changed, 287 insertions, 106 deletions
@@ -28,6 +28,111 @@ struct follow_page_context { unsigned int page_mask; }; +typedef int (*set_dirty_func_t)(struct page *page); + +static void __put_user_pages_dirty(struct page **pages, + unsigned long npages, + set_dirty_func_t sdf) +{ + unsigned long index; + + for (index = 0; index < npages; index++) { + struct page *page = compound_head(pages[index]); + + /* + * Checking PageDirty at this point may race with + * clear_page_dirty_for_io(), but that's OK. Two key cases: + * + * 1) This code sees the page as already dirty, so it skips + * the call to sdf(). That could happen because + * clear_page_dirty_for_io() called page_mkclean(), + * followed by set_page_dirty(). However, now the page is + * going to get written back, which meets the original + * intention of setting it dirty, so all is well: + * clear_page_dirty_for_io() goes on to call + * TestClearPageDirty(), and write the page back. + * + * 2) This code sees the page as clean, so it calls sdf(). + * The page stays dirty, despite being written back, so it + * gets written back again in the next writeback cycle. + * This is harmless. + */ + if (!PageDirty(page)) + sdf(page); + + put_user_page(page); + } +} + +/** + * put_user_pages_dirty() - release and dirty an array of gup-pinned pages + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * "gup-pinned page" refers to a page that has had one of the get_user_pages() + * variants called on that page. + * + * For each page in the @pages array, make that page (or its head page, if a + * compound page) dirty, if it was previously listed as clean. Then, release + * the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + * + * set_page_dirty(), which does not lock the page, is used here. + * Therefore, it is the caller's responsibility to ensure that this is + * safe. If not, then put_user_pages_dirty_lock() should be called instead. + * + */ +void put_user_pages_dirty(struct page **pages, unsigned long npages) +{ + __put_user_pages_dirty(pages, npages, set_page_dirty); +} +EXPORT_SYMBOL(put_user_pages_dirty); + +/** + * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * For each page in the @pages array, make that page (or its head page, if a + * compound page) dirty, if it was previously listed as clean. Then, release + * the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + * + * This is just like put_user_pages_dirty(), except that it invokes + * set_page_dirty_lock(), instead of set_page_dirty(). + * + */ +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) +{ + __put_user_pages_dirty(pages, npages, set_page_dirty_lock); +} +EXPORT_SYMBOL(put_user_pages_dirty_lock); + +/** + * put_user_pages() - release an array of gup-pinned pages. + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * For each page in the @pages array, release the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + */ +void put_user_pages(struct page **pages, unsigned long npages) +{ + unsigned long index; + + /* + * TODO: this can be optimized for huge pages: if a series of pages is + * physically contiguous and part of the same compound page, then a + * single operation to the head page should suffice. + */ + for (index = 0; index < npages; index++) + put_user_page(pages[index]); +} +EXPORT_SYMBOL(put_user_pages); + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -1018,6 +1123,15 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, gup_flags | FOLL_TOUCH); @@ -1046,6 +1160,15 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int locked = 1; long ret; + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + down_read(&mm->mmap_sem); ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, &locked, gup_flags | FOLL_TOUCH); @@ -1116,32 +1239,22 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, locked, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); -/* - * This is the same as get_user_pages_remote(), just with a - * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's and don't allow - * passing of a locked parameter. We also obviously don't pass - * FOLL_REMOTE in here. - */ -long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) -{ - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, vmas, NULL, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages); - #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) - -#ifdef CONFIG_FS_DAX static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { long i; @@ -1160,12 +1273,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) } return false; } -#else -static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) -{ - return false; -} -#endif #ifdef CONFIG_CMA static struct page *new_non_cma_page(struct page *page, unsigned long private) @@ -1219,10 +1326,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private) return __alloc_pages_node(nid, gfp_mask, 0); } -static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, + unsigned int gup_flags) { long i; bool drain_allow = true; @@ -1278,10 +1388,14 @@ check_again: putback_movable_pages(&cma_page_list); } /* - * We did migrate all the pages, Try to get the page references again - * migrating any new CMA pages which we failed to isolate earlier. + * We did migrate all the pages, Try to get the page references + * again migrating any new CMA pages which we failed to isolate + * earlier. */ - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, + pages, vmas, NULL, + gup_flags); + if ((nr_pages > 0) && migrate_allow) { drain_allow = true; goto check_again; @@ -1291,66 +1405,101 @@ check_again: return nr_pages; } #else -static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, - struct page **pages, - struct vm_area_struct **vmas) +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { return nr_pages; } #endif /* - * This is the same as get_user_pages() in that it assumes we are - * operating on the current task's mm, but it goes further to validate - * that the vmas associated with the address range are suitable for - * longterm elevated page reference counts. For example, filesystem-dax - * mappings are subject to the lifetime enforced by the filesystem and - * we need guarantees that longterm users like RDMA and V4L2 only - * establish mappings that have a kernel enforced revocation mechanism. - * - * "longterm" == userspace controlled elevated page count lifetime. - * Contrast this to iov_iter_get_pages() usages which are transient. + * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which + * allows us to process the FOLL_LONGTERM flag. */ -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas_arg) +static long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { - struct vm_area_struct **vmas = vmas_arg; - unsigned long flags; + struct vm_area_struct **vmas_tmp = vmas; + unsigned long flags = 0; long rc, i; - if (!pages) - return -EINVAL; - - if (!vmas) { - vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - return -ENOMEM; + if (gup_flags & FOLL_LONGTERM) { + if (!pages) + return -EINVAL; + + if (!vmas_tmp) { + vmas_tmp = kcalloc(nr_pages, + sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas_tmp) + return -ENOMEM; + } + flags = memalloc_nocma_save(); } - flags = memalloc_nocma_save(); - rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); - memalloc_nocma_restore(flags); - if (rc < 0) - goto out; + rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, + vmas_tmp, NULL, gup_flags); - if (check_dax_vmas(vmas, rc)) { - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; - goto out; + if (gup_flags & FOLL_LONGTERM) { + memalloc_nocma_restore(flags); + if (rc < 0) + goto out; + + if (check_dax_vmas(vmas_tmp, rc)) { + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; + goto out; + } + + rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, + vmas_tmp, gup_flags); } - rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); out: - if (vmas != vmas_arg) - kfree(vmas); + if (vmas_tmp != vmas) + kfree(vmas_tmp); return rc; } -EXPORT_SYMBOL(get_user_pages_longterm); -#endif /* CONFIG_FS_DAX */ +#else /* !CONFIG_FS_DAX && !CONFIG_CMA */ +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int flags) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + NULL, flags); +} +#endif /* CONFIG_FS_DAX || CONFIG_CMA */ + +/* + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. + */ +long get_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas) +{ + return __gup_longterm_locked(current, current->mm, start, nr_pages, + pages, vmas, gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages); /** * populate_vma_page_range() - populate a range of pages in the vma. @@ -1571,7 +1720,7 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; @@ -1589,10 +1738,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, if (pte_protnone(pte)) goto pte_unmap; - if (!pte_access_permitted(pte, write)) + if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_devmap(pte)) { + if (unlikely(flags & FOLL_LONGTERM)) + goto pte_unmap; + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, pages); @@ -1641,7 +1793,7 @@ pte_unmap: * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { return 0; } @@ -1724,16 +1876,19 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, #endif static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) + unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *head, *page; int refs; - if (!pmd_access_permitted(orig, write)) + if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; - if (pmd_devmap(orig)) + if (pmd_devmap(orig)) { + if (unlikely(flags & FOLL_LONGTERM)) + return 0; return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); + } refs = 0; page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1762,16 +1917,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) + unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *head, *page; int refs; - if (!pud_access_permitted(orig, write)) + if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; - if (pud_devmap(orig)) + if (pud_devmap(orig)) { + if (unlikely(flags & FOLL_LONGTERM)) + return 0; return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); + } refs = 0; page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); @@ -1800,13 +1958,13 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, } static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, - unsigned long end, int write, + unsigned long end, unsigned int flags, struct page **pages, int *nr) { int refs; struct page *head, *page; - if (!pgd_access_permitted(orig, write)) + if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); @@ -1837,7 +1995,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, } static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; @@ -1860,7 +2018,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (pmd_protnone(pmd)) return 0; - if (!gup_huge_pmd(pmd, pmdp, addr, next, write, + if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, pages, nr)) return 0; @@ -1870,9 +2028,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, * pmd format and THP pmd format */ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, - PMD_SHIFT, next, write, pages, nr)) + PMD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); @@ -1880,7 +2038,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, } static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; @@ -1893,14 +2051,14 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, if (pud_none(pud)) return 0; if (unlikely(pud_huge(pud))) { - if (!gup_huge_pud(pud, pudp, addr, next, write, + if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, - PUD_SHIFT, next, write, pages, nr)) + PUD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); @@ -1908,7 +2066,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, } static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; @@ -1923,9 +2081,9 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, BUILD_BUG_ON(p4d_huge(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, - P4D_SHIFT, next, write, pages, nr)) + P4D_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); @@ -1933,7 +2091,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, } static void gup_pgd_range(unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; @@ -1946,14 +2104,14 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, if (pgd_none(pgd)) return; if (unlikely(pgd_huge(pgd))) { - if (!gup_huge_pgd(pgd, pgdp, addr, next, write, + if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, write, pages, nr)) + PGDIR_SHIFT, next, flags, pages, nr)) return; - } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr)) + } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } @@ -2007,18 +2165,41 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages)) { local_irq_save(flags); - gup_pgd_range(start, end, write, pages, &nr); + gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); local_irq_restore(flags); } return nr; } +static int __gup_longterm_unlocked(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int ret; + + /* + * FIXME: FOLL_LONGTERM does not work with + * get_user_pages_unlocked() (see comments in that function) + */ + if (gup_flags & FOLL_LONGTERM) { + down_read(¤t->mm->mmap_sem); + ret = __gup_longterm_locked(current, current->mm, + start, nr_pages, + pages, NULL, gup_flags); + up_read(¤t->mm->mmap_sem); + } else { + ret = get_user_pages_unlocked(start, nr_pages, + pages, gup_flags); + } + + return ret; +} + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -2030,8 +2211,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { unsigned long addr, len, end; int nr = 0, ret = 0; @@ -2049,7 +2230,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages)) { local_irq_disable(); - gup_pgd_range(addr, end, write, pages, &nr); + gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); ret = nr; } @@ -2059,8 +2240,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, pages, - write ? FOLL_WRITE : 0); + ret = __gup_longterm_unlocked(start, nr_pages - nr, + gup_flags, pages); /* Have to be a bit careful with return values */ if (nr > 0) { |