summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug21
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/debug.c20
-rw-r--r--mm/early_ioremap.c8
-rw-r--r--mm/filemap.c34
-rw-r--r--mm/gup.c503
-rw-r--r--mm/gup_benchmark.c9
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/hmm.c62
-rw-r--r--mm/huge_memory.c93
-rw-r--r--mm/hugetlb_cgroup.c198
-rw-r--r--mm/kasan/common.c2
-rw-r--r--mm/kasan/report.c40
-rw-r--r--mm/kmemleak.c112
-rw-r--r--mm/madvise.c7
-rw-r--r--mm/memblock.c22
-rw-r--r--mm/memcontrol.c62
-rw-r--r--mm/memory.c46
-rw-r--r--mm/memory_hotplug.c128
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/memremap.c77
-rw-r--r--mm/migrate.c82
-rw-r--r--mm/mincore.c1
-rw-r--r--mm/mmap.c44
-rw-r--r--mm/mmu_gather.c134
-rw-r--r--mm/mmu_notifier.c585
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c187
-rw-r--r--mm/page_isolation.c53
-rw-r--r--mm/page_vma_mapped.c12
-rw-r--r--mm/pagewalk.c163
-rw-r--r--mm/percpu.c61
-rw-r--r--mm/process_vm_access.c28
-rw-r--r--mm/ptdump.c139
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slab_common.c40
-rw-r--r--mm/slub.c104
-rw-r--r--mm/sparse.c21
-rw-r--r--mm/swap.c27
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/vmalloc.c12
-rw-r--r--mm/vmscan.c24
-rw-r--r--mm/zswap.c86
46 files changed, 1992 insertions, 1315 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 327b3ebf23bf..0271b22e063f 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -117,3 +117,24 @@ config DEBUG_RODATA_TEST
depends on STRICT_KERNEL_RWX
---help---
This option enables a testcase for the setting rodata read-only.
+
+config GENERIC_PTDUMP
+ bool
+
+config PTDUMP_CORE
+ bool
+
+config PTDUMP_DEBUGFS
+ bool "Export kernel pagetable layout to userspace via debugfs"
+ depends on DEBUG_KERNEL
+ depends on DEBUG_FS
+ depends on GENERIC_PTDUMP
+ select PTDUMP_CORE
+ help
+ Say Y here if you want to show the kernel pagetable layout in a
+ debugfs file. This information is only useful for kernel developers
+ who are working in architecture specific areas of the kernel.
+ It is probably not a good idea to enable this feature in a production
+ kernel.
+
+ If in doubt, say N.
diff --git a/mm/Makefile b/mm/Makefile
index 1937cc251883..272e66039e70 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -20,6 +20,7 @@ KCOV_INSTRUMENT_kmemleak.o := n
KCOV_INSTRUMENT_memcontrol.o := n
KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
+KCOV_INSTRUMENT_failslab.o := n
CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
@@ -108,3 +109,4 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
+obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c360f6a6c844..62f05f605fb5 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -21,6 +21,7 @@ struct backing_dev_info noop_backing_dev_info = {
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
+const char *bdi_unknown_name = "(unknown)";
/*
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
diff --git a/mm/debug.c b/mm/debug.c
index 0461df1207cb..ecccd9f17801 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -46,7 +46,15 @@ void __dump_page(struct page *page, const char *reason)
{
struct address_space *mapping;
bool page_poisoned = PagePoisoned(page);
+ /*
+ * Accessing the pageblock without the zone lock. It could change to
+ * "isolate" again in the meantime, but since we are just dumping the
+ * state for debugging, it should be fine to accept a bit of
+ * inaccuracy here due to racing.
+ */
+ bool page_cma = is_migrate_cma_page(page);
int mapcount;
+ char *type = "";
/*
* If struct page is poisoned don't access Page*() functions as that
@@ -78,9 +86,9 @@ void __dump_page(struct page *page, const char *reason)
page, page_ref_count(page), mapcount,
page->mapping, page_to_pgoff(page));
if (PageKsm(page))
- pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ type = "ksm ";
else if (PageAnon(page))
- pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ type = "anon ";
else if (mapping) {
if (mapping->host && mapping->host->i_dentry.first) {
struct dentry *dentry;
@@ -88,10 +96,12 @@ void __dump_page(struct page *page, const char *reason)
pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry);
} else
pr_warn("%ps\n", mapping->a_ops);
- pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
}
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
+ pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags,
+ page_cma ? " CMA" : "");
+
hex_only:
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
@@ -153,7 +163,7 @@ void dump_mm(const struct mm_struct *mm)
#endif
"exe_file %px\n"
#ifdef CONFIG_MMU_NOTIFIER
- "mmu_notifier_mm %px\n"
+ "notifier_subscriptions %px\n"
#endif
#ifdef CONFIG_NUMA_BALANCING
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
@@ -185,7 +195,7 @@ void dump_mm(const struct mm_struct *mm)
#endif
mm->exe_file,
#ifdef CONFIG_MMU_NOTIFIER
- mm->mmu_notifier_mm,
+ mm->notifier_subscriptions,
#endif
#ifdef CONFIG_NUMA_BALANCING
mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index 1826f191e72c..a0018ad1a1f6 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -121,8 +121,8 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
}
}
- if (WARN(slot < 0, "%s(%08llx, %08lx) not found slot\n",
- __func__, (u64)phys_addr, size))
+ if (WARN(slot < 0, "%s(%pa, %08lx) not found slot\n",
+ __func__, &phys_addr, size))
return NULL;
/* Don't allow wraparound or zero size */
@@ -158,8 +158,8 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
--idx;
--nrpages;
}
- WARN(early_ioremap_debug, "%s(%08llx, %08lx) [%d] => %08lx + %08lx\n",
- __func__, (u64)phys_addr, size, slot, offset, slot_virt[slot]);
+ WARN(early_ioremap_debug, "%s(%pa, %08lx) [%d] => %08lx + %08lx\n",
+ __func__, &phys_addr, size, slot, offset, slot_virt[slot]);
prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
return prev_map[slot];
diff --git a/mm/filemap.c b/mm/filemap.c
index bf6aa30be58d..1784478270e1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -632,33 +632,6 @@ static bool mapping_needs_writeback(struct address_space *mapping)
return mapping->nrpages;
}
-int filemap_write_and_wait(struct address_space *mapping)
-{
- int err = 0;
-
- if (mapping_needs_writeback(mapping)) {
- err = filemap_fdatawrite(mapping);
- /*
- * Even if the above returned error, the pages may be
- * written partially (e.g. -ENOSPC), so we wait for it.
- * But the -EIO is special case, it may indicate the worst
- * thing (e.g. bug) happened, so we avoid waiting for it.
- */
- if (err != -EIO) {
- int err2 = filemap_fdatawait(mapping);
- if (!err)
- err = err2;
- } else {
- /* Clear any previously stored errors */
- filemap_check_errors(mapping);
- }
- } else {
- err = filemap_check_errors(mapping);
- }
- return err;
-}
-EXPORT_SYMBOL(filemap_write_and_wait);
-
/**
* filemap_write_and_wait_range - write out & wait on a file range
* @mapping: the address_space for the pages
@@ -680,7 +653,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
- /* See comment of filemap_write_and_wait() */
+ /*
+ * Even if the above returned error, the pages may be
+ * written partially (e.g. -ENOSPC), so we wait for it.
+ * But the -EIO is special case, it may indicate the worst
+ * thing (e.g. bug) happened, so we avoid waiting for it.
+ */
if (err != -EIO) {
int err2 = filemap_fdatawait_range(mapping,
lstart, lend);
diff --git a/mm/gup.c b/mm/gup.c
index 7646bf993b25..1b521e0ac1de 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,8 +29,23 @@ struct follow_page_context {
unsigned int page_mask;
};
+/*
+ * Return the compound head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+ struct page *head = compound_head(page);
+
+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
+ return NULL;
+ if (unlikely(!page_cache_add_speculative(head, refs)))
+ return NULL;
+ return head;
+}
+
/**
- * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
+ * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
* @pages: array of pages to be maybe marked dirty, and definitely released.
* @npages: number of pages in the @pages array.
* @make_dirty: whether to mark the pages dirty
@@ -40,19 +55,19 @@ struct follow_page_context {
*
* For each page in the @pages array, make that page (or its head page, if a
* compound page) dirty, if @make_dirty is true, and if the page was previously
- * listed as clean. In any case, releases all pages using put_user_page(),
- * possibly via put_user_pages(), for the non-dirty case.
+ * listed as clean. In any case, releases all pages using unpin_user_page(),
+ * possibly via unpin_user_pages(), for the non-dirty case.
*
- * Please see the put_user_page() documentation for details.
+ * Please see the unpin_user_page() documentation for details.
*
* set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
* required, then the caller should a) verify that this is really correct,
* because _lock() is usually required, and b) hand code it:
- * set_page_dirty_lock(), put_user_page().
+ * set_page_dirty_lock(), unpin_user_page().
*
*/
-void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
- bool make_dirty)
+void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+ bool make_dirty)
{
unsigned long index;
@@ -63,7 +78,7 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
*/
if (!make_dirty) {
- put_user_pages(pages, npages);
+ unpin_user_pages(pages, npages);
return;
}
@@ -91,21 +106,21 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
*/
if (!PageDirty(page))
set_page_dirty_lock(page);
- put_user_page(page);
+ unpin_user_page(page);
}
}
-EXPORT_SYMBOL(put_user_pages_dirty_lock);
+EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
/**
- * put_user_pages() - release an array of gup-pinned pages.
+ * unpin_user_pages() - release an array of gup-pinned pages.
* @pages: array of pages to be marked dirty and released.
* @npages: number of pages in the @pages array.
*
- * For each page in the @pages array, release the page using put_user_page().
+ * For each page in the @pages array, release the page using unpin_user_page().
*
- * Please see the put_user_page() documentation for details.
+ * Please see the unpin_user_page() documentation for details.
*/
-void put_user_pages(struct page **pages, unsigned long npages)
+void unpin_user_pages(struct page **pages, unsigned long npages)
{
unsigned long index;
@@ -115,9 +130,9 @@ void put_user_pages(struct page **pages, unsigned long npages)
* single operation to the head page should suffice.
*/
for (index = 0; index < npages; index++)
- put_user_page(pages[index]);
+ unpin_user_page(pages[index]);
}
-EXPORT_SYMBOL(put_user_pages);
+EXPORT_SYMBOL(unpin_user_pages);
#ifdef CONFIG_MMU
static struct page *no_page_table(struct vm_area_struct *vma,
@@ -179,6 +194,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
spinlock_t *ptl;
pte_t *ptep, pte;
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return ERR_PTR(-EINVAL);
retry:
if (unlikely(pmd_bad(*pmd)))
return no_page_table(vma, flags);
@@ -323,7 +342,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
pmdval = READ_ONCE(*pmd);
if (pmd_none(pmdval))
return no_page_table(vma, flags);
- if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) {
+ if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
page = follow_huge_pmd(mm, address, pmd, flags);
if (page)
return page;
@@ -433,7 +452,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
pud = pud_offset(p4dp, address);
if (pud_none(*pud))
return no_page_table(vma, flags);
- if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
+ if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
page = follow_huge_pud(mm, address, pud, flags);
if (page)
return page;
@@ -796,7 +815,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
start = untagged_addr(start);
- VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
+ VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
/*
* If FOLL_FORCE is set then do not force a full fault as the hinting
@@ -1020,7 +1039,16 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
BUG_ON(*locked != 1);
}
- if (pages)
+ /*
+ * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
+ * is to set FOLL_GET if the caller wants pages[] filled in (but has
+ * carelessly failed to specify FOLL_GET), so keep doing that, but only
+ * for FOLL_GET, not for the newer FOLL_PIN.
+ *
+ * FOLL_PIN always expects pages to be non-null, but no need to assert
+ * that here, as any failures will be obvious enough.
+ */
+ if (pages && !(flags & FOLL_PIN))
flags |= FOLL_GET;
pages_done = 0;
@@ -1096,88 +1124,6 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
return pages_done;
}
-/*
- * get_user_pages_remote() - pin user pages in memory
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
- * @mm: mm_struct of target mm
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying lookup behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @vmas: array of pointers to vmas corresponding to each page.
- * Or NULL if the caller does not require them.
- * @locked: pointer to lock flag indicating whether lock is held and
- * subsequently whether VM_FAULT_RETRY functionality can be
- * utilised. Lock must initially be held.
- *
- * Returns either number of pages pinned (which may be less than the
- * number requested), or an error. Details about the return value:
- *
- * -- If nr_pages is 0, returns 0.
- * -- If nr_pages is >0, but no pages were pinned, returns -errno.
- * -- If nr_pages is >0, and some pages were pinned, returns the number of
- * pages pinned. Again, this may be less than nr_pages.
- *
- * The caller is responsible for releasing returned @pages, via put_page().
- *
- * @vmas are valid only as long as mmap_sem is held.
- *
- * Must be called with mmap_sem held for read or write.
- *
- * get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
- * instant. That is, it takes the page that would be accessed if a user
- * thread accesses the given user virtual address at that instant.
- *
- * This does not guarantee that the page exists in the user mappings when
- * get_user_pages returns, and there may even be a completely different
- * page there in some cases (eg. if mmapped pagecache has been invalidated
- * and subsequently re faulted). However it does guarantee that the page
- * won't be freed completely. And mostly callers simply care that the page
- * contains data that was valid *at some point in time*. Typically, an IO
- * or similar operation cannot guarantee anything stronger anyway because
- * locks can't be held over the syscall boundary.
- *
- * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
- * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
- * be called after the page is finished with, and before put_page is called.
- *
- * get_user_pages is typically used for fewer-copy IO operations, to get a
- * handle on the memory by some means other than accesses via the user virtual
- * addresses. The pages may be submitted for DMA to devices or accessed via
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
- * use the correct cache flushing APIs.
- *
- * See also get_user_pages_fast, for performance critical applications.
- *
- * get_user_pages should be phased out in favor of
- * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
- * should use get_user_pages because it cannot pass
- * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
- */
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
-{
- /*
- * FIXME: Current FOLL_LONGTERM behavior is incompatible with
- * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
- * vmas. As there are no users of this flag in this call we simply
- * disallow this option for now.
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
- return -EINVAL;
-
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
- locked,
- gup_flags | FOLL_TOUCH | FOLL_REMOTE);
-}
-EXPORT_SYMBOL(get_user_pages_remote);
-
/**
* populate_vma_page_range() - populate a range of pages in the vma.
* @vma: target vma
@@ -1612,6 +1558,116 @@ static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
/*
+ * get_user_pages_remote() - pin user pages in memory
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * Returns either number of pages pinned (which may be less than the
+ * number requested), or an error. Details about the return value:
+ *
+ * -- If nr_pages is 0, returns 0.
+ * -- If nr_pages is >0, but no pages were pinned, returns -errno.
+ * -- If nr_pages is >0, and some pages were pinned, returns the number of
+ * pages pinned. Again, this may be less than nr_pages.
+ *
+ * The caller is responsible for releasing returned @pages, via put_page().
+ *
+ * @vmas are valid only as long as mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
+ * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
+ * be called after the page is finished with, and before put_page is called.
+ *
+ * get_user_pages is typically used for fewer-copy IO operations, to get a
+ * handle on the memory by some means other than accesses via the user virtual
+ * addresses. The pages may be submitted for DMA to devices or accessed via
+ * their kernel linear mapping (via the kmap APIs). Care should be taken to
+ * use the correct cache flushing APIs.
+ *
+ * See also get_user_pages_fast, for performance critical applications.
+ *
+ * get_user_pages should be phased out in favor of
+ * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
+ * should use get_user_pages because it cannot pass
+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
+ */
+#ifdef CONFIG_MMU
+long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that with an assertion:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
+
+ /*
+ * Parts of FOLL_LONGTERM behavior are incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. However, this only comes up if locked is set, and there are
+ * callers that do request FOLL_LONGTERM, but do not set locked. So,
+ * allow what we can.
+ */
+ if (gup_flags & FOLL_LONGTERM) {
+ if (WARN_ON_ONCE(locked))
+ return -EINVAL;
+ /*
+ * This will check the vmas (even if our vmas arg is NULL)
+ * and return -ENOTSUPP if DAX isn't allowed in this case:
+ */
+ return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
+ vmas, gup_flags | FOLL_TOUCH |
+ FOLL_REMOTE);
+ }
+
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ locked,
+ gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+}
+EXPORT_SYMBOL(get_user_pages_remote);
+
+#else /* CONFIG_MMU */
+long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ return 0;
+}
+#endif /* !CONFIG_MMU */
+
+/*
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
* and mm being operated on are the current task's and don't allow
@@ -1622,6 +1678,13 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that with an assertion:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
+
return __gup_longterm_locked(current, current->mm, start, nr_pages,
pages, vmas, gup_flags | FOLL_TOUCH);
}
@@ -1729,7 +1792,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* Before activating this code, please be aware that the following assumptions
* are currently made:
*
- * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
+ * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
* free pages containing page tables or TLB flushing requires IPI broadcast.
*
* *) ptes can be read atomically by the architecture.
@@ -1807,20 +1870,6 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
}
-/*
- * Return the compund head page with ref appropriately incremented,
- * or NULL if that failed.
- */
-static inline struct page *try_get_compound_head(struct page *page, int refs)
-{
- struct page *head = compound_head(page);
- if (WARN_ON_ONCE(page_ref_count(head) < 0))
- return NULL;
- if (unlikely(!page_cache_add_speculative(head, refs)))
- return NULL;
- return head;
-}
-
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
@@ -1978,6 +2027,29 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
}
#endif
+static int record_subpages(struct page *page, unsigned long addr,
+ unsigned long end, struct page **pages)
+{
+ int nr;
+
+ for (nr = 0; addr != end; addr += PAGE_SIZE)
+ pages[nr++] = page++;
+
+ return nr;
+}
+
+static void put_compound_head(struct page *page, int refs)
+{
+ VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
+ /*
+ * Calling put_page() for each ref is unnecessarily slow. Only the last
+ * ref needs a put_page().
+ */
+ if (refs > 1)
+ page_ref_sub(page, refs - 1);
+ put_page(page);
+}
+
#ifdef CONFIG_ARCH_HAS_HUGEPD
static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
unsigned long sz)
@@ -2007,32 +2079,20 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
/* hugepages are never "special" */
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
- refs = 0;
head = pte_page(pte);
-
page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
- do {
- VM_BUG_ON(compound_head(page) != head);
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(head, refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- /* Could be optimized better */
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2079,28 +2139,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
}
- refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pmd_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2120,28 +2171,19 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
}
- refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pud_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2157,28 +2199,20 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
- refs = 0;
+
page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
- do {
- pages[*nr] = page;
- (*nr)++;
- page++;
- refs++;
- } while (addr += PAGE_SIZE, addr != end);
+ refs = record_subpages(page, addr, end, pages + *nr);
head = try_get_compound_head(pgd_page(orig), refs);
- if (!head) {
- *nr -= refs;
+ if (!head)
return 0;
- }
if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
- *nr -= refs;
- while (refs--)
- put_page(head);
+ put_compound_head(head, refs);
return 0;
}
+ *nr += refs;
SetPageReferenced(head);
return 1;
}
@@ -2237,7 +2271,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
pud_t pud = READ_ONCE(*pudp);
next = pud_addr_end(addr, end);
- if (pud_none(pud))
+ if (unlikely(!pud_present(pud)))
return 0;
if (unlikely(pud_huge(pud))) {
if (!gup_huge_pud(pud, pudp, addr, next, flags,
@@ -2393,29 +2427,15 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
return ret;
}
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying pin behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages,
- unsigned int gup_flags, struct page **pages)
+static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags,
+ struct page **pages)
{
unsigned long addr, len, end;
int nr = 0, ret = 0;
- if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
+ if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
+ FOLL_FORCE | FOLL_PIN)))
return -EINVAL;
start = untagged_addr(start) & PAGE_MASK;
@@ -2455,4 +2475,103 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
return ret;
}
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number requested.
+ * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
+ * -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
+
+ return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
+}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
+
+/**
+ * pin_user_pages_fast() - pin user pages in memory without taking locks
+ *
+ * For now, this is a placeholder function, until various call sites are
+ * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
+ * this is identical to get_user_pages_fast().
+ *
+ * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * is NOT intended for Case 2 (RDMA: long-term pins).
+ */
+int pin_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ /*
+ * This is a placeholder, until the pin functionality is activated.
+ * Until then, just behave like the corresponding get_user_pages*()
+ * routine.
+ */
+ return get_user_pages_fast(start, nr_pages, gup_flags, pages);
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fast);
+
+/**
+ * pin_user_pages_remote() - pin pages of a remote process (task != current)
+ *
+ * For now, this is a placeholder function, until various call sites are
+ * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
+ * this is identical to get_user_pages_remote().
+ *
+ * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * is NOT intended for Case 2 (RDMA: long-term pins).
+ */
+long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas, int *locked)
+{
+ /*
+ * This is a placeholder, until the pin functionality is activated.
+ * Until then, just behave like the corresponding get_user_pages*()
+ * routine.
+ */
+ return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages,
+ vmas, locked);
+}
+EXPORT_SYMBOL(pin_user_pages_remote);
+
+/**
+ * pin_user_pages() - pin user pages in memory for use by other devices
+ *
+ * For now, this is a placeholder function, until various call sites are
+ * converted to use the correct get_user_pages*() or pin_user_pages*() API. So,
+ * this is identical to get_user_pages().
+ *
+ * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
+ * is NOT intended for Case 2 (RDMA: long-term pins).
+ */
+long pin_user_pages(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ /*
+ * This is a placeholder, until the pin functionality is activated.
+ * Until then, just behave like the corresponding get_user_pages*()
+ * routine.
+ */
+ return get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+}
+EXPORT_SYMBOL(pin_user_pages);
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index ad9d5b1c4473..8dba38e79a9f 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -49,18 +49,21 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = (next - addr) / PAGE_SIZE;
}
+ /* Filter out most gup flags: only allow a tiny subset here: */
+ gup->flags &= FOLL_WRITE;
+
switch (cmd) {
case GUP_FAST_BENCHMARK:
- nr = get_user_pages_fast(addr, nr, gup->flags & 1,
+ nr = get_user_pages_fast(addr, nr, gup->flags,
pages + i);
break;
case GUP_LONGTERM_BENCHMARK:
nr = get_user_pages(addr, nr,
- (gup->flags & 1) | FOLL_LONGTERM,
+ gup->flags | FOLL_LONGTERM,
pages + i, NULL);
break;
case GUP_BENCHMARK:
- nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
+ nr = get_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
default:
diff --git a/mm/highmem.c b/mm/highmem.c
index 107b10f9878e..64d8dea47dd1 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,7 +29,7 @@
#include <linux/highmem.h>
#include <linux/kgdb.h>
#include <asm/tlbflush.h>
-
+#include <linux/vmalloc.h>
#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
DEFINE_PER_CPU(int, __kmap_atomic_idx);
diff --git a/mm/hmm.c b/mm/hmm.c
index d379cb6496ae..72e5a6d9a417 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -186,7 +186,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
}
static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+ __always_unused int depth, struct mm_walk *walk)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
@@ -380,7 +380,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
again:
pmd = READ_ONCE(*pmdp);
if (pmd_none(pmd))
- return hmm_vma_walk_hole(start, end, walk);
+ return hmm_vma_walk_hole(start, end, -1, walk);
if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
bool fault, write_fault;
@@ -474,23 +474,32 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- unsigned long addr = start, next;
- pmd_t *pmdp;
+ unsigned long addr = start;
pud_t pud;
- int ret;
+ int ret = 0;
+ spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
+
+ if (!ptl)
+ return 0;
+
+ /* Normally we don't want to split the huge page */
+ walk->action = ACTION_CONTINUE;
-again:
pud = READ_ONCE(*pudp);
- if (pud_none(pud))
- return hmm_vma_walk_hole(start, end, walk);
+ if (pud_none(pud)) {
+ ret = hmm_vma_walk_hole(start, end, -1, walk);
+ goto out_unlock;
+ }
if (pud_huge(pud) && pud_devmap(pud)) {
unsigned long i, npages, pfn;
uint64_t *pfns, cpu_flags;
bool fault, write_fault;
- if (!pud_present(pud))
- return hmm_vma_walk_hole(start, end, walk);
+ if (!pud_present(pud)) {
+ ret = hmm_vma_walk_hole(start, end, -1, walk);
+ goto out_unlock;
+ }
i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
@@ -499,16 +508,20 @@ again:
cpu_flags = pud_to_hmm_pfn_flags(range, pud);
hmm_range_need_fault(hmm_vma_walk, pfns, npages,
cpu_flags, &fault, &write_fault);
- if (fault || write_fault)
- return hmm_vma_walk_hole_(addr, end, fault,
- write_fault, walk);
+ if (fault || write_fault) {
+ ret = hmm_vma_walk_hole_(addr, end, fault,
+ write_fault, walk);
+ goto out_unlock;
+ }
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
for (i = 0; i < npages; ++i, ++pfn) {
hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
hmm_vma_walk->pgmap);
- if (unlikely(!hmm_vma_walk->pgmap))
- return -EBUSY;
+ if (unlikely(!hmm_vma_walk->pgmap)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
cpu_flags;
}
@@ -517,22 +530,15 @@ again:
hmm_vma_walk->pgmap = NULL;
}
hmm_vma_walk->last = end;
- return 0;
+ goto out_unlock;
}
- split_huge_pud(walk->vma, pudp, addr);
- if (pud_none(*pudp))
- goto again;
+ /* Ask for the PUD to be split */
+ walk->action = ACTION_SUBTREE;
- pmdp = pmd_offset(pudp, addr);
- do {
- next = pmd_addr_end(addr, end);
- ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
- if (ret)
- return ret;
- } while (pmdp++, addr = next, addr != end);
-
- return 0;
+out_unlock:
+ spin_unlock(ptl);
+ return ret;
}
#else
#define hmm_vma_walk_pud NULL
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 41a0fbddc96b..b08b199f9a11 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -177,16 +177,13 @@ static ssize_t enabled_store(struct kobject *kobj,
{
ssize_t ret = count;
- if (!memcmp("always", buf,
- min(sizeof("always")-1, count))) {
+ if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("madvise", buf,
- min(sizeof("madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("never", buf,
- min(sizeof("never")-1, count))) {
+ } else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
} else
@@ -250,32 +247,27 @@ static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- if (!memcmp("always", buf,
- min(sizeof("always")-1, count))) {
+ if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("defer+madvise", buf,
- min(sizeof("defer+madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "defer+madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("defer", buf,
- min(sizeof("defer")-1, count))) {
+ } else if (sysfs_streq(buf, "defer")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("madvise", buf,
- min(sizeof("madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("never", buf,
- min(sizeof("never")-1, count))) {
+ } else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
@@ -527,13 +519,24 @@ void prep_transhuge_page(struct page *page)
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
-static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
+bool is_transparent_hugepage(struct page *page)
+{
+ if (!PageCompound(page))
+ return 0;
+
+ page = compound_head(page);
+ return is_huge_zero_page(page) ||
+ page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
+}
+EXPORT_SYMBOL_GPL(is_transparent_hugepage);
+
+static unsigned long __thp_get_unmapped_area(struct file *filp,
+ unsigned long addr, unsigned long len,
loff_t off, unsigned long flags, unsigned long size)
{
- unsigned long addr;
loff_t off_end = off + len;
loff_t off_align = round_up(off, size);
- unsigned long len_pad;
+ unsigned long len_pad, ret;
if (off_end <= off_align || (off_end - off_align) < size)
return 0;
@@ -542,30 +545,40 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long le
if (len_pad < len || (off + len_pad) < off)
return 0;
- addr = current->mm->get_unmapped_area(filp, 0, len_pad,
+ ret = current->mm->get_unmapped_area(filp, addr, len_pad,
off >> PAGE_SHIFT, flags);
- if (IS_ERR_VALUE(addr))
+
+ /*
+ * The failure might be due to length padding. The caller will retry
+ * without the padding.
+ */
+ if (IS_ERR_VALUE(ret))
return 0;
- addr += (off - addr) & (size - 1);
- return addr;
+ /*
+ * Do not try to align to THP boundary if allocation at the address
+ * hint succeeds.
+ */
+ if (ret == addr)
+ return addr;
+
+ ret += (off - ret) & (size - 1);
+ return ret;
}
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
+ unsigned long ret;
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
- if (addr)
- goto out;
if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
goto out;
- addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
- if (addr)
- return addr;
-
- out:
+ ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
+ if (ret)
+ return ret;
+out:
return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
@@ -2694,7 +2707,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
- struct deferred_split *ds_queue = get_deferred_split_queue(page);
+ struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
@@ -2702,11 +2715,11 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
unsigned long flags;
pgoff_t end;
- VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageCompound(page), page);
+ VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
+ VM_BUG_ON_PAGE(!PageLocked(head), head);
+ VM_BUG_ON_PAGE(!PageCompound(head), head);
- if (PageWriteback(page))
+ if (PageWriteback(head))
return -EBUSY;
if (PageAnon(head)) {
@@ -2757,7 +2770,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out_unlock;
}
- mlocked = PageMlocked(page);
+ mlocked = PageMlocked(head);
unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
@@ -2789,14 +2802,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
}
+ spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
- if (PageSwapBacked(page))
- __dec_node_page_state(page, NR_SHMEM_THPS);
+ if (PageSwapBacked(head))
+ __dec_node_page_state(head, NR_SHMEM_THPS);
else
- __dec_node_page_state(page, NR_FILE_THPS);
+ __dec_node_page_state(head, NR_FILE_THPS);
}
- spin_unlock(&ds_queue->split_queue_lock);
__split_huge_page(page, list, end, flags);
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 2ac38bdc18a1..e434b05416c6 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -3,6 +3,10 @@
* Copyright IBM Corporation, 2012
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
*
+ * Cgroup v2
+ * Copyright (C) 2019 Red Hat, Inc.
+ * Author: Giuseppe Scrivano <gscrivan@redhat.com>
+ *
* This program is free software; you can redistribute it and/or modify it
* under the terms of version 2.1 of the GNU Lesser General Public License
* as published by the Free Software Foundation.
@@ -19,18 +23,36 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
+enum hugetlb_memory_event {
+ HUGETLB_MAX,
+ HUGETLB_NR_MEMORY_EVENTS,
+};
+
struct hugetlb_cgroup {
struct cgroup_subsys_state css;
+
/*
* the counter to account for hugepages from hugetlb.
*/
struct page_counter hugepage[HUGE_MAX_HSTATE];
+
+ atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
+ atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
+
+ /* Handle for "hugetlb.events" */
+ struct cgroup_file events_file[HUGE_MAX_HSTATE];
+
+ /* Handle for "hugetlb.events.local" */
+ struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
};
#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
+#define hugetlb_cgroup_from_counter(counter, idx) \
+ container_of(counter, struct hugetlb_cgroup, hugepage[idx])
+
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
static inline
@@ -178,6 +200,19 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
} while (hugetlb_cgroup_have_usage(h_cg));
}
+static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
+ enum hugetlb_memory_event event)
+{
+ atomic_long_inc(&hugetlb->events_local[idx][event]);
+ cgroup_file_notify(&hugetlb->events_local_file[idx]);
+
+ do {
+ atomic_long_inc(&hugetlb->events[idx][event]);
+ cgroup_file_notify(&hugetlb->events_file[idx]);
+ } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
+ !hugetlb_cgroup_is_root(hugetlb));
+}
+
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
struct hugetlb_cgroup **ptr)
{
@@ -202,8 +237,12 @@ again:
}
rcu_read_unlock();
- if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter))
+ if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages,
+ &counter)) {
ret = -ENOMEM;
+ hugetlb_event(hugetlb_cgroup_from_counter(counter, idx), idx,
+ HUGETLB_MAX);
+ }
css_put(&h_cg->css);
done:
*ptr = h_cg;
@@ -283,10 +322,45 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
+static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
+{
+ int idx;
+ u64 val;
+ struct cftype *cft = seq_cft(seq);
+ unsigned long limit;
+ struct page_counter *counter;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
+
+ idx = MEMFILE_IDX(cft->private);
+ counter = &h_cg->hugepage[idx];
+
+ limit = round_down(PAGE_COUNTER_MAX,
+ 1 << huge_page_order(&hstates[idx]));
+
+ switch (MEMFILE_ATTR(cft->private)) {
+ case RES_USAGE:
+ val = (u64)page_counter_read(counter);
+ seq_printf(seq, "%llu\n", val * PAGE_SIZE);
+ break;
+ case RES_LIMIT:
+ val = (u64)counter->max;
+ if (val == limit)
+ seq_puts(seq, "max\n");
+ else
+ seq_printf(seq, "%llu\n", val * PAGE_SIZE);
+ break;
+ default:
+ BUG();
+ }
+
+ return 0;
+}
+
static DEFINE_MUTEX(hugetlb_limit_mutex);
static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+ char *buf, size_t nbytes, loff_t off,
+ const char *max)
{
int ret, idx;
unsigned long nr_pages;
@@ -296,7 +370,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
return -EINVAL;
buf = strstrip(buf);
- ret = page_counter_memparse(buf, "-1", &nr_pages);
+ ret = page_counter_memparse(buf, max, &nr_pages);
if (ret)
return ret;
@@ -316,6 +390,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
return ret ?: nbytes;
}
+static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
+}
+
+static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
+}
+
static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
@@ -350,7 +436,36 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
return buf;
}
-static void __init __hugetlb_cgroup_file_init(int idx)
+static int __hugetlb_events_show(struct seq_file *seq, bool local)
+{
+ int idx;
+ long max;
+ struct cftype *cft = seq_cft(seq);
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
+
+ idx = MEMFILE_IDX(cft->private);
+
+ if (local)
+ max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
+ else
+ max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);
+
+ seq_printf(seq, "max %lu\n", max);
+
+ return 0;
+}
+
+static int hugetlb_events_show(struct seq_file *seq, void *v)
+{
+ return __hugetlb_events_show(seq, false);
+}
+
+static int hugetlb_events_local_show(struct seq_file *seq, void *v)
+{
+ return __hugetlb_events_show(seq, true);
+}
+
+static void __init __hugetlb_cgroup_file_dfl_init(int idx)
{
char buf[32];
struct cftype *cft;
@@ -360,38 +475,93 @@ static void __init __hugetlb_cgroup_file_init(int idx)
mem_fmt(buf, 32, huge_page_size(h));
/* Add the limit file */
- cft = &h->cgroup_files[0];
+ cft = &h->cgroup_files_dfl[0];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
+ cft->seq_show = hugetlb_cgroup_read_u64_max;
+ cft->write = hugetlb_cgroup_write_dfl;
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* Add the current usage file */
+ cft = &h->cgroup_files_dfl[1];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
+ cft->seq_show = hugetlb_cgroup_read_u64_max;
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* Add the events file */
+ cft = &h->cgroup_files_dfl[2];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf);
+ cft->private = MEMFILE_PRIVATE(idx, 0);
+ cft->seq_show = hugetlb_events_show;
+ cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]),
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* Add the events.local file */
+ cft = &h->cgroup_files_dfl[3];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf);
+ cft->private = MEMFILE_PRIVATE(idx, 0);
+ cft->seq_show = hugetlb_events_local_show;
+ cft->file_offset = offsetof(struct hugetlb_cgroup,
+ events_local_file[idx]),
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* NULL terminate the last cft */
+ cft = &h->cgroup_files_dfl[4];
+ memset(cft, 0, sizeof(*cft));
+
+ WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
+ h->cgroup_files_dfl));
+}
+
+static void __init __hugetlb_cgroup_file_legacy_init(int idx)
+{
+ char buf[32];
+ struct cftype *cft;
+ struct hstate *h = &hstates[idx];
+
+ /* format the size */
+ mem_fmt(buf, 32, huge_page_size(h));
+
+ /* Add the limit file */
+ cft = &h->cgroup_files_legacy[0];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
cft->read_u64 = hugetlb_cgroup_read_u64;
- cft->write = hugetlb_cgroup_write;
+ cft->write = hugetlb_cgroup_write_legacy;
/* Add the usage file */
- cft = &h->cgroup_files[1];
+ cft = &h->cgroup_files_legacy[1];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
cft->read_u64 = hugetlb_cgroup_read_u64;
/* Add the MAX usage file */
- cft = &h->cgroup_files[2];
+ cft = &h->cgroup_files_legacy[2];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* Add the failcntfile */
- cft = &h->cgroup_files[3];
+ cft = &h->cgroup_files_legacy[3];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* NULL terminate the last cft */
- cft = &h->cgroup_files[4];
+ cft = &h->cgroup_files_legacy[4];
memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
- h->cgroup_files));
+ h->cgroup_files_legacy));
+}
+
+static void __init __hugetlb_cgroup_file_init(int idx)
+{
+ __hugetlb_cgroup_file_dfl_init(idx);
+ __hugetlb_cgroup_file_legacy_init(idx);
}
void __init hugetlb_cgroup_file_init(void)
@@ -433,8 +603,14 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
return;
}
+static struct cftype hugetlb_files[] = {
+ {} /* terminate */
+};
+
struct cgroup_subsys hugetlb_cgrp_subsys = {
.css_alloc = hugetlb_cgroup_css_alloc,
.css_offline = hugetlb_cgroup_css_offline,
.css_free = hugetlb_cgroup_css_free,
+ .dfl_cftypes = hugetlb_files,
+ .legacy_cftypes = hugetlb_files,
};
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index c15d8ae68c96..6aa51723b92b 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -110,6 +110,7 @@ void *memset(void *addr, int c, size_t len)
return __memset(addr, c, len);
}
+#ifdef __HAVE_ARCH_MEMMOVE
#undef memmove
void *memmove(void *dest, const void *src, size_t len)
{
@@ -118,6 +119,7 @@ void *memmove(void *dest, const void *src, size_t len)
return __memmove(dest, src, len);
}
+#endif
#undef memcpy
void *memcpy(void *dest, const void *src, size_t len)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 621782100eaa..5ef9f24f566b 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -512,3 +512,43 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon
end_report(&flags);
}
+
+#ifdef CONFIG_KASAN_INLINE
+/*
+ * With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high
+ * canonical half of the address space) cause out-of-bounds shadow memory reads
+ * before the actual access. For addresses in the low canonical half of the
+ * address space, as well as most non-canonical addresses, that out-of-bounds
+ * shadow memory access lands in the non-canonical part of the address space.
+ * Help the user figure out what the original bogus pointer was.
+ */
+void kasan_non_canonical_hook(unsigned long addr)
+{
+ unsigned long orig_addr;
+ const char *bug_type;
+
+ if (addr < KASAN_SHADOW_OFFSET)
+ return;
+
+ orig_addr = (addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT;
+ /*
+ * For faults near the shadow address for NULL, we can be fairly certain
+ * that this is a KASAN shadow memory access.
+ * For faults that correspond to shadow for low canonical addresses, we
+ * can still be pretty sure - that shadow region is a fairly narrow
+ * chunk of the non-canonical address space.
+ * But faults that look like shadow for non-canonical addresses are a
+ * really large chunk of the address space. In that case, we still
+ * print the decoded address, but make it clear that this is not
+ * necessarily what's actually going on.
+ */
+ if (orig_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if (orig_addr < TASK_SIZE)
+ bug_type = "probably user-memory-access";
+ else
+ bug_type = "maybe wild-memory-access";
+ pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type,
+ orig_addr, orig_addr + KASAN_SHADOW_MASK);
+}
+#endif
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 244607663363..3a4259eeb5a0 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -13,7 +13,7 @@
*
* The following locks and mutexes are used by kmemleak:
*
- * - kmemleak_lock (rwlock): protects the object_list modifications and
+ * - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and
* accesses to the object_tree_root. The object_list is the main list
* holding the metadata (struct kmemleak_object) for the allocated memory
* blocks. The object_tree_root is a red black tree used to look-up
@@ -22,13 +22,13 @@
* object_tree_root in the create_object() function called from the
* kmemleak_alloc() callback and removed in delete_object() called from the
* kmemleak_free() callback
- * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
- * the metadata (e.g. count) are protected by this lock. Note that some
- * members of this structure may be protected by other means (atomic or
- * kmemleak_lock). This lock is also held when scanning the corresponding
- * memory block to avoid the kernel freeing it via the kmemleak_free()
- * callback. This is less heavyweight than holding a global lock like
- * kmemleak_lock during scanning
+ * - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object.
+ * Accesses to the metadata (e.g. count) are protected by this lock. Note
+ * that some members of this structure may be protected by other means
+ * (atomic or kmemleak_lock). This lock is also held when scanning the
+ * corresponding memory block to avoid the kernel freeing it via the
+ * kmemleak_free() callback. This is less heavyweight than holding a global
+ * lock like kmemleak_lock during scanning.
* - scan_mutex (mutex): ensures that only one thread may scan the memory for
* unreferenced objects at a time. The gray_list contains the objects which
* are already referenced or marked as false positives and need to be
@@ -135,7 +135,7 @@ struct kmemleak_scan_area {
* (use_count) and freed using the RCU mechanism.
*/
struct kmemleak_object {
- spinlock_t lock;
+ raw_spinlock_t lock;
unsigned int flags; /* object status flags */
struct list_head object_list;
struct list_head gray_list;
@@ -191,8 +191,8 @@ static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
static LIST_HEAD(mem_pool_free_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
-/* rw_lock protecting the access to object_list and object_tree_root */
-static DEFINE_RWLOCK(kmemleak_lock);
+/* protecting the access to object_list and object_tree_root */
+static DEFINE_RAW_SPINLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
static struct kmem_cache *object_cache;
@@ -426,7 +426,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
}
/* slab allocation failed, try the memory pool */
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = list_first_entry_or_null(&mem_pool_free_list,
typeof(*object), object_list);
if (object)
@@ -435,7 +435,7 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
object = &mem_pool[--mem_pool_free_count];
else
pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -453,9 +453,9 @@ static void mem_pool_free(struct kmemleak_object *object)
}
/* add the object to the memory pool free list */
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
list_add(&object->object_list, &mem_pool_free_list);
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
}
/*
@@ -514,9 +514,9 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
struct kmemleak_object *object;
rcu_read_lock();
- read_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
- read_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
/* check whether the object is still available */
if (object && !get_object(object))
@@ -546,11 +546,11 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali
unsigned long flags;
struct kmemleak_object *object;
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
if (object)
__remove_object(object);
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -585,7 +585,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
INIT_LIST_HEAD(&object->object_list);
INIT_LIST_HEAD(&object->gray_list);
INIT_HLIST_HEAD(&object->area_list);
- spin_lock_init(&object->lock);
+ raw_spin_lock_init(&object->lock);
atomic_set(&object->use_count, 1);
object->flags = OBJECT_ALLOCATED;
object->pointer = ptr;
@@ -617,7 +617,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
/* kernel backtrace */
object->trace_len = __save_stack_trace(object->trace);
- write_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
min_addr = min(min_addr, untagged_ptr);
@@ -649,7 +649,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
list_add_tail_rcu(&object->object_list, &object_list);
out:
- write_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
return object;
}
@@ -667,9 +667,9 @@ static void __delete_object(struct kmemleak_object *object)
* Locking here also ensures that the corresponding memory block
* cannot be freed when it is being scanned.
*/
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->flags &= ~OBJECT_ALLOCATED;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -739,9 +739,9 @@ static void paint_it(struct kmemleak_object *object, int color)
{
unsigned long flags;
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
__paint_it(object, color);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
static void paint_ptr(unsigned long ptr, int color)
@@ -798,7 +798,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
if (scan_area_cache)
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (!area) {
pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
/* mark the object for full scan to avoid false positives */
@@ -820,7 +820,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
hlist_add_head(&area->node, &object->area_list);
out_unlock:
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -842,9 +842,9 @@ static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref)
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->excess_ref = excess_ref;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -864,9 +864,9 @@ static void object_no_scan(unsigned long ptr)
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->flags |= OBJECT_NO_SCAN;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -1026,9 +1026,9 @@ void __ref kmemleak_update_trace(const void *ptr)
return;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
object->trace_len = __save_stack_trace(object->trace);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
}
@@ -1233,7 +1233,7 @@ static void scan_block(void *_start, void *_end,
unsigned long flags;
unsigned long untagged_ptr;
- read_lock_irqsave(&kmemleak_lock, flags);
+ raw_spin_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
struct kmemleak_object *object;
unsigned long pointer;
@@ -1268,7 +1268,7 @@ static void scan_block(void *_start, void *_end,
* previously acquired in scan_object(). These locks are
* enclosed by scan_mutex.
*/
- spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
/* only pass surplus references (object already gray) */
if (color_gray(object)) {
excess_ref = object->excess_ref;
@@ -1277,7 +1277,7 @@ static void scan_block(void *_start, void *_end,
excess_ref = 0;
update_refs(object);
}
- spin_unlock(&object->lock);
+ raw_spin_unlock(&object->lock);
if (excess_ref) {
object = lookup_object(excess_ref, 0);
@@ -1286,12 +1286,12 @@ static void scan_block(void *_start, void *_end,
if (object == scanned)
/* circular reference, ignore */
continue;
- spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
update_refs(object);
- spin_unlock(&object->lock);
+ raw_spin_unlock(&object->lock);
}
}
- read_unlock_irqrestore(&kmemleak_lock, flags);
+ raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
}
/*
@@ -1324,7 +1324,7 @@ static void scan_object(struct kmemleak_object *object)
* Once the object->lock is acquired, the corresponding memory block
* cannot be freed (the same lock is acquired in delete_object).
*/
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (object->flags & OBJECT_NO_SCAN)
goto out;
if (!(object->flags & OBJECT_ALLOCATED))
@@ -1344,9 +1344,9 @@ static void scan_object(struct kmemleak_object *object)
if (start >= end)
break;
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
cond_resched();
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
} while (object->flags & OBJECT_ALLOCATED);
} else
hlist_for_each_entry(area, &object->area_list, node)
@@ -1354,7 +1354,7 @@ static void scan_object(struct kmemleak_object *object)
(void *)(area->start + area->size),
object);
out:
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
/*
@@ -1407,7 +1407,7 @@ static void kmemleak_scan(void)
/* prepare the kmemleak_object's */
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
#ifdef DEBUG
/*
* With a few exceptions there should be a maximum of
@@ -1424,7 +1424,7 @@ static void kmemleak_scan(void)
if (color_gray(object) && get_object(object))
list_add_tail(&object->gray_list, &gray_list);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1492,14 +1492,14 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
&& update_checksum(object) && get_object(object)) {
/* color it gray temporarily */
object->count = object->min_count;
list_add_tail(&object->gray_list, &gray_list);
}
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1519,7 +1519,7 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
@@ -1529,7 +1529,7 @@ static void kmemleak_scan(void)
new_leaks++;
}
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
@@ -1681,10 +1681,10 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v)
struct kmemleak_object *object = v;
unsigned long flags;
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object))
print_unreferenced(seq, object);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
return 0;
}
@@ -1714,9 +1714,9 @@ static int dump_str_object_info(const char *str)
return -EINVAL;
}
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
dump_object_info(object);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
put_object(object);
return 0;
@@ -1735,11 +1735,11 @@ static void kmemleak_clear(void)
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irqsave(&object->lock, flags);
if ((object->flags & OBJECT_REPORTED) &&
unreferenced_object(object))
__paint_it(object, KMEMLEAK_GREY);
- spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irqrestore(&object->lock, flags);
}
rcu_read_unlock();
diff --git a/mm/madvise.c b/mm/madvise.c
index bcdb6a042787..43b47d3fae02 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1044,7 +1044,7 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
-SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
+int do_madvise(unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
@@ -1141,3 +1141,8 @@ out:
return error;
}
+
+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
+{
+ return do_madvise(start, len_in, behavior);
+}
diff --git a/mm/memblock.c b/mm/memblock.c
index 4bc2c7d8bf42..eba94ee3de0b 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -575,7 +575,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
* Return:
* 0 on success, -errno on failure.
*/
-int __init_memblock memblock_add_range(struct memblock_type *type,
+static int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
int nid, enum memblock_flags flags)
{
@@ -694,7 +694,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_add: [%pa-%pa] %pS\n",
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
@@ -795,7 +795,7 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_remove: [%pa-%pa] %pS\n",
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
return memblock_remove_range(&memblock.memory, base, size);
@@ -813,7 +813,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg(" memblock_free: [%pa-%pa] %pS\n",
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
@@ -824,12 +824,24 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_reserve: [%pa-%pa] %pS\n",
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
}
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
+{
+ phys_addr_t end = base + size - 1;
+
+ memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
+ &base, &end, (void *)_RET_IP_);
+
+ return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0);
+}
+#endif
+
/**
* memblock_setclr_flag - set or clear flag for a memory region
* @base: base address of the region
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c5b5f74cfd4d..6f6dc8712e39 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3287,49 +3287,34 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
{
- unsigned long stat[MEMCG_NR_STAT];
+ unsigned long stat[MEMCG_NR_STAT] = {0};
struct mem_cgroup *mi;
int node, cpu, i;
- int min_idx, max_idx;
-
- if (slab_only) {
- min_idx = NR_SLAB_RECLAIMABLE;
- max_idx = NR_SLAB_UNRECLAIMABLE;
- } else {
- min_idx = 0;
- max_idx = MEMCG_NR_STAT;
- }
-
- for (i = min_idx; i < max_idx; i++)
- stat[i] = 0;
for_each_online_cpu(cpu)
- for (i = min_idx; i < max_idx; i++)
+ for (i = 0; i < MEMCG_NR_STAT; i++)
stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- for (i = min_idx; i < max_idx; i++)
+ for (i = 0; i < MEMCG_NR_STAT; i++)
atomic_long_add(stat[i], &mi->vmstats[i]);
- if (!slab_only)
- max_idx = NR_VM_NODE_STAT_ITEMS;
-
for_each_node(node) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
struct mem_cgroup_per_node *pi;
- for (i = min_idx; i < max_idx; i++)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
stat[i] = 0;
for_each_online_cpu(cpu)
- for (i = min_idx; i < max_idx; i++)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
stat[i] += per_cpu(
pn->lruvec_stat_cpu->count[i], cpu);
for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
- for (i = min_idx; i < max_idx; i++)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
atomic_long_add(stat[i], &pi->lruvec_stat[i]);
}
}
@@ -3403,13 +3388,9 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
parent = root_mem_cgroup;
/*
- * Deactivate and reparent kmem_caches. Then flush percpu
- * slab statistics to have precise values at the parent and
- * all ancestor levels. It's required to keep slab stats
- * accurate after the reparenting of kmem_caches.
+ * Deactivate and reparent kmem_caches.
*/
memcg_deactivate_kmem_caches(memcg, parent);
- memcg_flush_percpu_vmstats(memcg, true);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
@@ -4913,7 +4894,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
* Flush percpu vmstats and vmevents to guarantee the value correctness
* on parent's and all ancestor levels.
*/
- memcg_flush_percpu_vmstats(memcg, false);
+ memcg_flush_percpu_vmstats(memcg);
memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
@@ -5359,14 +5340,6 @@ static int mem_cgroup_move_account(struct page *page,
__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (compound && !list_empty(page_deferred_list(page))) {
- spin_lock(&from->deferred_split_queue.split_queue_lock);
- list_del_init(page_deferred_list(page));
- from->deferred_split_queue.split_queue_len--;
- spin_unlock(&from->deferred_split_queue.split_queue_lock);
- }
-#endif
/*
* It is safe to change page->mem_cgroup here because the page
* is referenced, charged, and isolated - we can't race with
@@ -5376,16 +5349,6 @@ static int mem_cgroup_move_account(struct page *page,
/* caller should have done css_get */
page->mem_cgroup = to;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (compound && list_empty(page_deferred_list(page))) {
- spin_lock(&to->deferred_split_queue.split_queue_lock);
- list_add_tail(page_deferred_list(page),
- &to->deferred_split_queue.split_queue);
- to->deferred_split_queue.split_queue_len++;
- spin_unlock(&to->deferred_split_queue.split_queue_lock);
- }
-#endif
-
spin_unlock_irqrestore(&from->move_lock, flags);
ret = 0;
@@ -6670,7 +6633,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
unsigned int nr_pages;
- bool compound;
unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
@@ -6692,8 +6654,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
return;
/* Force-charge the new page. The old one will be freed soon */
- compound = PageTransHuge(newpage);
- nr_pages = compound ? hpage_nr_pages(newpage) : 1;
+ nr_pages = hpage_nr_pages(newpage);
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
@@ -6703,7 +6664,8 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
commit_charge(newpage, memcg, false);
local_irq_save(flags);
- mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
+ mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage),
+ nr_pages);
memcg_check_events(memcg, newpage);
local_irq_restore(flags);
}
diff --git a/mm/memory.c b/mm/memory.c
index 45442d9a4f52..0bccc622e482 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1664,6 +1664,9 @@ out_unlock:
* vmf_insert_pfn_prot should only be used if using multiple VMAs is
* impractical.
*
+ * See vmf_insert_mixed_prot() for a discussion of the implication of using
+ * a value of @pgprot different from that of @vma->vm_page_prot.
+ *
* Context: Process context. May allocate using %GFP_KERNEL.
* Return: vm_fault_t value.
*/
@@ -1737,9 +1740,9 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
}
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn, bool mkwrite)
+ unsigned long addr, pfn_t pfn, pgprot_t pgprot,
+ bool mkwrite)
{
- pgprot_t pgprot = vma->vm_page_prot;
int err;
BUG_ON(!vm_mixed_ok(vma, pfn));
@@ -1782,10 +1785,43 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
+/**
+ * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ * @pgprot: pgprot flags for the inserted page
+ *
+ * This is exactly like vmf_insert_mixed(), except that it allows drivers to
+ * to override pgprot on a per-page basis.
+ *
+ * Typically this function should be used by drivers to set caching- and
+ * encryption bits different than those of @vma->vm_page_prot, because
+ * the caching- or encryption mode may not be known at mmap() time.
+ * This is ok as long as @vma->vm_page_prot is not used by the core vm
+ * to set caching and encryption bits for those vmas (except for COW pages).
+ * This is ensured by core vm only modifying these page table entries using
+ * functions that don't touch caching- or encryption bits, using pte_modify()
+ * if needed. (See for example mprotect()).
+ * Also when new page-table entries are created, this is only done using the
+ * fault() callback, and never using the value of vma->vm_page_prot,
+ * except for page-table entries that point to anonymous pages as the result
+ * of COW.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn, pgprot_t pgprot)
+{
+ return __vm_insert_mixed(vma, addr, pfn, pgprot, false);
+}
+EXPORT_SYMBOL(vmf_insert_mixed_prot);
+
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
{
- return __vm_insert_mixed(vma, addr, pfn, false);
+ return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);
@@ -1797,7 +1833,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn)
{
- return __vm_insert_mixed(vma, addr, pfn, true);
+ return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
@@ -2203,7 +2239,7 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte)
{
int same = 1;
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
if (sizeof(pte_t) > sizeof(unsigned long)) {
spinlock_t *ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a91a072f2b2c..0a54ffac8c68 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -355,7 +355,7 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
if (unlikely(pfn_to_nid(start_pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+ if (zone != page_zone(pfn_to_page(start_pfn)))
continue;
return start_pfn;
@@ -380,7 +380,7 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
if (unlikely(pfn_to_nid(pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(pfn)))
+ if (zone != page_zone(pfn_to_page(pfn)))
continue;
return pfn;
@@ -392,14 +392,11 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long zone_start_pfn = zone->zone_start_pfn;
- unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
- unsigned long zone_end_pfn = z;
unsigned long pfn;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
- if (zone_start_pfn == start_pfn) {
+ if (zone->zone_start_pfn == start_pfn) {
/*
* If the section is smallest section in the zone, it need
* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
@@ -407,50 +404,30 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
* for shrinking zone.
*/
pfn = find_smallest_section_pfn(nid, zone, end_pfn,
- zone_end_pfn);
+ zone_end_pfn(zone));
if (pfn) {
+ zone->spanned_pages = zone_end_pfn(zone) - pfn;
zone->zone_start_pfn = pfn;
- zone->spanned_pages = zone_end_pfn - pfn;
+ } else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
}
- } else if (zone_end_pfn == end_pfn) {
+ } else if (zone_end_pfn(zone) == end_pfn) {
/*
* If the section is biggest section in the zone, it need
* shrink zone->spanned_pages.
* In this case, we find second biggest valid mem_section for
* shrinking zone.
*/
- pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+ pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
start_pfn);
if (pfn)
- zone->spanned_pages = pfn - zone_start_pfn + 1;
- }
-
- /*
- * The section is not biggest or smallest mem_section in the zone, it
- * only creates a hole in the zone. So in this case, we need not
- * change the zone. But perhaps, the zone has only hole data. Thus
- * it check the zone has only hole or not.
- */
- pfn = zone_start_pfn;
- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_to_online_page(pfn)))
- continue;
-
- if (page_zone(pfn_to_page(pfn)) != zone)
- continue;
-
- /* Skip range to be removed */
- if (pfn >= start_pfn && pfn < end_pfn)
- continue;
-
- /* If we find valid section, we have nothing to do */
- zone_span_writeunlock(zone);
- return;
+ zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
+ else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
+ }
}
-
- /* The zone has no valid section */
- zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
zone_span_writeunlock(zone);
}
@@ -490,6 +467,9 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long flags;
+ /* Poison struct pages because they are now uninitialized again. */
+ page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+
#ifdef CONFIG_ZONE_DEVICE
/*
* Zone shrinking code cannot properly deal with ZONE_DEVICE. So
@@ -536,25 +516,20 @@ static void __remove_section(unsigned long pfn, unsigned long nr_pages,
void __remove_pages(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
+ const unsigned long end_pfn = pfn + nr_pages;
+ unsigned long cur_nr_pages;
unsigned long map_offset = 0;
- unsigned long nr, start_sec, end_sec;
map_offset = vmem_altmap_offset(altmap);
if (check_pfn_span(pfn, nr_pages, "remove"))
return;
- start_sec = pfn_to_section_nr(pfn);
- end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
- for (nr = start_sec; nr <= end_sec; nr++) {
- unsigned long pfns;
-
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
- pfns = min(nr_pages, PAGES_PER_SECTION
- - (pfn & ~PAGE_SECTION_MASK));
- __remove_section(pfn, pfns, map_offset, altmap);
- pfn += pfns;
- nr_pages -= pfns;
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK));
+ __remove_section(pfn, cur_nr_pages, map_offset, altmap);
map_offset = 0;
}
}
@@ -783,27 +758,18 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+ int online_type, int nid)
{
unsigned long flags;
unsigned long onlined_pages = 0;
struct zone *zone;
int need_zonelists_rebuild = 0;
- int nid;
int ret;
struct memory_notify arg;
- struct memory_block *mem;
mem_hotplug_begin();
- /*
- * We can't use pfn_to_nid() because nid might be stored in struct page
- * which is not yet initialized. Instead, we find nid from memory block.
- */
- mem = find_memory_block(__pfn_to_section(pfn));
- nid = mem->nid;
- put_device(&mem->dev);
-
/* associate pfn range with the zone */
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
@@ -1182,7 +1148,7 @@ static bool is_pageblock_removable_nolock(unsigned long pfn)
if (!zone_spans_pfn(zone, pfn))
return false;
- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE,
+ return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE,
MEMORY_OFFLINE);
}
@@ -1206,14 +1172,13 @@ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
}
/*
- * Confirm all pages in a range [start, end) belong to the same zone.
- * When true, return its valid [start, end).
+ * Confirm all pages in a range [start, end) belong to the same zone (skipping
+ * memory holes). When true, return the zone.
*/
-int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
- unsigned long *valid_start, unsigned long *valid_end)
+struct zone *test_pages_in_a_zone(unsigned long start_pfn,
+ unsigned long end_pfn)
{
unsigned long pfn, sec_end_pfn;
- unsigned long start, end;
struct zone *zone = NULL;
struct page *page;
int i;
@@ -1234,24 +1199,15 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
continue;
/* Check if we got outside of the zone */
if (zone && !zone_spans_pfn(zone, pfn + i))
- return 0;
+ return NULL;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
- return 0;
- if (!zone)
- start = pfn + i;
+ return NULL;
zone = page_zone(page);
- end = pfn + MAX_ORDER_NR_PAGES;
}
}
- if (zone) {
- *valid_start = start;
- *valid_end = min(end, end_pfn);
- return 1;
- } else {
- return 0;
- }
+ return zone;
}
/*
@@ -1496,7 +1452,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
unsigned long offlined_pages = 0;
int ret, node, nr_isolate_pageblock;
unsigned long flags;
- unsigned long valid_start, valid_end;
struct zone *zone;
struct memory_notify arg;
char *reason;
@@ -1521,14 +1476,12 @@ static int __ref __offline_pages(unsigned long start_pfn,
/* This makes hotplug much easier...and readable.
we assume this for now. .*/
- if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
- &valid_end)) {
+ zone = test_pages_in_a_zone(start_pfn, end_pfn);
+ if (!zone) {
ret = -EINVAL;
reason = "multizone range";
goto failed_removal;
}
-
- zone = page_zone(pfn_to_page(valid_start));
node = zone_to_nid(zone);
/* set above range as isolated */
@@ -1764,8 +1717,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
BUG_ON(check_hotplug_memory_range(start, size));
- mem_hotplug_begin();
-
/*
* All memory blocks must be offlined before removing memory. Check
* whether all memory blocks in question are offline and return error
@@ -1778,9 +1729,14 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
- /* remove memory block devices before removing memory */
+ /*
+ * Memory block device removal under the device_hotplug_lock is
+ * a barrier against racing online attempts.
+ */
remove_memory_block_devices(start, size);
+ mem_hotplug_begin();
+
arch_remove_memory(nid, start, size, NULL);
memblock_free(start, size);
memblock_remove(start, size);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 067cf7d3daf5..977c641f78cf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2148,18 +2148,22 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
+ /*
+ * First, try to allocate THP only on local node, but
+ * don't reclaim unnecessarily, just compact.
+ */
page = __alloc_pages_node(hpage_node,
- gfp | __GFP_THISNODE, order);
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order);
/*
* If hugepage allocations are configured to always
* synchronous compact or the vma has been madvised
* to prefer hugepage backing, retry allowing remote
- * memory as well.
+ * memory with both reclaim and compact as well.
*/
if (!page && (gfp & __GFP_DIRECT_RECLAIM))
page = __alloc_pages_node(hpage_node,
- gfp | __GFP_NORETRY, order);
+ gfp, order);
goto out;
}
@@ -2817,6 +2821,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
char *flags = strchr(str, '=');
int err = 1, mode;
+ if (flags)
+ *flags++ = '\0'; /* terminate mode string */
+
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
@@ -2827,9 +2834,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
} else
nodes_clear(nodes);
- if (flags)
- *flags++ = '\0'; /* terminate mode string */
-
mode = match_string(policy_modes, MPOL_MAX, str);
if (mode < 0)
goto out;
diff --git a/mm/memremap.c b/mm/memremap.c
index c51c6bd2fe34..09b5b7adc773 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -27,7 +27,8 @@ static void devmap_managed_enable_put(void)
static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
{
- if (!pgmap->ops || !pgmap->ops->page_free) {
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE &&
+ (!pgmap->ops || !pgmap->ops->page_free)) {
WARN(1, "Missing page_free method\n");
return -EINVAL;
}
@@ -119,6 +120,8 @@ void memunmap_pages(struct dev_pagemap *pgmap)
nid = page_to_nid(first_page);
mem_hotplug_begin();
+ remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)));
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
__remove_pages(PHYS_PFN(res->start),
PHYS_PFN(resource_size(res)), NULL);
@@ -410,48 +413,42 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
EXPORT_SYMBOL_GPL(get_dev_pagemap);
#ifdef CONFIG_DEV_PAGEMAP_OPS
-void __put_devmap_managed_page(struct page *page)
+void free_devmap_managed_page(struct page *page)
{
- int count = page_ref_dec_return(page);
-
- /*
- * If refcount is 1 then page is freed and refcount is stable as nobody
- * holds a reference on the page.
- */
- if (count == 1) {
- /* Clear Active bit in case of parallel mark_page_accessed */
- __ClearPageActive(page);
- __ClearPageWaiters(page);
+ /* notify page idle for dax */
+ if (!is_device_private_page(page)) {
+ wake_up_var(&page->_refcount);
+ return;
+ }
- mem_cgroup_uncharge(page);
+ /* Clear Active bit in case of parallel mark_page_accessed */
+ __ClearPageActive(page);
+ __ClearPageWaiters(page);
- /*
- * When a device_private page is freed, the page->mapping field
- * may still contain a (stale) mapping value. For example, the
- * lower bits of page->mapping may still identify the page as
- * an anonymous page. Ultimately, this entire field is just
- * stale and wrong, and it will cause errors if not cleared.
- * One example is:
- *
- * migrate_vma_pages()
- * migrate_vma_insert_page()
- * page_add_new_anon_rmap()
- * __page_set_anon_rmap()
- * ...checks page->mapping, via PageAnon(page) call,
- * and incorrectly concludes that the page is an
- * anonymous page. Therefore, it incorrectly,
- * silently fails to set up the new anon rmap.
- *
- * For other types of ZONE_DEVICE pages, migration is either
- * handled differently or not done at all, so there is no need
- * to clear page->mapping.
- */
- if (is_device_private_page(page))
- page->mapping = NULL;
+ mem_cgroup_uncharge(page);
- page->pgmap->ops->page_free(page);
- } else if (!count)
- __put_page(page);
+ /*
+ * When a device_private page is freed, the page->mapping field
+ * may still contain a (stale) mapping value. For example, the
+ * lower bits of page->mapping may still identify the page as an
+ * anonymous page. Ultimately, this entire field is just stale
+ * and wrong, and it will cause errors if not cleared. One
+ * example is:
+ *
+ * migrate_vma_pages()
+ * migrate_vma_insert_page()
+ * page_add_new_anon_rmap()
+ * __page_set_anon_rmap()
+ * ...checks page->mapping, via PageAnon(page) call,
+ * and incorrectly concludes that the page is an
+ * anonymous page. Therefore, it incorrectly,
+ * silently fails to set up the new anon rmap.
+ *
+ * For other types of ZONE_DEVICE pages, migration is either
+ * handled differently or not done at all, so there is no need
+ * to clear page->mapping.
+ */
+ page->mapping = NULL;
+ page->pgmap->ops->page_free(page);
}
-EXPORT_SYMBOL(__put_devmap_managed_page);
#endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/mm/migrate.c b/mm/migrate.c
index 86873b6f38a7..b1092876e537 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -48,6 +48,7 @@
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
#include <linux/ptrace.h>
+#include <linux/oom.h>
#include <asm/tlbflush.h>
@@ -986,7 +987,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
}
/*
- * Anonymous and movable page->mapping will be cleard by
+ * Anonymous and movable page->mapping will be cleared by
* free_pages_prepare so don't reset it here for keeping
* the type to work PageAnon, for example.
*/
@@ -1199,8 +1200,7 @@ out:
/*
* A page that has been migrated has all references
* removed and will be freed. A page that has not been
- * migrated will have kepts its references and be
- * restored.
+ * migrated will have kept its references and be restored.
*/
list_del(&page->lru);
@@ -1627,8 +1627,19 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
start = i;
} else if (node != current_node) {
err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err)
+ if (err) {
+ /*
+ * Positive err means the number of failed
+ * pages to migrate. Since we are going to
+ * abort and return the number of non-migrated
+ * pages, so need to incude the rest of the
+ * nr_pages that have not been attempted as
+ * well.
+ */
+ if (err > 0)
+ err += nr_pages - i - 1;
goto out;
+ }
err = store_status(status, start, current_node, i - start);
if (err)
goto out;
@@ -1659,8 +1670,11 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
goto out_flush;
err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err)
+ if (err) {
+ if (err > 0)
+ err += nr_pages - i - 1;
goto out;
+ }
if (i > start) {
err = store_status(status, start, current_node, i - start);
if (err)
@@ -1674,9 +1688,16 @@ out_flush:
/* Make sure we do not overwrite the existing error */
err1 = do_move_pages_to_node(mm, &pagelist, current_node);
+ /*
+ * Don't have to report non-attempted pages here since:
+ * - If the above loop is done gracefully all pages have been
+ * attempted.
+ * - If the above loop is aborted it means a fatal error
+ * happened, should return ret.
+ */
if (!err1)
err1 = store_status(status, start, current_node, i - start);
- if (!err)
+ if (err >= 0)
err = err1;
out:
return err;
@@ -2130,12 +2151,13 @@ out_unlock:
#ifdef CONFIG_DEVICE_PRIVATE
static int migrate_vma_collect_hole(unsigned long start,
unsigned long end,
+ __always_unused int depth,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
unsigned long addr;
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
migrate->npages++;
@@ -2152,7 +2174,7 @@ static int migrate_vma_collect_skip(unsigned long start,
struct migrate_vma *migrate = walk->private;
unsigned long addr;
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = 0;
}
@@ -2174,7 +2196,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
again:
if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end, walk);
+ return migrate_vma_collect_hole(start, end, -1, walk);
if (pmd_trans_huge(*pmdp)) {
struct page *page;
@@ -2207,7 +2229,7 @@ again:
return migrate_vma_collect_skip(start, end,
walk);
if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end,
+ return migrate_vma_collect_hole(start, end, -1,
walk);
}
}
@@ -2675,6 +2697,14 @@ int migrate_vma_setup(struct migrate_vma *args)
}
EXPORT_SYMBOL(migrate_vma_setup);
+/*
+ * This code closely matches the code in:
+ * __handle_mm_fault()
+ * handle_pte_fault()
+ * do_anonymous_page()
+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
+ * private page.
+ */
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
struct page *page,
@@ -2755,30 +2785,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (check_stable_address_space(mm))
+ goto unlock_abort;
+
if (pte_present(*ptep)) {
unsigned long pfn = pte_pfn(*ptep);
- if (!is_zero_pfn(pfn)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ if (!is_zero_pfn(pfn))
+ goto unlock_abort;
flush = true;
- } else if (!pte_none(*ptep)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ } else if (!pte_none(*ptep))
+ goto unlock_abort;
/*
- * Check for usefaultfd but do not deliver the fault. Instead,
+ * Check for userfaultfd but do not deliver the fault. Instead,
* just back off.
*/
- if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
@@ -2802,6 +2826,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
*src = MIGRATE_PFN_MIGRATE;
return;
+unlock_abort:
+ pte_unmap_unlock(ptep, ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
@@ -2834,9 +2861,8 @@ void migrate_vma_pages(struct migrate_vma *migrate)
}
if (!page) {
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
- }
if (!notified) {
notified = true;
diff --git a/mm/mincore.c b/mm/mincore.c
index 49b6fa2f6aa1..0e6dd9948f1a 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -112,6 +112,7 @@ static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
}
static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+ __always_unused int depth,
struct mm_walk *walk)
{
walk->private += __mincore_unmapped_range(addr, end,
diff --git a/mm/mmap.c b/mm/mmap.c
index 9c648524e4dc..6756b8bb0033 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -90,12 +90,6 @@ static void unmap_region(struct mm_struct *mm,
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- *
- * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
- * MAP_PRIVATE:
- * r: (no) no
- * w: (no) no
- * x: (yes) yes
*/
pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
@@ -1276,26 +1270,22 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
- struct anon_vma *anon_vma;
- struct vm_area_struct *near;
-
- near = vma->vm_next;
- if (!near)
- goto try_prev;
-
- anon_vma = reusable_anon_vma(near, vma, near);
- if (anon_vma)
- return anon_vma;
-try_prev:
- near = vma->vm_prev;
- if (!near)
- goto none;
-
- anon_vma = reusable_anon_vma(near, near, vma);
- if (anon_vma)
- return anon_vma;
-none:
+ struct anon_vma *anon_vma = NULL;
+
+ /* Try next first. */
+ if (vma->vm_next) {
+ anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
+ if (anon_vma)
+ return anon_vma;
+ }
+
+ /* Try prev next. */
+ if (vma->vm_prev)
+ anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
+
/*
+ * We might reach here with anon_vma == NULL if we can't find
+ * any reusable anon_vma.
* There's no absolute need to look only at touching neighbours:
* we could search further afield for "compatible" anon_vmas.
* But it would probably just be a waste of time searching,
@@ -1303,7 +1293,7 @@ none:
* We're trying to allow mprotect remerging later on,
* not trying to minimize memory used for anon_vmas.
*/
- return NULL;
+ return anon_vma;
}
/*
@@ -3342,6 +3332,8 @@ static const struct vm_operations_struct special_mapping_vmops = {
.fault = special_mapping_fault,
.mremap = special_mapping_mremap,
.name = special_mapping_name,
+ /* vDSO code relies that VVAR can't be accessed remotely */
+ .access = NULL,
};
static const struct vm_operations_struct legacy_special_mapping_vmops = {
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 7d70e5c78f97..a3538cb2bcbe 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -11,7 +11,7 @@
#include <asm/pgalloc.h>
#include <asm/tlb.h>
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
static bool tlb_next_batch(struct mmu_gather *tlb)
{
@@ -69,7 +69,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
VM_BUG_ON(!tlb->end);
-#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
VM_WARN_ON(tlb->page_size != page_size);
#endif
@@ -89,58 +89,108 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
return false;
}
-#endif /* HAVE_MMU_GATHER_NO_GATHER */
+#endif /* MMU_GATHER_NO_GATHER */
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+#ifdef CONFIG_MMU_GATHER_TABLE_FREE
-/*
- * See the comment near struct mmu_table_batch.
- */
+static void __tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ int i;
+
+ for (i = 0; i < batch->nr; i++)
+ __tlb_remove_table(batch->tables[i]);
+
+ free_page((unsigned long)batch);
+}
+
+#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
/*
- * If we want tlb_remove_table() to imply TLB invalidates.
+ * Semi RCU freeing of the page directories.
+ *
+ * This is needed by some architectures to implement software pagetable walkers.
+ *
+ * gup_fast() and other software pagetable walkers do a lockless page-table
+ * walk and therefore needs some synchronization with the freeing of the page
+ * directories. The chosen means to accomplish that is by disabling IRQs over
+ * the walk.
+ *
+ * Architectures that use IPIs to flush TLBs will then automagically DTRT,
+ * since we unlink the page, flush TLBs, free the page. Since the disabling of
+ * IRQs delays the completion of the TLB flush we can never observe an already
+ * freed page.
+ *
+ * Architectures that do not have this (PPC) need to delay the freeing by some
+ * other means, this is that means.
+ *
+ * What we do is batch the freed directory pages (tables) and RCU free them.
+ * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
+ * holds off grace periods.
+ *
+ * However, in order to batch these pages we need to allocate storage, this
+ * allocation is deep inside the MM code and can thus easily fail on memory
+ * pressure. To guarantee progress we fall back to single table freeing, see
+ * the implementation of tlb_remove_table_one().
+ *
*/
-static inline void tlb_table_invalidate(struct mmu_gather *tlb)
-{
-#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE
- /*
- * Invalidate page-table caches used by hardware walkers. Then we still
- * need to RCU-sched wait while freeing the pages because software
- * walkers can still be in-flight.
- */
- tlb_flush_mmu_tlbonly(tlb);
-#endif
-}
static void tlb_remove_table_smp_sync(void *arg)
{
/* Simply deliver the interrupt */
}
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_sync_one(void)
{
/*
* This isn't an RCU grace period and hence the page-tables cannot be
* assumed to be actually RCU-freed.
*
* It is however sufficient for software page-table walkers that rely on
- * IRQ disabling. See the comment near struct mmu_table_batch.
+ * IRQ disabling.
*/
smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
- __tlb_remove_table(table);
}
static void tlb_remove_table_rcu(struct rcu_head *head)
{
- struct mmu_table_batch *batch;
- int i;
+ __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
+}
+
+static void tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ call_rcu(&batch->rcu, tlb_remove_table_rcu);
+}
- batch = container_of(head, struct mmu_table_batch, rcu);
+#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
- for (i = 0; i < batch->nr; i++)
- __tlb_remove_table(batch->tables[i]);
+static void tlb_remove_table_sync_one(void) { }
- free_page((unsigned long)batch);
+static void tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+ __tlb_remove_table_free(batch);
+}
+
+#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
+
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+ if (tlb_needs_table_invalidate()) {
+ /*
+ * Invalidate page-table caches used by hardware walkers. Then
+ * we still need to RCU-sched wait while freeing the pages
+ * because software walkers can still be in-flight.
+ */
+ tlb_flush_mmu_tlbonly(tlb);
+ }
+}
+
+static void tlb_remove_table_one(void *table)
+{
+ tlb_remove_table_sync_one();
+ __tlb_remove_table(table);
}
static void tlb_table_flush(struct mmu_gather *tlb)
@@ -149,7 +199,7 @@ static void tlb_table_flush(struct mmu_gather *tlb)
if (*batch) {
tlb_table_invalidate(tlb);
- call_rcu(&(*batch)->rcu, tlb_remove_table_rcu);
+ tlb_remove_table_free(*batch);
*batch = NULL;
}
}
@@ -173,14 +223,22 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
tlb_table_flush(tlb);
}
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+static inline void tlb_table_init(struct mmu_gather *tlb)
+{
+ tlb->batch = NULL;
+}
+
+#else /* !CONFIG_MMU_GATHER_TABLE_FREE */
+
+static inline void tlb_table_flush(struct mmu_gather *tlb) { }
+static inline void tlb_table_init(struct mmu_gather *tlb) { }
+
+#endif /* CONFIG_MMU_GATHER_TABLE_FREE */
static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb_table_flush(tlb);
-#endif
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb_batch_pages_flush(tlb);
#endif
}
@@ -211,7 +269,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
/* Is it from 0 to ~0? */
tlb->fullmm = !(start | (end+1));
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb->need_flush_all = 0;
tlb->local.next = NULL;
tlb->local.nr = 0;
@@ -220,10 +278,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
tlb->batch_count = 0;
#endif
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
- tlb->batch = NULL;
-#endif
-#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+ tlb_table_init(tlb);
+#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
tlb->page_size = 0;
#endif
@@ -271,7 +327,7 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
tlb_flush_mmu(tlb);
-#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+#ifndef CONFIG_MMU_GATHER_NO_GATHER
tlb_batch_list_free(tlb);
#endif
dec_tlb_flush_pending(tlb->mm);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index f76ea05b1cb0..ef3973a5d34a 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -29,12 +29,12 @@ struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
#endif
/*
- * The mmu notifier_mm structure is allocated and installed in
- * mm->mmu_notifier_mm inside the mm_take_all_locks() protected
+ * The mmu_notifier_subscriptions structure is allocated and installed in
+ * mm->notifier_subscriptions inside the mm_take_all_locks() protected
* critical section and it's released only when mm_count reaches zero
* in mmdrop().
*/
-struct mmu_notifier_mm {
+struct mmu_notifier_subscriptions {
/* all mmu notifiers registered in this mm are queued in this list */
struct hlist_head list;
bool has_itree;
@@ -65,80 +65,81 @@ struct mmu_notifier_mm {
*
* The write side has two states, fully excluded:
* - mm->active_invalidate_ranges != 0
- * - mnn->invalidate_seq & 1 == True (odd)
+ * - subscriptions->invalidate_seq & 1 == True (odd)
* - some range on the mm_struct is being invalidated
* - the itree is not allowed to change
*
* And partially excluded:
* - mm->active_invalidate_ranges != 0
- * - mnn->invalidate_seq & 1 == False (even)
+ * - subscriptions->invalidate_seq & 1 == False (even)
* - some range on the mm_struct is being invalidated
* - the itree is allowed to change
*
- * Operations on mmu_notifier_mm->invalidate_seq (under spinlock):
+ * Operations on notifier_subscriptions->invalidate_seq (under spinlock):
* seq |= 1 # Begin writing
* seq++ # Release the writing state
* seq & 1 # True if a writer exists
*
* The later state avoids some expensive work on inv_end in the common case of
- * no mni monitoring the VA.
+ * no mmu_interval_notifier monitoring the VA.
*/
-static bool mn_itree_is_invalidating(struct mmu_notifier_mm *mmn_mm)
+static bool
+mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions)
{
- lockdep_assert_held(&mmn_mm->lock);
- return mmn_mm->invalidate_seq & 1;
+ lockdep_assert_held(&subscriptions->lock);
+ return subscriptions->invalidate_seq & 1;
}
static struct mmu_interval_notifier *
-mn_itree_inv_start_range(struct mmu_notifier_mm *mmn_mm,
+mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions,
const struct mmu_notifier_range *range,
unsigned long *seq)
{
struct interval_tree_node *node;
struct mmu_interval_notifier *res = NULL;
- spin_lock(&mmn_mm->lock);
- mmn_mm->active_invalidate_ranges++;
- node = interval_tree_iter_first(&mmn_mm->itree, range->start,
+ spin_lock(&subscriptions->lock);
+ subscriptions->active_invalidate_ranges++;
+ node = interval_tree_iter_first(&subscriptions->itree, range->start,
range->end - 1);
if (node) {
- mmn_mm->invalidate_seq |= 1;
+ subscriptions->invalidate_seq |= 1;
res = container_of(node, struct mmu_interval_notifier,
interval_tree);
}
- *seq = mmn_mm->invalidate_seq;
- spin_unlock(&mmn_mm->lock);
+ *seq = subscriptions->invalidate_seq;
+ spin_unlock(&subscriptions->lock);
return res;
}
static struct mmu_interval_notifier *
-mn_itree_inv_next(struct mmu_interval_notifier *mni,
+mn_itree_inv_next(struct mmu_interval_notifier *interval_sub,
const struct mmu_notifier_range *range)
{
struct interval_tree_node *node;
- node = interval_tree_iter_next(&mni->interval_tree, range->start,
- range->end - 1);
+ node = interval_tree_iter_next(&interval_sub->interval_tree,
+ range->start, range->end - 1);
if (!node)
return NULL;
return container_of(node, struct mmu_interval_notifier, interval_tree);
}
-static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm)
+static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
{
- struct mmu_interval_notifier *mni;
+ struct mmu_interval_notifier *interval_sub;
struct hlist_node *next;
- spin_lock(&mmn_mm->lock);
- if (--mmn_mm->active_invalidate_ranges ||
- !mn_itree_is_invalidating(mmn_mm)) {
- spin_unlock(&mmn_mm->lock);
+ spin_lock(&subscriptions->lock);
+ if (--subscriptions->active_invalidate_ranges ||
+ !mn_itree_is_invalidating(subscriptions)) {
+ spin_unlock(&subscriptions->lock);
return;
}
/* Make invalidate_seq even */
- mmn_mm->invalidate_seq++;
+ subscriptions->invalidate_seq++;
/*
* The inv_end incorporates a deferred mechanism like rtnl_unlock().
@@ -146,30 +147,31 @@ static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm)
* they are progressed. This arrangement for tree updates is used to
* avoid using a blocking lock during invalidate_range_start.
*/
- hlist_for_each_entry_safe(mni, next, &mmn_mm->deferred_list,
+ hlist_for_each_entry_safe(interval_sub, next,
+ &subscriptions->deferred_list,
deferred_item) {
- if (RB_EMPTY_NODE(&mni->interval_tree.rb))
- interval_tree_insert(&mni->interval_tree,
- &mmn_mm->itree);
+ if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb))
+ interval_tree_insert(&interval_sub->interval_tree,
+ &subscriptions->itree);
else
- interval_tree_remove(&mni->interval_tree,
- &mmn_mm->itree);
- hlist_del(&mni->deferred_item);
+ interval_tree_remove(&interval_sub->interval_tree,
+ &subscriptions->itree);
+ hlist_del(&interval_sub->deferred_item);
}
- spin_unlock(&mmn_mm->lock);
+ spin_unlock(&subscriptions->lock);
- wake_up_all(&mmn_mm->wq);
+ wake_up_all(&subscriptions->wq);
}
/**
* mmu_interval_read_begin - Begin a read side critical section against a VA
* range
- * mni: The range to use
+ * interval_sub: The interval subscription
*
* mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
- * collision-retry scheme similar to seqcount for the VA range under mni. If
- * the mm invokes invalidation during the critical section then
- * mmu_interval_read_retry() will return true.
+ * collision-retry scheme similar to seqcount for the VA range under
+ * subscription. If the mm invokes invalidation during the critical section
+ * then mmu_interval_read_retry() will return true.
*
* This is useful to obtain shadow PTEs where teardown or setup of the SPTEs
* require a blocking context. The critical region formed by this can sleep,
@@ -180,68 +182,71 @@ static void mn_itree_inv_end(struct mmu_notifier_mm *mmn_mm)
*
* The return value should be passed to mmu_interval_read_retry().
*/
-unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni)
+unsigned long
+mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
{
- struct mmu_notifier_mm *mmn_mm = mni->mm->mmu_notifier_mm;
+ struct mmu_notifier_subscriptions *subscriptions =
+ interval_sub->mm->notifier_subscriptions;
unsigned long seq;
bool is_invalidating;
/*
- * If the mni has a different seq value under the user_lock than we
- * started with then it has collided.
+ * If the subscription has a different seq value under the user_lock
+ * than we started with then it has collided.
*
- * If the mni currently has the same seq value as the mmn_mm seq, then
- * it is currently between invalidate_start/end and is colliding.
+ * If the subscription currently has the same seq value as the
+ * subscriptions seq, then it is currently between
+ * invalidate_start/end and is colliding.
*
* The locking looks broadly like this:
* mn_tree_invalidate_start(): mmu_interval_read_begin():
* spin_lock
- * seq = READ_ONCE(mni->invalidate_seq);
- * seq == mmn_mm->invalidate_seq
+ * seq = READ_ONCE(interval_sub->invalidate_seq);
+ * seq == subs->invalidate_seq
* spin_unlock
* spin_lock
- * seq = ++mmn_mm->invalidate_seq
+ * seq = ++subscriptions->invalidate_seq
* spin_unlock
* op->invalidate_range():
* user_lock
* mmu_interval_set_seq()
- * mni->invalidate_seq = seq
+ * interval_sub->invalidate_seq = seq
* user_unlock
*
* [Required: mmu_interval_read_retry() == true]
*
* mn_itree_inv_end():
* spin_lock
- * seq = ++mmn_mm->invalidate_seq
+ * seq = ++subscriptions->invalidate_seq
* spin_unlock
*
* user_lock
* mmu_interval_read_retry():
- * mni->invalidate_seq != seq
+ * interval_sub->invalidate_seq != seq
* user_unlock
*
* Barriers are not needed here as any races here are closed by an
* eventual mmu_interval_read_retry(), which provides a barrier via the
* user_lock.
*/
- spin_lock(&mmn_mm->lock);
+ spin_lock(&subscriptions->lock);
/* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
- seq = READ_ONCE(mni->invalidate_seq);
- is_invalidating = seq == mmn_mm->invalidate_seq;
- spin_unlock(&mmn_mm->lock);
+ seq = READ_ONCE(interval_sub->invalidate_seq);
+ is_invalidating = seq == subscriptions->invalidate_seq;
+ spin_unlock(&subscriptions->lock);
/*
- * mni->invalidate_seq must always be set to an odd value via
+ * interval_sub->invalidate_seq must always be set to an odd value via
* mmu_interval_set_seq() using the provided cur_seq from
* mn_itree_inv_start_range(). This ensures that if seq does wrap we
* will always clear the below sleep in some reasonable time as
- * mmn_mm->invalidate_seq is even in the idle state.
+ * subscriptions->invalidate_seq is even in the idle state.
*/
lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
if (is_invalidating)
- wait_event(mmn_mm->wq,
- READ_ONCE(mmn_mm->invalidate_seq) != seq);
+ wait_event(subscriptions->wq,
+ READ_ONCE(subscriptions->invalidate_seq) != seq);
/*
* Notice that mmu_interval_read_retry() can already be true at this
@@ -253,7 +258,7 @@ unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni)
}
EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
-static void mn_itree_release(struct mmu_notifier_mm *mmn_mm,
+static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
struct mm_struct *mm)
{
struct mmu_notifier_range range = {
@@ -263,17 +268,20 @@ static void mn_itree_release(struct mmu_notifier_mm *mmn_mm,
.start = 0,
.end = ULONG_MAX,
};
- struct mmu_interval_notifier *mni;
+ struct mmu_interval_notifier *interval_sub;
unsigned long cur_seq;
bool ret;
- for (mni = mn_itree_inv_start_range(mmn_mm, &range, &cur_seq); mni;
- mni = mn_itree_inv_next(mni, &range)) {
- ret = mni->ops->invalidate(mni, &range, cur_seq);
+ for (interval_sub =
+ mn_itree_inv_start_range(subscriptions, &range, &cur_seq);
+ interval_sub;
+ interval_sub = mn_itree_inv_next(interval_sub, &range)) {
+ ret = interval_sub->ops->invalidate(interval_sub, &range,
+ cur_seq);
WARN_ON(!ret);
}
- mn_itree_inv_end(mmn_mm);
+ mn_itree_inv_end(subscriptions);
}
/*
@@ -283,15 +291,15 @@ static void mn_itree_release(struct mmu_notifier_mm *mmn_mm,
* in parallel despite there being no task using this mm any more,
* through the vmas outside of the exit_mmap context, such as with
* vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to SRCU and it serializes
- * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
+ * the notifier_subscriptions->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_subscriptions
* can't go away from under us as exit_mmap holds an mm_count pin
* itself.
*/
-static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm,
+static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions,
struct mm_struct *mm)
{
- struct mmu_notifier *mn;
+ struct mmu_notifier *subscription;
int id;
/*
@@ -299,29 +307,29 @@ static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm,
* ->release returns.
*/
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist)
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist)
/*
* If ->release runs before mmu_notifier_unregister it must be
* handled, as it's the only way for the driver to flush all
* existing sptes and stop the driver from establishing any more
* sptes before all the pages in the mm are freed.
*/
- if (mn->ops->release)
- mn->ops->release(mn, mm);
+ if (subscription->ops->release)
+ subscription->ops->release(subscription, mm);
- spin_lock(&mmn_mm->lock);
- while (unlikely(!hlist_empty(&mmn_mm->list))) {
- mn = hlist_entry(mmn_mm->list.first, struct mmu_notifier,
- hlist);
+ spin_lock(&subscriptions->lock);
+ while (unlikely(!hlist_empty(&subscriptions->list))) {
+ subscription = hlist_entry(subscriptions->list.first,
+ struct mmu_notifier, hlist);
/*
* We arrived before mmu_notifier_unregister so
* mmu_notifier_unregister will do nothing other than to wait
* for ->release to finish and for mmu_notifier_unregister to
* return.
*/
- hlist_del_init_rcu(&mn->hlist);
+ hlist_del_init_rcu(&subscription->hlist);
}
- spin_unlock(&mmn_mm->lock);
+ spin_unlock(&subscriptions->lock);
srcu_read_unlock(&srcu, id);
/*
@@ -330,21 +338,22 @@ static void mn_hlist_release(struct mmu_notifier_mm *mmn_mm,
* until the ->release method returns, if it was invoked by
* mmu_notifier_unregister.
*
- * The mmu_notifier_mm can't go away from under us because one mm_count
- * is held by exit_mmap.
+ * The notifier_subscriptions can't go away from under us because
+ * one mm_count is held by exit_mmap.
*/
synchronize_srcu(&srcu);
}
void __mmu_notifier_release(struct mm_struct *mm)
{
- struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm;
+ struct mmu_notifier_subscriptions *subscriptions =
+ mm->notifier_subscriptions;
- if (mmn_mm->has_itree)
- mn_itree_release(mmn_mm, mm);
+ if (subscriptions->has_itree)
+ mn_itree_release(subscriptions, mm);
- if (!hlist_empty(&mmn_mm->list))
- mn_hlist_release(mmn_mm, mm);
+ if (!hlist_empty(&subscriptions->list))
+ mn_hlist_release(subscriptions, mm);
}
/*
@@ -356,13 +365,15 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
- struct mmu_notifier *mn;
+ struct mmu_notifier *subscription;
int young = 0, id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (mn->ops->clear_flush_young)
- young |= mn->ops->clear_flush_young(mn, mm, start, end);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist) {
+ if (subscription->ops->clear_flush_young)
+ young |= subscription->ops->clear_flush_young(
+ subscription, mm, start, end);
}
srcu_read_unlock(&srcu, id);
@@ -373,13 +384,15 @@ int __mmu_notifier_clear_young(struct mm_struct *mm,
unsigned long start,
unsigned long end)
{
- struct mmu_notifier *mn;
+ struct mmu_notifier *subscription;
int young = 0, id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (mn->ops->clear_young)
- young |= mn->ops->clear_young(mn, mm, start, end);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist) {
+ if (subscription->ops->clear_young)
+ young |= subscription->ops->clear_young(subscription,
+ mm, start, end);
}
srcu_read_unlock(&srcu, id);
@@ -389,13 +402,15 @@ int __mmu_notifier_clear_young(struct mm_struct *mm,
int __mmu_notifier_test_young(struct mm_struct *mm,
unsigned long address)
{
- struct mmu_notifier *mn;
+ struct mmu_notifier *subscription;
int young = 0, id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (mn->ops->test_young) {
- young = mn->ops->test_young(mn, mm, address);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist) {
+ if (subscription->ops->test_young) {
+ young = subscription->ops->test_young(subscription, mm,
+ address);
if (young)
break;
}
@@ -408,28 +423,33 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
pte_t pte)
{
- struct mmu_notifier *mn;
+ struct mmu_notifier *subscription;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (mn->ops->change_pte)
- mn->ops->change_pte(mn, mm, address, pte);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist) {
+ if (subscription->ops->change_pte)
+ subscription->ops->change_pte(subscription, mm, address,
+ pte);
}
srcu_read_unlock(&srcu, id);
}
-static int mn_itree_invalidate(struct mmu_notifier_mm *mmn_mm,
+static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
const struct mmu_notifier_range *range)
{
- struct mmu_interval_notifier *mni;
+ struct mmu_interval_notifier *interval_sub;
unsigned long cur_seq;
- for (mni = mn_itree_inv_start_range(mmn_mm, range, &cur_seq); mni;
- mni = mn_itree_inv_next(mni, range)) {
+ for (interval_sub =
+ mn_itree_inv_start_range(subscriptions, range, &cur_seq);
+ interval_sub;
+ interval_sub = mn_itree_inv_next(interval_sub, range)) {
bool ret;
- ret = mni->ops->invalidate(mni, range, cur_seq);
+ ret = interval_sub->ops->invalidate(interval_sub, range,
+ cur_seq);
if (!ret) {
if (WARN_ON(mmu_notifier_range_blockable(range)))
continue;
@@ -443,31 +463,36 @@ out_would_block:
* On -EAGAIN the non-blocking caller is not allowed to call
* invalidate_range_end()
*/
- mn_itree_inv_end(mmn_mm);
+ mn_itree_inv_end(subscriptions);
return -EAGAIN;
}
-static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm,
- struct mmu_notifier_range *range)
+static int mn_hlist_invalidate_range_start(
+ struct mmu_notifier_subscriptions *subscriptions,
+ struct mmu_notifier_range *range)
{
- struct mmu_notifier *mn;
+ struct mmu_notifier *subscription;
int ret = 0;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) {
- if (mn->ops->invalidate_range_start) {
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
+ const struct mmu_notifier_ops *ops = subscription->ops;
+
+ if (ops->invalidate_range_start) {
int _ret;
if (!mmu_notifier_range_blockable(range))
non_block_start();
- _ret = mn->ops->invalidate_range_start(mn, range);
+ _ret = ops->invalidate_range_start(subscription, range);
if (!mmu_notifier_range_blockable(range))
non_block_end();
if (_ret) {
pr_info("%pS callback failed with %d in %sblockable context.\n",
- mn->ops->invalidate_range_start, _ret,
- !mmu_notifier_range_blockable(range) ? "non-" : "");
+ ops->invalidate_range_start, _ret,
+ !mmu_notifier_range_blockable(range) ?
+ "non-" :
+ "");
WARN_ON(mmu_notifier_range_blockable(range) ||
_ret != -EAGAIN);
ret = _ret;
@@ -481,28 +506,29 @@ static int mn_hlist_invalidate_range_start(struct mmu_notifier_mm *mmn_mm,
int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
- struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm;
+ struct mmu_notifier_subscriptions *subscriptions =
+ range->mm->notifier_subscriptions;
int ret;
- if (mmn_mm->has_itree) {
- ret = mn_itree_invalidate(mmn_mm, range);
+ if (subscriptions->has_itree) {
+ ret = mn_itree_invalidate(subscriptions, range);
if (ret)
return ret;
}
- if (!hlist_empty(&mmn_mm->list))
- return mn_hlist_invalidate_range_start(mmn_mm, range);
+ if (!hlist_empty(&subscriptions->list))
+ return mn_hlist_invalidate_range_start(subscriptions, range);
return 0;
}
-static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm,
- struct mmu_notifier_range *range,
- bool only_end)
+static void
+mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
+ struct mmu_notifier_range *range, bool only_end)
{
- struct mmu_notifier *mn;
+ struct mmu_notifier *subscription;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mmn_mm->list, hlist) {
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
/*
* Call invalidate_range here too to avoid the need for the
* subsystem of having to register an invalidate_range_end
@@ -516,14 +542,16 @@ static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm,
* is safe to do when we know that a call to invalidate_range()
* already happen under page table lock.
*/
- if (!only_end && mn->ops->invalidate_range)
- mn->ops->invalidate_range(mn, range->mm,
- range->start,
- range->end);
- if (mn->ops->invalidate_range_end) {
+ if (!only_end && subscription->ops->invalidate_range)
+ subscription->ops->invalidate_range(subscription,
+ range->mm,
+ range->start,
+ range->end);
+ if (subscription->ops->invalidate_range_end) {
if (!mmu_notifier_range_blockable(range))
non_block_start();
- mn->ops->invalidate_range_end(mn, range);
+ subscription->ops->invalidate_range_end(subscription,
+ range);
if (!mmu_notifier_range_blockable(range))
non_block_end();
}
@@ -534,27 +562,30 @@ static void mn_hlist_invalidate_end(struct mmu_notifier_mm *mmn_mm,
void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
bool only_end)
{
- struct mmu_notifier_mm *mmn_mm = range->mm->mmu_notifier_mm;
+ struct mmu_notifier_subscriptions *subscriptions =
+ range->mm->notifier_subscriptions;
lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
- if (mmn_mm->has_itree)
- mn_itree_inv_end(mmn_mm);
+ if (subscriptions->has_itree)
+ mn_itree_inv_end(subscriptions);
- if (!hlist_empty(&mmn_mm->list))
- mn_hlist_invalidate_end(mmn_mm, range, only_end);
+ if (!hlist_empty(&subscriptions->list))
+ mn_hlist_invalidate_end(subscriptions, range, only_end);
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}
void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
- struct mmu_notifier *mn;
+ struct mmu_notifier *subscription;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (mn->ops->invalidate_range)
- mn->ops->invalidate_range(mn, mm, start, end);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist) {
+ if (subscription->ops->invalidate_range)
+ subscription->ops->invalidate_range(subscription, mm,
+ start, end);
}
srcu_read_unlock(&srcu, id);
}
@@ -564,9 +595,10 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
* write mode. A NULL mn signals the notifier is being registered for itree
* mode.
*/
-int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+int __mmu_notifier_register(struct mmu_notifier *subscription,
+ struct mm_struct *mm)
{
- struct mmu_notifier_mm *mmu_notifier_mm = NULL;
+ struct mmu_notifier_subscriptions *subscriptions = NULL;
int ret;
lockdep_assert_held_write(&mm->mmap_sem);
@@ -579,23 +611,23 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
fs_reclaim_release(GFP_KERNEL);
}
- if (!mm->mmu_notifier_mm) {
+ if (!mm->notifier_subscriptions) {
/*
* kmalloc cannot be called under mm_take_all_locks(), but we
- * know that mm->mmu_notifier_mm can't change while we hold
- * the write side of the mmap_sem.
+ * know that mm->notifier_subscriptions can't change while we
+ * hold the write side of the mmap_sem.
*/
- mmu_notifier_mm =
- kzalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
- if (!mmu_notifier_mm)
+ subscriptions = kzalloc(
+ sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL);
+ if (!subscriptions)
return -ENOMEM;
- INIT_HLIST_HEAD(&mmu_notifier_mm->list);
- spin_lock_init(&mmu_notifier_mm->lock);
- mmu_notifier_mm->invalidate_seq = 2;
- mmu_notifier_mm->itree = RB_ROOT_CACHED;
- init_waitqueue_head(&mmu_notifier_mm->wq);
- INIT_HLIST_HEAD(&mmu_notifier_mm->deferred_list);
+ INIT_HLIST_HEAD(&subscriptions->list);
+ spin_lock_init(&subscriptions->lock);
+ subscriptions->invalidate_seq = 2;
+ subscriptions->itree = RB_ROOT_CACHED;
+ init_waitqueue_head(&subscriptions->wq);
+ INIT_HLIST_HEAD(&subscriptions->deferred_list);
}
ret = mm_take_all_locks(mm);
@@ -610,34 +642,36 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
* We can't race against any other mmu notifier method either
* thanks to mm_take_all_locks().
*
- * release semantics on the initialization of the mmu_notifier_mm's
- * contents are provided for unlocked readers. acquire can only be
- * used while holding the mmgrab or mmget, and is safe because once
- * created the mmu_notififer_mm is not freed until the mm is
- * destroyed. As above, users holding the mmap_sem or one of the
+ * release semantics on the initialization of the
+ * mmu_notifier_subscriptions's contents are provided for unlocked
+ * readers. acquire can only be used while holding the mmgrab or
+ * mmget, and is safe because once created the
+ * mmu_notifier_subscriptions is not freed until the mm is destroyed.
+ * As above, users holding the mmap_sem or one of the
* mm_take_all_locks() do not need to use acquire semantics.
*/
- if (mmu_notifier_mm)
- smp_store_release(&mm->mmu_notifier_mm, mmu_notifier_mm);
+ if (subscriptions)
+ smp_store_release(&mm->notifier_subscriptions, subscriptions);
- if (mn) {
+ if (subscription) {
/* Pairs with the mmdrop in mmu_notifier_unregister_* */
mmgrab(mm);
- mn->mm = mm;
- mn->users = 1;
+ subscription->mm = mm;
+ subscription->users = 1;
- spin_lock(&mm->mmu_notifier_mm->lock);
- hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
- spin_unlock(&mm->mmu_notifier_mm->lock);
+ spin_lock(&mm->notifier_subscriptions->lock);
+ hlist_add_head_rcu(&subscription->hlist,
+ &mm->notifier_subscriptions->list);
+ spin_unlock(&mm->notifier_subscriptions->lock);
} else
- mm->mmu_notifier_mm->has_itree = true;
+ mm->notifier_subscriptions->has_itree = true;
mm_drop_all_locks(mm);
BUG_ON(atomic_read(&mm->mm_users) <= 0);
return 0;
out_clean:
- kfree(mmu_notifier_mm);
+ kfree(subscriptions);
return ret;
}
EXPORT_SYMBOL_GPL(__mmu_notifier_register);
@@ -658,15 +692,16 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_register);
* mmu_notifier_unregister() or mmu_notifier_put() must be always called to
* unregister the notifier.
*
- * While the caller has a mmu_notifier get the mn->mm pointer will remain
+ * While the caller has a mmu_notifier get the subscription->mm pointer will remain
* valid, and can be converted to an active mm pointer via mmget_not_zero().
*/
-int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+int mmu_notifier_register(struct mmu_notifier *subscription,
+ struct mm_struct *mm)
{
int ret;
down_write(&mm->mmap_sem);
- ret = __mmu_notifier_register(mn, mm);
+ ret = __mmu_notifier_register(subscription, mm);
up_write(&mm->mmap_sem);
return ret;
}
@@ -675,21 +710,22 @@ EXPORT_SYMBOL_GPL(mmu_notifier_register);
static struct mmu_notifier *
find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
{
- struct mmu_notifier *mn;
+ struct mmu_notifier *subscription;
- spin_lock(&mm->mmu_notifier_mm->lock);
- hlist_for_each_entry_rcu (mn, &mm->mmu_notifier_mm->list, hlist) {
- if (mn->ops != ops)
+ spin_lock(&mm->notifier_subscriptions->lock);
+ hlist_for_each_entry_rcu(subscription,
+ &mm->notifier_subscriptions->list, hlist) {
+ if (subscription->ops != ops)
continue;
- if (likely(mn->users != UINT_MAX))
- mn->users++;
+ if (likely(subscription->users != UINT_MAX))
+ subscription->users++;
else
- mn = ERR_PTR(-EOVERFLOW);
- spin_unlock(&mm->mmu_notifier_mm->lock);
- return mn;
+ subscription = ERR_PTR(-EOVERFLOW);
+ spin_unlock(&mm->notifier_subscriptions->lock);
+ return subscription;
}
- spin_unlock(&mm->mmu_notifier_mm->lock);
+ spin_unlock(&mm->notifier_subscriptions->lock);
return NULL;
}
@@ -713,37 +749,37 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
struct mm_struct *mm)
{
- struct mmu_notifier *mn;
+ struct mmu_notifier *subscription;
int ret;
lockdep_assert_held_write(&mm->mmap_sem);
- if (mm->mmu_notifier_mm) {
- mn = find_get_mmu_notifier(mm, ops);
- if (mn)
- return mn;
+ if (mm->notifier_subscriptions) {
+ subscription = find_get_mmu_notifier(mm, ops);
+ if (subscription)
+ return subscription;
}
- mn = ops->alloc_notifier(mm);
- if (IS_ERR(mn))
- return mn;
- mn->ops = ops;
- ret = __mmu_notifier_register(mn, mm);
+ subscription = ops->alloc_notifier(mm);
+ if (IS_ERR(subscription))
+ return subscription;
+ subscription->ops = ops;
+ ret = __mmu_notifier_register(subscription, mm);
if (ret)
goto out_free;
- return mn;
+ return subscription;
out_free:
- mn->ops->free_notifier(mn);
+ subscription->ops->free_notifier(subscription);
return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(mmu_notifier_get_locked);
/* this is called after the last mmu_notifier_unregister() returned */
-void __mmu_notifier_mm_destroy(struct mm_struct *mm)
+void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
- BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
- kfree(mm->mmu_notifier_mm);
- mm->mmu_notifier_mm = LIST_POISON1; /* debug */
+ BUG_ON(!hlist_empty(&mm->notifier_subscriptions->list));
+ kfree(mm->notifier_subscriptions);
+ mm->notifier_subscriptions = LIST_POISON1; /* debug */
}
/*
@@ -756,11 +792,12 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
* and only after mmu_notifier_unregister returned we're guaranteed
* that ->release or any other method can't run anymore.
*/
-void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+void mmu_notifier_unregister(struct mmu_notifier *subscription,
+ struct mm_struct *mm)
{
BUG_ON(atomic_read(&mm->mm_count) <= 0);
- if (!hlist_unhashed(&mn->hlist)) {
+ if (!hlist_unhashed(&subscription->hlist)) {
/*
* SRCU here will force exit_mmap to wait for ->release to
* finish before freeing the pages.
@@ -772,17 +809,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
* exit_mmap will block in mmu_notifier_release to guarantee
* that ->release is called before freeing the pages.
*/
- if (mn->ops->release)
- mn->ops->release(mn, mm);
+ if (subscription->ops->release)
+ subscription->ops->release(subscription, mm);
srcu_read_unlock(&srcu, id);
- spin_lock(&mm->mmu_notifier_mm->lock);
+ spin_lock(&mm->notifier_subscriptions->lock);
/*
* Can not use list_del_rcu() since __mmu_notifier_release
* can delete it before we hold the lock.
*/
- hlist_del_init_rcu(&mn->hlist);
- spin_unlock(&mm->mmu_notifier_mm->lock);
+ hlist_del_init_rcu(&subscription->hlist);
+ spin_unlock(&mm->notifier_subscriptions->lock);
}
/*
@@ -799,10 +836,11 @@ EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
static void mmu_notifier_free_rcu(struct rcu_head *rcu)
{
- struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu);
- struct mm_struct *mm = mn->mm;
+ struct mmu_notifier *subscription =
+ container_of(rcu, struct mmu_notifier, rcu);
+ struct mm_struct *mm = subscription->mm;
- mn->ops->free_notifier(mn);
+ subscription->ops->free_notifier(subscription);
/* Pairs with the get in __mmu_notifier_register() */
mmdrop(mm);
}
@@ -829,39 +867,40 @@ static void mmu_notifier_free_rcu(struct rcu_head *rcu)
* Modules calling this function must call mmu_notifier_synchronize() in
* their __exit functions to ensure the async work is completed.
*/
-void mmu_notifier_put(struct mmu_notifier *mn)
+void mmu_notifier_put(struct mmu_notifier *subscription)
{
- struct mm_struct *mm = mn->mm;
+ struct mm_struct *mm = subscription->mm;
- spin_lock(&mm->mmu_notifier_mm->lock);
- if (WARN_ON(!mn->users) || --mn->users)
+ spin_lock(&mm->notifier_subscriptions->lock);
+ if (WARN_ON(!subscription->users) || --subscription->users)
goto out_unlock;
- hlist_del_init_rcu(&mn->hlist);
- spin_unlock(&mm->mmu_notifier_mm->lock);
+ hlist_del_init_rcu(&subscription->hlist);
+ spin_unlock(&mm->notifier_subscriptions->lock);
- call_srcu(&srcu, &mn->rcu, mmu_notifier_free_rcu);
+ call_srcu(&srcu, &subscription->rcu, mmu_notifier_free_rcu);
return;
out_unlock:
- spin_unlock(&mm->mmu_notifier_mm->lock);
+ spin_unlock(&mm->notifier_subscriptions->lock);
}
EXPORT_SYMBOL_GPL(mmu_notifier_put);
static int __mmu_interval_notifier_insert(
- struct mmu_interval_notifier *mni, struct mm_struct *mm,
- struct mmu_notifier_mm *mmn_mm, unsigned long start,
+ struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
+ struct mmu_notifier_subscriptions *subscriptions, unsigned long start,
unsigned long length, const struct mmu_interval_notifier_ops *ops)
{
- mni->mm = mm;
- mni->ops = ops;
- RB_CLEAR_NODE(&mni->interval_tree.rb);
- mni->interval_tree.start = start;
+ interval_sub->mm = mm;
+ interval_sub->ops = ops;
+ RB_CLEAR_NODE(&interval_sub->interval_tree.rb);
+ interval_sub->interval_tree.start = start;
/*
* Note that the representation of the intervals in the interval tree
* considers the ending point as contained in the interval.
*/
if (length == 0 ||
- check_add_overflow(start, length - 1, &mni->interval_tree.last))
+ check_add_overflow(start, length - 1,
+ &interval_sub->interval_tree.last))
return -EOVERFLOW;
/* Must call with a mmget() held */
@@ -881,38 +920,40 @@ static int __mmu_interval_notifier_insert(
* possibility for live lock, instead defer the add to
* mn_itree_inv_end() so this algorithm is deterministic.
*
- * In all cases the value for the mni->invalidate_seq should be
+ * In all cases the value for the interval_sub->invalidate_seq should be
* odd, see mmu_interval_read_begin()
*/
- spin_lock(&mmn_mm->lock);
- if (mmn_mm->active_invalidate_ranges) {
- if (mn_itree_is_invalidating(mmn_mm))
- hlist_add_head(&mni->deferred_item,
- &mmn_mm->deferred_list);
+ spin_lock(&subscriptions->lock);
+ if (subscriptions->active_invalidate_ranges) {
+ if (mn_itree_is_invalidating(subscriptions))
+ hlist_add_head(&interval_sub->deferred_item,
+ &subscriptions->deferred_list);
else {
- mmn_mm->invalidate_seq |= 1;
- interval_tree_insert(&mni->interval_tree,
- &mmn_mm->itree);
+ subscriptions->invalidate_seq |= 1;
+ interval_tree_insert(&interval_sub->interval_tree,
+ &subscriptions->itree);
}
- mni->invalidate_seq = mmn_mm->invalidate_seq;
+ interval_sub->invalidate_seq = subscriptions->invalidate_seq;
} else {
- WARN_ON(mn_itree_is_invalidating(mmn_mm));
+ WARN_ON(mn_itree_is_invalidating(subscriptions));
/*
- * The starting seq for a mni not under invalidation should be
- * odd, not equal to the current invalidate_seq and
+ * The starting seq for a subscription not under invalidation
+ * should be odd, not equal to the current invalidate_seq and
* invalidate_seq should not 'wrap' to the new seq any time
* soon.
*/
- mni->invalidate_seq = mmn_mm->invalidate_seq - 1;
- interval_tree_insert(&mni->interval_tree, &mmn_mm->itree);
+ interval_sub->invalidate_seq =
+ subscriptions->invalidate_seq - 1;
+ interval_tree_insert(&interval_sub->interval_tree,
+ &subscriptions->itree);
}
- spin_unlock(&mmn_mm->lock);
+ spin_unlock(&subscriptions->lock);
return 0;
}
/**
* mmu_interval_notifier_insert - Insert an interval notifier
- * @mni: Interval notifier to register
+ * @interval_sub: Interval subscription to register
* @start: Starting virtual address to monitor
* @length: Length of the range to monitor
* @mm : mm_struct to attach to
@@ -925,53 +966,53 @@ static int __mmu_interval_notifier_insert(
* The caller must use the normal interval notifier read flow via
* mmu_interval_read_begin() to establish SPTEs for this range.
*/
-int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
+int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
struct mm_struct *mm, unsigned long start,
unsigned long length,
const struct mmu_interval_notifier_ops *ops)
{
- struct mmu_notifier_mm *mmn_mm;
+ struct mmu_notifier_subscriptions *subscriptions;
int ret;
might_lock(&mm->mmap_sem);
- mmn_mm = smp_load_acquire(&mm->mmu_notifier_mm);
- if (!mmn_mm || !mmn_mm->has_itree) {
+ subscriptions = smp_load_acquire(&mm->notifier_subscriptions);
+ if (!subscriptions || !subscriptions->has_itree) {
ret = mmu_notifier_register(NULL, mm);
if (ret)
return ret;
- mmn_mm = mm->mmu_notifier_mm;
+ subscriptions = mm->notifier_subscriptions;
}
- return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length,
- ops);
+ return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
+ start, length, ops);
}
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert);
int mmu_interval_notifier_insert_locked(
- struct mmu_interval_notifier *mni, struct mm_struct *mm,
+ struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
unsigned long start, unsigned long length,
const struct mmu_interval_notifier_ops *ops)
{
- struct mmu_notifier_mm *mmn_mm;
+ struct mmu_notifier_subscriptions *subscriptions =
+ mm->notifier_subscriptions;
int ret;
lockdep_assert_held_write(&mm->mmap_sem);
- mmn_mm = mm->mmu_notifier_mm;
- if (!mmn_mm || !mmn_mm->has_itree) {
+ if (!subscriptions || !subscriptions->has_itree) {
ret = __mmu_notifier_register(NULL, mm);
if (ret)
return ret;
- mmn_mm = mm->mmu_notifier_mm;
+ subscriptions = mm->notifier_subscriptions;
}
- return __mmu_interval_notifier_insert(mni, mm, mmn_mm, start, length,
- ops);
+ return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
+ start, length, ops);
}
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
/**
* mmu_interval_notifier_remove - Remove a interval notifier
- * @mni: Interval notifier to unregister
+ * @interval_sub: Interval subscription to unregister
*
* This function must be paired with mmu_interval_notifier_insert(). It cannot
* be called from any ops callback.
@@ -979,32 +1020,34 @@ EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
* Once this returns ops callbacks are no longer running on other CPUs and
* will not be called in future.
*/
-void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni)
+void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
{
- struct mm_struct *mm = mni->mm;
- struct mmu_notifier_mm *mmn_mm = mm->mmu_notifier_mm;
+ struct mm_struct *mm = interval_sub->mm;
+ struct mmu_notifier_subscriptions *subscriptions =
+ mm->notifier_subscriptions;
unsigned long seq = 0;
might_sleep();
- spin_lock(&mmn_mm->lock);
- if (mn_itree_is_invalidating(mmn_mm)) {
+ spin_lock(&subscriptions->lock);
+ if (mn_itree_is_invalidating(subscriptions)) {
/*
* remove is being called after insert put this on the
* deferred list, but before the deferred list was processed.
*/
- if (RB_EMPTY_NODE(&mni->interval_tree.rb)) {
- hlist_del(&mni->deferred_item);
+ if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) {
+ hlist_del(&interval_sub->deferred_item);
} else {
- hlist_add_head(&mni->deferred_item,
- &mmn_mm->deferred_list);
- seq = mmn_mm->invalidate_seq;
+ hlist_add_head(&interval_sub->deferred_item,
+ &subscriptions->deferred_list);
+ seq = subscriptions->invalidate_seq;
}
} else {
- WARN_ON(RB_EMPTY_NODE(&mni->interval_tree.rb));
- interval_tree_remove(&mni->interval_tree, &mmn_mm->itree);
+ WARN_ON(RB_EMPTY_NODE(&interval_sub->interval_tree.rb));
+ interval_tree_remove(&interval_sub->interval_tree,
+ &subscriptions->itree);
}
- spin_unlock(&mmn_mm->lock);
+ spin_unlock(&subscriptions->lock);
/*
* The possible sleep on progress in the invalidation requires the
@@ -1013,8 +1056,8 @@ void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni)
lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
if (seq)
- wait_event(mmn_mm->wq,
- READ_ONCE(mmn_mm->invalidate_seq) != seq);
+ wait_event(subscriptions->wq,
+ READ_ONCE(subscriptions->invalidate_seq) != seq);
/* pairs with mmgrab in mmu_interval_notifier_insert() */
mmdrop(mm);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d58c481b3df8..dfc357614e56 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
#include <linux/swap.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
@@ -620,6 +621,7 @@ static void oom_reap_task(struct task_struct *tsk)
pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
task_pid_nr(tsk), tsk->comm);
+ sched_show_task(tsk);
debug_show_all_locks();
done:
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 50055d2e4ea8..2caf780a42e7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -201,11 +201,11 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
if (this_bw < tot_bw) {
if (min) {
min *= this_bw;
- do_div(min, tot_bw);
+ min = div64_ul(min, tot_bw);
}
if (max < 100) {
max *= this_bw;
- do_div(max, tot_bw);
+ max = div64_ul(max, tot_bw);
}
}
@@ -766,7 +766,7 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
struct wb_domain *dom = dtc_dom(dtc);
unsigned long thresh = dtc->thresh;
u64 wb_thresh;
- long numerator, denominator;
+ unsigned long numerator, denominator;
unsigned long wb_min_ratio, wb_max_ratio;
/*
@@ -777,7 +777,7 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
wb_thresh *= numerator;
- do_div(wb_thresh, denominator);
+ wb_thresh = div64_ul(wb_thresh, denominator);
wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
@@ -1102,7 +1102,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
bw = written - min(written, wb->written_stamp);
bw *= HZ;
if (unlikely(elapsed > period)) {
- do_div(bw, elapsed);
+ bw = div64_ul(bw, elapsed);
avg = bw;
goto out;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4785a8a2040e..3c4eb750a199 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -694,34 +694,27 @@ void prep_compound_page(struct page *page, unsigned int order)
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
-#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
-DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
-#else
+bool _debug_pagealloc_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
-#endif
EXPORT_SYMBOL(_debug_pagealloc_enabled);
DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
static int __init early_debug_pagealloc(char *buf)
{
- bool enable = false;
-
- if (kstrtobool(buf, &enable))
- return -EINVAL;
-
- if (enable)
- static_branch_enable(&_debug_pagealloc_enabled);
-
- return 0;
+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
}
early_param("debug_pagealloc", early_debug_pagealloc);
-static void init_debug_guardpage(void)
+void init_debug_pagealloc(void)
{
if (!debug_pagealloc_enabled())
return;
+ static_branch_enable(&_debug_pagealloc_enabled);
+
if (!debug_guardpage_minorder())
return;
@@ -1186,7 +1179,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
*/
arch_free_page(page, order);
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
kernel_map_pages(page, 1 << order, 0);
kasan_free_nondeferred_pages(page, order);
@@ -1207,7 +1200,7 @@ static bool free_pcp_prepare(struct page *page)
static bool bulkfree_pcp_prepare(struct page *page)
{
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
return free_pages_check(page);
else
return false;
@@ -1221,7 +1214,7 @@ static bool bulkfree_pcp_prepare(struct page *page)
*/
static bool free_pcp_prepare(struct page *page)
{
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
return free_pages_prepare(page, 0, true);
else
return free_pages_prepare(page, 0, false);
@@ -1973,10 +1966,6 @@ void __init page_alloc_init_late(void)
for_each_populated_zone(zone)
set_zone_contiguous(zone);
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
- init_debug_guardpage();
-#endif
}
#ifdef CONFIG_CMA
@@ -2106,7 +2095,7 @@ static inline bool free_pages_prezeroed(void)
*/
static inline bool check_pcp_refill(struct page *page)
{
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
return check_new_page(page);
else
return false;
@@ -2128,7 +2117,7 @@ static inline bool check_pcp_refill(struct page *page)
}
static inline bool check_new_pcp(struct page *page)
{
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
return check_new_page(page);
else
return false;
@@ -2155,7 +2144,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
set_page_refcounted(page);
arch_alloc_page(page, order);
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
kernel_map_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
kernel_poison_pages(page, 1 << order, 1);
@@ -4476,8 +4465,11 @@ retry_cpuset:
if (page)
goto got_pg;
- if (order >= pageblock_order && (gfp_mask & __GFP_IO) &&
- !(gfp_mask & __GFP_RETRY_MAYFAIL)) {
+ /*
+ * Checks for costly allocations with __GFP_NORETRY, which
+ * includes some THP page fault allocations
+ */
+ if (costly_order && (gfp_mask & __GFP_NORETRY)) {
/*
* If allocating entire pageblock(s) and compaction
* failed because all zones are below low watermarks
@@ -4498,23 +4490,6 @@ retry_cpuset:
if (compact_result == COMPACT_SKIPPED ||
compact_result == COMPACT_DEFERRED)
goto nopage;
- }
-
- /*
- * Checks for costly allocations with __GFP_NORETRY, which
- * includes THP page fault allocations
- */
- if (costly_order && (gfp_mask & __GFP_NORETRY)) {
- /*
- * If compaction is deferred for high-order allocations,
- * it is because sync compaction recently failed. If
- * this is the case and the caller requested a THP
- * allocation, we do not want to heavily disrupt the
- * system, so we fail the allocation instead of entering
- * direct reclaim.
- */
- if (compact_result == COMPACT_DEFERRED)
- goto nopage;
/*
* Looks like reclaim/compaction is worth trying, but
@@ -5873,6 +5848,23 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
return false;
}
+#ifdef CONFIG_SPARSEMEM
+/* Skip PFNs that belong to non-present sections */
+static inline __meminit unsigned long next_pfn(unsigned long pfn)
+{
+ const unsigned long section_nr = pfn_to_section_nr(++pfn);
+
+ if (present_section_nr(section_nr))
+ return pfn;
+ return section_nr_to_pfn(next_present_section_nr(section_nr));
+}
+#else
+static inline __meminit unsigned long next_pfn(unsigned long pfn)
+{
+ return pfn++;
+}
+#endif
+
/*
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
@@ -5906,16 +5898,20 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
}
#endif
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ for (pfn = start_pfn; pfn < end_pfn; ) {
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
+ if (!early_pfn_valid(pfn)) {
+ pfn = next_pfn(pfn);
continue;
- if (!early_pfn_in_nid(pfn, nid))
+ }
+ if (!early_pfn_in_nid(pfn, nid)) {
+ pfn++;
continue;
+ }
if (overlap_memmap_init(zone, &pfn))
continue;
if (defer_init(nid, pfn, end_pfn))
@@ -5943,16 +5939,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
+ pfn++;
}
}
#ifdef CONFIG_ZONE_DEVICE
void __ref memmap_init_zone_device(struct zone *zone,
unsigned long start_pfn,
- unsigned long size,
+ unsigned long nr_pages,
struct dev_pagemap *pgmap)
{
- unsigned long pfn, end_pfn = start_pfn + size;
+ unsigned long pfn, end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
unsigned long zone_idx = zone_idx(zone);
@@ -5969,7 +5966,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
*/
if (altmap) {
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
- size = end_pfn - start_pfn;
+ nr_pages = end_pfn - start_pfn;
}
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
@@ -6016,7 +6013,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
}
pr_info("%s initialised %lu pages in %ums\n", __func__,
- size, jiffies_to_msecs(jiffies - start));
+ nr_pages, jiffies_to_msecs(jiffies - start));
}
#endif
@@ -6915,10 +6912,10 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
- * Zero all valid struct pages in range [spfn, epfn), return number of struct
- * pages zeroed
+ * Initialize all valid struct pages in the range [spfn, epfn) and mark them
+ * PageReserved(). Return the number of struct pages that were initialized.
*/
-static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn)
{
unsigned long pfn;
u64 pgcnt = 0;
@@ -6929,7 +6926,13 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+ pageblock_nr_pages - 1;
continue;
}
- mm_zero_struct_page(pfn_to_page(pfn));
+ /*
+ * Use a fake node/zone (0) for now. Some of these pages
+ * (in memblock.reserved but not in memblock.memory) will
+ * get re-initialized via reserve_bootmem_region() later.
+ */
+ __init_single_page(pfn_to_page(pfn), pfn, 0, 0);
+ __SetPageReserved(pfn_to_page(pfn));
pgcnt++;
}
@@ -6941,14 +6944,15 @@ static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
* initialized by going through __init_single_page(). But, there are some
* struct pages which are reserved in memblock allocator and their fields
* may be accessed (for example page_to_pfn() on some configuration accesses
- * flags). We must explicitly zero those struct pages.
+ * flags). We must explicitly initialize those struct pages.
*
* This function also addresses a similar issue where struct pages are left
* uninitialized because the physical address range is not covered by
* memblock.memory or memblock.reserved. That could happen when memblock
- * layout is manually configured via memmap=.
+ * layout is manually configured via memmap=, or when the highest physical
+ * address (max_pfn) does not end on a section boundary.
*/
-void __init zero_resv_unavail(void)
+static void __init init_unavailable_mem(void)
{
phys_addr_t start, end;
u64 i, pgcnt;
@@ -6961,10 +6965,20 @@ void __init zero_resv_unavail(void)
for_each_mem_range(i, &memblock.memory, NULL,
NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
if (next < start)
- pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
+ pgcnt += init_unavailable_range(PFN_DOWN(next),
+ PFN_UP(start));
next = end;
}
- pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
+
+ /*
+ * Early sections always have a fully populated memmap for the whole
+ * section - see pfn_valid(). If the last section has holes at the
+ * end and that section is marked "online", the memmap will be
+ * considered initialized. Make sure that memmap has a well defined
+ * state.
+ */
+ pgcnt += init_unavailable_range(PFN_DOWN(next),
+ round_up(max_pfn, PAGES_PER_SECTION));
/*
* Struct pages that do not have backing memory. This could be because
@@ -6973,6 +6987,10 @@ void __init zero_resv_unavail(void)
if (pgcnt)
pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
}
+#else
+static inline void __init init_unavailable_mem(void)
+{
+}
#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -7402,7 +7420,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
- zero_resv_unavail();
+ init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, NULL,
@@ -7597,7 +7615,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
void __init free_area_init(unsigned long *zones_size)
{
- zero_resv_unavail();
+ init_unavailable_mem();
free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
@@ -8179,20 +8197,22 @@ void *__init alloc_large_system_hash(const char *tablename,
/*
* This function checks whether pageblock includes unmovable pages or not.
- * If @count is not zero, it is okay to include less @count unmovable pages
*
* PageLRU check without isolation or lru_lock could race so that
* MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
* check without lock_page also may miss some movable non-lru pages at
* race condition. So you can't expect this function should be exact.
+ *
+ * Returns a page without holding a reference. If the caller wants to
+ * dereference that page (e.g., dumping), it has to make sure that that it
+ * cannot get removed (e.g., via memory unplug) concurrently.
+ *
*/
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
- int migratetype, int flags)
+struct page *has_unmovable_pages(struct zone *zone, struct page *page,
+ int migratetype, int flags)
{
- unsigned long found;
unsigned long iter = 0;
unsigned long pfn = page_to_pfn(page);
- const char *reason = "unmovable page";
/*
* TODO we could make this much more efficient by not checking every
@@ -8209,22 +8229,19 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* so consider them movable here.
*/
if (is_migrate_cma(migratetype))
- return false;
+ return NULL;
- reason = "CMA page";
- goto unmovable;
+ return page;
}
- for (found = 0; iter < pageblock_nr_pages; iter++) {
- unsigned long check = pfn + iter;
-
- if (!pfn_valid_within(check))
+ for (; iter < pageblock_nr_pages; iter++) {
+ if (!pfn_valid_within(pfn + iter))
continue;
- page = pfn_to_page(check);
+ page = pfn_to_page(pfn + iter);
if (PageReserved(page))
- goto unmovable;
+ return page;
/*
* If the zone is movable and we have ruled out all reserved
@@ -8244,7 +8261,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
unsigned int skip_pages;
if (!hugepage_migration_supported(page_hstate(head)))
- goto unmovable;
+ return page;
skip_pages = compound_nr(head) - (page - head);
iter += skip_pages - 1;
@@ -8270,11 +8287,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
continue;
- if (__PageMovable(page))
+ if (__PageMovable(page) || PageLRU(page))
continue;
- if (!PageLRU(page))
- found++;
/*
* If there are RECLAIMABLE pages, we need to check
* it. But now, memory offline itself doesn't call
@@ -8288,15 +8303,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* is set to both of a memory hole page and a _used_ kernel
* page at boot.
*/
- if (found > count)
- goto unmovable;
+ return page;
}
- return false;
-unmovable:
- WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
- if (flags & REPORT_FAILURE)
- dump_page(pfn_to_page(pfn + iter), reason);
- return true;
+ return NULL;
}
#ifdef CONFIG_CONTIG_ALLOC
@@ -8700,10 +8709,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(!PageBuddy(page));
order = page_order(page);
offlined_pages += 1 << order;
-#ifdef CONFIG_DEBUG_VM
- pr_info("remove from free list %lx %d %lx\n",
- pfn, 1 << order, end_pfn);
-#endif
del_page_from_free_area(page, &zone->free_area[order]);
pfn += (1 << order);
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 04ee1663cdbe..a9fd7c740c23 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -17,10 +17,9 @@
static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
{
+ struct page *unmovable = NULL;
struct zone *zone;
- unsigned long flags, pfn;
- struct memory_isolate_notify arg;
- int notifier_ret;
+ unsigned long flags;
int ret = -EBUSY;
zone = page_zone(page);
@@ -35,41 +34,12 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
if (is_migrate_isolate_page(page))
goto out;
- pfn = page_to_pfn(page);
- arg.start_pfn = pfn;
- arg.nr_pages = pageblock_nr_pages;
- arg.pages_found = 0;
-
- /*
- * It may be possible to isolate a pageblock even if the
- * migratetype is not MIGRATE_MOVABLE. The memory isolation
- * notifier chain is used by balloon drivers to return the
- * number of pages in a range that are held by the balloon
- * driver to shrink memory. If all the pages are accounted for
- * by balloons, are free, or on the LRU, isolation can continue.
- * Later, for example, when memory hotplug notifier runs, these
- * pages reported as "can be isolated" should be isolated(freed)
- * by the balloon driver through the memory notifier chain.
- */
- notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
- notifier_ret = notifier_to_errno(notifier_ret);
- if (notifier_ret)
- goto out;
/*
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
* We just check MOVABLE pages.
*/
- if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
- isol_flags))
- ret = 0;
-
- /*
- * immobile means "not-on-lru" pages. If immobile is larger than
- * removable-by-driver pages reported by notifier, we'll fail.
- */
-
-out:
- if (!ret) {
+ unmovable = has_unmovable_pages(zone, page, migratetype, isol_flags);
+ if (!unmovable) {
unsigned long nr_pages;
int mt = get_pageblock_migratetype(page);
@@ -79,11 +49,24 @@ out:
NULL);
__mod_zone_freepage_state(zone, -nr_pages, mt);
+ ret = 0;
}
+out:
spin_unlock_irqrestore(&zone->lock, flags);
- if (!ret)
+ if (!ret) {
drain_all_pages(zone);
+ } else {
+ WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
+
+ if ((isol_flags & REPORT_FAILURE) && unmovable)
+ /*
+ * printk() with zone->lock held will likely trigger a
+ * lockdep splat, so defer it here.
+ */
+ dump_page(unmovable, "unmovable page");
+ }
+
return ret;
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index eff4b4520c8d..719c35246cfa 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -52,12 +52,16 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
return true;
}
-static inline bool pfn_in_hpage(struct page *hpage, unsigned long pfn)
+static inline bool pfn_is_match(struct page *page, unsigned long pfn)
{
- unsigned long hpage_pfn = page_to_pfn(hpage);
+ unsigned long page_pfn = page_to_pfn(page);
+
+ /* normal page and hugetlbfs page */
+ if (!PageTransCompound(page) || PageHuge(page))
+ return page_pfn == pfn;
/* THP can be referenced by any subpage */
- return pfn >= hpage_pfn && pfn - hpage_pfn < hpage_nr_pages(hpage);
+ return pfn >= page_pfn && pfn - page_pfn < hpage_nr_pages(page);
}
/**
@@ -108,7 +112,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
pfn = pte_pfn(*pvmw->pte);
}
- return pfn_in_hpage(pvmw->page, pfn);
+ return pfn_is_match(pvmw->page, pfn);
}
/**
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index ea0b9e606ad1..928df1638c30 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -4,26 +4,57 @@
#include <linux/sched.h>
#include <linux/hugetlb.h>
-static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+/*
+ * We want to know the real level where a entry is located ignoring any
+ * folding of levels which may be happening. For example if p4d is folded then
+ * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
+ */
+static int real_depth(int depth)
+{
+ if (depth == 3 && PTRS_PER_PMD == 1)
+ depth = 2;
+ if (depth == 2 && PTRS_PER_PUD == 1)
+ depth = 1;
+ if (depth == 1 && PTRS_PER_P4D == 1)
+ depth = 0;
+ return depth;
+}
+
+static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
- pte_t *pte;
- int err = 0;
const struct mm_walk_ops *ops = walk->ops;
- spinlock_t *ptl;
+ int err = 0;
- pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (;;) {
err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
- addr += PAGE_SIZE;
- if (addr == end)
+ if (addr >= end - PAGE_SIZE)
break;
+ addr += PAGE_SIZE;
pte++;
}
+ return err;
+}
+
+static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pte_t *pte;
+ int err = 0;
+ spinlock_t *ptl;
+
+ if (walk->no_vma) {
+ pte = pte_offset_map(pmd, addr);
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ pte_unmap(pte);
+ } else {
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ err = walk_pte_range_inner(pte, addr, end, walk);
+ pte_unmap_unlock(pte, ptl);
+ }
- pte_unmap_unlock(pte, ptl);
return err;
}
@@ -34,18 +65,22 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
+ int depth = real_depth(3);
pmd = pmd_offset(pud, addr);
do {
again:
next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd) || !walk->vma) {
+ if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) {
if (ops->pte_hole)
- err = ops->pte_hole(addr, next, walk);
+ err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
continue;
}
+
+ walk->action = ACTION_SUBTREE;
+
/*
* This implies that each ->pmd_entry() handler
* needs to know about pmd_trans_huge() pmds
@@ -55,16 +90,24 @@ again:
if (err)
break;
+ if (walk->action == ACTION_AGAIN)
+ goto again;
+
/*
* Check this here so we only break down trans_huge
* pages when we _need_ to
*/
- if (!ops->pte_entry)
+ if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) ||
+ walk->action == ACTION_CONTINUE ||
+ !(ops->pte_entry))
continue;
- split_huge_pmd(walk->vma, pmd, addr);
- if (pmd_trans_unstable(pmd))
- goto again;
+ if (walk->vma) {
+ split_huge_pmd(walk->vma, pmd, addr);
+ if (pmd_trans_unstable(pmd))
+ goto again;
+ }
+
err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
@@ -80,37 +123,41 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
+ int depth = real_depth(2);
pud = pud_offset(p4d, addr);
do {
again:
next = pud_addr_end(addr, end);
- if (pud_none(*pud) || !walk->vma) {
+ if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
if (ops->pte_hole)
- err = ops->pte_hole(addr, next, walk);
+ err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
continue;
}
- if (ops->pud_entry) {
- spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
+ walk->action = ACTION_SUBTREE;
- if (ptl) {
- err = ops->pud_entry(pud, addr, next, walk);
- spin_unlock(ptl);
- if (err)
- break;
- continue;
- }
- }
+ if (ops->pud_entry)
+ err = ops->pud_entry(pud, addr, next, walk);
+ if (err)
+ break;
+
+ if (walk->action == ACTION_AGAIN)
+ goto again;
- split_huge_pud(walk->vma, pud, addr);
+ if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
+ walk->action == ACTION_CONTINUE ||
+ !(ops->pmd_entry || ops->pte_entry))
+ continue;
+
+ if (walk->vma)
+ split_huge_pud(walk->vma, pud, addr);
if (pud_none(*pud))
goto again;
- if (ops->pmd_entry || ops->pte_entry)
- err = walk_pmd_range(pud, addr, next, walk);
+ err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
} while (pud++, addr = next, addr != end);
@@ -125,18 +172,24 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
+ int depth = real_depth(1);
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d)) {
if (ops->pte_hole)
- err = ops->pte_hole(addr, next, walk);
+ err = ops->pte_hole(addr, next, depth, walk);
if (err)
break;
continue;
}
- if (ops->pmd_entry || ops->pte_entry)
+ if (ops->p4d_entry) {
+ err = ops->p4d_entry(p4d, addr, next, walk);
+ if (err)
+ break;
+ }
+ if (ops->pud_entry || ops->pmd_entry || ops->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -153,17 +206,26 @@ static int walk_pgd_range(unsigned long addr, unsigned long end,
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
- pgd = pgd_offset(walk->mm, addr);
+ if (walk->pgd)
+ pgd = walk->pgd + pgd_index(addr);
+ else
+ pgd = pgd_offset(walk->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
if (ops->pte_hole)
- err = ops->pte_hole(addr, next, walk);
+ err = ops->pte_hole(addr, next, 0, walk);
if (err)
break;
continue;
}
- if (ops->pmd_entry || ops->pte_entry)
+ if (ops->pgd_entry) {
+ err = ops->pgd_entry(pgd, addr, next, walk);
+ if (err)
+ break;
+ }
+ if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
+ ops->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
@@ -199,7 +261,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
if (pte)
err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
else if (ops->pte_hole)
- err = ops->pte_hole(addr, next, walk);
+ err = ops->pte_hole(addr, next, -1, walk);
if (err)
break;
@@ -243,7 +305,7 @@ static int walk_page_test(unsigned long start, unsigned long end,
if (vma->vm_flags & VM_PFNMAP) {
int err = 1;
if (ops->pte_hole)
- err = ops->pte_hole(start, end, walk);
+ err = ops->pte_hole(start, end, -1, walk);
return err ? err : 1;
}
return 0;
@@ -369,6 +431,33 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
return err;
}
+/*
+ * Similar to walk_page_range() but can walk any page tables even if they are
+ * not backed by VMAs. Because 'unusual' entries may be walked this function
+ * will also not lock the PTEs for the pte_entry() callback. This is useful for
+ * walking the kernel pages tables or page tables for firmware.
+ */
+int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ pgd_t *pgd,
+ void *private)
+{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = mm,
+ .pgd = pgd,
+ .private = private,
+ .no_vma = true
+ };
+
+ if (start >= end || !walk.mm)
+ return -EINVAL;
+
+ lockdep_assert_held(&walk.mm->mmap_sem);
+
+ return __walk_page_range(start, end, &walk);
+}
+
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
void *private)
{
diff --git a/mm/percpu.c b/mm/percpu.c
index 7e06a1e58720..e9844086b236 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -270,33 +270,6 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
pcpu_unit_page_offset(cpu, page_idx);
}
-static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
-{
- *rs = find_next_zero_bit(bitmap, end, *rs);
- *re = find_next_bit(bitmap, end, *rs + 1);
-}
-
-static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
-{
- *rs = find_next_bit(bitmap, end, *rs);
- *re = find_next_zero_bit(bitmap, end, *rs + 1);
-}
-
-/*
- * Bitmap region iterators. Iterates over the bitmap between
- * [@start, @end) in @chunk. @rs and @re should be integer variables
- * and will be set to start and end index of the current free region.
- */
-#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \
- for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
- (rs) < (re); \
- (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
-
-#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \
- for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \
- (rs) < (re); \
- (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
-
/*
* The following are helper functions to help access bitmaps and convert
* between bitmap offsets to address offsets.
@@ -732,9 +705,8 @@ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
}
bits = 0;
- pcpu_for_each_md_free_region(chunk, bit_off, bits) {
+ pcpu_for_each_md_free_region(chunk, bit_off, bits)
pcpu_block_update(chunk_md, bit_off, bit_off + bits);
- }
}
/**
@@ -749,7 +721,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
struct pcpu_block_md *block = chunk->md_blocks + index;
unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
- int rs, re, start; /* region start, region end */
+ unsigned int rs, re, start; /* region start, region end */
/* promote scan_hint to contig_hint */
if (block->scan_hint) {
@@ -765,10 +737,9 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
block->right_free = 0;
/* iterate over free areas and update the contig hints */
- pcpu_for_each_unpop_region(alloc_map, rs, re, start,
- PCPU_BITMAP_BLOCK_BITS) {
+ bitmap_for_each_clear_region(alloc_map, rs, re, start,
+ PCPU_BITMAP_BLOCK_BITS)
pcpu_block_update(block, rs, re);
- }
}
/**
@@ -1041,13 +1012,13 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
int *next_off)
{
- int page_start, page_end, rs, re;
+ unsigned int page_start, page_end, rs, re;
page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
rs = page_start;
- pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
+ bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
if (rs >= page_end)
return true;
@@ -1702,13 +1673,13 @@ area_found:
/* populate if not all pages are already there */
if (!is_atomic) {
- int page_start, page_end, rs, re;
+ unsigned int page_start, page_end, rs, re;
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
- pcpu_for_each_unpop_region(chunk->populated, rs, re,
- page_start, page_end) {
+ bitmap_for_each_clear_region(chunk->populated, rs, re,
+ page_start, page_end) {
WARN_ON(chunk->immutable);
ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
@@ -1858,10 +1829,10 @@ static void pcpu_balance_workfn(struct work_struct *work)
spin_unlock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, &to_free, list) {
- int rs, re;
+ unsigned int rs, re;
- pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
- chunk->nr_pages) {
+ bitmap_for_each_set_region(chunk->populated, rs, re, 0,
+ chunk->nr_pages) {
pcpu_depopulate_chunk(chunk, rs, re);
spin_lock_irq(&pcpu_lock);
pcpu_chunk_depopulated(chunk, rs, re);
@@ -1893,7 +1864,7 @@ retry_pop:
}
for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
- int nr_unpop = 0, rs, re;
+ unsigned int nr_unpop = 0, rs, re;
if (!nr_to_pop)
break;
@@ -1910,9 +1881,9 @@ retry_pop:
continue;
/* @chunk can't go away while pcpu_alloc_mutex is held */
- pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
- chunk->nr_pages) {
- int nr = min(re - rs, nr_to_pop);
+ bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
+ chunk->nr_pages) {
+ int nr = min_t(int, re - rs, nr_to_pop);
ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
if (!ret) {
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 357aa7bef6c0..de41e830cdac 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -42,12 +42,11 @@ static int process_vm_rw_pages(struct page **pages,
if (copy > len)
copy = len;
- if (vm_write) {
+ if (vm_write)
copied = copy_page_from_iter(page, offset, copy, iter);
- set_page_dirty_lock(page);
- } else {
+ else
copied = copy_page_to_iter(page, offset, copy, iter);
- }
+
len -= copied;
if (copied < copy && iov_iter_count(iter))
return -EFAULT;
@@ -96,7 +95,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
flags |= FOLL_WRITE;
while (!rc && nr_pages && iov_iter_count(iter)) {
- int pages = min(nr_pages, max_pages_per_loop);
+ int pinned_pages = min(nr_pages, max_pages_per_loop);
int locked = 1;
size_t bytes;
@@ -106,14 +105,15 @@ static int process_vm_rw_single_vec(unsigned long addr,
* current/current->mm
*/
down_read(&mm->mmap_sem);
- pages = get_user_pages_remote(task, mm, pa, pages, flags,
- process_pages, NULL, &locked);
+ pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages,
+ flags, process_pages,
+ NULL, &locked);
if (locked)
up_read(&mm->mmap_sem);
- if (pages <= 0)
+ if (pinned_pages <= 0)
return -EFAULT;
- bytes = pages * PAGE_SIZE - start_offset;
+ bytes = pinned_pages * PAGE_SIZE - start_offset;
if (bytes > len)
bytes = len;
@@ -122,10 +122,12 @@ static int process_vm_rw_single_vec(unsigned long addr,
vm_write);
len -= bytes;
start_offset = 0;
- nr_pages -= pages;
- pa += pages * PAGE_SIZE;
- while (pages)
- put_page(process_pages[--pages]);
+ nr_pages -= pinned_pages;
+ pa += pinned_pages * PAGE_SIZE;
+
+ /* If vm_write is set, the pages need to be made dirty: */
+ unpin_user_pages_dirty_lock(process_pages, pinned_pages,
+ vm_write);
}
return rc;
diff --git a/mm/ptdump.c b/mm/ptdump.c
new file mode 100644
index 000000000000..26208d0d03b7
--- /dev/null
+++ b/mm/ptdump.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/pagewalk.h>
+#include <linux/ptdump.h>
+#include <linux/kasan.h>
+
+#ifdef CONFIG_KASAN
+/*
+ * This is an optimization for KASAN=y case. Since all kasan page tables
+ * eventually point to the kasan_early_shadow_page we could call note_page()
+ * right away without walking through lower level page tables. This saves
+ * us dozens of seconds (minutes for 5-level config) while checking for
+ * W+X mapping or reading kernel_page_tables debugfs file.
+ */
+static inline int note_kasan_page_table(struct mm_walk *walk,
+ unsigned long addr)
+{
+ struct ptdump_state *st = walk->private;
+
+ st->note_page(st, addr, 4, pte_val(kasan_early_shadow_pte[0]));
+
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+#endif
+
+static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ pgd_t val = READ_ONCE(*pgd);
+
+#if CONFIG_PGTABLE_LEVELS > 4 && defined(CONFIG_KASAN)
+ if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (pgd_leaf(val))
+ st->note_page(st, addr, 0, pgd_val(val));
+
+ return 0;
+}
+
+static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ p4d_t val = READ_ONCE(*p4d);
+
+#if CONFIG_PGTABLE_LEVELS > 3 && defined(CONFIG_KASAN)
+ if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (p4d_leaf(val))
+ st->note_page(st, addr, 1, p4d_val(val));
+
+ return 0;
+}
+
+static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ pud_t val = READ_ONCE(*pud);
+
+#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN)
+ if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (pud_leaf(val))
+ st->note_page(st, addr, 2, pud_val(val));
+
+ return 0;
+}
+
+static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+ pmd_t val = READ_ONCE(*pmd);
+
+#if defined(CONFIG_KASAN)
+ if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
+ return note_kasan_page_table(walk, addr);
+#endif
+
+ if (pmd_leaf(val))
+ st->note_page(st, addr, 3, pmd_val(val));
+
+ return 0;
+}
+
+static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+
+ st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte)));
+
+ return 0;
+}
+
+static int ptdump_hole(unsigned long addr, unsigned long next,
+ int depth, struct mm_walk *walk)
+{
+ struct ptdump_state *st = walk->private;
+
+ st->note_page(st, addr, depth, 0);
+
+ return 0;
+}
+
+static const struct mm_walk_ops ptdump_ops = {
+ .pgd_entry = ptdump_pgd_entry,
+ .p4d_entry = ptdump_p4d_entry,
+ .pud_entry = ptdump_pud_entry,
+ .pmd_entry = ptdump_pmd_entry,
+ .pte_entry = ptdump_pte_entry,
+ .pte_hole = ptdump_hole,
+};
+
+void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
+{
+ const struct ptdump_range *range = st->range;
+
+ down_read(&mm->mmap_sem);
+ while (range->start != range->end) {
+ walk_page_range_novma(mm, range->start, range->end,
+ &ptdump_ops, pgd, st);
+ range++;
+ }
+ up_read(&mm->mmap_sem);
+
+ /* Flush out the last page */
+ st->note_page(st, 0, -1, 0);
+}
diff --git a/mm/shmem.c b/mm/shmem.c
index 165fa6332993..8793e8cc1a48 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2107,9 +2107,10 @@ unsigned long shmem_get_unmapped_area(struct file *file,
/*
* Our priority is to support MAP_SHARED mapped hugely;
* and support MAP_PRIVATE mapped hugely too, until it is COWed.
- * But if caller specified an address hint, respect that as before.
+ * But if caller specified an address hint and we allocated area there
+ * successfully, respect that as before.
*/
- if (uaddr)
+ if (uaddr == addr)
return addr;
if (shmem_huge != SHMEM_HUGE_FORCE) {
@@ -2143,7 +2144,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
if (inflated_len < len)
return addr;
- inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
+ inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
if (IS_ERR_VALUE(inflated_addr))
return addr;
if (inflated_addr & ~PAGE_MASK)
diff --git a/mm/slab.c b/mm/slab.c
index f1e1840af533..a89633603b2d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1416,7 +1416,7 @@ static void kmem_rcu_free(struct rcu_head *head)
#if DEBUG
static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
{
- if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
+ if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
(cachep->size % PAGE_SIZE) == 0)
return true;
@@ -2008,7 +2008,7 @@ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
* to check size >= 256. It guarantees that all necessary small
* sized slab is initialized in current slab initialization sequence.
*/
- if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
+ if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) &&
size >= 256 && cachep->object_size > cache_line_size()) {
if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
size_t tmp_size = ALIGN(size, PAGE_SIZE);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f0ab6d4ceb4c..1907cb2903c7 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -903,7 +903,8 @@ static void flush_memcg_workqueue(struct kmem_cache *s)
* deactivates the memcg kmem_caches through workqueue. Make sure all
* previous workitems on workqueue are processed.
*/
- flush_workqueue(memcg_kmem_cache_wq);
+ if (likely(memcg_kmem_cache_wq))
+ flush_workqueue(memcg_kmem_cache_wq);
/*
* If we're racing with children kmem_cache deactivation, it might
@@ -1579,18 +1580,17 @@ static int slabinfo_open(struct inode *inode, struct file *file)
return seq_open(file, &slabinfo_op);
}
-static const struct file_operations proc_slabinfo_operations = {
- .open = slabinfo_open,
- .read = seq_read,
- .write = slabinfo_write,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops slabinfo_proc_ops = {
+ .proc_open = slabinfo_open,
+ .proc_read = seq_read,
+ .proc_write = slabinfo_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
static int __init slab_proc_init(void)
{
- proc_create("slabinfo", SLABINFO_RIGHTS, NULL,
- &proc_slabinfo_operations);
+ proc_create("slabinfo", SLABINFO_RIGHTS, NULL, &slabinfo_proc_ops);
return 0;
}
module_init(slab_proc_init);
@@ -1676,28 +1676,6 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
}
/**
- * __krealloc - like krealloc() but don't free @p.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * This function is like krealloc() except it never frees the originally
- * allocated buffer. Use this if you don't want to free the buffer immediately
- * like, for example, with RCU.
- *
- * Return: pointer to the allocated memory or %NULL in case of error
- */
-void *__krealloc(const void *p, size_t new_size, gfp_t flags)
-{
- if (unlikely(!new_size))
- return ZERO_SIZE_PTR;
-
- return __do_krealloc(p, new_size, flags);
-
-}
-EXPORT_SYMBOL(__krealloc);
-
-/**
* krealloc - reallocate memory. The contents will remain unchanged.
* @p: object to reallocate memory for.
* @new_size: how many bytes of memory are required.
diff --git a/mm/slub.c b/mm/slub.c
index d11389710b12..17dc00e33115 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -288,7 +288,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
unsigned long freepointer_addr;
void *p;
- if (!debug_pagealloc_enabled())
+ if (!debug_pagealloc_enabled_static())
return get_freepointer(s, object);
freepointer_addr = (unsigned long)object + s->offset;
@@ -439,19 +439,38 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
}
#ifdef CONFIG_SLUB_DEBUG
+static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
+static DEFINE_SPINLOCK(object_map_lock);
+
/*
* Determine a map of object in use on a page.
*
* Node listlock must be held to guarantee that the page does
* not vanish from under us.
*/
-static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
+static unsigned long *get_map(struct kmem_cache *s, struct page *page)
{
void *p;
void *addr = page_address(page);
+ VM_BUG_ON(!irqs_disabled());
+
+ spin_lock(&object_map_lock);
+
+ bitmap_zero(object_map, page->objects);
+
for (p = page->freelist; p; p = get_freepointer(s, p))
- set_bit(slab_index(p, s, addr), map);
+ set_bit(slab_index(p, s, addr), object_map);
+
+ return object_map;
+}
+
+static void put_map(unsigned long *map)
+{
+ VM_BUG_ON(map != object_map);
+ lockdep_assert_held(&object_map_lock);
+
+ spin_unlock(&object_map_lock);
}
static inline unsigned int size_from_object(struct kmem_cache *s)
@@ -1964,7 +1983,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
return get_any_partial(s, flags, c);
}
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* Calculate the next globally unique transaction for disambiguiation
* during cmpxchg. The transactions start with the cpu number and are then
@@ -2009,7 +2028,7 @@ static inline void note_cmpxchg_failure(const char *n,
pr_info("%s %s: cmpxchg redo ", n, s->name);
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
pr_warn("due to cpu change %d -> %d\n",
tid_to_cpu(tid), tid_to_cpu(actual_tid));
@@ -2341,7 +2360,7 @@ static bool has_cpu_slab(int cpu, void *info)
static void flush_all(struct kmem_cache *s)
{
- on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC);
+ on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1);
}
/*
@@ -2637,7 +2656,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long flags;
local_irq_save(flags);
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPTION
/*
* We may have been preempted and rescheduled on a different
* cpu before disabling interrupts. Need to reload cpu area
@@ -2691,13 +2710,13 @@ redo:
* as we end up on the original cpu again when doing the cmpxchg.
*
* We should guarantee that tid and kmem_cache are retrieved on
- * the same cpu. It could be different if CONFIG_PREEMPT so we need
+ * the same cpu. It could be different if CONFIG_PREEMPTION so we need
* to check if it is matched or not.
*/
do {
tid = this_cpu_read(s->cpu_slab->tid);
c = raw_cpu_ptr(s->cpu_slab);
- } while (IS_ENABLED(CONFIG_PREEMPT) &&
+ } while (IS_ENABLED(CONFIG_PREEMPTION) &&
unlikely(tid != READ_ONCE(c->tid)));
/*
@@ -2971,7 +2990,7 @@ redo:
do {
tid = this_cpu_read(s->cpu_slab->tid);
c = raw_cpu_ptr(s->cpu_slab);
- } while (IS_ENABLED(CONFIG_PREEMPT) &&
+ } while (IS_ENABLED(CONFIG_PREEMPTION) &&
unlikely(tid != READ_ONCE(c->tid)));
/* Same with comment on barrier() in slab_alloc_node() */
@@ -3675,13 +3694,12 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
- if (!map)
- return;
+ unsigned long *map;
+
slab_err(s, page, text, s->name);
slab_lock(page);
- get_map(s, page, map);
+ map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
if (!test_bit(slab_index(p, s, addr), map)) {
@@ -3689,8 +3707,9 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
print_tracking(s, p);
}
}
+ put_map(map);
+
slab_unlock(page);
- bitmap_free(map);
#endif
}
@@ -4384,19 +4403,19 @@ static int count_total(struct page *page)
#endif
#ifdef CONFIG_SLUB_DEBUG
-static void validate_slab(struct kmem_cache *s, struct page *page,
- unsigned long *map)
+static void validate_slab(struct kmem_cache *s, struct page *page)
{
void *p;
void *addr = page_address(page);
+ unsigned long *map;
+
+ slab_lock(page);
if (!check_slab(s, page) || !on_freelist(s, page, NULL))
- return;
+ goto unlock;
/* Now we know that a valid freelist exists */
- bitmap_zero(map, page->objects);
-
- get_map(s, page, map);
+ map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
u8 val = test_bit(slab_index(p, s, addr), map) ?
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
@@ -4404,18 +4423,13 @@ static void validate_slab(struct kmem_cache *s, struct page *page,
if (!check_object(s, page, p, val))
break;
}
-}
-
-static void validate_slab_slab(struct kmem_cache *s, struct page *page,
- unsigned long *map)
-{
- slab_lock(page);
- validate_slab(s, page, map);
+ put_map(map);
+unlock:
slab_unlock(page);
}
static int validate_slab_node(struct kmem_cache *s,
- struct kmem_cache_node *n, unsigned long *map)
+ struct kmem_cache_node *n)
{
unsigned long count = 0;
struct page *page;
@@ -4424,7 +4438,7 @@ static int validate_slab_node(struct kmem_cache *s,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list) {
- validate_slab_slab(s, page, map);
+ validate_slab(s, page);
count++;
}
if (count != n->nr_partial)
@@ -4435,7 +4449,7 @@ static int validate_slab_node(struct kmem_cache *s,
goto out;
list_for_each_entry(page, &n->full, slab_list) {
- validate_slab_slab(s, page, map);
+ validate_slab(s, page);
count++;
}
if (count != atomic_long_read(&n->nr_slabs))
@@ -4452,15 +4466,11 @@ static long validate_slab_cache(struct kmem_cache *s)
int node;
unsigned long count = 0;
struct kmem_cache_node *n;
- unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
-
- if (!map)
- return -ENOMEM;
flush_all(s);
for_each_kmem_cache_node(s, node, n)
- count += validate_slab_node(s, n, map);
- bitmap_free(map);
+ count += validate_slab_node(s, n);
+
return count;
}
/*
@@ -4590,18 +4600,17 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
}
static void process_slab(struct loc_track *t, struct kmem_cache *s,
- struct page *page, enum track_item alloc,
- unsigned long *map)
+ struct page *page, enum track_item alloc)
{
void *addr = page_address(page);
void *p;
+ unsigned long *map;
- bitmap_zero(map, page->objects);
- get_map(s, page, map);
-
+ map = get_map(s, page);
for_each_object(p, s, addr, page->objects)
if (!test_bit(slab_index(p, s, addr), map))
add_location(t, s, get_track(s, p, alloc));
+ put_map(map);
}
static int list_locations(struct kmem_cache *s, char *buf,
@@ -4612,11 +4621,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
struct loc_track t = { 0, 0, NULL };
int node;
struct kmem_cache_node *n;
- unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
- if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
- GFP_KERNEL)) {
- bitmap_free(map);
+ if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+ GFP_KERNEL)) {
return sprintf(buf, "Out of memory\n");
}
/* Push back cpu slabs */
@@ -4631,9 +4638,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry(page, &n->partial, slab_list)
- process_slab(&t, s, page, alloc, map);
+ process_slab(&t, s, page, alloc);
list_for_each_entry(page, &n->full, slab_list)
- process_slab(&t, s, page, alloc, map);
+ process_slab(&t, s, page, alloc);
spin_unlock_irqrestore(&n->list_lock, flags);
}
@@ -4682,7 +4689,6 @@ static int list_locations(struct kmem_cache *s, char *buf,
}
free_loc_track(&t);
- bitmap_free(map);
if (!t.count)
len += sprintf(buf, "No data\n");
return len;
diff --git a/mm/sparse.c b/mm/sparse.c
index b20ab7cdac86..c184b69460b7 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -198,16 +198,6 @@ static void section_mark_present(struct mem_section *ms)
ms->section_mem_map |= SECTION_MARKED_PRESENT;
}
-static inline unsigned long next_present_section_nr(unsigned long section_nr)
-{
- do {
- section_nr++;
- if (present_section_nr(section_nr))
- return section_nr;
- } while ((section_nr <= __highest_present_section_nr));
-
- return -1;
-}
#define for_each_present_section_nr(start, section_nr) \
for (section_nr = next_present_section_nr(start-1); \
((section_nr != -1) && \
@@ -777,12 +767,19 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
unsigned long section_nr = pfn_to_section_nr(pfn);
- if (!section_is_early) {
+ /*
+ * When removing an early section, the usage map is kept (as the
+ * usage maps of other sections fall into the same page). It
+ * will be re-used when re-adding the section - which is then no
+ * longer an early section. If the usage map is PageReserved, it
+ * was allocated during boot.
+ */
+ if (!PageReserved(virt_to_page(ms->usage))) {
kfree(ms->usage);
ms->usage = NULL;
}
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
- ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
+ ms->section_mem_map = (unsigned long)NULL;
}
if (section_is_early && memmap)
diff --git a/mm/swap.c b/mm/swap.c
index 5341ae93861f..cf39d24ada2a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -813,8 +813,10 @@ void release_pages(struct page **pages, int nr)
* processing, and instead, expect a call to
* put_page_testzero().
*/
- if (put_devmap_managed_page(page))
+ if (page_is_devmap_managed(page)) {
+ put_devmap_managed_page(page);
continue;
+ }
}
page = compound_head(page);
@@ -1102,3 +1104,26 @@ void __init swap_setup(void)
* _really_ don't want to cluster much more
*/
}
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+void put_devmap_managed_page(struct page *page)
+{
+ int count;
+
+ if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
+ return;
+
+ count = page_ref_dec_return(page);
+
+ /*
+ * devmap page refcounts are 1-based, rather than 0-based: if
+ * refcount is 1, then the page is free and the refcount is
+ * stable because nobody holds a reference on the page.
+ */
+ if (count == 1)
+ free_devmap_managed_page(page);
+ else if (!count)
+ __put_page(page);
+}
+EXPORT_SYMBOL(put_devmap_managed_page);
+#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bb3261d45b6a..2c33ff456ed5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2737,10 +2737,10 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
else
type = si->type + 1;
+ ++(*pos);
for (; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
- ++*pos;
return si;
}
@@ -2796,17 +2796,17 @@ static int swaps_open(struct inode *inode, struct file *file)
return 0;
}
-static const struct file_operations proc_swaps_operations = {
- .open = swaps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
- .poll = swaps_poll,
+static const struct proc_ops swaps_proc_ops = {
+ .proc_open = swaps_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
+ .proc_poll = swaps_poll,
};
static int __init procswaps_init(void)
{
- proc_create("swaps", 0, NULL, &proc_swaps_operations);
+ proc_create("swaps", 0, NULL, &swaps_proc_ops);
return 0;
}
__initcall(procswaps_init);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e9681dc4aa75..1f46c3b86f9f 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -41,6 +41,14 @@
#include "internal.h"
+bool is_vmalloc_addr(const void *x)
+{
+ unsigned long addr = (unsigned long)x;
+
+ return addr >= VMALLOC_START && addr < VMALLOC_END;
+}
+EXPORT_SYMBOL(is_vmalloc_addr);
+
struct vfree_deferred {
struct llist_head list;
struct work_struct wq;
@@ -1383,7 +1391,7 @@ static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
unmap_vmap_area(va);
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
free_vmap_area_noflush(va);
@@ -1681,7 +1689,7 @@ static void vb_free(const void *addr, unsigned long size)
vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range((unsigned long)addr,
(unsigned long)addr + size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 572fb17c6273..c05eb9efec07 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -146,20 +146,6 @@ struct scan_control {
struct reclaim_state reclaim_state;
};
-#ifdef ARCH_HAS_PREFETCH
-#define prefetch_prev_lru_page(_page, _base, _field) \
- do { \
- if ((_page)->lru.prev != _base) { \
- struct page *prev; \
- \
- prev = lru_to_page(&(_page->lru)); \
- prefetch(&prev->_field); \
- } \
- } while (0)
-#else
-#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
-#endif
-
#ifdef ARCH_HAS_PREFETCHW
#define prefetchw_prev_lru_page(_page, _base, _field) \
do { \
@@ -2695,7 +2681,7 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}
-static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
@@ -2874,8 +2860,6 @@ again:
*/
if (reclaimable)
pgdat->kswapd_failures = 0;
-
- return reclaimable;
}
/*
@@ -4126,10 +4110,8 @@ module_init(kswapd_init)
*/
int node_reclaim_mode __read_mostly;
-#define RECLAIM_OFF 0
-#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
-#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
+#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */
+#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */
/*
* Priority for NODE_RECLAIM. This determines the fraction of pages
diff --git a/mm/zswap.c b/mm/zswap.c
index 46a322316e52..55094e63b72d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -32,6 +32,7 @@
#include <linux/swapops.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
+#include <linux/workqueue.h>
/*********************************
* statistics
@@ -65,6 +66,11 @@ static u64 zswap_reject_kmemcache_fail;
/* Duplicate store was encountered (rare) */
static u64 zswap_duplicate_entry;
+/* Shrinker work queue */
+static struct workqueue_struct *shrink_wq;
+/* Pool limit was hit, we need to calm down */
+static bool zswap_pool_reached_full;
+
/*********************************
* tunables
**********************************/
@@ -109,6 +115,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
static unsigned int zswap_max_pool_percent = 20;
module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
+/* The threshold for accepting new pages after the max_pool_percent was hit */
+static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */
+module_param_named(accept_threshold_percent, zswap_accept_thr_percent,
+ uint, 0644);
+
/* Enable/disable handling same-value filled pages (enabled by default) */
static bool zswap_same_filled_pages_enabled = true;
module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
@@ -123,7 +134,8 @@ struct zswap_pool {
struct crypto_comp * __percpu *tfm;
struct kref kref;
struct list_head list;
- struct work_struct work;
+ struct work_struct release_work;
+ struct work_struct shrink_work;
struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
};
@@ -214,6 +226,13 @@ static bool zswap_is_full(void)
DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
}
+static bool zswap_can_accept(void)
+{
+ return totalram_pages() * zswap_accept_thr_percent / 100 *
+ zswap_max_pool_percent / 100 >
+ DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+}
+
static void zswap_update_total_size(void)
{
struct zswap_pool *pool;
@@ -501,6 +520,16 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
return NULL;
}
+static void shrink_worker(struct work_struct *w)
+{
+ struct zswap_pool *pool = container_of(w, typeof(*pool),
+ shrink_work);
+
+ if (zpool_shrink(pool->zpool, 1, NULL))
+ zswap_reject_reclaim_fail++;
+ zswap_pool_put(pool);
+}
+
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
struct zswap_pool *pool;
@@ -551,6 +580,7 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
*/
kref_init(&pool->kref);
INIT_LIST_HEAD(&pool->list);
+ INIT_WORK(&pool->shrink_work, shrink_worker);
zswap_pool_debug("created", pool);
@@ -624,7 +654,8 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
static void __zswap_pool_release(struct work_struct *work)
{
- struct zswap_pool *pool = container_of(work, typeof(*pool), work);
+ struct zswap_pool *pool = container_of(work, typeof(*pool),
+ release_work);
synchronize_rcu();
@@ -647,8 +678,8 @@ static void __zswap_pool_empty(struct kref *kref)
list_del_rcu(&pool->list);
- INIT_WORK(&pool->work, __zswap_pool_release);
- schedule_work(&pool->work);
+ INIT_WORK(&pool->release_work, __zswap_pool_release);
+ schedule_work(&pool->release_work);
spin_unlock(&zswap_pools_lock);
}
@@ -942,22 +973,6 @@ end:
return ret;
}
-static int zswap_shrink(void)
-{
- struct zswap_pool *pool;
- int ret;
-
- pool = zswap_pool_last_get();
- if (!pool)
- return -ENOENT;
-
- ret = zpool_shrink(pool->zpool, 1, NULL);
-
- zswap_pool_put(pool);
-
- return ret;
-}
-
static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
{
unsigned int pos;
@@ -1011,21 +1026,23 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* reclaim space if needed */
if (zswap_is_full()) {
+ struct zswap_pool *pool;
+
zswap_pool_limit_hit++;
- if (zswap_shrink()) {
- zswap_reject_reclaim_fail++;
- ret = -ENOMEM;
- goto reject;
- }
+ zswap_pool_reached_full = true;
+ pool = zswap_pool_last_get();
+ if (pool)
+ queue_work(shrink_wq, &pool->shrink_work);
+ ret = -ENOMEM;
+ goto reject;
+ }
- /* A second zswap_is_full() check after
- * zswap_shrink() to make sure it's now
- * under the max_pool_percent
- */
- if (zswap_is_full()) {
+ if (zswap_pool_reached_full) {
+ if (!zswap_can_accept()) {
ret = -ENOMEM;
goto reject;
- }
+ } else
+ zswap_pool_reached_full = false;
}
/* allocate entry */
@@ -1332,11 +1349,18 @@ static int __init init_zswap(void)
zswap_enabled = false;
}
+ shrink_wq = create_workqueue("zswap-shrink");
+ if (!shrink_wq)
+ goto fallback_fail;
+
frontswap_register_ops(&zswap_frontswap_ops);
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
return 0;
+fallback_fail:
+ if (pool)
+ zswap_pool_destroy(pool);
hp_fail:
cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
dstmem_fail: