summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig16
-rw-r--r--mm/Makefile2
-rw-r--r--mm/bootmem_info.c127
-rw-r--r--mm/compaction.c20
-rw-r--r--mm/debug_vm_pgtable.c109
-rw-r--r--mm/gup.c58
-rw-r--r--mm/hmm.c12
-rw-r--r--mm/huge_memory.c265
-rw-r--r--mm/hugetlb.c361
-rw-r--r--mm/hugetlb_vmemmap.c298
-rw-r--r--mm/hugetlb_vmemmap.h45
-rw-r--r--mm/internal.h29
-rw-r--r--mm/kfence/core.c4
-rw-r--r--mm/khugepaged.c20
-rw-r--r--mm/madvise.c66
-rw-r--r--mm/mapping_dirty_helpers.c2
-rw-r--r--mm/memblock.c28
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/memory-failure.c38
-rw-r--r--mm/memory.c235
-rw-r--r--mm/memory_hotplug.c159
-rw-r--r--mm/mempolicy.c303
-rw-r--r--mm/migrate.c268
-rw-r--r--mm/mlock.c12
-rw-r--r--mm/mmap_lock.c59
-rw-r--r--mm/mprotect.c18
-rw-r--r--mm/nommu.c5
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/page_vma_mapped.c15
-rw-r--r--mm/rmap.c628
-rw-r--r--mm/shmem.c123
-rw-r--r--mm/sparse-vmemmap.c354
-rw-r--r--mm/sparse.c1
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/userfaultfd.c225
-rw-r--r--mm/util.c40
-rw-r--r--mm/vmalloc.c37
-rw-r--r--mm/vmscan.c20
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/z3fold.c39
-rw-r--r--mm/zbud.c235
-rw-r--r--mm/zsmalloc.c3
-rw-r--r--mm/zswap.c26
45 files changed, 2919 insertions, 1411 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ded98fb859ab..a02498c0e13d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -96,6 +96,9 @@ config HAVE_FAST_GUP
depends on MMU
bool
+config HOLES_IN_ZONE
+ bool
+
# Don't discard allocated memory used to track "memory" and "reserved" memblocks
# after early boot, so it can still be used to test for validity of memory.
# Also, memblocks are updated with memory hot(un)plug.
@@ -671,6 +674,7 @@ config ZPOOL
config ZBUD
tristate "Low (Up to 2x) density storage for compressed pages"
+ depends on ZPOOL
help
A special purpose allocator for storing compressed pages.
It is designed to store up to two compressed pages per physical
@@ -757,6 +761,18 @@ config ARCH_HAS_CACHE_LINE_SIZE
config ARCH_HAS_PTE_DEVMAP
bool
+config ARCH_HAS_ZONE_DMA_SET
+ bool
+
+config ZONE_DMA
+ bool "Support DMA zone" if ARCH_HAS_ZONE_DMA_SET
+ default y if ARM64 || X86
+
+config ZONE_DMA32
+ bool "Support DMA32 zone" if ARCH_HAS_ZONE_DMA_SET
+ depends on !X86_32
+ default y if ARM64
+
config ZONE_DEVICE
bool "Device memory (pmem, HMM, etc...) hotplug support"
depends on MEMORY_HOTPLUG
diff --git a/mm/Makefile b/mm/Makefile
index bf71e295e9f6..74b47c354682 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_ZSWAP) += zswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
+obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o
obj-$(CONFIG_NUMA) += mempolicy.o
obj-$(CONFIG_SPARSEMEM) += sparse.o
obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
@@ -125,3 +126,4 @@ obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
obj-$(CONFIG_IO_MAPPING) += io-mapping.o
+obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
new file mode 100644
index 000000000000..5b152dba7344
--- /dev/null
+++ b/mm/bootmem_info.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Bootmem core functions.
+ *
+ * Copyright (c) 2020, Bytedance.
+ *
+ * Author: Muchun Song <songmuchun@bytedance.com>
+ *
+ */
+#include <linux/mm.h>
+#include <linux/compiler.h>
+#include <linux/memblock.h>
+#include <linux/bootmem_info.h>
+#include <linux/memory_hotplug.h>
+
+void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
+{
+ page->freelist = (void *)type;
+ SetPagePrivate(page);
+ set_page_private(page, info);
+ page_ref_inc(page);
+}
+
+void put_page_bootmem(struct page *page)
+{
+ unsigned long type;
+
+ type = (unsigned long) page->freelist;
+ BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+ type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
+
+ if (page_ref_dec_return(page) == 1) {
+ page->freelist = NULL;
+ ClearPagePrivate(page);
+ set_page_private(page, 0);
+ INIT_LIST_HEAD(&page->lru);
+ free_reserved_page(page);
+ }
+}
+
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+ struct mem_section_usage *usage;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ /* Get section's memmap address */
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ /*
+ * Get page for the memmap's phys address
+ * XXX: need more consideration for sparse_vmemmap...
+ */
+ page = virt_to_page(memmap);
+ mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+ mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
+
+ /* remember memmap's page */
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, SECTION_INFO);
+
+ usage = ms->usage;
+ page = virt_to_page(usage);
+
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+
+}
+#else /* CONFIG_SPARSEMEM_VMEMMAP */
+static void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+ unsigned long mapsize, section_nr, i;
+ struct mem_section *ms;
+ struct page *page, *memmap;
+ struct mem_section_usage *usage;
+
+ section_nr = pfn_to_section_nr(start_pfn);
+ ms = __nr_to_section(section_nr);
+
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+
+ register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
+
+ usage = ms->usage;
+ page = virt_to_page(usage);
+
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
+
+ for (i = 0; i < mapsize; i++, page++)
+ get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+ unsigned long i, pfn, end_pfn, nr_pages;
+ int node = pgdat->node_id;
+ struct page *page;
+
+ nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
+ page = virt_to_page(pgdat);
+
+ for (i = 0; i < nr_pages; i++, page++)
+ get_page_bootmem(node, page, NODE_INFO);
+
+ pfn = pgdat->node_start_pfn;
+ end_pfn = pgdat_end_pfn(pgdat);
+
+ /* register section info */
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ /*
+ * Some platforms can assign the same pfn to multiple nodes - on
+ * node0 as well as nodeN. To avoid registering a pfn against
+ * multiple nodes we check that this pfn does not already
+ * reside in some other nodes.
+ */
+ if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
+ register_page_bootmem_info_section(pfn);
+ }
+}
diff --git a/mm/compaction.c b/mm/compaction.c
index 3a509fbf2bea..621508e0ecd5 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1297,8 +1297,7 @@ move_freelist_head(struct list_head *freelist, struct page *freepage)
if (!list_is_last(freelist, &freepage->lru)) {
list_cut_before(&sublist, freelist, &freepage->lru);
- if (!list_empty(&sublist))
- list_splice_tail(&sublist, freelist);
+ list_splice_tail(&sublist, freelist);
}
}
@@ -1315,8 +1314,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage)
if (!list_is_first(freelist, &freepage->lru)) {
list_cut_position(&sublist, freelist, &freepage->lru);
- if (!list_empty(&sublist))
- list_splice_tail(&sublist, freelist);
+ list_splice_tail(&sublist, freelist);
}
}
@@ -1380,7 +1378,7 @@ static int next_search_order(struct compact_control *cc, int order)
static unsigned long
fast_isolate_freepages(struct compact_control *cc)
{
- unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
+ unsigned int limit = max(1U, freelist_scan_limit(cc) >> 1);
unsigned int nr_scanned = 0;
unsigned long low_pfn, min_pfn, highest = 0;
unsigned long nr_isolated = 0;
@@ -1492,11 +1490,11 @@ fast_isolate_freepages(struct compact_control *cc)
spin_unlock_irqrestore(&cc->zone->lock, flags);
/*
- * Smaller scan on next order so the total scan ig related
+ * Smaller scan on next order so the total scan is related
* to freelist_scan_limit.
*/
if (order_scanned >= limit)
- limit = min(1U, limit >> 1);
+ limit = max(1U, limit >> 1);
}
if (!page) {
@@ -2722,9 +2720,9 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
-static ssize_t sysfs_compact_node(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t compact_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
int nid = dev->id;
@@ -2737,7 +2735,7 @@ static ssize_t sysfs_compact_node(struct device *dev,
return count;
}
-static DEVICE_ATTR(compact, 0200, NULL, sysfs_compact_node);
+static DEVICE_ATTR_WO(compact);
int compaction_register_node(struct node *node)
{
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 92bfc37300df..1c922691aa61 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -91,7 +91,7 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
unsigned long pfn, unsigned long vaddr,
pgprot_t prot)
{
- pte_t pte = pfn_pte(pfn, prot);
+ pte_t pte;
/*
* Architectures optimize set_pte_at by avoiding TLB flush.
@@ -248,29 +248,6 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_leaf(pmd));
}
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
-{
- pmd_t pmd;
-
- if (!arch_vmap_pmd_supported(prot))
- return;
-
- pr_debug("Validating PMD huge\n");
- /*
- * X86 defined pmd_set_huge() verifies that the given
- * PMD is not a populated non-leaf entry.
- */
- WRITE_ONCE(*pmdp, __pmd(0));
- WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pmd_clear_huge(pmdp));
- pmd = READ_ONCE(*pmdp);
- WARN_ON(!pmd_none(pmd));
-}
-#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
-#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-
static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
pmd_t pmd;
@@ -395,30 +372,6 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
pud = pud_mkhuge(pud);
WARN_ON(!pud_leaf(pud));
}
-
-#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
-{
- pud_t pud;
-
- if (!arch_vmap_pud_supported(prot))
- return;
-
- pr_debug("Validating PUD huge\n");
- /*
- * X86 defined pud_set_huge() verifies that the given
- * PUD is not a populated non-leaf entry.
- */
- WRITE_ONCE(*pudp, __pud(0));
- WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
- WARN_ON(!pud_clear_huge(pudp));
- pud = READ_ONCE(*pudp);
- WARN_ON(!pud_none(pud));
-}
-#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
-#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
-
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -428,9 +381,6 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
{
}
static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
-{
-}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pmd_basic_tests(unsigned long pfn, int idx) { }
@@ -449,14 +399,51 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
}
static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
{
+ pmd_t pmd;
+
+ if (!arch_vmap_pmd_supported(prot))
+ return;
+
+ pr_debug("Validating PMD huge\n");
+ /*
+ * X86 defined pmd_set_huge() verifies that the given
+ * PMD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pmdp, __pmd(0));
+ WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pmd_clear_huge(pmdp));
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
}
+
static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
{
+ pud_t pud;
+
+ if (!arch_vmap_pud_supported(prot))
+ return;
+
+ pr_debug("Validating PUD huge\n");
+ /*
+ * X86 defined pud_set_huge() verifies that the given
+ * PUD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pudp, __pud(0));
+ WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pud_clear_huge(pudp));
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
}
-static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
{
@@ -791,12 +778,12 @@ static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
}
-#else /* !CONFIG_ARCH_HAS_PTE_DEVMAP */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
{
}
-#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
{
@@ -856,17 +843,17 @@ static void __init swap_migration_tests(void)
* locked, otherwise it stumbles upon a BUG_ON().
*/
__SetPageLocked(page);
- swp = make_migration_entry(page, 1);
+ swp = make_writable_migration_entry(page_to_pfn(page));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(!is_write_migration_entry(swp));
+ WARN_ON(!is_writable_migration_entry(swp));
- make_migration_entry_read(&swp);
+ swp = make_readable_migration_entry(swp_offset(swp));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(is_write_migration_entry(swp));
+ WARN_ON(is_writable_migration_entry(swp));
- swp = make_migration_entry(page, 0);
+ swp = make_readable_migration_entry(page_to_pfn(page));
WARN_ON(!is_migration_entry(swp));
- WARN_ON(is_write_migration_entry(swp));
+ WARN_ON(is_writable_migration_entry(swp));
__ClearPageLocked(page);
__free_page(page);
}
diff --git a/mm/gup.c b/mm/gup.c
index 8651309f8ec3..728d996767cb 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1501,6 +1501,64 @@ long populate_vma_page_range(struct vm_area_struct *vma,
}
/*
+ * faultin_vma_page_range() - populate (prefault) page tables inside the
+ * given VMA range readable/writable
+ *
+ * This takes care of mlocking the pages, too, if VM_LOCKED is set.
+ *
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ * @write: whether to prefault readable or writable
+ * @locked: whether the mmap_lock is still held
+ *
+ * Returns either number of processed pages in the vma, or a negative error
+ * code on error (see __get_user_pages()).
+ *
+ * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
+ * covered by the VMA.
+ *
+ * If @locked is NULL, it may be held for read or write and will be unperturbed.
+ *
+ * If @locked is non-NULL, it must held for read only and may be released. If
+ * it's released, *@locked will be set to 0.
+ */
+long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, bool write, int *locked)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ mmap_assert_locked(mm);
+
+ /*
+ * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
+ * the page dirty with FOLL_WRITE -- which doesn't make a
+ * difference with !FOLL_FORCE, because the page is writable
+ * in the page table.
+ * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
+ * a poisoned page.
+ * FOLL_POPULATE: Always populate memory with VM_LOCKONFAULT.
+ * !FOLL_FORCE: Require proper access permissions.
+ */
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK | FOLL_HWPOISON;
+ if (write)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * See check_vma_flags(): Will return -EFAULT on incompatible mappings
+ * or with insufficient permissions.
+ */
+ return __get_user_pages(mm, start, nr_pages, gup_flags,
+ NULL, NULL, locked);
+}
+
+/*
* __mm_populate - populate and/or mlock pages within a range of address space.
*
* This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
diff --git a/mm/hmm.c b/mm/hmm.c
index 943cb2ba4442..fad6be2bf072 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -26,6 +26,8 @@
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
+#include "internal.h"
+
struct hmm_vma_walk {
struct hmm_range *range;
unsigned long last;
@@ -214,7 +216,7 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range,
swp_entry_t entry)
{
return is_device_private_entry(entry) &&
- device_private_entry_to_page(entry)->pgmap->owner ==
+ pfn_swap_entry_to_page(entry)->pgmap->owner ==
range->dev_private_owner;
}
@@ -255,10 +257,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
*/
if (hmm_is_device_private_entry(range, entry)) {
cpu_flags = HMM_PFN_VALID;
- if (is_write_device_private_entry(entry))
+ if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
- *hmm_pfn = device_private_entry_to_pfn(entry) |
- cpu_flags;
+ *hmm_pfn = swp_offset(entry) | cpu_flags;
return 0;
}
@@ -272,6 +273,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
if (!non_swap_entry(entry))
goto fault;
+ if (is_device_exclusive_entry(entry))
+ goto fault;
+
if (is_migration_entry(entry)) {
pte_unmap(ptep);
hmm_vma_walk->last = addr;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6d2a0119fc58..8b731d53e9f4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -64,7 +64,14 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+ return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
+ !inode_is_open_for_write(vma->vm_file->f_inode) &&
+ (vma->vm_flags & VM_EXEC);
+}
+
+bool transparent_hugepage_active(struct vm_area_struct *vma)
{
/* The addr is used to check if the vma size fits */
unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
@@ -75,6 +82,8 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma)
return __transparent_hugepage_enabled(vma);
if (vma_is_shmem(vma))
return shmem_huge_enabled(vma);
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+ return file_thp_enabled(vma);
return false;
}
@@ -1017,7 +1026,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
- struct vm_area_struct *vma)
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
@@ -1026,7 +1035,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
int ret = -ENOMEM;
/* Skip if can be re-fill on fault */
- if (!vma_is_anonymous(vma))
+ if (!vma_is_anonymous(dst_vma))
return 0;
pgtable = pte_alloc_one(dst_mm);
@@ -1040,29 +1049,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- /*
- * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
- * does not have the VM_UFFD_WP, which means that the uffd
- * fork event is not enabled.
- */
- if (!(vma->vm_flags & VM_UFFD_WP))
- pmd = pmd_clear_uffd_wp(pmd);
-
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
if (unlikely(is_swap_pmd(pmd))) {
swp_entry_t entry = pmd_to_swp_entry(pmd);
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- if (is_write_migration_entry(entry)) {
- make_migration_entry_read(&entry);
+ if (is_writable_migration_entry(entry)) {
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
pmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*src_pmd))
pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(src_mm, addr, src_pmd, pmd);
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0;
goto out_unlock;
@@ -1079,17 +1085,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* a page table.
*/
if (is_huge_zero_pmd(pmd)) {
- struct page *zero_page;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
- zero_page = mm_get_huge_zero_page(dst_mm);
- set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
- zero_page);
- ret = 0;
- goto out_unlock;
+ mm_get_huge_zero_page(dst_mm);
+ goto out_zero_page;
}
src_page = pmd_page(pmd);
@@ -1102,21 +1104,23 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* best effort that the pinned pages won't be replaced by another
* random page during the coming copy-on-write.
*/
- if (unlikely(page_needs_cow_for_dma(vma, src_page))) {
+ if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) {
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
- __split_huge_pmd(vma, src_pmd, addr, false, NULL);
+ __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
return -EAGAIN;
}
get_page(src_page);
page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+out_zero_page:
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-
pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_clear_uffd_wp(pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
@@ -1254,11 +1258,12 @@ unlock:
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
+void huge_pmd_set_accessed(struct vm_fault *vmf)
{
pmd_t entry;
unsigned long haddr;
bool write = vmf->flags & FAULT_FLAG_WRITE;
+ pmd_t orig_pmd = vmf->orig_pmd;
vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
@@ -1275,11 +1280,12 @@ unlock:
spin_unlock(vmf->ptl);
}
-vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+ pmd_t orig_pmd = vmf->orig_pmd;
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1415,96 +1421,25 @@ out:
}
/* NUMA hinting page fault entry point for trans huge pmds */
-vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
+vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct anon_vma *anon_vma = NULL;
+ pmd_t oldpmd = vmf->orig_pmd;
+ pmd_t pmd;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
+ int page_nid = NUMA_NO_NODE;
int target_nid, last_cpupid = -1;
- bool page_locked;
bool migrated = false;
- bool was_writable;
+ bool was_writable = pmd_savedwrite(oldpmd);
int flags = 0;
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(pmd, *vmf->pmd)))
- goto out_unlock;
-
- /*
- * If there are potential migrations, wait for completion and retry
- * without disrupting NUMA hinting information. Do not relock and
- * check_same as the page may no longer be mapped.
- */
- if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
- page = pmd_page(*vmf->pmd);
- if (!get_page_unless_zero(page))
- goto out_unlock;
+ if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
spin_unlock(vmf->ptl);
- put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
goto out;
}
- page = pmd_page(pmd);
- BUG_ON(is_huge_zero_page(page));
- page_nid = page_to_nid(page);
- last_cpupid = page_cpupid_last(page);
- count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == this_nid) {
- count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
- flags |= TNF_FAULT_LOCAL;
- }
-
- /* See similar comment in do_numa_page for explanation */
- if (!pmd_savedwrite(pmd))
- flags |= TNF_NO_GROUP;
-
- /*
- * Acquire the page lock to serialise THP migrations but avoid dropping
- * page_table_lock if at all possible
- */
- page_locked = trylock_page(page);
- target_nid = mpol_misplaced(page, vma, haddr);
- /* Migration could have started since the pmd_trans_migrating check */
- if (!page_locked) {
- page_nid = NUMA_NO_NODE;
- if (!get_page_unless_zero(page))
- goto out_unlock;
- spin_unlock(vmf->ptl);
- put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
- goto out;
- } else if (target_nid == NUMA_NO_NODE) {
- /* There are no parallel migrations and page is in the right
- * node. Clear the numa hinting info in this pmd.
- */
- goto clear_pmdnuma;
- }
-
- /*
- * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
- * to serialises splits
- */
- get_page(page);
- spin_unlock(vmf->ptl);
- anon_vma = page_lock_anon_vma_read(page);
-
- /* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(vmf->ptl);
- if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
- unlock_page(page);
- put_page(page);
- page_nid = NUMA_NO_NODE;
- goto out_unlock;
- }
-
- /* Bail if we fail to protect against THP splits for any reason */
- if (unlikely(!anon_vma)) {
- put_page(page);
- page_nid = NUMA_NO_NODE;
- goto clear_pmdnuma;
- }
-
/*
* Since we took the NUMA fault, we must have observed the !accessible
* bit. Make sure all other CPUs agree with that, to avoid them
@@ -1531,43 +1466,58 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
haddr + HPAGE_PMD_SIZE);
}
- /*
- * Migrate the THP to the requested node, returns with page unlocked
- * and access rights restored.
- */
+ pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+ page = vm_normal_page_pmd(vma, haddr, pmd);
+ if (!page)
+ goto out_map;
+
+ /* See similar comment in do_numa_page for explanation */
+ if (!was_writable)
+ flags |= TNF_NO_GROUP;
+
+ page_nid = page_to_nid(page);
+ last_cpupid = page_cpupid_last(page);
+ target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
+ &flags);
+
+ if (target_nid == NUMA_NO_NODE) {
+ put_page(page);
+ goto out_map;
+ }
+
spin_unlock(vmf->ptl);
- migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
- vmf->pmd, pmd, vmf->address, page, target_nid);
+ migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated) {
flags |= TNF_MIGRATED;
page_nid = target_nid;
- } else
+ } else {
flags |= TNF_MIGRATE_FAIL;
-
- goto out;
-clear_pmdnuma:
- BUG_ON(!PageLocked(page));
- was_writable = pmd_savedwrite(pmd);
- pmd = pmd_modify(pmd, vma->vm_page_prot);
- pmd = pmd_mkyoung(pmd);
- if (was_writable)
- pmd = pmd_mkwrite(pmd);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- unlock_page(page);
-out_unlock:
- spin_unlock(vmf->ptl);
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
+ spin_unlock(vmf->ptl);
+ goto out;
+ }
+ goto out_map;
+ }
out:
- if (anon_vma)
- page_unlock_anon_vma_read(anon_vma);
-
if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
flags);
return 0;
+
+out_map:
+ /* Restore the PMD */
+ pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+ pmd = pmd_mkyoung(pmd);
+ if (was_writable)
+ pmd = pmd_mkwrite(pmd);
+ set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+ spin_unlock(vmf->ptl);
+ goto out;
}
/*
@@ -1604,7 +1554,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* If other processes are mapping this page, we couldn't discard
* the page unless they all do MADV_FREE so let's skip the page.
*/
- if (page_mapcount(page) != 1)
+ if (total_mapcount(page) != 1)
goto out;
if (!trylock_page(page))
@@ -1677,12 +1627,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
- if (is_huge_zero_pmd(orig_pmd))
- tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else if (is_huge_zero_pmd(orig_pmd)) {
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
- tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
} else {
struct page *page = NULL;
int flush_needed = 1;
@@ -1697,7 +1644,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
flush_needed = 0;
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -1796,6 +1743,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
* Returns
* - 0 if PMD could not be locked
* - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
+ * or if prot_numa but THP migration is not supported
* - HPAGE_PMD_NR if protections changed and TLB flush necessary
*/
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1810,6 +1758,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+ if (prot_numa && !thp_migration_supported())
+ return 1;
+
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
@@ -1822,16 +1773,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry = pmd_to_swp_entry(*pmd);
VM_BUG_ON(!is_pmd_migration_entry(*pmd));
- if (is_write_migration_entry(entry)) {
+ if (is_writable_migration_entry(entry)) {
pmd_t newpmd;
/*
* A protection check is difficult so
* just be safe and disable write
*/
- make_migration_entry_read(&entry);
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
+ if (pmd_swp_uffd_wp(*pmd))
+ newpmd = pmd_swp_mkuffd_wp(newpmd);
set_pmd_at(mm, addr, pmd, newpmd);
}
goto unlock;
@@ -2060,7 +2014,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
} else {
page = pmd_page(old_pmd);
if (!PageDirty(page) && pmd_dirty(old_pmd))
@@ -2114,8 +2068,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = migration_entry_to_page(entry);
- write = is_write_migration_entry(entry);
+ page = pfn_swap_entry_to_page(entry);
+ write = is_writable_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
@@ -2147,7 +2101,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (freeze || pmd_migration) {
swp_entry_t swp_entry;
- swp_entry = make_migration_entry(page + i, write);
+ if (write)
+ swp_entry = make_writable_migration_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_migration_entry(
+ page_to_pfn(page + i));
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
@@ -2350,15 +2309,20 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void unmap_page(struct page *page)
{
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
- TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
+ enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
+ TTU_SYNC;
VM_BUG_ON_PAGE(!PageHead(page), page);
+ /*
+ * Anon pages need migration entries to preserve them, but file
+ * pages can simply be left unmapped, then faulted back on demand.
+ * If that is ever changed (perhaps for mlock), update remap_page().
+ */
if (PageAnon(page))
- ttu_flags |= TTU_SPLIT_FREEZE;
-
- try_to_unmap(page, ttu_flags);
+ try_to_migrate(page, ttu_flags);
+ else
+ try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK);
VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
@@ -2366,6 +2330,10 @@ static void unmap_page(struct page *page)
static void remap_page(struct page *page, unsigned int nr)
{
int i;
+
+ /* If TTU_SPLIT_FREEZE is ever extended to file, remove this check */
+ if (!PageAnon(page))
+ return;
if (PageTransHuge(page)) {
remove_migration_ptes(page, page, true);
} else {
@@ -2870,7 +2838,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
list_for_each_safe(pos, next, &ds_queue->split_queue) {
- page = list_entry((void *)pos, struct page, mapping);
+ page = list_entry((void *)pos, struct page, deferred_list);
page = compound_head(page);
if (get_page_unless_zero(page)) {
list_move(page_deferred_list(page), &list);
@@ -2885,7 +2853,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
list_for_each_safe(pos, next, &list) {
- page = list_entry((void *)pos, struct page, mapping);
+ page = list_entry((void *)pos, struct page, deferred_list);
if (!trylock_page(page))
goto next;
/* split_huge_page() removes page from list on success */
@@ -3144,7 +3112,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
tok = strsep(&buf, ",");
if (tok) {
- strncpy(file_path, tok, MAX_INPUT_BUF_SZ);
+ strcpy(file_path, tok);
} else {
ret = -EINVAL;
goto out;
@@ -3214,7 +3182,10 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
if (pmd_dirty(pmdval))
set_page_dirty(page);
- entry = make_migration_entry(page, pmd_write(pmdval));
+ if (pmd_write(pmdval))
+ entry = make_writable_migration_entry(page_to_pfn(page));
+ else
+ entry = make_readable_migration_entry(page_to_pfn(page));
pmdswp = swp_entry_to_pmd(entry);
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
@@ -3240,8 +3211,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
- if (is_write_migration_entry(entry))
+ if (is_writable_migration_entry(entry))
pmde = maybe_pmd_mkwrite(pmde, vma);
+ if (pmd_swp_uffd_wp(*pvmw->pmd))
+ pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
if (PageAnon(new))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 103f1187043f..924553aa8f78 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,6 +30,7 @@
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
+#include <linux/migrate.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -41,6 +42,7 @@
#include <linux/node.h>
#include <linux/page_owner.h>
#include "internal.h"
+#include "hugetlb_vmemmap.h"
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
@@ -1318,8 +1320,6 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned int order);
#else /* !CONFIG_CONTIG_ALLOC */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
@@ -1375,7 +1375,40 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
h->nr_huge_pages_node[nid]--;
}
-static void update_and_free_page(struct hstate *h, struct page *page)
+static void add_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ int zeroed;
+ int nid = page_to_nid(page);
+
+ VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
+
+ lockdep_assert_held(&hugetlb_lock);
+
+ INIT_LIST_HEAD(&page->lru);
+ h->nr_huge_pages++;
+ h->nr_huge_pages_node[nid]++;
+
+ if (adjust_surplus) {
+ h->surplus_huge_pages++;
+ h->surplus_huge_pages_node[nid]++;
+ }
+
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+ set_page_private(page, 0);
+ SetHPageVmemmapOptimized(page);
+
+ /*
+ * This page is now managed by the hugetlb allocator and has
+ * no users -- drop the last reference.
+ */
+ zeroed = put_page_testzero(page);
+ VM_BUG_ON_PAGE(!zeroed, page);
+ arch_clear_hugepage_flags(page);
+ enqueue_huge_page(h, page);
+}
+
+static void __update_and_free_page(struct hstate *h, struct page *page)
{
int i;
struct page *subpage = page;
@@ -1383,6 +1416,18 @@ static void update_and_free_page(struct hstate *h, struct page *page)
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
+ if (alloc_huge_page_vmemmap(h, page)) {
+ spin_lock_irq(&hugetlb_lock);
+ /*
+ * If we cannot allocate vmemmap pages, just refuse to free the
+ * page and put the page back on the hugetlb free list and treat
+ * as a surplus page.
+ */
+ add_hugetlb_page(h, page, true);
+ spin_unlock_irq(&hugetlb_lock);
+ return;
+ }
+
for (i = 0; i < pages_per_huge_page(h);
i++, subpage = mem_map_next(subpage, page, i)) {
subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1398,12 +1443,79 @@ static void update_and_free_page(struct hstate *h, struct page *page)
}
}
+/*
+ * As update_and_free_page() can be called under any context, so we cannot
+ * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
+ * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
+ * the vmemmap pages.
+ *
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to be
+ * freed and frees them one-by-one. As the page->mapping pointer is going
+ * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
+ * structure of a lockless linked list of huge pages to be freed.
+ */
+static LLIST_HEAD(hpage_freelist);
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+ struct llist_node *node;
+
+ node = llist_del_all(&hpage_freelist);
+
+ while (node) {
+ struct page *page;
+ struct hstate *h;
+
+ page = container_of((struct address_space **)node,
+ struct page, mapping);
+ node = node->next;
+ page->mapping = NULL;
+ /*
+ * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
+ * is going to trigger because a previous call to
+ * remove_hugetlb_page() will set_compound_page_dtor(page,
+ * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
+ */
+ h = size_to_hstate(page_size(page));
+
+ __update_and_free_page(h, page);
+
+ cond_resched();
+ }
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+static inline void flush_free_hpage_work(struct hstate *h)
+{
+ if (free_vmemmap_pages_per_hpage(h))
+ flush_work(&free_hpage_work);
+}
+
+static void update_and_free_page(struct hstate *h, struct page *page,
+ bool atomic)
+{
+ if (!HPageVmemmapOptimized(page) || !atomic) {
+ __update_and_free_page(h, page);
+ return;
+ }
+
+ /*
+ * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
+ *
+ * Only call schedule_work() if hpage_freelist is previously
+ * empty. Otherwise, schedule_work() had been called but the workfn
+ * hasn't retrieved the list yet.
+ */
+ if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
+ schedule_work(&free_hpage_work);
+}
+
static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
{
struct page *page, *t_page;
list_for_each_entry_safe(page, t_page, list, lru) {
- update_and_free_page(h, page);
+ update_and_free_page(h, page, false);
cond_resched();
}
}
@@ -1470,12 +1582,12 @@ void free_huge_page(struct page *page)
if (HPageTemporary(page)) {
remove_hugetlb_page(h, page, false);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page);
+ update_and_free_page(h, page, true);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
remove_hugetlb_page(h, page, true);
spin_unlock_irqrestore(&hugetlb_lock, flags);
- update_and_free_page(h, page);
+ update_and_free_page(h, page, true);
} else {
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
@@ -1493,8 +1605,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
h->nr_huge_pages_node[nid]++;
}
-static void __prep_new_huge_page(struct page *page)
+static void __prep_new_huge_page(struct hstate *h, struct page *page)
{
+ free_huge_page_vmemmap(h, page);
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
@@ -1504,15 +1617,15 @@ static void __prep_new_huge_page(struct page *page)
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
- __prep_new_huge_page(page);
+ __prep_new_huge_page(h, page);
spin_lock_irq(&hugetlb_lock);
__prep_account_new_huge_page(h, nid);
spin_unlock_irq(&hugetlb_lock);
}
-static void prep_compound_gigantic_page(struct page *page, unsigned int order)
+static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
{
- int i;
+ int i, j;
int nr_pages = 1 << order;
struct page *p = page + 1;
@@ -1534,11 +1647,48 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
* after get_user_pages().
*/
__ClearPageReserved(p);
+ /*
+ * Subtle and very unlikely
+ *
+ * Gigantic 'page allocators' such as memblock or cma will
+ * return a set of pages with each page ref counted. We need
+ * to turn this set of pages into a compound page with tail
+ * page ref counts set to zero. Code such as speculative page
+ * cache adding could take a ref on a 'to be' tail page.
+ * We need to respect any increased ref count, and only set
+ * the ref count to zero if count is currently 1. If count
+ * is not 1, we call synchronize_rcu in the hope that a rcu
+ * grace period will cause ref count to drop and then retry.
+ * If count is still inflated on retry we return an error and
+ * must discard the pages.
+ */
+ if (!page_ref_freeze(p, 1)) {
+ pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
+ synchronize_rcu();
+ if (!page_ref_freeze(p, 1))
+ goto out_error;
+ }
set_page_count(p, 0);
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
atomic_set(compound_pincount_ptr(page), 0);
+ return true;
+
+out_error:
+ /* undo tail page modifications made above */
+ p = page + 1;
+ for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
+ clear_compound_head(p);
+ set_page_refcounted(p);
+ }
+ /* need to clear PG_reserved on remaining tail pages */
+ for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
+ __ClearPageReserved(p);
+ set_compound_order(page, 0);
+ page[1].compound_nr = 0;
+ __ClearPageHead(page);
+ return false;
}
/*
@@ -1658,7 +1808,9 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
nodemask_t *node_alloc_noretry)
{
struct page *page;
+ bool retry = false;
+retry:
if (hstate_is_gigantic(h))
page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
else
@@ -1667,8 +1819,21 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
if (!page)
return NULL;
- if (hstate_is_gigantic(h))
- prep_compound_gigantic_page(page, huge_page_order(h));
+ if (hstate_is_gigantic(h)) {
+ if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
+ /*
+ * Rare failure to convert pages to compound page.
+ * Free pages and try again - ONCE!
+ */
+ free_gigantic_page(page, huge_page_order(h));
+ if (!retry) {
+ retry = true;
+ goto retry;
+ }
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ return NULL;
+ }
+ }
prep_new_huge_page(h, page, page_to_nid(page));
return page;
@@ -1737,10 +1902,14 @@ static struct page *remove_pool_huge_page(struct hstate *h,
* nothing for in-use hugepages and non-hugepages.
* This function returns values like below:
*
- * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
- * (allocated or reserved.)
- * 0: successfully dissolved free hugepages or the page is not a
- * hugepage (considered as already dissolved)
+ * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
+ * when the system is under memory pressure and the feature of
+ * freeing unused vmemmap pages associated with each hugetlb page
+ * is enabled.
+ * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
+ * (allocated or reserved.)
+ * 0: successfully dissolved free hugepages or the page is not a
+ * hugepage (considered as already dissolved)
*/
int dissolve_free_huge_page(struct page *page)
{
@@ -1782,19 +1951,38 @@ retry:
goto retry;
}
- /*
- * Move PageHWPoison flag from head page to the raw error page,
- * which makes any subpages rather than the error page reusable.
- */
- if (PageHWPoison(head) && page != head) {
- SetPageHWPoison(page);
- ClearPageHWPoison(head);
- }
remove_hugetlb_page(h, head, false);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
- update_and_free_page(h, head);
- return 0;
+
+ /*
+ * Normally update_and_free_page will allocate required vmemmmap
+ * before freeing the page. update_and_free_page will fail to
+ * free the page if it can not allocate required vmemmap. We
+ * need to adjust max_huge_pages if the page is not freed.
+ * Attempt to allocate vmemmmap here so that we can take
+ * appropriate action on failure.
+ */
+ rc = alloc_huge_page_vmemmap(h, head);
+ if (!rc) {
+ /*
+ * Move PageHWPoison flag from head page to the raw
+ * error page, which makes any subpages rather than
+ * the error page reusable.
+ */
+ if (PageHWPoison(head) && page != head) {
+ SetPageHWPoison(page);
+ ClearPageHWPoison(head);
+ }
+ update_and_free_page(h, head, false);
+ } else {
+ spin_lock_irq(&hugetlb_lock);
+ add_hugetlb_page(h, head, false);
+ h->max_huge_pages++;
+ spin_unlock_irq(&hugetlb_lock);
+ }
+
+ return rc;
}
out:
spin_unlock_irq(&hugetlb_lock);
@@ -2351,14 +2539,15 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
/*
* Before dissolving the page, we need to allocate a new one for the
- * pool to remain stable. Using alloc_buddy_huge_page() allows us to
- * not having to deal with prep_new_huge_page() and avoids dealing of any
- * counters. This simplifies and let us do the whole thing under the
- * lock.
+ * pool to remain stable. Here, we allocate the page and 'prep' it
+ * by doing everything but actually updating counters and adding to
+ * the pool. This simplifies and let us do most of the processing
+ * under the lock.
*/
new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
if (!new_page)
return -ENOMEM;
+ __prep_new_huge_page(h, new_page);
retry:
spin_lock_irq(&hugetlb_lock);
@@ -2397,14 +2586,9 @@ retry:
remove_hugetlb_page(h, old_page, false);
/*
- * new_page needs to be initialized with the standard hugetlb
- * state. This is normally done by prep_new_huge_page() but
- * that takes hugetlb_lock which is already held so we need to
- * open code it here.
* Reference count trick is needed because allocator gives us
* referenced page but the pool requires pages with 0 refcount.
*/
- __prep_new_huge_page(new_page);
__prep_account_new_huge_page(h, nid);
page_ref_dec(new_page);
enqueue_huge_page(h, new_page);
@@ -2413,14 +2597,14 @@ retry:
* Pages have been replaced, we can safely free the old one.
*/
spin_unlock_irq(&hugetlb_lock);
- update_and_free_page(h, old_page);
+ update_and_free_page(h, old_page, false);
}
return ret;
free_new:
spin_unlock_irq(&hugetlb_lock);
- __free_pages(new_page, huge_page_order(h));
+ update_and_free_page(h, new_page, false);
return ret;
}
@@ -2625,16 +2809,10 @@ found:
return 1;
}
-static void __init prep_compound_huge_page(struct page *page,
- unsigned int order)
-{
- if (unlikely(order > (MAX_ORDER - 1)))
- prep_compound_gigantic_page(page, order);
- else
- prep_compound_page(page, order);
-}
-
-/* Put bootmem huge pages into the standard lists after mem_map is up */
+/*
+ * Put bootmem huge pages into the standard lists after mem_map is up.
+ * Note: This only applies to gigantic (order > MAX_ORDER) pages.
+ */
static void __init gather_bootmem_prealloc(void)
{
struct huge_bootmem_page *m;
@@ -2643,20 +2821,23 @@ static void __init gather_bootmem_prealloc(void)
struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
+ VM_BUG_ON(!hstate_is_gigantic(h));
WARN_ON(page_count(page) != 1);
- prep_compound_huge_page(page, huge_page_order(h));
- WARN_ON(PageReserved(page));
- prep_new_huge_page(h, page, page_to_nid(page));
- put_page(page); /* free it into the hugepage allocator */
+ if (prep_compound_gigantic_page(page, huge_page_order(h))) {
+ WARN_ON(PageReserved(page));
+ prep_new_huge_page(h, page, page_to_nid(page));
+ put_page(page); /* add to the hugepage allocator */
+ } else {
+ free_gigantic_page(page, huge_page_order(h));
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ }
/*
- * If we had gigantic hugepages allocated at boot time, we need
- * to restore the 'stolen' pages to totalram_pages in order to
- * fix confusing memory reports from free(1) and another
- * side-effects, like CommitLimit going negative.
+ * We need to restore the 'stolen' pages to totalram_pages
+ * in order to fix confusing memory reports from free(1) and
+ * other side-effects, like CommitLimit going negative.
*/
- if (hstate_is_gigantic(h))
- adjust_managed_page_count(page, pages_per_huge_page(h));
+ adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
}
@@ -2834,6 +3015,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* pages in hstate via the proc/sysfs interfaces.
*/
mutex_lock(&h->resize_lock);
+ flush_free_hpage_work(h);
spin_lock_irq(&hugetlb_lock);
/*
@@ -2943,6 +3125,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
/* free the pages after dropping lock */
spin_unlock_irq(&hugetlb_lock);
update_and_free_pages_bulk(h, &page_list);
+ flush_free_hpage_work(h);
spin_lock_irq(&hugetlb_lock);
while (count < persistent_huge_pages(h)) {
@@ -3450,6 +3633,7 @@ void __init hugetlb_add_hstate(unsigned int order)
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
+ hugetlb_vmemmap_init(h);
parsed_hstate = h;
}
@@ -3924,6 +4108,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
int writable)
{
pte_t entry;
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
if (writable) {
entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
@@ -3934,7 +4119,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
}
entry = pte_mkyoung(entry);
entry = pte_mkhuge(entry);
- entry = arch_make_huge_pte(entry, vma, page, writable);
+ entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
return entry;
}
@@ -4057,12 +4242,13 @@ again:
is_hugetlb_entry_hwpoisoned(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
- if (is_write_migration_entry(swp_entry) && cow) {
+ if (is_writable_migration_entry(swp_entry) && cow) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
- make_migration_entry_read(&swp_entry);
+ swp_entry = make_readable_migration_entry(
+ swp_offset(swp_entry));
entry = swp_entry_to_pte(swp_entry);
set_huge_swap_pte_at(src, addr, src_pte,
entry, sz);
@@ -4939,20 +5125,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
- struct address_space *mapping;
- pgoff_t idx;
+ struct hstate *h = hstate_vma(dst_vma);
+ struct address_space *mapping = dst_vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
unsigned long size;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
- struct hstate *h = hstate_vma(dst_vma);
pte_t _dst_pte;
spinlock_t *ptl;
- int ret;
+ int ret = -ENOMEM;
struct page *page;
int writable;
- mapping = dst_vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
if (is_continue) {
ret = -EFAULT;
page = find_lock_page(mapping, idx);
@@ -4981,12 +5164,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
+ /* Free the allocated page which may have
+ * consumed a reservation.
+ */
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
+ put_page(page);
+
+ /* Allocate a temporary page to hold the copied
+ * contents.
+ */
+ page = alloc_huge_page_vma(h, dst_vma, dst_addr);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
*pagep = page;
- /* don't free the page */
+ /* Set the outparam pagep and return to the caller to
+ * copy the contents outside the lock. Don't free the
+ * page.
+ */
goto out;
}
} else {
- page = *pagep;
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ put_page(*pagep);
+ ret = -EEXIST;
+ *pagep = NULL;
+ goto out;
+ }
+
+ page = alloc_huge_page(dst_vma, dst_addr, 0);
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
+ *pagep = NULL;
+ goto out;
+ }
+ copy_huge_page(page, *pagep);
+ put_page(*pagep);
*pagep = NULL;
}
@@ -5318,10 +5533,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
- if (is_write_migration_entry(entry)) {
+ if (is_writable_migration_entry(entry)) {
pte_t newpte;
- make_migration_entry_read(&entry);
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
set_huge_swap_pte_at(mm, address, ptep,
newpte, huge_page_size(h));
@@ -5332,10 +5548,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
}
if (!huge_pte_none(pte)) {
pte_t old_pte;
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
- pte = arch_make_huge_pte(pte, vma, NULL, 0);
+ pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
}
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
new file mode 100644
index 000000000000..c540c21e26f5
--- /dev/null
+++ b/mm/hugetlb_vmemmap.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Free some vmemmap pages of HugeTLB
+ *
+ * Copyright (c) 2020, Bytedance. All rights reserved.
+ *
+ * Author: Muchun Song <songmuchun@bytedance.com>
+ *
+ * The struct page structures (page structs) are used to describe a physical
+ * page frame. By default, there is a one-to-one mapping from a page frame to
+ * it's corresponding page struct.
+ *
+ * HugeTLB pages consist of multiple base page size pages and is supported by
+ * many architectures. See hugetlbpage.rst in the Documentation directory for
+ * more details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB
+ * are currently supported. Since the base page size on x86 is 4KB, a 2MB
+ * HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of
+ * 4096 base pages. For each base page, there is a corresponding page struct.
+ *
+ * Within the HugeTLB subsystem, only the first 4 page structs are used to
+ * contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
+ * this upper limit. The only 'useful' information in the remaining page structs
+ * is the compound_head field, and this field is the same for all tail pages.
+ *
+ * By removing redundant page structs for HugeTLB pages, memory can be returned
+ * to the buddy allocator for other uses.
+ *
+ * Different architectures support different HugeTLB pages. For example, the
+ * following table is the HugeTLB page size supported by x86 and arm64
+ * architectures. Because arm64 supports 4k, 16k, and 64k base pages and
+ * supports contiguous entries, so it supports many kinds of sizes of HugeTLB
+ * page.
+ *
+ * +--------------+-----------+-----------------------------------------------+
+ * | Architecture | Page Size | HugeTLB Page Size |
+ * +--------------+-----------+-----------+-----------+-----------+-----------+
+ * | x86-64 | 4KB | 2MB | 1GB | | |
+ * +--------------+-----------+-----------+-----------+-----------+-----------+
+ * | | 4KB | 64KB | 2MB | 32MB | 1GB |
+ * | +-----------+-----------+-----------+-----------+-----------+
+ * | arm64 | 16KB | 2MB | 32MB | 1GB | |
+ * | +-----------+-----------+-----------+-----------+-----------+
+ * | | 64KB | 2MB | 512MB | 16GB | |
+ * +--------------+-----------+-----------+-----------+-----------+-----------+
+ *
+ * When the system boot up, every HugeTLB page has more than one struct page
+ * structs which size is (unit: pages):
+ *
+ * struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
+ *
+ * Where HugeTLB_Size is the size of the HugeTLB page. We know that the size
+ * of the HugeTLB page is always n times PAGE_SIZE. So we can get the following
+ * relationship.
+ *
+ * HugeTLB_Size = n * PAGE_SIZE
+ *
+ * Then,
+ *
+ * struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
+ * = n * sizeof(struct page) / PAGE_SIZE
+ *
+ * We can use huge mapping at the pud/pmd level for the HugeTLB page.
+ *
+ * For the HugeTLB page of the pmd level mapping, then
+ *
+ * struct_size = n * sizeof(struct page) / PAGE_SIZE
+ * = PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE
+ * = sizeof(struct page) / sizeof(pte_t)
+ * = 64 / 8
+ * = 8 (pages)
+ *
+ * Where n is how many pte entries which one page can contains. So the value of
+ * n is (PAGE_SIZE / sizeof(pte_t)).
+ *
+ * This optimization only supports 64-bit system, so the value of sizeof(pte_t)
+ * is 8. And this optimization also applicable only when the size of struct page
+ * is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
+ * x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
+ * size of struct page structs of it is 8 page frames which size depends on the
+ * size of the base page.
+ *
+ * For the HugeTLB page of the pud level mapping, then
+ *
+ * struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd)
+ * = PAGE_SIZE / 8 * 8 (pages)
+ * = PAGE_SIZE (pages)
+ *
+ * Where the struct_size(pmd) is the size of the struct page structs of a
+ * HugeTLB page of the pmd level mapping.
+ *
+ * E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
+ * HugeTLB page consists in 4096.
+ *
+ * Next, we take the pmd level mapping of the HugeTLB page as an example to
+ * show the internal implementation of this optimization. There are 8 pages
+ * struct page structs associated with a HugeTLB page which is pmd mapped.
+ *
+ * Here is how things look before optimization.
+ *
+ * HugeTLB struct pages(8 pages) page frame(8 pages)
+ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
+ * | | | 0 | -------------> | 0 |
+ * | | +-----------+ +-----------+
+ * | | | 1 | -------------> | 1 |
+ * | | +-----------+ +-----------+
+ * | | | 2 | -------------> | 2 |
+ * | | +-----------+ +-----------+
+ * | | | 3 | -------------> | 3 |
+ * | | +-----------+ +-----------+
+ * | | | 4 | -------------> | 4 |
+ * | PMD | +-----------+ +-----------+
+ * | level | | 5 | -------------> | 5 |
+ * | mapping | +-----------+ +-----------+
+ * | | | 6 | -------------> | 6 |
+ * | | +-----------+ +-----------+
+ * | | | 7 | -------------> | 7 |
+ * | | +-----------+ +-----------+
+ * | |
+ * | |
+ * | |
+ * +-----------+
+ *
+ * The value of page->compound_head is the same for all tail pages. The first
+ * page of page structs (page 0) associated with the HugeTLB page contains the 4
+ * page structs necessary to describe the HugeTLB. The only use of the remaining
+ * pages of page structs (page 1 to page 7) is to point to page->compound_head.
+ * Therefore, we can remap pages 2 to 7 to page 1. Only 2 pages of page structs
+ * will be used for each HugeTLB page. This will allow us to free the remaining
+ * 6 pages to the buddy allocator.
+ *
+ * Here is how things look after remapping.
+ *
+ * HugeTLB struct pages(8 pages) page frame(8 pages)
+ * +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
+ * | | | 0 | -------------> | 0 |
+ * | | +-----------+ +-----------+
+ * | | | 1 | -------------> | 1 |
+ * | | +-----------+ +-----------+
+ * | | | 2 | ----------------^ ^ ^ ^ ^ ^
+ * | | +-----------+ | | | | |
+ * | | | 3 | ------------------+ | | | |
+ * | | +-----------+ | | | |
+ * | | | 4 | --------------------+ | | |
+ * | PMD | +-----------+ | | |
+ * | level | | 5 | ----------------------+ | |
+ * | mapping | +-----------+ | |
+ * | | | 6 | ------------------------+ |
+ * | | +-----------+ |
+ * | | | 7 | --------------------------+
+ * | | +-----------+
+ * | |
+ * | |
+ * | |
+ * +-----------+
+ *
+ * When a HugeTLB is freed to the buddy system, we should allocate 6 pages for
+ * vmemmap pages and restore the previous mapping relationship.
+ *
+ * For the HugeTLB page of the pud level mapping. It is similar to the former.
+ * We also can use this approach to free (PAGE_SIZE - 2) vmemmap pages.
+ *
+ * Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
+ * (e.g. aarch64) provides a contiguous bit in the translation table entries
+ * that hints to the MMU to indicate that it is one of a contiguous set of
+ * entries that can be cached in a single TLB entry.
+ *
+ * The contiguous bit is used to increase the mapping size at the pmd and pte
+ * (last) level. So this type of HugeTLB page can be optimized only when its
+ * size of the struct page structs is greater than 2 pages.
+ */
+#define pr_fmt(fmt) "HugeTLB: " fmt
+
+#include "hugetlb_vmemmap.h"
+
+/*
+ * There are a lot of struct page structures associated with each HugeTLB page.
+ * For tail pages, the value of compound_head is the same. So we can reuse first
+ * page of tail page structures. We map the virtual addresses of the remaining
+ * pages of tail page structures to the first tail page struct, and then free
+ * these page frames. Therefore, we need to reserve two pages as vmemmap areas.
+ */
+#define RESERVE_VMEMMAP_NR 2U
+#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT)
+
+bool hugetlb_free_vmemmap_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON);
+
+static int __init early_hugetlb_free_vmemmap_param(char *buf)
+{
+ /* We cannot optimize if a "struct page" crosses page boundaries. */
+ if ((!is_power_of_2(sizeof(struct page)))) {
+ pr_warn("cannot free vmemmap pages because \"struct page\" crosses page boundaries\n");
+ return 0;
+ }
+
+ if (!buf)
+ return -EINVAL;
+
+ if (!strcmp(buf, "on"))
+ hugetlb_free_vmemmap_enabled = true;
+ else if (!strcmp(buf, "off"))
+ hugetlb_free_vmemmap_enabled = false;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("hugetlb_free_vmemmap", early_hugetlb_free_vmemmap_param);
+
+static inline unsigned long free_vmemmap_pages_size_per_hpage(struct hstate *h)
+{
+ return (unsigned long)free_vmemmap_pages_per_hpage(h) << PAGE_SHIFT;
+}
+
+/*
+ * Previously discarded vmemmap pages will be allocated and remapping
+ * after this function returns zero.
+ */
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+ int ret;
+ unsigned long vmemmap_addr = (unsigned long)head;
+ unsigned long vmemmap_end, vmemmap_reuse;
+
+ if (!HPageVmemmapOptimized(head))
+ return 0;
+
+ vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+ vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
+ vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+ /*
+ * The pages which the vmemmap virtual address range [@vmemmap_addr,
+ * @vmemmap_end) are mapped to are freed to the buddy allocator, and
+ * the range is mapped to the page which @vmemmap_reuse is mapped to.
+ * When a HugeTLB page is freed to the buddy allocator, previously
+ * discarded vmemmap pages must be allocated and remapping.
+ */
+ ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse,
+ GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE);
+
+ if (!ret)
+ ClearHPageVmemmapOptimized(head);
+
+ return ret;
+}
+
+void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+ unsigned long vmemmap_addr = (unsigned long)head;
+ unsigned long vmemmap_end, vmemmap_reuse;
+
+ if (!free_vmemmap_pages_per_hpage(h))
+ return;
+
+ vmemmap_addr += RESERVE_VMEMMAP_SIZE;
+ vmemmap_end = vmemmap_addr + free_vmemmap_pages_size_per_hpage(h);
+ vmemmap_reuse = vmemmap_addr - PAGE_SIZE;
+
+ /*
+ * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end)
+ * to the page which @vmemmap_reuse is mapped to, then free the pages
+ * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
+ */
+ if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
+ SetHPageVmemmapOptimized(head);
+}
+
+void __init hugetlb_vmemmap_init(struct hstate *h)
+{
+ unsigned int nr_pages = pages_per_huge_page(h);
+ unsigned int vmemmap_pages;
+
+ /*
+ * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
+ * page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP,
+ * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
+ */
+ BUILD_BUG_ON(__NR_USED_SUBPAGE >=
+ RESERVE_VMEMMAP_SIZE / sizeof(struct page));
+
+ if (!hugetlb_free_vmemmap_enabled)
+ return;
+
+ vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
+ /*
+ * The head page and the first tail page are not to be freed to buddy
+ * allocator, the other pages will map to the first tail page, so they
+ * can be freed.
+ *
+ * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
+ * on some architectures (e.g. aarch64). See Documentation/arm64/
+ * hugetlbpage.rst for more details.
+ */
+ if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
+ h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
+
+ pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages,
+ h->name);
+}
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
new file mode 100644
index 000000000000..cb2bef8f9e73
--- /dev/null
+++ b/mm/hugetlb_vmemmap.h
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Free some vmemmap pages of HugeTLB
+ *
+ * Copyright (c) 2020, Bytedance. All rights reserved.
+ *
+ * Author: Muchun Song <songmuchun@bytedance.com>
+ */
+#ifndef _LINUX_HUGETLB_VMEMMAP_H
+#define _LINUX_HUGETLB_VMEMMAP_H
+#include <linux/hugetlb.h>
+
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+int alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
+void free_huge_page_vmemmap(struct hstate *h, struct page *head);
+void hugetlb_vmemmap_init(struct hstate *h);
+
+/*
+ * How many vmemmap pages associated with a HugeTLB page that can be freed
+ * to the buddy allocator.
+ */
+static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+{
+ return h->nr_free_vmemmap_pages;
+}
+#else
+static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+ return 0;
+}
+
+static inline void free_huge_page_vmemmap(struct hstate *h, struct page *head)
+{
+}
+
+static inline void hugetlb_vmemmap_init(struct hstate *h)
+{
+}
+
+static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
+{
+ return 0;
+}
+#endif /* CONFIG_HUGETLB_PAGE_FREE_VMEMMAP */
+#endif /* _LINUX_HUGETLB_VMEMMAP_H */
diff --git a/mm/internal.h b/mm/internal.h
index 6ec2cea9926b..2d7c9a2e0118 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -274,11 +274,10 @@ isolate_freepages_range(struct compact_control *cc,
int
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
+#endif
int find_suitable_fallback(struct free_area *area, unsigned int order,
int migratetype, bool only_stealable, bool *can_steal);
-#endif
-
/*
* This function returns the order of a free page in the buddy system. In
* general, page_zone(page)->lock must be held by the caller to prevent the
@@ -344,7 +343,10 @@ void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
#ifdef CONFIG_MMU
extern long populate_vma_page_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, int *nonblocking);
+ unsigned long start, unsigned long end, int *locked);
+extern long faultin_vma_page_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ bool write, int *locked);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
@@ -369,23 +371,6 @@ extern unsigned int munlock_vma_page(struct page *page);
*/
extern void clear_page_mlock(struct page *page);
-/*
- * mlock_migrate_page - called only from migrate_misplaced_transhuge_page()
- * (because that does not go through the full procedure of migration ptes):
- * to migrate the Mlocked page flag; update statistics.
- */
-static inline void mlock_migrate_page(struct page *newpage, struct page *page)
-{
- if (TestClearPageMlocked(page)) {
- int nr_pages = thp_nr_pages(page);
-
- /* Holding pmd lock, no change in irq context: __mod is safe */
- __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- SetPageMlocked(newpage);
- __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
- }
-}
-
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/*
@@ -461,7 +446,6 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
-static inline void mlock_migrate_page(struct page *new, struct page *old) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
@@ -672,4 +656,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
void vunmap_range_noflush(unsigned long start, unsigned long end);
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int page_nid, int *flags);
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 4d21ac44d5d3..d7666ace9d2e 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -636,7 +636,7 @@ static void toggle_allocation_gate(struct work_struct *work)
/* Disable static key and reset timer. */
static_branch_disable(&kfence_allocation_key);
#endif
- queue_delayed_work(system_power_efficient_wq, &kfence_timer,
+ queue_delayed_work(system_unbound_wq, &kfence_timer,
msecs_to_jiffies(kfence_sample_interval));
}
static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
@@ -666,7 +666,7 @@ void __init kfence_init(void)
}
WRITE_ONCE(kfence_enabled, true);
- queue_delayed_work(system_power_efficient_wq, &kfence_timer, 0);
+ queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
(void *)(__kfence_pool + KFENCE_POOL_SIZE));
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 6c0185fdd815..b0412be08fa2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -442,9 +442,7 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
static bool hugepage_vma_check(struct vm_area_struct *vma,
unsigned long vm_flags)
{
- /* Explicitly disabled through madvise. */
- if ((vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ if (!transhuge_vma_enabled(vma, vm_flags))
return false;
/* Enabled via shmem mount options or sysfs settings. */
@@ -459,7 +457,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
/* Read-only file mappings need to be aligned for THP to work. */
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
- (vm_flags & VM_DENYWRITE)) {
+ !inode_is_open_for_write(vma->vm_file->f_inode) &&
+ (vm_flags & VM_EXEC)) {
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
HPAGE_PMD_NR);
}
@@ -1864,6 +1863,19 @@ out_unlock:
else {
__mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
filemap_nr_thps_inc(mapping);
+ /*
+ * Paired with smp_mb() in do_dentry_open() to ensure
+ * i_writecount is up to date and the update to nr_thps is
+ * visible. Ensures the page cache will be truncated if the
+ * file is opened writable.
+ */
+ smp_mb();
+ if (inode_is_open_for_write(mapping->host)) {
+ result = SCAN_FAIL;
+ __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr);
+ filemap_nr_thps_dec(mapping);
+ goto xa_locked;
+ }
}
if (nr_none) {
diff --git a/mm/madvise.c b/mm/madvise.c
index 63e489e5bfdb..6d3d348b17f4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -53,6 +53,8 @@ static int madvise_need_mmap_write(int behavior)
case MADV_COLD:
case MADV_PAGEOUT:
case MADV_FREE:
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -822,6 +824,61 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
return -EINVAL;
}
+static long madvise_populate(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ int behavior)
+{
+ const bool write = behavior == MADV_POPULATE_WRITE;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long tmp_end;
+ int locked = 1;
+ long pages;
+
+ *prev = vma;
+
+ while (start < end) {
+ /*
+ * We might have temporarily dropped the lock. For example,
+ * our VMA might have been split.
+ */
+ if (!vma || start >= vma->vm_end) {
+ vma = find_vma(mm, start);
+ if (!vma || start < vma->vm_start)
+ return -ENOMEM;
+ }
+
+ tmp_end = min_t(unsigned long, end, vma->vm_end);
+ /* Populate (prefault) page tables readable/writable. */
+ pages = faultin_vma_page_range(vma, start, tmp_end, write,
+ &locked);
+ if (!locked) {
+ mmap_read_lock(mm);
+ locked = 1;
+ *prev = NULL;
+ vma = NULL;
+ }
+ if (pages < 0) {
+ switch (pages) {
+ case -EINTR:
+ return -EINTR;
+ case -EFAULT: /* Incompatible mappings / permissions. */
+ return -EINVAL;
+ case -EHWPOISON:
+ return -EHWPOISON;
+ default:
+ pr_warn_once("%s: unhandled return value: %ld\n",
+ __func__, pages);
+ fallthrough;
+ case -ENOMEM:
+ return -ENOMEM;
+ }
+ }
+ start += pages * PAGE_SIZE;
+ }
+ return 0;
+}
+
/*
* Application wants to free up the pages and associated backing store.
* This is effectively punching a hole into the middle of a file.
@@ -935,6 +992,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ return madvise_populate(vma, prev, start, end, behavior);
default:
return madvise_behavior(vma, prev, start, end, behavior);
}
@@ -955,6 +1015,8 @@ madvise_behavior_valid(int behavior)
case MADV_FREE:
case MADV_COLD:
case MADV_PAGEOUT:
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
@@ -1042,6 +1104,10 @@ process_madvise_behavior_valid(int behavior)
* easily if memory pressure happens.
* MADV_PAGEOUT - the application is not expected to use this memory soon,
* page out the pages in this range immediately.
+ * MADV_POPULATE_READ - populate (prefault) page tables readable by
+ * triggering read faults if required
+ * MADV_POPULATE_WRITE - populate (prefault) page tables writable by
+ * triggering write faults if required
*
* return values:
* zero - success
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index b890854ec761..ea734f248fce 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -317,7 +317,7 @@ EXPORT_SYMBOL_GPL(wp_shared_mapping_range);
* pfn_mkwrite(). And then after a TLB flush following the write-protection
* pick up all dirty bits.
*
- * Note: This function currently skips transhuge page-table entries, since
+ * This function currently skips transhuge page-table entries, since
* it's intended for dirty-tracking on the PTE level. It will warn on
* encountering transhuge dirty entries, though, and can easily be extended
* to handle them as well.
diff --git a/mm/memblock.c b/mm/memblock.c
index 123feef5259d..3e4acbf03ab7 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -906,6 +906,11 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
* @base: the base phys addr of the region
* @size: the size of the region
*
+ * The memory regions marked with %MEMBLOCK_NOMAP will not be added to the
+ * direct mapping of the physical memory. These regions will still be
+ * covered by the memory map. The struct page representing NOMAP memory
+ * frames in the memory map will be PageReserved()
+ *
* Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
@@ -2002,6 +2007,26 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
return end_pfn - start_pfn;
}
+static void __init memmap_init_reserved_pages(void)
+{
+ struct memblock_region *region;
+ phys_addr_t start, end;
+ u64 i;
+
+ /* initialize struct pages for the reserved regions */
+ for_each_reserved_mem_range(i, &start, &end)
+ reserve_bootmem_region(start, end);
+
+ /* and also treat struct pages for the NOMAP regions as PageReserved */
+ for_each_mem_region(region) {
+ if (memblock_is_nomap(region)) {
+ start = region->base;
+ end = start + region->size;
+ reserve_bootmem_region(start, end);
+ }
+ }
+}
+
static unsigned long __init free_low_memory_core_early(void)
{
unsigned long count = 0;
@@ -2010,8 +2035,7 @@ static unsigned long __init free_low_memory_core_early(void)
memblock_clear_hotplug(0, -1);
- for_each_reserved_mem_range(i, &start, &end)
- reserve_bootmem_region(start, end);
+ memmap_init_reserved_pages();
/*
* We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b80aae448a49..ae1f5d0cb581 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5537,7 +5537,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
* as special swap entry in the CPU page table.
*/
if (is_device_private_entry(ent)) {
- page = device_private_entry_to_page(ent);
+ page = pfn_swap_entry_to_page(ent);
/*
* MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
* a refcount of 1 when free (unlike normal page)
@@ -6644,7 +6644,7 @@ static unsigned long effective_protection(unsigned long usage,
}
/**
- * mem_cgroup_protected - check if memory consumption is in the normal range
+ * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
* @root: the top ancestor of the sub-tree being checked
* @memcg: the memory cgroup to check
*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e5a1531f7f4e..eefd823deb67 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -66,6 +66,19 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
+static bool __page_handle_poison(struct page *page)
+{
+ bool ret;
+
+ zone_pcp_disable(page_zone(page));
+ ret = dissolve_free_huge_page(page);
+ if (!ret)
+ ret = take_page_off_buddy(page);
+ zone_pcp_enable(page_zone(page));
+
+ return ret;
+}
+
static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
{
if (hugepage_or_freepage) {
@@ -73,7 +86,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
* Doing this check for free pages is also fine since dissolve_free_huge_page
* returns 0 for non-hugetlb pages as well.
*/
- if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
+ if (!__page_handle_poison(page))
/*
* We could fail to take off the target page from buddy
* for example due to racy page allocation, but that's
@@ -985,7 +998,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
*/
if (PageAnon(hpage))
put_page(hpage);
- if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) {
+ if (__page_handle_poison(p)) {
page_ref_inc(p);
res = MF_RECOVERED;
}
@@ -1253,10 +1266,10 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
int flags, struct page **hpagep)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC;
struct address_space *mapping;
LIST_HEAD(tokill);
- bool unmap_success = true;
+ bool unmap_success;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
bool mlocked = PageMlocked(hpage);
@@ -1319,7 +1332,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
if (!PageHuge(hpage)) {
- unmap_success = try_to_unmap(hpage, ttu);
+ try_to_unmap(hpage, ttu);
} else {
if (!PageAnon(hpage)) {
/*
@@ -1327,21 +1340,20 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
* could potentially call huge_pmd_unshare. Because of
* this, take semaphore in write mode here and set
* TTU_RMAP_LOCKED to indicate we have taken the lock
- * at this higer level.
+ * at this higher level.
*/
mapping = hugetlb_page_mapping_lock_write(hpage);
if (mapping) {
- unmap_success = try_to_unmap(hpage,
- ttu|TTU_RMAP_LOCKED);
+ try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
i_mmap_unlock_write(mapping);
- } else {
+ } else
pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
- unmap_success = false;
- }
} else {
- unmap_success = try_to_unmap(hpage, ttu);
+ try_to_unmap(hpage, ttu);
}
}
+
+ unmap_success = !page_mapped(hpage);
if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
@@ -1446,7 +1458,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
}
unlock_page(head);
res = MF_FAILED;
- if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) {
+ if (__page_handle_poison(p)) {
page_ref_inc(p);
res = MF_RECOVERED;
}
diff --git a/mm/memory.c b/mm/memory.c
index 48c4576df898..747a01d495f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -699,6 +699,68 @@ out:
}
#endif
+static void restore_exclusive_pte(struct vm_area_struct *vma,
+ struct page *page, unsigned long address,
+ pte_t *ptep)
+{
+ pte_t pte;
+ swp_entry_t entry;
+
+ pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+ if (pte_swp_soft_dirty(*ptep))
+ pte = pte_mksoft_dirty(pte);
+
+ entry = pte_to_swp_entry(*ptep);
+ if (pte_swp_uffd_wp(*ptep))
+ pte = pte_mkuffd_wp(pte);
+ else if (is_writable_device_exclusive_entry(entry))
+ pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+
+ set_pte_at(vma->vm_mm, address, ptep, pte);
+
+ /*
+ * No need to take a page reference as one was already
+ * created when the swap entry was made.
+ */
+ if (PageAnon(page))
+ page_add_anon_rmap(page, vma, address, false);
+ else
+ /*
+ * Currently device exclusive access only supports anonymous
+ * memory so the entry shouldn't point to a filebacked page.
+ */
+ WARN_ON_ONCE(!PageAnon(page));
+
+ if (vma->vm_flags & VM_LOCKED)
+ mlock_vma_page(page);
+
+ /*
+ * No need to invalidate - it was non-present before. However
+ * secondary CPUs may have mappings that need invalidating.
+ */
+ update_mmu_cache(vma, address, ptep);
+}
+
+/*
+ * Tries to restore an exclusive pte if the page lock can be acquired without
+ * sleeping.
+ */
+static int
+try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ swp_entry_t entry = pte_to_swp_entry(*src_pte);
+ struct page *page = pfn_swap_entry_to_page(entry);
+
+ if (trylock_page(page)) {
+ restore_exclusive_pte(vma, page, addr, src_pte);
+ unlock_page(page);
+ return 0;
+ }
+
+ return -EBUSY;
+}
+
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
@@ -707,17 +769,17 @@ out:
static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
- unsigned long addr, int *rss)
+ pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
+ struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
- unsigned long vm_flags = vma->vm_flags;
+ unsigned long vm_flags = dst_vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
swp_entry_t entry = pte_to_swp_entry(pte);
if (likely(!non_swap_entry(entry))) {
if (swap_duplicate(entry) < 0)
- return entry.val;
+ return -EIO;
/* make sure dst_mm is on swapoff's mmlist. */
if (unlikely(list_empty(&dst_mm->mmlist))) {
@@ -729,17 +791,18 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
rss[MM_SWAPENTS]++;
} else if (is_migration_entry(entry)) {
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
rss[mm_counter(page)]++;
- if (is_write_migration_entry(entry) &&
+ if (is_writable_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
/*
* COW mappings require pages in both
* parent and child to be set to read.
*/
- make_migration_entry_read(&entry);
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(*src_pte))
pte = pte_swp_mksoft_dirty(pte);
@@ -748,7 +811,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
set_pte_at(src_mm, addr, src_pte, pte);
}
} else if (is_device_private_entry(entry)) {
- page = device_private_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
/*
* Update rss count even for unaddressable pages, as
@@ -770,15 +833,29 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* when a device driver is involved (you cannot easily
* save and restore device driver state).
*/
- if (is_write_device_private_entry(entry) &&
+ if (is_writable_device_private_entry(entry) &&
is_cow_mapping(vm_flags)) {
- make_device_private_entry_read(&entry);
+ entry = make_readable_device_private_entry(
+ swp_offset(entry));
pte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(*src_pte))
pte = pte_swp_mkuffd_wp(pte);
set_pte_at(src_mm, addr, src_pte, pte);
}
+ } else if (is_device_exclusive_entry(entry)) {
+ /*
+ * Make device exclusive entries present by restoring the
+ * original entry then copying as for a present pte. Device
+ * exclusive entries currently only support private writable
+ * (ie. COW) mappings.
+ */
+ VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
+ if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+ return -EBUSY;
+ return -ENOENT;
}
+ if (!userfaultfd_wp(dst_vma))
+ pte = pte_swp_clear_uffd_wp(pte);
set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
@@ -844,6 +921,9 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
/* All done, just insert the new page copy in the child */
pte = mk_pte(new_page, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
+ if (userfaultfd_pte_wp(dst_vma, *src_pte))
+ /* Uffd-wp needs to be delivered to dest pte as well */
+ pte = pte_wrprotect(pte_mkuffd_wp(pte));
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
@@ -893,12 +973,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
pte = pte_mkclean(pte);
pte = pte_mkold(pte);
- /*
- * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
- * does not have the VM_UFFD_WP, which means that the uffd
- * fork event is not enabled.
- */
- if (!(vm_flags & VM_UFFD_WP))
+ if (!userfaultfd_wp(dst_vma))
pte = pte_clear_uffd_wp(pte);
set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
@@ -971,13 +1046,25 @@ again:
continue;
}
if (unlikely(!pte_present(*src_pte))) {
- entry.val = copy_nonpresent_pte(dst_mm, src_mm,
- dst_pte, src_pte,
- src_vma, addr, rss);
- if (entry.val)
+ ret = copy_nonpresent_pte(dst_mm, src_mm,
+ dst_pte, src_pte,
+ dst_vma, src_vma,
+ addr, rss);
+ if (ret == -EIO) {
+ entry = pte_to_swp_entry(*src_pte);
break;
- progress += 8;
- continue;
+ } else if (ret == -EBUSY) {
+ break;
+ } else if (!ret) {
+ progress += 8;
+ continue;
+ }
+
+ /*
+ * Device exclusive entry restored, continue by copying
+ * the now present pte.
+ */
+ WARN_ON_ONCE(ret != -ENOENT);
}
/* copy_present_pte() will clear `*prealloc' if consumed */
ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
@@ -1008,20 +1095,26 @@ again:
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
- if (entry.val) {
+ if (ret == -EIO) {
+ VM_WARN_ON_ONCE(!entry.val);
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
ret = -ENOMEM;
goto out;
}
entry.val = 0;
- } else if (ret) {
- WARN_ON_ONCE(ret != -EAGAIN);
+ } else if (ret == -EBUSY) {
+ goto out;
+ } else if (ret == -EAGAIN) {
prealloc = page_copy_prealloc(src_mm, src_vma, addr);
if (!prealloc)
return -ENOMEM;
- /* We've captured and resolved the error. Reset, try again. */
- ret = 0;
+ } else if (ret) {
+ VM_WARN_ON_ONCE(1);
}
+
+ /* We've captured and resolved the error. Reset, try again. */
+ ret = 0;
+
if (addr != end)
goto again;
out:
@@ -1050,8 +1143,8 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
|| pmd_devmap(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
- err = copy_huge_pmd(dst_mm, src_mm,
- dst_pmd, src_pmd, addr, src_vma);
+ err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+ addr, dst_vma, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
@@ -1278,8 +1371,9 @@ again:
}
entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry)) {
- struct page *page = device_private_entry_to_page(entry);
+ if (is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry)) {
+ struct page *page = pfn_swap_entry_to_page(entry);
if (unlikely(details && details->check_mapping)) {
/*
@@ -1294,7 +1388,10 @@ again:
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
- page_remove_rmap(page, false);
+
+ if (is_device_private_entry(entry))
+ page_remove_rmap(page, false);
+
put_page(page);
continue;
}
@@ -1308,7 +1405,7 @@ again:
else if (is_migration_entry(entry)) {
struct page *page;
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
rss[mm_counter(page)]--;
}
if (unlikely(!free_swap_and_cache(entry)))
@@ -3343,6 +3440,34 @@ void unmap_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL(unmap_mapping_range);
/*
+ * Restore a potential device exclusive pte to a working pte entry
+ */
+static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mmu_notifier_range range;
+
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+ return VM_FAULT_RETRY;
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+ vma->vm_mm, vmf->address & PAGE_MASK,
+ (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
+ mmu_notifier_invalidate_range_start(&range);
+
+ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+ &vmf->ptl);
+ if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+ restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ unlock_page(page);
+
+ mmu_notifier_invalidate_range_end(&range);
+ return 0;
+}
+
+/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
@@ -3370,8 +3495,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (is_migration_entry(entry)) {
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
+ } else if (is_device_exclusive_entry(entry)) {
+ vmf->page = pfn_swap_entry_to_page(entry);
+ ret = remove_device_exclusive_entry(vmf);
} else if (is_device_private_entry(entry)) {
- vmf->page = device_private_entry_to_page(entry);
+ vmf->page = pfn_swap_entry_to_page(entry);
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
@@ -4025,9 +4153,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- ret = do_fault_around(vmf);
- if (ret)
- return ret;
+ if (likely(!userfaultfd_minor(vmf->vma))) {
+ ret = do_fault_around(vmf);
+ if (ret)
+ return ret;
+ }
}
ret = __do_fault(vmf);
@@ -4172,9 +4302,8 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
return ret;
}
-static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int page_nid,
- int *flags)
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int page_nid, int *flags)
{
get_page(page);
@@ -4295,12 +4424,12 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
}
/* `inline' is required to avoid gcc 4.1.2 build error */
-static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
+static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
if (vma_is_anonymous(vmf->vma)) {
- if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
+ if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
- return do_huge_pmd_wp_page(vmf, orig_pmd);
+ return do_huge_pmd_wp_page(vmf);
}
if (vmf->vma->vm_ops->huge_fault) {
vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
@@ -4527,26 +4656,26 @@ retry_pud:
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- pmd_t orig_pmd = *vmf.pmd;
+ vmf.orig_pmd = *vmf.pmd;
barrier();
- if (unlikely(is_swap_pmd(orig_pmd))) {
+ if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(orig_pmd));
- if (is_pmd_migration_entry(orig_pmd))
+ !is_pmd_migration_entry(vmf.orig_pmd));
+ if (is_pmd_migration_entry(vmf.orig_pmd))
pmd_migration_entry_wait(mm, vmf.pmd);
return 0;
}
- if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
- if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
- return do_huge_pmd_numa_page(&vmf, orig_pmd);
+ if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+ if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+ return do_huge_pmd_numa_page(&vmf);
- if (dirty && !pmd_write(orig_pmd)) {
- ret = wp_huge_pmd(&vmf, orig_pmd);
+ if (dirty && !pmd_write(vmf.orig_pmd)) {
+ ret = wp_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(&vmf, orig_pmd);
+ huge_pmd_set_accessed(&vmf);
return 0;
}
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 974a565797d8..8cb75b26ea4f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -154,122 +154,6 @@ static void release_memory_resource(struct resource *res)
}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-void get_page_bootmem(unsigned long info, struct page *page,
- unsigned long type)
-{
- page->freelist = (void *)type;
- SetPagePrivate(page);
- set_page_private(page, info);
- page_ref_inc(page);
-}
-
-void put_page_bootmem(struct page *page)
-{
- unsigned long type;
-
- type = (unsigned long) page->freelist;
- BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
- type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
-
- if (page_ref_dec_return(page) == 1) {
- page->freelist = NULL;
- ClearPagePrivate(page);
- set_page_private(page, 0);
- INIT_LIST_HEAD(&page->lru);
- free_reserved_page(page);
- }
-}
-
-#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void register_page_bootmem_info_section(unsigned long start_pfn)
-{
- unsigned long mapsize, section_nr, i;
- struct mem_section *ms;
- struct page *page, *memmap;
- struct mem_section_usage *usage;
-
- section_nr = pfn_to_section_nr(start_pfn);
- ms = __nr_to_section(section_nr);
-
- /* Get section's memmap address */
- memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-
- /*
- * Get page for the memmap's phys address
- * XXX: need more consideration for sparse_vmemmap...
- */
- page = virt_to_page(memmap);
- mapsize = sizeof(struct page) * PAGES_PER_SECTION;
- mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
-
- /* remember memmap's page */
- for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, SECTION_INFO);
-
- usage = ms->usage;
- page = virt_to_page(usage);
-
- mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
-
- for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
-
-}
-#else /* CONFIG_SPARSEMEM_VMEMMAP */
-static void register_page_bootmem_info_section(unsigned long start_pfn)
-{
- unsigned long mapsize, section_nr, i;
- struct mem_section *ms;
- struct page *page, *memmap;
- struct mem_section_usage *usage;
-
- section_nr = pfn_to_section_nr(start_pfn);
- ms = __nr_to_section(section_nr);
-
- memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
-
- register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
-
- usage = ms->usage;
- page = virt_to_page(usage);
-
- mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
-
- for (i = 0; i < mapsize; i++, page++)
- get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
-}
-#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-
-void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
- unsigned long i, pfn, end_pfn, nr_pages;
- int node = pgdat->node_id;
- struct page *page;
-
- nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
- page = virt_to_page(pgdat);
-
- for (i = 0; i < nr_pages; i++, page++)
- get_page_bootmem(node, page, NODE_INFO);
-
- pfn = pgdat->node_start_pfn;
- end_pfn = pgdat_end_pfn(pgdat);
-
- /* register section info */
- for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- /*
- * Some platforms can assign the same pfn to multiple nodes - on
- * node0 as well as nodeN. To avoid registering a pfn against
- * multiple nodes we check that this pfn does not already
- * reside in some other nodes.
- */
- if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
- register_page_bootmem_info_section(pfn);
- }
-}
-#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
-
static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
const char *reason)
{
@@ -445,7 +329,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long pfn;
int nid = zone_to_nid(zone);
- zone_span_writelock(zone);
if (zone->zone_start_pfn == start_pfn) {
/*
* If the section is smallest section in the zone, it need
@@ -478,7 +361,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
zone->spanned_pages = 0;
}
}
- zone_span_writeunlock(zone);
}
static void update_pgdat_span(struct pglist_data *pgdat)
@@ -515,7 +397,7 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
{
const unsigned long end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long pfn, cur_nr_pages, flags;
+ unsigned long pfn, cur_nr_pages;
/* Poison struct pages because they are now uninitialized again. */
for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
@@ -540,10 +422,8 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
clear_zone_contiguous(zone);
- pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
update_pgdat_span(pgdat);
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
set_zone_contiguous(zone);
}
@@ -750,19 +630,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
- unsigned long flags;
clear_zone_contiguous(zone);
- /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
- pgdat_resize_lock(pgdat, &flags);
- zone_span_writelock(zone);
if (zone_is_empty(zone))
init_currently_empty_zone(zone, start_pfn, nr_pages);
resize_zone_range(zone, start_pfn, nr_pages);
- zone_span_writeunlock(zone);
resize_pgdat_range(pgdat, start_pfn, nr_pages);
- pgdat_resize_unlock(pgdat, &flags);
/*
* Subsection population requires care in pfn_to_online_page().
@@ -852,12 +726,8 @@ struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
*/
void adjust_present_page_count(struct zone *zone, long nr_pages)
{
- unsigned long flags;
-
zone->present_pages += nr_pages;
- pgdat_resize_lock(zone->zone_pgdat, &flags);
zone->zone_pgdat->node_present_pages += nr_pages;
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
@@ -913,7 +783,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *z
/*
* {on,off}lining is constrained to full memory sections (or more
- * precisly to memory blocks from the user space POV).
+ * precisely to memory blocks from the user space POV).
* memmap_on_memory is an exception because it reserves initial part
* of the physical memory space for vmemmaps. That space is pageblock
* aligned.
@@ -1072,8 +942,8 @@ static void rollback_node_hotadd(int nid)
}
-/**
- * try_online_node - online a node if offlined
+/*
+ * __try_online_node - online a node if offlined
* @nid: the node ID
* @set_node_online: Whether we want to online the node
* called by cpu_up() to online a node without onlined memory.
@@ -1172,6 +1042,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
* populate a single PMD.
*/
return memmap_on_memory &&
+ !hugetlb_free_vmemmap_enabled &&
IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
size == memory_block_size_bytes() &&
IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
@@ -1521,6 +1392,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
struct page *page, *head;
int ret = 0;
LIST_HEAD(source);
+ static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
if (!pfn_valid(pfn))
@@ -1567,8 +1440,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
page_is_file_lru(page));
} else {
- pr_warn("failed to isolate pfn %lx\n", pfn);
- dump_page(page, "isolation failed");
+ if (__ratelimit(&migrate_rs)) {
+ pr_warn("failed to isolate pfn %lx\n", pfn);
+ dump_page(page, "isolation failed");
+ }
}
put_page(page);
}
@@ -1597,9 +1472,11 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret) {
list_for_each_entry(page, &source, lru) {
- pr_warn("migrating pfn %lx failed ret:%d ",
- page_to_pfn(page), ret);
- dump_page(page, "migration failure");
+ if (__ratelimit(&migrate_rs)) {
+ pr_warn("migrating pfn %lx failed ret:%d\n",
+ page_to_pfn(page), ret);
+ dump_page(page, "migration failure");
+ }
}
putback_movable_pages(&source);
}
@@ -1703,7 +1580,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
/*
* {on,off}lining is constrained to full memory sections (or more
- * precisly to memory blocks from the user space POV).
+ * precisely to memory blocks from the user space POV).
* memmap_on_memory is an exception because it reserves initial part
* of the physical memory space for vmemmaps. That space is pageblock
* aligned.
@@ -2031,7 +1908,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
}
/**
- * remove_memory
+ * __remove_memory - Remove memory if every memory block is offline
* @nid: the node ID
* @start: physical address of the region to remove
* @size: size of the region to remove
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b5d95bf1025d..e32360e90274 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -121,8 +121,7 @@ enum zone_type policy_zone = 0;
*/
static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
- .mode = MPOL_PREFERRED,
- .flags = MPOL_F_LOCAL,
+ .mode = MPOL_LOCAL,
};
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
@@ -194,18 +193,17 @@ static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
- pol->v.nodes = *nodes;
+ pol->nodes = *nodes;
return 0;
}
static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
- if (!nodes)
- pol->flags |= MPOL_F_LOCAL; /* local allocation */
- else if (nodes_empty(*nodes))
- return -EINVAL; /* no allowed nodes */
- else
- pol->v.preferred_node = first_node(*nodes);
+ if (nodes_empty(*nodes))
+ return -EINVAL;
+
+ nodes_clear(pol->nodes);
+ node_set(first_node(*nodes), pol->nodes);
return 0;
}
@@ -213,15 +211,14 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
{
if (nodes_empty(*nodes))
return -EINVAL;
- pol->v.nodes = *nodes;
+ pol->nodes = *nodes;
return 0;
}
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
- * parameter with respect to the policy mode and flags. But, we need to
- * handle an empty nodemask with MPOL_PREFERRED here.
+ * parameter with respect to the policy mode and flags.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_lock for write.
@@ -231,33 +228,31 @@ static int mpol_set_nodemask(struct mempolicy *pol,
{
int ret;
- /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
- if (pol == NULL)
+ /*
+ * Default (pol==NULL) resp. local memory policies are not a
+ * subject of any remapping. They also do not need any special
+ * constructor.
+ */
+ if (!pol || pol->mode == MPOL_LOCAL)
return 0;
+
/* Check N_MEMORY */
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
- if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
- nodes = NULL; /* explicit local allocation */
- else {
- if (pol->flags & MPOL_F_RELATIVE_NODES)
- mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
- else
- nodes_and(nsc->mask2, *nodes, nsc->mask1);
- if (mpol_store_user_nodemask(pol))
- pol->w.user_nodemask = *nodes;
- else
- pol->w.cpuset_mems_allowed =
- cpuset_current_mems_allowed;
- }
+ if (pol->flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
+ else
+ nodes_and(nsc->mask2, *nodes, nsc->mask1);
- if (nodes)
- ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+ if (mpol_store_user_nodemask(pol))
+ pol->w.user_nodemask = *nodes;
else
- ret = mpol_ops[pol->mode].create(pol, NULL);
+ pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;
+
+ ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
return ret;
}
@@ -290,13 +285,14 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
+
+ mode = MPOL_LOCAL;
}
} else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes) ||
(flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES))
return ERR_PTR(-EINVAL);
- mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -330,7 +326,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
- nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
+ nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
*nodes);
pol->w.cpuset_mems_allowed = *nodes;
}
@@ -338,31 +334,13 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
if (nodes_empty(tmp))
tmp = *nodes;
- pol->v.nodes = tmp;
+ pol->nodes = tmp;
}
static void mpol_rebind_preferred(struct mempolicy *pol,
const nodemask_t *nodes)
{
- nodemask_t tmp;
-
- if (pol->flags & MPOL_F_STATIC_NODES) {
- int node = first_node(pol->w.user_nodemask);
-
- if (node_isset(node, *nodes)) {
- pol->v.preferred_node = node;
- pol->flags &= ~MPOL_F_LOCAL;
- } else
- pol->flags |= MPOL_F_LOCAL;
- } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
- mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
- pol->v.preferred_node = first_node(tmp);
- } else if (!(pol->flags & MPOL_F_LOCAL)) {
- pol->v.preferred_node = node_remap(pol->v.preferred_node,
- pol->w.cpuset_mems_allowed,
- *nodes);
- pol->w.cpuset_mems_allowed = *nodes;
- }
+ pol->w.cpuset_mems_allowed = *nodes;
}
/*
@@ -376,7 +354,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
+ if (!mpol_store_user_nodemask(pol) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
@@ -427,6 +405,9 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
.create = mpol_new_bind,
.rebind = mpol_rebind_nodemask,
},
+ [MPOL_LOCAL] = {
+ .rebind = mpol_rebind_default,
+ },
};
static int migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -458,7 +439,8 @@ static inline bool queue_pages_required(struct page *page,
/*
* queue_pages_pmd() has four possible return values:
- * 0 - pages are placed on the right node or queued successfully.
+ * 0 - pages are placed on the right node or queued successfully, or
+ * special page is met, i.e. huge zero page.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* 2 - THP was split.
@@ -482,8 +464,7 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
- __split_huge_pmd(walk->vma, pmd, addr, false, NULL);
- ret = 2;
+ walk->action = ACTION_CONTINUE;
goto out;
}
if (!queue_pages_required(page, qp))
@@ -510,7 +491,8 @@ out:
* and move them to the pagelist if they do.
*
* queue_pages_pte_range() has three possible return values:
- * 0 - pages are placed on the right node or queued successfully.
+ * 0 - pages are placed on the right node or queued successfully, or
+ * special page is met, i.e. zero page.
* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
* specified.
* -EIO - only MPOL_MF_STRICT was specified and an existing page was already
@@ -917,12 +899,11 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
switch (p->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
- *nodes = p->v.nodes;
- break;
case MPOL_PREFERRED:
- if (!(p->flags & MPOL_F_LOCAL))
- node_set(p->v.preferred_node, *nodes);
- /* else return empty node mask for local allocation */
+ *nodes = p->nodes;
+ break;
+ case MPOL_LOCAL:
+ /* return empty node mask for local allocation */
break;
default:
BUG();
@@ -1007,7 +988,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
*policy = err;
} else if (pol == current->mempolicy &&
pol->mode == MPOL_INTERLEAVE) {
- *policy = next_node_in(current->il_prev, pol->v.nodes);
+ *policy = next_node_in(current->il_prev, pol->nodes);
} else {
err = -EINVAL;
goto out;
@@ -1460,26 +1441,38 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
+/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
+static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
+{
+ *flags = *mode & MPOL_MODE_FLAGS;
+ *mode &= ~MPOL_MODE_FLAGS;
+ if ((unsigned int)(*mode) >= MPOL_MAX)
+ return -EINVAL;
+ if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
+ return -EINVAL;
+
+ return 0;
+}
+
static long kernel_mbind(unsigned long start, unsigned long len,
unsigned long mode, const unsigned long __user *nmask,
unsigned long maxnode, unsigned int flags)
{
+ unsigned short mode_flags;
nodemask_t nodes;
+ int lmode = mode;
int err;
- unsigned short mode_flags;
start = untagged_addr(start);
- mode_flags = mode & MPOL_MODE_FLAGS;
- mode &= ~MPOL_MODE_FLAGS;
- if (mode >= MPOL_MAX)
- return -EINVAL;
- if ((mode_flags & MPOL_F_STATIC_NODES) &&
- (mode_flags & MPOL_F_RELATIVE_NODES))
- return -EINVAL;
+ err = sanitize_mpol_flags(&lmode, &mode_flags);
+ if (err)
+ return err;
+
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_mbind(start, len, mode, mode_flags, &nodes, flags);
+
+ return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
@@ -1493,20 +1486,20 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
unsigned long maxnode)
{
- int err;
+ unsigned short mode_flags;
nodemask_t nodes;
- unsigned short flags;
+ int lmode = mode;
+ int err;
+
+ err = sanitize_mpol_flags(&lmode, &mode_flags);
+ if (err)
+ return err;
- flags = mode & MPOL_MODE_FLAGS;
- mode &= ~MPOL_MODE_FLAGS;
- if ((unsigned int)mode >= MPOL_MAX)
- return -EINVAL;
- if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
- return -EINVAL;
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
- return do_set_mempolicy(mode, flags, &nodes);
+
+ return do_set_mempolicy(lmode, mode_flags, &nodes);
}
SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
@@ -1863,14 +1856,14 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
/*
- * if policy->v.nodes has movable memory only,
+ * if policy->nodes has movable memory only,
* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
*
- * policy->v.nodes is intersect with node_states[N_MEMORY].
+ * policy->nodes is intersect with node_states[N_MEMORY].
* so if the following test fails, it implies
- * policy->v.nodes has movable memory only.
+ * policy->nodes has movable memory only.
*/
- if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
+ if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
dynamic_policy_zone = ZONE_MOVABLE;
return zone >= dynamic_policy_zone;
@@ -1885,8 +1878,8 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
apply_policy_zone(policy, gfp_zone(gfp)) &&
- cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
- return &policy->v.nodes;
+ cpuset_nodemask_valid_mems_allowed(&policy->nodes))
+ return &policy->nodes;
return NULL;
}
@@ -1894,9 +1887,9 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
/* Return the node id preferred by the given mempolicy, or the given id */
static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
- if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
- nd = policy->v.preferred_node;
- else {
+ if (policy->mode == MPOL_PREFERRED) {
+ nd = first_node(policy->nodes);
+ } else {
/*
* __GFP_THISNODE shouldn't even be used with the bind policy
* because we might easily break the expectation to stay on the
@@ -1914,7 +1907,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
unsigned next;
struct task_struct *me = current;
- next = next_node_in(me->il_prev, policy->v.nodes);
+ next = next_node_in(me->il_prev, policy->nodes);
if (next < MAX_NUMNODES)
me->il_prev = next;
return next;
@@ -1933,15 +1926,12 @@ unsigned int mempolicy_slab_node(void)
return node;
policy = current->mempolicy;
- if (!policy || policy->flags & MPOL_F_LOCAL)
+ if (!policy)
return node;
switch (policy->mode) {
case MPOL_PREFERRED:
- /*
- * handled MPOL_F_LOCAL above
- */
- return policy->v.preferred_node;
+ return first_node(policy->nodes);
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
@@ -1957,9 +1947,11 @@ unsigned int mempolicy_slab_node(void)
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
- &policy->v.nodes);
+ &policy->nodes);
return z->zone ? zone_to_nid(z->zone) : node;
}
+ case MPOL_LOCAL:
+ return node;
default:
BUG();
@@ -1968,12 +1960,12 @@ unsigned int mempolicy_slab_node(void)
/*
* Do static interleaving for a VMA with known offset @n. Returns the n'th
- * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
+ * node in pol->nodes (starting from n=0), wrapping around if n exceeds the
* number of present nodes.
*/
static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
{
- unsigned nnodes = nodes_weight(pol->v.nodes);
+ unsigned nnodes = nodes_weight(pol->nodes);
unsigned target;
int i;
int nid;
@@ -1981,9 +1973,9 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
if (!nnodes)
return numa_node_id();
target = (unsigned int)n % nnodes;
- nid = first_node(pol->v.nodes);
+ nid = first_node(pol->nodes);
for (i = 0; i < target; i++)
- nid = next_node(nid, pol->v.nodes);
+ nid = next_node(nid, pol->nodes);
return nid;
}
@@ -2039,7 +2031,7 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
} else {
nid = policy_node(gfp_flags, *mpol, numa_node_id());
if ((*mpol)->mode == MPOL_BIND)
- *nodemask = &(*mpol)->v.nodes;
+ *nodemask = &(*mpol)->nodes;
}
return nid;
}
@@ -2063,7 +2055,6 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
struct mempolicy *mempolicy;
- int nid;
if (!(mask && current->mempolicy))
return false;
@@ -2072,16 +2063,13 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
mempolicy = current->mempolicy;
switch (mempolicy->mode) {
case MPOL_PREFERRED:
- if (mempolicy->flags & MPOL_F_LOCAL)
- nid = numa_node_id();
- else
- nid = mempolicy->v.preferred_node;
- init_nodemask_of_node(mask, nid);
- break;
-
case MPOL_BIND:
case MPOL_INTERLEAVE:
- *mask = mempolicy->v.nodes;
+ *mask = mempolicy->nodes;
+ break;
+
+ case MPOL_LOCAL:
+ init_nodemask_of_node(mask, numa_node_id());
break;
default:
@@ -2094,16 +2082,16 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
#endif
/*
- * mempolicy_nodemask_intersects
+ * mempolicy_in_oom_domain
*
- * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
- * policy. Otherwise, check for intersection between mask and the policy
- * nodemask for 'bind' or 'interleave' policy. For 'preferred' or 'local'
- * policy, always return true since it may allocate elsewhere on fallback.
+ * If tsk's mempolicy is "bind", check for intersection between mask and
+ * the policy nodemask. Otherwise, return true for all other policies
+ * including "interleave", as a tsk with "interleave" policy may have
+ * memory allocated from all nodes in system.
*
* Takes task_lock(tsk) to prevent freeing of its mempolicy.
*/
-bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask)
{
struct mempolicy *mempolicy;
@@ -2111,29 +2099,13 @@ bool mempolicy_nodemask_intersects(struct task_struct *tsk,
if (!mask)
return ret;
+
task_lock(tsk);
mempolicy = tsk->mempolicy;
- if (!mempolicy)
- goto out;
-
- switch (mempolicy->mode) {
- case MPOL_PREFERRED:
- /*
- * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
- * allocate from, they may fallback to other nodes when oom.
- * Thus, it's possible for tsk to have allocated memory from
- * nodes in mask.
- */
- break;
- case MPOL_BIND:
- case MPOL_INTERLEAVE:
- ret = nodes_intersects(mempolicy->v.nodes, *mask);
- break;
- default:
- BUG();
- }
-out:
+ if (mempolicy && mempolicy->mode == MPOL_BIND)
+ ret = nodes_intersects(mempolicy->nodes, *mask);
task_unlock(tsk);
+
return ret;
}
@@ -2204,8 +2176,8 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
* If the policy is interleave, or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
- if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
- hpage_node = pol->v.preferred_node;
+ if (pol->mode == MPOL_PREFERRED)
+ hpage_node = first_node(pol->nodes);
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
@@ -2338,12 +2310,10 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
switch (a->mode) {
case MPOL_BIND:
case MPOL_INTERLEAVE:
- return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
- /* a's ->flags is the same as b's */
- if (a->flags & MPOL_F_LOCAL)
- return true;
- return a->v.preferred_node == b->v.preferred_node;
+ return !!nodes_equal(a->nodes, b->nodes);
+ case MPOL_LOCAL:
+ return true;
default:
BUG();
return false;
@@ -2481,16 +2451,17 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
case MPOL_PREFERRED:
- if (pol->flags & MPOL_F_LOCAL)
- polnid = numa_node_id();
- else
- polnid = pol->v.preferred_node;
+ polnid = first_node(pol->nodes);
+ break;
+
+ case MPOL_LOCAL:
+ polnid = numa_node_id();
break;
case MPOL_BIND:
/* Optimize placement among multiple nodes via NUMA balancing */
if (pol->flags & MPOL_F_MORON) {
- if (node_isset(thisnid, pol->v.nodes))
+ if (node_isset(thisnid, pol->nodes))
break;
goto out;
}
@@ -2501,12 +2472,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
* else select nearest allowed node, if any.
* If no allowed nodes, use current [!misplaced].
*/
- if (node_isset(curnid, pol->v.nodes))
+ if (node_isset(curnid, pol->nodes))
goto out;
z = first_zones_zonelist(
node_zonelist(numa_node_id(), GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
- &pol->v.nodes);
+ &pol->nodes);
polnid = zone_to_nid(z->zone);
break;
@@ -2709,7 +2680,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
vma->vm_pgoff,
sz, npol ? npol->mode : -1,
npol ? npol->flags : -1,
- npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
+ npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -2807,7 +2778,7 @@ void __init numa_policy_init(void)
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
- .v = { .preferred_node = nid, },
+ .nodes = nodemask_of_node(nid),
};
}
@@ -2851,9 +2822,6 @@ void numa_default_policy(void)
* Parse and format mempolicy from/to strings
*/
-/*
- * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
- */
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
@@ -2931,7 +2899,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
*/
if (nodelist)
goto out;
- mode = MPOL_PREFERRED;
break;
case MPOL_DEFAULT:
/*
@@ -2970,12 +2937,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol)
* Save nodes for mpol_to_str() to show the tmpfs mount options
* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
*/
- if (mode != MPOL_PREFERRED)
- new->v.nodes = nodes;
- else if (nodelist)
- new->v.preferred_node = first_node(nodes);
- else
- new->flags |= MPOL_F_LOCAL;
+ if (mode != MPOL_PREFERRED) {
+ new->nodes = nodes;
+ } else if (nodelist) {
+ nodes_clear(new->nodes);
+ node_set(first_node(nodes), new->nodes);
+ } else {
+ new->mode = MPOL_LOCAL;
+ }
/*
* Save nodes for contextualization: this will be used to "clone"
@@ -3021,16 +2990,12 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
switch (mode) {
case MPOL_DEFAULT:
+ case MPOL_LOCAL:
break;
case MPOL_PREFERRED:
- if (flags & MPOL_F_LOCAL)
- mode = MPOL_LOCAL;
- else
- node_set(pol->v.preferred_node, nodes);
- break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
- nodes = pol->v.nodes;
+ nodes = pol->nodes;
break;
default:
WARN_ON_ONCE(1);
diff --git a/mm/migrate.c b/mm/migrate.c
index 380ca57b9031..23cbd9de030b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -210,13 +210,18 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
* Recheck VMA as permissions can change since migration started
*/
entry = pte_to_swp_entry(*pvmw.pte);
- if (is_write_migration_entry(entry))
+ if (is_writable_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(*pvmw.pte))
pte = pte_mkuffd_wp(pte);
if (unlikely(is_device_private_page(new))) {
- entry = make_device_private_entry(new, pte_write(pte));
+ if (pte_write(pte))
+ entry = make_writable_device_private_entry(
+ page_to_pfn(new));
+ else
+ entry = make_readable_device_private_entry(
+ page_to_pfn(new));
pte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(*pvmw.pte))
pte = pte_swp_mksoft_dirty(pte);
@@ -226,8 +231,10 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
+
pte = pte_mkhuge(pte);
- pte = arch_make_huge_pte(pte, vma, new, 0);
+ pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, pvmw.address);
@@ -294,7 +301,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
if (!is_migration_entry(entry))
goto out;
- page = migration_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
page = compound_head(page);
/*
@@ -335,7 +342,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
ptl = pmd_lock(mm, pmd);
if (!is_pmd_migration_entry(*pmd))
goto unlock;
- page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
+ page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
if (!get_page_unless_zero(page))
goto unlock;
spin_unlock(ptl);
@@ -551,7 +558,7 @@ static void __copy_gigantic_page(struct page *dst, struct page *src,
}
}
-static void copy_huge_page(struct page *dst, struct page *src)
+void copy_huge_page(struct page *dst, struct page *src)
{
int i;
int nr_pages;
@@ -626,7 +633,10 @@ void migrate_page_states(struct page *newpage, struct page *page)
if (PageSwapCache(page))
ClearPageSwapCache(page);
ClearPagePrivate(page);
- set_page_private(page, 0);
+
+ /* page->private contains hugetlb specific flags */
+ if (!PageHuge(page))
+ set_page_private(page, 0);
/*
* If any waiters have accumulated on the new page then
@@ -1099,7 +1109,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
/* Establish migration ptes */
VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
page);
- try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
+ try_to_migrate(page, 0);
page_was_mapped = 1;
}
@@ -1288,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
* page_mapping() set, hugetlbfs specific move page routine will not
* be called and we could leak usage counts for subpools.
*/
- if (page_private(hpage) && !page_mapping(hpage)) {
+ if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
rc = -EBUSY;
goto out_unlock;
}
@@ -1301,7 +1311,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (page_mapped(hpage)) {
bool mapping_locked = false;
- enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
+ enum ttu_flags ttu = 0;
if (!PageAnon(hpage)) {
/*
@@ -1318,7 +1328,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
ttu |= TTU_RMAP_LOCKED;
}
- try_to_unmap(hpage, ttu);
+ try_to_migrate(hpage, ttu);
page_was_mapped = 1;
if (mapping_locked)
@@ -1418,6 +1428,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
int swapwrite = current->flags & PF_SWAPWRITE;
int rc, nr_subpages;
LIST_HEAD(ret_pages);
+ bool nosplit = (reason == MR_NUMA_MISPLACED);
trace_mm_migrate_pages_start(mode, reason);
@@ -1489,8 +1500,9 @@ retry:
/*
* When memory is low, don't bother to try to migrate
* other pages, just exit.
+ * THP NUMA faulting doesn't split THP to retry.
*/
- if (is_thp) {
+ if (is_thp && !nosplit) {
if (!try_split_thp(page, &page2, from)) {
nr_thp_split++;
goto retry;
@@ -2043,12 +2055,33 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
return newpage;
}
+static struct page *alloc_misplaced_dst_page_thp(struct page *page,
+ unsigned long data)
+{
+ int nid = (int) data;
+ struct page *newpage;
+
+ newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
+ HPAGE_PMD_ORDER);
+ if (!newpage)
+ goto out;
+
+ prep_transhuge_page(newpage);
+
+out:
+ return newpage;
+}
+
static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
{
int page_lru;
VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
+ /* Do not migrate THP mapped by multiple processes */
+ if (PageTransHuge(page) && total_mapcount(page) > 1)
+ return 0;
+
/* Avoid migrating to a node that is nearly full */
if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
return 0;
@@ -2056,18 +2089,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
if (isolate_lru_page(page))
return 0;
- /*
- * migrate_misplaced_transhuge_page() skips page migration's usual
- * check on page_count(), so we must do it here, now that the page
- * has been isolated: a GUP pin, or any other pin, prevents migration.
- * The expected page count is 3: 1 for page's mapcount and 1 for the
- * caller's pin and 1 for the reference taken by isolate_lru_page().
- */
- if (PageTransHuge(page) && page_count(page) != 3) {
- putback_lru_page(page);
- return 0;
- }
-
page_lru = page_is_file_lru(page);
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
thp_nr_pages(page));
@@ -2081,12 +2102,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
return 1;
}
-bool pmd_trans_migrating(pmd_t pmd)
-{
- struct page *page = pmd_page(pmd);
- return PageLocked(page);
-}
-
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
@@ -2099,6 +2114,21 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
int isolated;
int nr_remaining;
LIST_HEAD(migratepages);
+ new_page_t *new;
+ bool compound;
+ unsigned int nr_pages = thp_nr_pages(page);
+
+ /*
+ * PTE mapped THP or HugeTLB page can't reach here so the page could
+ * be either base page or THP. And it must be head page if it is
+ * THP.
+ */
+ compound = PageTransHuge(page);
+
+ if (compound)
+ new = alloc_misplaced_dst_page_thp;
+ else
+ new = alloc_misplaced_dst_page;
/*
* Don't migrate file pages that are mapped in multiple processes
@@ -2120,19 +2150,18 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
goto out;
list_add(&page->lru, &migratepages);
- nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
- NULL, node, MIGRATE_ASYNC,
- MR_NUMA_MISPLACED);
+ nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
+ MIGRATE_ASYNC, MR_NUMA_MISPLACED);
if (nr_remaining) {
if (!list_empty(&migratepages)) {
list_del(&page->lru);
- dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
+ mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+ page_is_file_lru(page), -nr_pages);
putback_lru_page(page);
}
isolated = 0;
} else
- count_vm_numa_event(NUMA_PAGE_MIGRATE);
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
BUG_ON(!list_empty(&migratepages));
return isolated;
@@ -2141,141 +2170,6 @@ out:
return 0;
}
#endif /* CONFIG_NUMA_BALANCING */
-
-#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-/*
- * Migrates a THP to a given target node. page must be locked and is unlocked
- * before returning.
- */
-int migrate_misplaced_transhuge_page(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pmd_t *pmd, pmd_t entry,
- unsigned long address,
- struct page *page, int node)
-{
- spinlock_t *ptl;
- pg_data_t *pgdat = NODE_DATA(node);
- int isolated = 0;
- struct page *new_page = NULL;
- int page_lru = page_is_file_lru(page);
- unsigned long start = address & HPAGE_PMD_MASK;
-
- new_page = alloc_pages_node(node,
- (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
- HPAGE_PMD_ORDER);
- if (!new_page)
- goto out_fail;
- prep_transhuge_page(new_page);
-
- isolated = numamigrate_isolate_page(pgdat, page);
- if (!isolated) {
- put_page(new_page);
- goto out_fail;
- }
-
- /* Prepare a page as a migration target */
- __SetPageLocked(new_page);
- if (PageSwapBacked(page))
- __SetPageSwapBacked(new_page);
-
- /* anon mapping, we can simply copy page->mapping to the new page: */
- new_page->mapping = page->mapping;
- new_page->index = page->index;
- /* flush the cache before copying using the kernel virtual address */
- flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
- migrate_page_copy(new_page, page);
- WARN_ON(PageLRU(new_page));
-
- /* Recheck the target PMD */
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
- spin_unlock(ptl);
-
- /* Reverse changes made by migrate_page_copy() */
- if (TestClearPageActive(new_page))
- SetPageActive(page);
- if (TestClearPageUnevictable(new_page))
- SetPageUnevictable(page);
-
- unlock_page(new_page);
- put_page(new_page); /* Free it */
-
- /* Retake the callers reference and putback on LRU */
- get_page(page);
- putback_lru_page(page);
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
-
- goto out_unlock;
- }
-
- entry = mk_huge_pmd(new_page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-
- /*
- * Overwrite the old entry under pagetable lock and establish
- * the new PTE. Any parallel GUP will either observe the old
- * page blocking on the page lock, block on the page table
- * lock or observe the new page. The SetPageUptodate on the
- * new page and page_add_new_anon_rmap guarantee the copy is
- * visible before the pagetable update.
- */
- page_add_anon_rmap(new_page, vma, start, true);
- /*
- * At this point the pmd is numa/protnone (i.e. non present) and the TLB
- * has already been flushed globally. So no TLB can be currently
- * caching this non present pmd mapping. There's no need to clear the
- * pmd before doing set_pmd_at(), nor to flush the TLB after
- * set_pmd_at(). Clearing the pmd here would introduce a race
- * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
- * mmap_lock for reading. If the pmd is set to NULL at any given time,
- * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
- * pmd.
- */
- set_pmd_at(mm, start, pmd, entry);
- update_mmu_cache_pmd(vma, address, &entry);
-
- page_ref_unfreeze(page, 2);
- mlock_migrate_page(new_page, page);
- page_remove_rmap(page, true);
- set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
-
- spin_unlock(ptl);
-
- /* Take an "isolate" reference and put new page on the LRU. */
- get_page(new_page);
- putback_lru_page(new_page);
-
- unlock_page(new_page);
- unlock_page(page);
- put_page(page); /* Drop the rmap reference */
- put_page(page); /* Drop the LRU isolation reference */
-
- count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
- count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
-
- mod_node_page_state(page_pgdat(page),
- NR_ISOLATED_ANON + page_lru,
- -HPAGE_PMD_NR);
- return isolated;
-
-out_fail:
- count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
- ptl = pmd_lock(mm, pmd);
- if (pmd_same(*pmd, entry)) {
- entry = pmd_modify(entry, vma->vm_page_prot);
- set_pmd_at(mm, start, pmd, entry);
- update_mmu_cache_pmd(vma, address, &entry);
- }
- spin_unlock(ptl);
-
-out_unlock:
- unlock_page(page);
- put_page(page);
- return 0;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
#endif /* CONFIG_NUMA */
#ifdef CONFIG_DEVICE_PRIVATE
@@ -2400,7 +2294,7 @@ again:
if (!is_device_private_entry(entry))
goto next;
- page = device_private_entry_to_page(entry);
+ page = pfn_swap_entry_to_page(entry);
if (!(migrate->flags &
MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
page->pgmap->owner != migrate->pgmap_owner)
@@ -2408,7 +2302,7 @@ again:
mpfn = migrate_pfn(page_to_pfn(page)) |
MIGRATE_PFN_MIGRATE;
- if (is_write_device_private_entry(entry))
+ if (is_writable_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
@@ -2454,8 +2348,12 @@ again:
ptep_get_and_clear(mm, addr, ptep);
/* Setup special migration page table entry */
- entry = make_migration_entry(page, mpfn &
- MIGRATE_PFN_WRITE);
+ if (mpfn & MIGRATE_PFN_WRITE)
+ entry = make_writable_migration_entry(
+ page_to_pfn(page));
+ else
+ entry = make_readable_migration_entry(
+ page_to_pfn(page));
swp_pte = swp_entry_to_pte(entry);
if (pte_present(pte)) {
if (pte_soft_dirty(pte))
@@ -2518,8 +2416,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
* that the registered device driver can skip invalidating device
* private page mappings that won't be migrated.
*/
- mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
- migrate->vma->vm_mm, migrate->start, migrate->end,
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
+ migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
@@ -2704,7 +2602,6 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
*/
static void migrate_vma_unmap(struct migrate_vma *migrate)
{
- int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
unsigned long addr, i, restore = 0;
@@ -2716,7 +2613,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
continue;
if (page_mapped(page)) {
- try_to_unmap(page, flags);
+ try_to_migrate(page, 0);
if (page_mapped(page))
goto restore;
}
@@ -2928,7 +2825,12 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
if (is_device_private_page(page)) {
swp_entry_t swp_entry;
- swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+ if (vma->vm_flags & VM_WRITE)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page));
entry = swp_entry_to_pte(swp_entry);
} else {
/*
@@ -3025,9 +2927,9 @@ void migrate_vma_pages(struct migrate_vma *migrate)
if (!notified) {
notified = true;
- mmu_notifier_range_init_migrate(&range, 0,
- migrate->vma, migrate->vma->vm_mm,
- addr, migrate->end,
+ mmu_notifier_range_init_owner(&range,
+ MMU_NOTIFY_MIGRATE, 0, migrate->vma,
+ migrate->vma->vm_mm, addr, migrate->end,
migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
}
diff --git a/mm/mlock.c b/mm/mlock.c
index e338ebc4ad29..0d639bf48794 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -108,7 +108,7 @@ void mlock_vma_page(struct page *page)
/*
* Finish munlock after successful page isolation
*
- * Page must be locked. This is a wrapper for try_to_munlock()
+ * Page must be locked. This is a wrapper for page_mlock()
* and putback_lru_page() with munlock accounting.
*/
static void __munlock_isolated_page(struct page *page)
@@ -118,7 +118,7 @@ static void __munlock_isolated_page(struct page *page)
* and we don't need to check all the other vmas.
*/
if (page_mapcount(page) > 1)
- try_to_munlock(page);
+ page_mlock(page);
/* Did try_to_unlock() succeed or punt? */
if (!PageMlocked(page))
@@ -158,7 +158,7 @@ static void __munlock_isolation_failed(struct page *page)
* munlock()ed or munmap()ed, we want to check whether other vmas hold the
* page locked so that we can leave it on the unevictable lru list and not
* bother vmscan with it. However, to walk the page's rmap list in
- * try_to_munlock() we must isolate the page from the LRU. If some other
+ * page_mlock() we must isolate the page from the LRU. If some other
* task has removed the page from the LRU, we won't be able to do that.
* So we clear the PageMlocked as we might not get another chance. If we
* can't isolate the page, we leave it for putback_lru_page() and vmscan
@@ -168,7 +168,7 @@ unsigned int munlock_vma_page(struct page *page)
{
int nr_pages;
- /* For try_to_munlock() and to serialize with page migration */
+ /* For page_mlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -205,7 +205,7 @@ static int __mlock_posix_error_return(long retval)
*
* The fast path is available only for evictable pages with single mapping.
* Then we can bypass the per-cpu pvec and get better performance.
- * when mapcount > 1 we need try_to_munlock() which can fail.
+ * when mapcount > 1 we need page_mlock() which can fail.
* when !page_evictable(), we need the full redo logic of putback_lru_page to
* avoid leaving evictable page in unevictable list.
*
@@ -414,7 +414,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
*
* We don't save and restore VM_LOCKED here because pages are
* still on lru. In unmap path, pages might be scanned by reclaim
- * and re-mlocked by try_to_{munlock|unmap} before we unmap and
+ * and re-mlocked by page_mlock/try_to_unmap before we unmap and
* free them. This will result in freeing mlocked pages.
*/
void munlock_vma_pages_range(struct vm_area_struct *vma,
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 2ae3f33b85b1..f5852a058ce0 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -153,6 +153,37 @@ static inline void put_memcg_path_buf(void)
rcu_read_unlock();
}
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ do { \
+ const char *memcg_path; \
+ preempt_disable(); \
+ memcg_path = get_mm_memcg_path(mm); \
+ trace_mmap_lock_##type(mm, \
+ memcg_path != NULL ? memcg_path : "", \
+ ##__VA_ARGS__); \
+ if (likely(memcg_path != NULL)) \
+ put_memcg_path_buf(); \
+ preempt_enable(); \
+ } while (0)
+
+#else /* !CONFIG_MEMCG */
+
+int trace_mmap_lock_reg(void)
+{
+ return 0;
+}
+
+void trace_mmap_lock_unreg(void)
+{
+}
+
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
+
+#endif /* CONFIG_MEMCG */
+
+#ifdef CONFIG_TRACING
+#ifdef CONFIG_MEMCG
/*
* Write the given mm_struct's memcg path to a percpu buffer, and return a
* pointer to it. If the path cannot be determined, or no buffer was available
@@ -187,33 +218,6 @@ out:
return buf;
}
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- do { \
- const char *memcg_path; \
- local_lock(&memcg_paths.lock); \
- memcg_path = get_mm_memcg_path(mm); \
- trace_mmap_lock_##type(mm, \
- memcg_path != NULL ? memcg_path : "", \
- ##__VA_ARGS__); \
- if (likely(memcg_path != NULL)) \
- put_memcg_path_buf(); \
- local_unlock(&memcg_paths.lock); \
- } while (0)
-
-#else /* !CONFIG_MEMCG */
-
-int trace_mmap_lock_reg(void)
-{
- return 0;
-}
-
-void trace_mmap_lock_unreg(void)
-{
-}
-
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
-
#endif /* CONFIG_MEMCG */
/*
@@ -239,3 +243,4 @@ void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
TRACE_MMAP_LOCK_EVENT(released, mm, write);
}
EXPORT_SYMBOL(__mmap_lock_do_trace_released);
+#endif /* CONFIG_TRACING */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e7a443157988..883e2cc85cad 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -143,26 +143,36 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
swp_entry_t entry = pte_to_swp_entry(oldpte);
pte_t newpte;
- if (is_write_migration_entry(entry)) {
+ if (is_writable_migration_entry(entry)) {
/*
* A protection check is difficult so
* just be safe and disable write
*/
- make_migration_entry_read(&entry);
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
- } else if (is_write_device_private_entry(entry)) {
+ } else if (is_writable_device_private_entry(entry)) {
/*
* We do not preserve soft-dirtiness. See
* copy_one_pte() for explanation.
*/
- make_device_private_entry_read(&entry);
+ entry = make_readable_device_private_entry(
+ swp_offset(entry));
newpte = swp_entry_to_pte(entry);
if (pte_swp_uffd_wp(oldpte))
newpte = pte_swp_mkuffd_wp(newpte);
+ } else if (is_writable_device_exclusive_entry(entry)) {
+ entry = make_readable_device_exclusive_entry(
+ swp_offset(entry));
+ newpte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(oldpte))
+ newpte = pte_swp_mksoft_dirty(newpte);
+ if (pte_swp_uffd_wp(oldpte))
+ newpte = pte_swp_mkuffd_wp(newpte);
} else {
newpte = oldpte;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index affda71641ca..3a93d4054810 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -223,7 +223,7 @@ long vread(char *buf, char *addr, unsigned long count)
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM);
+ return __vmalloc(size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc);
@@ -241,7 +241,7 @@ EXPORT_SYMBOL(vmalloc);
*/
void *vzalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+ return __vmalloc(size, GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vzalloc);
@@ -1501,7 +1501,6 @@ erase_whole_vma:
delete_vma(mm, vma);
return 0;
}
-EXPORT_SYMBOL(do_munmap);
int vm_munmap(unsigned long addr, size_t len)
{
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eefd3f5fde46..fcc29e9a3064 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -104,7 +104,7 @@ static bool oom_cpuset_eligible(struct task_struct *start,
* mempolicy intersects current, otherwise it may be
* needlessly killed.
*/
- ret = mempolicy_nodemask_intersects(tsk, mask);
+ ret = mempolicy_in_oom_domain(tsk, mask);
} else {
/*
* This is not a mempolicy constrained oom, so only
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0817d88383d5..d6e94cc8066c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -749,7 +749,6 @@ void prep_compound_page(struct page *page, unsigned int order)
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
- set_page_count(p, 0);
p->mapping = TAIL_MAPPING;
set_compound_head(p, page);
}
@@ -3193,7 +3192,7 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
int cpu;
/*
- * Allocate in the BSS so we wont require allocation in
+ * Allocate in the BSS so we won't require allocation in
* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
*/
static cpumask_t cpus_with_pcps;
@@ -3832,7 +3831,7 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
-noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index a4435311754b..f7b331081791 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -41,7 +41,8 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
/* Handle un-addressable ZONE_DEVICE memory */
entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_device_private_entry(entry))
+ if (!is_device_private_entry(entry) &&
+ !is_device_exclusive_entry(entry))
return false;
} else if (!pte_present(*pvmw->pte))
return false;
@@ -93,19 +94,21 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
return false;
entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_migration_entry(entry))
+ if (!is_migration_entry(entry) &&
+ !is_device_exclusive_entry(entry))
return false;
- pfn = migration_entry_to_pfn(entry);
+ pfn = swp_offset(entry);
} else if (is_swap_pte(*pvmw->pte)) {
swp_entry_t entry;
/* Handle un-addressable ZONE_DEVICE memory */
entry = pte_to_swp_entry(*pvmw->pte);
- if (!is_device_private_entry(entry))
+ if (!is_device_private_entry(entry) &&
+ !is_device_exclusive_entry(entry))
return false;
- pfn = device_private_entry_to_pfn(entry);
+ pfn = swp_offset(entry);
} else {
if (!pte_present(*pvmw->pte))
return false;
@@ -233,7 +236,7 @@ restart:
return not_found(pvmw);
entry = pmd_to_swp_entry(pmde);
if (!is_migration_entry(entry) ||
- migration_entry_to_page(entry) != page)
+ pfn_swap_entry_to_page(entry) != page)
return not_found(pvmw);
return true;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index e05c300048e6..37c24672125c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1405,24 +1405,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/*
* When racing against e.g. zap_pte_range() on another cpu,
* in between its ptep_get_and_clear_full() and page_remove_rmap(),
- * try_to_unmap() may return false when it is about to become true,
+ * try_to_unmap() may return before page_mapped() has become false,
* if page table locking is skipped: use TTU_SYNC to wait for that.
*/
if (flags & TTU_SYNC)
pvmw.flags = PVMW_SYNC;
- /* munlock has nothing to gain from examining un-locked vmas */
- if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
- return true;
-
- if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
- is_zone_device_page(page) && !is_device_private_page(page))
- return true;
-
- if (flags & TTU_SPLIT_HUGE_PMD) {
- split_huge_pmd_address(vma, address,
- flags & TTU_SPLIT_FREEZE, page);
- }
+ if (flags & TTU_SPLIT_HUGE_PMD)
+ split_huge_pmd_address(vma, address, false, page);
/*
* For THP, we have to assume the worse case ie pmd for invalidation.
@@ -1447,16 +1437,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
- /* PMD-mapped THP migration entry */
- if (!pvmw.pte && (flags & TTU_MIGRATION)) {
- VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
-
- set_pmd_migration_entry(&pvmw, page);
- continue;
- }
-#endif
-
/*
* If the page is mlock()d, we cannot swap it out.
* If it's recently referenced (perhaps page_referenced
@@ -1476,8 +1456,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
page_vma_mapped_walk_done(&pvmw);
break;
}
- if (flags & TTU_MUNLOCK)
- continue;
}
/* Unexpected PMD-mapped THP? */
@@ -1520,46 +1498,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
}
}
- if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & TTU_MIGRATION) &&
- is_zone_device_page(page)) {
- swp_entry_t entry;
- pte_t swp_pte;
-
- pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = make_migration_entry(page, 0);
- swp_pte = swp_entry_to_pte(entry);
-
- /*
- * pteval maps a zone device page and is therefore
- * a swap pte.
- */
- if (pte_swp_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_swp_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- *
- * The assignment to subpage above was computed from a
- * swap PTE which results in an invalid pointer.
- * Since only PAGE_SIZE pages can currently be
- * migrated, just set it to page. This will need to be
- * changed when hugepage migrations to device private
- * memory are supported.
- */
- subpage = page;
- goto discard;
- }
-
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
if (should_defer_flush(mm, flags)) {
@@ -1612,35 +1550,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* We have to invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
address + PAGE_SIZE);
- } else if (IS_ENABLED(CONFIG_MIGRATION) &&
- (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
- swp_entry_t entry;
- pte_t swp_pte;
-
- if (arch_unmap_one(mm, vma, address, pteval) < 0) {
- set_pte_at(mm, address, pvmw.pte, pteval);
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
-
- /*
- * Store the pfn of the page in a special migration
- * pte. do_swap_page() will wait until the migration
- * pte is removed and then restart fault handling.
- */
- entry = make_migration_entry(subpage,
- pte_write(pteval));
- swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, address, pvmw.pte, swp_pte);
- /*
- * No need to invalidate here it will synchronize on
- * against the special swap migration pte.
- */
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(subpage) };
pte_t swp_pte;
@@ -1756,9 +1665,10 @@ static int page_not_mapped(struct page *page)
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
*
- * If unmap is successful, return true. Otherwise, false.
+ * It is the caller's responsibility to check if the page is still
+ * mapped when needed (use TTU_SYNC to prevent accounting races).
*/
-bool try_to_unmap(struct page *page, enum ttu_flags flags)
+void try_to_unmap(struct page *page, enum ttu_flags flags)
{
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
@@ -1767,6 +1677,277 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
.anon_lock = page_lock_anon_vma_read,
};
+ if (flags & TTU_RMAP_LOCKED)
+ rmap_walk_locked(page, &rwc);
+ else
+ rmap_walk(page, &rwc);
+}
+
+/*
+ * @arg: enum ttu_flags will be passed to this argument.
+ *
+ * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
+ * containing migration entries. This and TTU_RMAP_LOCKED are the only supported
+ * flags.
+ */
+static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
+ pte_t pteval;
+ struct page *subpage;
+ bool ret = true;
+ struct mmu_notifier_range range;
+ enum ttu_flags flags = (enum ttu_flags)(long)arg;
+
+ if (is_zone_device_page(page) && !is_device_private_page(page))
+ return true;
+
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_migrate() may return before page_mapped() has become false,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ if (flags & TTU_SYNC)
+ pvmw.flags = PVMW_SYNC;
+
+ /*
+ * unmap_page() in mm/huge_memory.c is the only user of migration with
+ * TTU_SPLIT_HUGE_PMD and it wants to freeze.
+ */
+ if (flags & TTU_SPLIT_HUGE_PMD)
+ split_huge_pmd_address(vma, address, true, page);
+
+ /*
+ * For THP, we have to assume the worse case ie pmd for invalidation.
+ * For hugetlb, it could be much worse if we need to do pud
+ * invalidation in the case of pmd sharing.
+ *
+ * Note that the page can not be free in this function as call of
+ * try_to_unmap() must hold a reference on the page.
+ */
+ range.end = PageKsm(page) ?
+ address + PAGE_SIZE : vma_address_end(page, vma);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address, range.end);
+ if (PageHuge(page)) {
+ /*
+ * If sharing is possible, start and end will be adjusted
+ * accordingly.
+ */
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
+ }
+ mmu_notifier_invalidate_range_start(&range);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ /* PMD-mapped THP migration entry */
+ if (!pvmw.pte) {
+ VM_BUG_ON_PAGE(PageHuge(page) ||
+ !PageTransCompound(page), page);
+
+ set_pmd_migration_entry(&pvmw, page);
+ continue;
+ }
+#endif
+
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ address = pvmw.address;
+
+ if (PageHuge(page) && !PageAnon(page)) {
+ /*
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
+ */
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+ /*
+ * huge_pmd_unshare unmapped an entire PMD
+ * page. There is no way of knowing exactly
+ * which PMDs may be cached for this mm, so
+ * we must flush them all. start/end were
+ * already adjusted above to cover this range.
+ */
+ flush_cache_range(vma, range.start, range.end);
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+
+ /* Move the dirty bit to the page. Now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ /* Update high watermark before we lower rss */
+ update_hiwater_rss(mm);
+
+ if (is_zone_device_page(page)) {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ entry = make_readable_migration_entry(
+ page_to_pfn(page));
+ swp_pte = swp_entry_to_pte(entry);
+
+ /*
+ * pteval maps a zone device page and is therefore
+ * a swap pte.
+ */
+ if (pte_swp_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ *
+ * The assignment to subpage above was computed from a
+ * swap PTE which results in an invalid pointer.
+ * Since only PAGE_SIZE pages can currently be
+ * migrated, just set it to page. This will need to be
+ * changed when hugepage migrations to device private
+ * memory are supported.
+ */
+ subpage = page;
+ } else if (PageHWPoison(page)) {
+ pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
+ if (PageHuge(page)) {
+ hugetlb_count_sub(compound_nr(page), mm);
+ set_huge_swap_pte_at(mm, address,
+ pvmw.pte, pteval,
+ vma_mmu_pagesize(vma));
+ } else {
+ dec_mm_counter(mm, mm_counter(page));
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ }
+
+ } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
+ /*
+ * The guest indicated that the page content is of no
+ * interest anymore. Simply discard the pte, vmscan
+ * will take care of the rest.
+ * A future reference will then fault in a new zero
+ * page. When userfaultfd is active, we must not drop
+ * this page though, as its main user (postcopy
+ * migration) will not expect userfaults on already
+ * copied pages.
+ */
+ dec_mm_counter(mm, mm_counter(page));
+ /* We have to invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
+ } else {
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ if (pte_write(pteval))
+ entry = make_writable_migration_entry(
+ page_to_pfn(subpage));
+ else
+ entry = make_readable_migration_entry(
+ page_to_pfn(subpage));
+
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ */
+ }
+
+ /*
+ * No need to call mmu_notifier_invalidate_range() it has be
+ * done above for all cases requiring it to happen under page
+ * table lock before mmu_notifier_invalidate_range_end()
+ *
+ * See Documentation/vm/mmu_notifier.rst
+ */
+ page_remove_rmap(subpage, PageHuge(page));
+ put_page(page);
+ }
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ return ret;
+}
+
+/**
+ * try_to_migrate - try to replace all page table mappings with swap entries
+ * @page: the page to replace page table entries for
+ * @flags: action and flags
+ *
+ * Tries to remove all the page table entries which are mapping this page and
+ * replace them with special swap entries. Caller must hold the page lock.
+ *
+ * If is successful, return true. Otherwise, false.
+ */
+void try_to_migrate(struct page *page, enum ttu_flags flags)
+{
+ struct rmap_walk_control rwc = {
+ .rmap_one = try_to_migrate_one,
+ .arg = (void *)flags,
+ .done = page_not_mapped,
+ .anon_lock = page_lock_anon_vma_read,
+ };
+
+ /*
+ * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
+ * TTU_SPLIT_HUGE_PMD and TTU_SYNC flags.
+ */
+ if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
+ TTU_SYNC)))
+ return;
+
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
@@ -1775,38 +1956,67 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
* locking requirements of exec(), migration skips
* temporary VMAs until after exec() completes.
*/
- if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
- && !PageKsm(page) && PageAnon(page))
+ if (!PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;
if (flags & TTU_RMAP_LOCKED)
rmap_walk_locked(page, &rwc);
else
rmap_walk(page, &rwc);
+}
- /*
- * When racing against e.g. zap_pte_range() on another cpu,
- * in between its ptep_get_and_clear_full() and page_remove_rmap(),
- * try_to_unmap() may return false when it is about to become true,
- * if page table locking is skipped: use TTU_SYNC to wait for that.
- */
- return !page_mapcount(page);
+/*
+ * Walks the vma's mapping a page and mlocks the page if any locked vma's are
+ * found. Once one is found the page is locked and the scan can be terminated.
+ */
+static bool page_mlock_one(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, void *unused)
+{
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
+
+ /* An un-locked vma doesn't have any pages to lock, continue the scan */
+ if (!(vma->vm_flags & VM_LOCKED))
+ return true;
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ /*
+ * Need to recheck under the ptl to serialise with
+ * __munlock_pagevec_fill() after VM_LOCKED is cleared in
+ * munlock_vma_pages_range().
+ */
+ if (vma->vm_flags & VM_LOCKED) {
+ /* PTE-mapped THP are never mlocked */
+ if (!PageTransCompound(page))
+ mlock_vma_page(page);
+ page_vma_mapped_walk_done(&pvmw);
+ }
+
+ /*
+ * no need to continue scanning other vma's if the page has
+ * been locked.
+ */
+ return false;
+ }
+
+ return true;
}
/**
- * try_to_munlock - try to munlock a page
- * @page: the page to be munlocked
+ * page_mlock - try to mlock a page
+ * @page: the page to be mlocked
*
- * Called from munlock code. Checks all of the VMAs mapping the page
- * to make sure nobody else has this page mlocked. The page will be
- * returned with PG_mlocked cleared if no other vmas have it mlocked.
+ * Called from munlock code. Checks all of the VMAs mapping the page and mlocks
+ * the page if any are found. The page will be returned with PG_mlocked cleared
+ * if it is not mapped by any locked vmas.
*/
-
-void try_to_munlock(struct page *page)
+void page_mlock(struct page *page)
{
struct rmap_walk_control rwc = {
- .rmap_one = try_to_unmap_one,
- .arg = (void *)TTU_MUNLOCK,
+ .rmap_one = page_mlock_one,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
@@ -1818,6 +2028,192 @@ void try_to_munlock(struct page *page)
rmap_walk(page, &rwc);
}
+#ifdef CONFIG_DEVICE_PRIVATE
+struct make_exclusive_args {
+ struct mm_struct *mm;
+ unsigned long address;
+ void *owner;
+ bool valid;
+};
+
+static bool page_make_device_exclusive_one(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, void *priv)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = address,
+ };
+ struct make_exclusive_args *args = priv;
+ pte_t pteval;
+ struct page *subpage;
+ bool ret = true;
+ struct mmu_notifier_range range;
+ swp_entry_t entry;
+ pte_t swp_pte;
+
+ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+ vma->vm_mm, address, min(vma->vm_end,
+ address + page_size(page)), args->owner);
+ mmu_notifier_invalidate_range_start(&range);
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ /* Unexpected PMD-mapped THP? */
+ VM_BUG_ON_PAGE(!pvmw.pte, page);
+
+ if (!pte_present(*pvmw.pte)) {
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
+ address = pvmw.address;
+
+ /* Nuke the page table entry. */
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+
+ /* Move the dirty bit to the page. Now the pte is gone. */
+ if (pte_dirty(pteval))
+ set_page_dirty(page);
+
+ /*
+ * Check that our target page is still mapped at the expected
+ * address.
+ */
+ if (args->mm == mm && args->address == address &&
+ pte_write(pteval))
+ args->valid = true;
+
+ /*
+ * Store the pfn of the page in a special migration
+ * pte. do_swap_page() will wait until the migration
+ * pte is removed and then restart fault handling.
+ */
+ if (pte_write(pteval))
+ entry = make_writable_device_exclusive_entry(
+ page_to_pfn(subpage));
+ else
+ entry = make_readable_device_exclusive_entry(
+ page_to_pfn(subpage));
+ swp_pte = swp_entry_to_pte(entry);
+ if (pte_soft_dirty(pteval))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
+
+ /*
+ * There is a reference on the page for the swap entry which has
+ * been removed, so shouldn't take another.
+ */
+ page_remove_rmap(subpage, false);
+ }
+
+ mmu_notifier_invalidate_range_end(&range);
+
+ return ret;
+}
+
+/**
+ * page_make_device_exclusive - mark the page exclusively owned by a device
+ * @page: the page to replace page table entries for
+ * @mm: the mm_struct where the page is expected to be mapped
+ * @address: address where the page is expected to be mapped
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
+ *
+ * Tries to remove all the page table entries which are mapping this page and
+ * replace them with special device exclusive swap entries to grant a device
+ * exclusive access to the page. Caller must hold the page lock.
+ *
+ * Returns false if the page is still mapped, or if it could not be unmapped
+ * from the expected address. Otherwise returns true (success).
+ */
+static bool page_make_device_exclusive(struct page *page, struct mm_struct *mm,
+ unsigned long address, void *owner)
+{
+ struct make_exclusive_args args = {
+ .mm = mm,
+ .address = address,
+ .owner = owner,
+ .valid = false,
+ };
+ struct rmap_walk_control rwc = {
+ .rmap_one = page_make_device_exclusive_one,
+ .done = page_not_mapped,
+ .anon_lock = page_lock_anon_vma_read,
+ .arg = &args,
+ };
+
+ /*
+ * Restrict to anonymous pages for now to avoid potential writeback
+ * issues. Also tail pages shouldn't be passed to rmap_walk so skip
+ * those.
+ */
+ if (!PageAnon(page) || PageTail(page))
+ return false;
+
+ rmap_walk(page, &rwc);
+
+ return args.valid && !page_mapcount(page);
+}
+
+/**
+ * make_device_exclusive_range() - Mark a range for exclusive use by a device
+ * @mm: mm_struct of assoicated target process
+ * @start: start of the region to mark for exclusive device access
+ * @end: end address of region
+ * @pages: returns the pages which were successfully marked for exclusive access
+ * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
+ *
+ * Returns: number of pages found in the range by GUP. A page is marked for
+ * exclusive access only if the page pointer is non-NULL.
+ *
+ * This function finds ptes mapping page(s) to the given address range, locks
+ * them and replaces mappings with special swap entries preventing userspace CPU
+ * access. On fault these entries are replaced with the original mapping after
+ * calling MMU notifiers.
+ *
+ * A driver using this to program access from a device must use a mmu notifier
+ * critical section to hold a device specific lock during programming. Once
+ * programming is complete it should drop the page lock and reference after
+ * which point CPU access to the page will revoke the exclusive access.
+ */
+int make_device_exclusive_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct page **pages,
+ void *owner)
+{
+ long npages = (end - start) >> PAGE_SHIFT;
+ long i;
+
+ npages = get_user_pages_remote(mm, start, npages,
+ FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
+ pages, NULL, NULL);
+ if (npages < 0)
+ return npages;
+
+ for (i = 0; i < npages; i++, start += PAGE_SIZE) {
+ if (!trylock_page(pages[i])) {
+ put_page(pages[i]);
+ pages[i] = NULL;
+ continue;
+ }
+
+ if (!page_make_device_exclusive(pages[i], mm, start, owner)) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ pages[i] = NULL;
+ }
+ }
+
+ return npages;
+}
+EXPORT_SYMBOL_GPL(make_device_exclusive_range);
+#endif
+
void __put_anon_vma(struct anon_vma *anon_vma)
{
struct anon_vma *root = anon_vma->root;
@@ -1858,7 +2254,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to.
*
- * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
+ * When called from page_mlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
@@ -1911,7 +2307,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
*
- * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
+ * When called from page_mlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
diff --git a/mm/shmem.c b/mm/shmem.c
index 6268b9b4e41a..70d9ce294bb4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1797,7 +1797,7 @@ unlock:
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * vmf and fault_type are only supplied by shmem_fault:
+ * vma, vmf, and fault_type are only supplied by shmem_fault:
* otherwise they are NULL.
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
@@ -1832,6 +1832,16 @@ repeat:
page = pagecache_get_page(mapping, index,
FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
+
+ if (page && vma && userfaultfd_minor(vma)) {
+ if (!xa_is_value(page)) {
+ unlock_page(page);
+ put_page(page);
+ }
+ *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
+ return 0;
+ }
+
if (xa_is_value(page)) {
error = shmem_swapin_page(inode, index, &page,
sgp, gfp, vma, fault_type);
@@ -2352,27 +2362,25 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
return inode;
}
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- bool zeropage,
- struct page **pagep)
+#ifdef CONFIG_USERFAULTFD
+int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ bool zeropage,
+ struct page **pagep)
{
struct inode *inode = file_inode(dst_vma->vm_file);
struct shmem_inode_info *info = SHMEM_I(inode);
struct address_space *mapping = inode->i_mapping;
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
- spinlock_t *ptl;
void *page_kaddr;
struct page *page;
- pte_t _dst_pte, *dst_pte;
int ret;
- pgoff_t offset, max_off;
+ pgoff_t max_off;
- ret = -ENOMEM;
if (!shmem_inode_acct_block(inode, 1)) {
/*
* We may have got a page, returned -ENOENT triggering a retry,
@@ -2383,15 +2391,16 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
put_page(*pagep);
*pagep = NULL;
}
- goto out;
+ return -ENOMEM;
}
if (!*pagep) {
+ ret = -ENOMEM;
page = shmem_alloc_page(gfp, info, pgoff);
if (!page)
goto out_unacct_blocks;
- if (!zeropage) { /* mcopy_atomic */
+ if (!zeropage) { /* COPY */
page_kaddr = kmap_atomic(page);
ret = copy_from_user(page_kaddr,
(const void __user *)src_addr,
@@ -2401,11 +2410,11 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
*pagep = page;
- shmem_inode_unacct_blocks(inode, 1);
+ ret = -ENOENT;
/* don't free the page */
- return -ENOENT;
+ goto out_unacct_blocks;
}
- } else { /* mfill_zeropage_atomic */
+ } else { /* ZEROPAGE */
clear_highpage(page);
}
} else {
@@ -2413,15 +2422,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
*pagep = NULL;
}
- VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
+ VM_BUG_ON(PageLocked(page));
+ VM_BUG_ON(PageSwapBacked(page));
__SetPageLocked(page);
__SetPageSwapBacked(page);
__SetPageUptodate(page);
ret = -EFAULT;
- offset = linear_page_index(dst_vma, dst_addr);
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
+ if (unlikely(pgoff >= max_off))
goto out_release;
ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
@@ -2429,32 +2438,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (ret)
goto out_release;
- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
- if (dst_vma->vm_flags & VM_WRITE)
- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
- else {
- /*
- * We don't set the pte dirty if the vma has no
- * VM_WRITE permission, so mark the page dirty or it
- * could be freed from under us. We could do it
- * unconditionally before unlock_page(), but doing it
- * only if VM_WRITE is not set is faster.
- */
- set_page_dirty(page);
- }
-
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
-
- ret = -EFAULT;
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
- goto out_release_unlock;
-
- ret = -EEXIST;
- if (!pte_none(*dst_pte))
- goto out_release_unlock;
-
- lru_cache_add(page);
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ page, true, false);
+ if (ret)
+ goto out_delete_from_cache;
spin_lock_irq(&info->lock);
info->alloced++;
@@ -2462,50 +2449,19 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
- inc_mm_counter(dst_mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
- pte_unmap_unlock(dst_pte, ptl);
+ SetPageDirty(page);
unlock_page(page);
- ret = 0;
-out:
- return ret;
-out_release_unlock:
- pte_unmap_unlock(dst_pte, ptl);
- ClearPageDirty(page);
+ return 0;
+out_delete_from_cache:
delete_from_page_cache(page);
out_release:
unlock_page(page);
put_page(page);
out_unacct_blocks:
shmem_inode_unacct_blocks(inode, 1);
- goto out;
-}
-
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep)
-{
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, false, pagep);
-}
-
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
-{
- struct page *page = NULL;
-
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, 0, true, &page);
+ return ret;
}
+#endif /* CONFIG_USERFAULTFD */
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
@@ -4040,8 +3996,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
loff_t i_size;
pgoff_t off;
- if ((vma->vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ if (!transhuge_vma_enabled(vma, vma->vm_flags))
return false;
if (shmem_huge == SHMEM_HUGE_FORCE)
return true;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 16183d85a7d5..bdce883f9286 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -27,8 +27,362 @@
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
+#include <linux/pgtable.h>
+#include <linux/bootmem_info.h>
+
#include <asm/dma.h>
#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+/**
+ * struct vmemmap_remap_walk - walk vmemmap page table
+ *
+ * @remap_pte: called for each lowest-level entry (PTE).
+ * @nr_walked: the number of walked pte.
+ * @reuse_page: the page which is reused for the tail vmemmap pages.
+ * @reuse_addr: the virtual address of the @reuse_page page.
+ * @vmemmap_pages: the list head of the vmemmap pages that can be freed
+ * or is mapped from.
+ */
+struct vmemmap_remap_walk {
+ void (*remap_pte)(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk);
+ unsigned long nr_walked;
+ struct page *reuse_page;
+ unsigned long reuse_addr;
+ struct list_head *vmemmap_pages;
+};
+
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
+ struct vmemmap_remap_walk *walk)
+{
+ pmd_t __pmd;
+ int i;
+ unsigned long addr = start;
+ struct page *page = pmd_page(*pmd);
+ pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
+
+ if (!pgtable)
+ return -ENOMEM;
+
+ pmd_populate_kernel(&init_mm, &__pmd, pgtable);
+
+ for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
+ pte_t entry, *pte;
+ pgprot_t pgprot = PAGE_KERNEL;
+
+ entry = mk_pte(page + i, pgprot);
+ pte = pte_offset_kernel(&__pmd, addr);
+ set_pte_at(&init_mm, addr, pte, entry);
+ }
+
+ /* Make pte visible before pmd. See comment in __pte_alloc(). */
+ smp_wmb();
+ pmd_populate_kernel(&init_mm, pmd, pgtable);
+
+ flush_tlb_kernel_range(start, start + PMD_SIZE);
+
+ return 0;
+}
+
+static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ pte_t *pte = pte_offset_kernel(pmd, addr);
+
+ /*
+ * The reuse_page is found 'first' in table walk before we start
+ * remapping (which is calling @walk->remap_pte).
+ */
+ if (!walk->reuse_page) {
+ walk->reuse_page = pte_page(*pte);
+ /*
+ * Because the reuse address is part of the range that we are
+ * walking, skip the reuse address range.
+ */
+ addr += PAGE_SIZE;
+ pte++;
+ walk->nr_walked++;
+ }
+
+ for (; addr != end; addr += PAGE_SIZE, pte++) {
+ walk->remap_pte(pte, addr, walk);
+ walk->nr_walked++;
+ }
+}
+
+static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ if (pmd_leaf(*pmd)) {
+ int ret;
+
+ ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
+ if (ret)
+ return ret;
+ }
+ next = pmd_addr_end(addr, end);
+ vmemmap_pte_range(pmd, addr, next, walk);
+ } while (pmd++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_offset(p4d, addr);
+ do {
+ int ret;
+
+ next = pud_addr_end(addr, end);
+ ret = vmemmap_pmd_range(pud, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pud++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ int ret;
+
+ next = p4d_addr_end(addr, end);
+ ret = vmemmap_pud_range(p4d, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (p4d++, addr = next, addr != end);
+
+ return 0;
+}
+
+static int vmemmap_remap_range(unsigned long start, unsigned long end,
+ struct vmemmap_remap_walk *walk)
+{
+ unsigned long addr = start;
+ unsigned long next;
+ pgd_t *pgd;
+
+ VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
+ VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
+
+ pgd = pgd_offset_k(addr);
+ do {
+ int ret;
+
+ next = pgd_addr_end(addr, end);
+ ret = vmemmap_p4d_range(pgd, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pgd++, addr = next, addr != end);
+
+ /*
+ * We only change the mapping of the vmemmap virtual address range
+ * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
+ * belongs to the range.
+ */
+ flush_tlb_kernel_range(start + PAGE_SIZE, end);
+
+ return 0;
+}
+
+/*
+ * Free a vmemmap page. A vmemmap page can be allocated from the memblock
+ * allocator or buddy allocator. If the PG_reserved flag is set, it means
+ * that it allocated from the memblock allocator, just free it via the
+ * free_bootmem_page(). Otherwise, use __free_page().
+ */
+static inline void free_vmemmap_page(struct page *page)
+{
+ if (PageReserved(page))
+ free_bootmem_page(page);
+ else
+ __free_page(page);
+}
+
+/* Free a list of the vmemmap pages */
+static void free_vmemmap_page_list(struct list_head *list)
+{
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, list, lru) {
+ list_del(&page->lru);
+ free_vmemmap_page(page);
+ }
+}
+
+static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk)
+{
+ /*
+ * Remap the tail pages as read-only to catch illegal write operation
+ * to the tail pages.
+ */
+ pgprot_t pgprot = PAGE_KERNEL_RO;
+ pte_t entry = mk_pte(walk->reuse_page, pgprot);
+ struct page *page = pte_page(*pte);
+
+ list_add_tail(&page->lru, walk->vmemmap_pages);
+ set_pte_at(&init_mm, addr, pte, entry);
+}
+
+static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
+ struct vmemmap_remap_walk *walk)
+{
+ pgprot_t pgprot = PAGE_KERNEL;
+ struct page *page;
+ void *to;
+
+ BUG_ON(pte_page(*pte) != walk->reuse_page);
+
+ page = list_first_entry(walk->vmemmap_pages, struct page, lru);
+ list_del(&page->lru);
+ to = page_to_virt(page);
+ copy_page(to, (void *)walk->reuse_addr);
+
+ set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
+}
+
+/**
+ * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
+ * to the page which @reuse is mapped to, then free vmemmap
+ * which the range are mapped to.
+ * @start: start address of the vmemmap virtual address range that we want
+ * to remap.
+ * @end: end address of the vmemmap virtual address range that we want to
+ * remap.
+ * @reuse: reuse address.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int vmemmap_remap_free(unsigned long start, unsigned long end,
+ unsigned long reuse)
+{
+ int ret;
+ LIST_HEAD(vmemmap_pages);
+ struct vmemmap_remap_walk walk = {
+ .remap_pte = vmemmap_remap_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ /*
+ * In order to make remapping routine most efficient for the huge pages,
+ * the routine of vmemmap page table walking has the following rules
+ * (see more details from the vmemmap_pte_range()):
+ *
+ * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
+ * should be continuous.
+ * - The @reuse address is part of the range [@reuse, @end) that we are
+ * walking which is passed to vmemmap_remap_range().
+ * - The @reuse address is the first in the complete range.
+ *
+ * So we need to make sure that @start and @reuse meet the above rules.
+ */
+ BUG_ON(start - reuse != PAGE_SIZE);
+
+ mmap_write_lock(&init_mm);
+ ret = vmemmap_remap_range(reuse, end, &walk);
+ mmap_write_downgrade(&init_mm);
+
+ if (ret && walk.nr_walked) {
+ end = reuse + walk.nr_walked * PAGE_SIZE;
+ /*
+ * vmemmap_pages contains pages from the previous
+ * vmemmap_remap_range call which failed. These
+ * are pages which were removed from the vmemmap.
+ * They will be restored in the following call.
+ */
+ walk = (struct vmemmap_remap_walk) {
+ .remap_pte = vmemmap_restore_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ vmemmap_remap_range(reuse, end, &walk);
+ }
+ mmap_read_unlock(&init_mm);
+
+ free_vmemmap_page_list(&vmemmap_pages);
+
+ return ret;
+}
+
+static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
+ gfp_t gfp_mask, struct list_head *list)
+{
+ unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
+ int nid = page_to_nid((struct page *)start);
+ struct page *page, *next;
+
+ while (nr_pages--) {
+ page = alloc_pages_node(nid, gfp_mask, 0);
+ if (!page)
+ goto out;
+ list_add_tail(&page->lru, list);
+ }
+
+ return 0;
+out:
+ list_for_each_entry_safe(page, next, list, lru)
+ __free_pages(page, 0);
+ return -ENOMEM;
+}
+
+/**
+ * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
+ * to the page which is from the @vmemmap_pages
+ * respectively.
+ * @start: start address of the vmemmap virtual address range that we want
+ * to remap.
+ * @end: end address of the vmemmap virtual address range that we want to
+ * remap.
+ * @reuse: reuse address.
+ * @gfp_mask: GFP flag for allocating vmemmap pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int vmemmap_remap_alloc(unsigned long start, unsigned long end,
+ unsigned long reuse, gfp_t gfp_mask)
+{
+ LIST_HEAD(vmemmap_pages);
+ struct vmemmap_remap_walk walk = {
+ .remap_pte = vmemmap_restore_pte,
+ .reuse_addr = reuse,
+ .vmemmap_pages = &vmemmap_pages,
+ };
+
+ /* See the comment in the vmemmap_remap_free(). */
+ BUG_ON(start - reuse != PAGE_SIZE);
+
+ if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
+ return -ENOMEM;
+
+ mmap_read_lock(&init_mm);
+ vmemmap_remap_range(reuse, end, &walk);
+ mmap_read_unlock(&init_mm);
+
+ return 0;
+}
/*
* Allocate a block of memory to be used to back the virtual memory map
diff --git a/mm/sparse.c b/mm/sparse.c
index 7272f7a1449d..6326cdf36c4f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -13,6 +13,7 @@
#include <linux/vmalloc.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/bootmem_info.h>
#include "internal.h"
#include <asm/dma.h>
diff --git a/mm/swap.c b/mm/swap.c
index 6c11db780467..19600430e536 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -554,7 +554,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
} else {
/*
* The page's writeback ends up during pagevec
- * We moves tha page into tail of inactive.
+ * We move that page into tail of inactive.
*/
add_page_to_lru_list_tail(page, lruvec);
__count_vm_events(PGROTATED, nr_pages);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e898c879a434..1e07d1c776f2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2967,7 +2967,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
return 0;
}
- /* swap partition endianess hack... */
+ /* swap partition endianness hack... */
if (swab32(swap_header->info.version) == 1) {
swab32s(&swap_header->info.version);
swab32s(&swap_header->info.last_page);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 63a73e164d55..0e2132834bc7 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -48,6 +48,78 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
return dst_vma;
}
+/*
+ * Install PTEs, to map dst_addr (within dst_vma) to page.
+ *
+ * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
+ * and anon, and for both shared and private VMAs.
+ */
+int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, struct page *page,
+ bool newly_allocated, bool wp_copy)
+{
+ int ret;
+ pte_t _dst_pte, *dst_pte;
+ bool writable = dst_vma->vm_flags & VM_WRITE;
+ bool vm_shared = dst_vma->vm_flags & VM_SHARED;
+ bool page_in_cache = page->mapping;
+ spinlock_t *ptl;
+ struct inode *inode;
+ pgoff_t offset, max_off;
+
+ _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ if (page_in_cache && !vm_shared)
+ writable = false;
+ if (writable || !page_in_cache)
+ _dst_pte = pte_mkdirty(_dst_pte);
+ if (writable) {
+ if (wp_copy)
+ _dst_pte = pte_mkuffd_wp(_dst_pte);
+ else
+ _dst_pte = pte_mkwrite(_dst_pte);
+ }
+
+ dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+
+ if (vma_is_shmem(dst_vma)) {
+ /* serialize against truncate with the page table lock */
+ inode = dst_vma->vm_file->f_inode;
+ offset = linear_page_index(dst_vma, dst_addr);
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ ret = -EFAULT;
+ if (unlikely(offset >= max_off))
+ goto out_unlock;
+ }
+
+ ret = -EEXIST;
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+
+ if (page_in_cache)
+ page_add_file_rmap(page, false);
+ else
+ page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+
+ /*
+ * Must happen after rmap, as mm_counter() checks mapping (via
+ * PageAnon()), which is set by __page_set_anon_rmap().
+ */
+ inc_mm_counter(dst_mm, mm_counter(page));
+
+ if (newly_allocated)
+ lru_cache_add_inactive_or_unevictable(page, dst_vma);
+
+ set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(dst_vma, dst_addr, dst_pte);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ return ret;
+}
+
static int mcopy_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
@@ -56,13 +128,9 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep,
bool wp_copy)
{
- pte_t _dst_pte, *dst_pte;
- spinlock_t *ptl;
void *page_kaddr;
int ret;
struct page *page;
- pgoff_t offset, max_off;
- struct inode *inode;
if (!*pagep) {
ret = -ENOMEM;
@@ -99,43 +167,12 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
goto out_release;
- _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
- if (dst_vma->vm_flags & VM_WRITE) {
- if (wp_copy)
- _dst_pte = pte_mkuffd_wp(_dst_pte);
- else
- _dst_pte = pte_mkwrite(_dst_pte);
- }
-
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
- if (dst_vma->vm_file) {
- /* the shmem MAP_PRIVATE case requires checking the i_size */
- inode = dst_vma->vm_file->f_inode;
- offset = linear_page_index(dst_vma, dst_addr);
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- ret = -EFAULT;
- if (unlikely(offset >= max_off))
- goto out_release_uncharge_unlock;
- }
- ret = -EEXIST;
- if (!pte_none(*dst_pte))
- goto out_release_uncharge_unlock;
-
- inc_mm_counter(dst_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- lru_cache_add_inactive_or_unevictable(page, dst_vma);
-
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
-
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
-
- pte_unmap_unlock(dst_pte, ptl);
- ret = 0;
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ page, true, wp_copy);
+ if (ret)
+ goto out_release;
out:
return ret;
-out_release_uncharge_unlock:
- pte_unmap_unlock(dst_pte, ptl);
out_release:
put_page(page);
goto out;
@@ -176,6 +213,41 @@ out_unlock:
return ret;
}
+/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
+static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ bool wp_copy)
+{
+ struct inode *inode = file_inode(dst_vma->vm_file);
+ pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
+ struct page *page;
+ int ret;
+
+ ret = shmem_getpage(inode, pgoff, &page, SGP_READ);
+ if (ret)
+ goto out;
+ if (!page) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ page, false, wp_copy);
+ if (ret)
+ goto out_release;
+
+ unlock_page(page);
+ ret = 0;
+out:
+ return ret;
+out_release:
+ unlock_page(page);
+ put_page(page);
+ goto out;
+}
+
static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
{
pgd_t *pgd;
@@ -209,7 +281,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
unsigned long len,
enum mcopy_atomic_mode mode)
{
- int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
@@ -308,7 +379,6 @@ retry:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
- vm_alloc_shared = vm_shared;
cond_resched();
@@ -346,54 +416,8 @@ retry:
out_unlock:
mmap_read_unlock(dst_mm);
out:
- if (page) {
- /*
- * We encountered an error and are about to free a newly
- * allocated huge page.
- *
- * Reservation handling is very subtle, and is different for
- * private and shared mappings. See the routine
- * restore_reserve_on_error for details. Unfortunately, we
- * can not call restore_reserve_on_error now as it would
- * require holding mmap_lock.
- *
- * If a reservation for the page existed in the reservation
- * map of a private mapping, the map was modified to indicate
- * the reservation was consumed when the page was allocated.
- * We clear the HPageRestoreReserve flag now so that the global
- * reserve count will not be incremented in free_huge_page.
- * The reservation map will still indicate the reservation
- * was consumed and possibly prevent later page allocation.
- * This is better than leaking a global reservation. If no
- * reservation existed, it is still safe to clear
- * HPageRestoreReserve as no adjustments to reservation counts
- * were made during allocation.
- *
- * The reservation map for shared mappings indicates which
- * pages have reservations. When a huge page is allocated
- * for an address with a reservation, no change is made to
- * the reserve map. In this case HPageRestoreReserve will be
- * set to indicate that the global reservation count should be
- * incremented when the page is freed. This is the desired
- * behavior. However, when a huge page is allocated for an
- * address without a reservation a reservation entry is added
- * to the reservation map, and HPageRestoreReserve will not be
- * set. When the page is freed, the global reserve count will
- * NOT be incremented and it will appear as though we have
- * leaked reserved page. In this case, set HPageRestoreReserve
- * so that the global reserve count will be incremented to
- * match the reservation map entry which was created.
- *
- * Note that vm_alloc_shared is based on the flags of the vma
- * for which the page was originally allocated. dst_vma could
- * be different or NULL on error.
- */
- if (vm_alloc_shared)
- SetHPageRestoreReserve(page);
- else
- ClearHPageRestoreReserve(page);
+ if (page)
put_page(page);
- }
BUG_ON(copied < 0);
BUG_ON(err > 0);
BUG_ON(!copied && !err);
@@ -415,11 +439,16 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
unsigned long dst_addr,
unsigned long src_addr,
struct page **page,
- bool zeropage,
+ enum mcopy_atomic_mode mode,
bool wp_copy)
{
ssize_t err;
+ if (mode == MCOPY_ATOMIC_CONTINUE) {
+ return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ wp_copy);
+ }
+
/*
* The normal page fault path for a shmem will invoke the
* fault, fill the hole in the file and COW it right away. The
@@ -431,7 +460,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
* and not in the radix tree.
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
- if (!zeropage)
+ if (mode == MCOPY_ATOMIC_NORMAL)
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr, page,
wp_copy);
@@ -440,13 +469,10 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
dst_vma, dst_addr);
} else {
VM_WARN_ON_ONCE(wp_copy);
- if (!zeropage)
- err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr,
- src_addr, page);
- else
- err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr);
+ err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr,
+ mode != MCOPY_ATOMIC_NORMAL,
+ page);
}
return err;
@@ -467,7 +493,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
long copied;
struct page *page;
bool wp_copy;
- bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
/*
* Sanitize the command parameters:
@@ -530,7 +555,7 @@ retry:
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
- if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+ if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
goto out_unlock;
/*
@@ -578,7 +603,7 @@ retry:
BUG_ON(pmd_trans_huge(*dst_pmd));
err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- src_addr, &page, zeropage, wp_copy);
+ src_addr, &page, mcopy_mode, wp_copy);
cond_resched();
if (unlikely(err == -ENOENT)) {
diff --git a/mm/util.c b/mm/util.c
index a8bf17f18a81..a034525e7ba2 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1010,3 +1010,43 @@ void mem_dump_obj(void *object)
}
EXPORT_SYMBOL_GPL(mem_dump_obj);
#endif
+
+/*
+ * A driver might set a page logically offline -- PageOffline() -- and
+ * turn the page inaccessible in the hypervisor; after that, access to page
+ * content can be fatal.
+ *
+ * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
+ * pages after checking PageOffline(); however, these PFN walkers can race
+ * with drivers that set PageOffline().
+ *
+ * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
+ * synchronize with such drivers, achieving that a page cannot be set
+ * PageOffline() while frozen.
+ *
+ * page_offline_begin()/page_offline_end() is used by drivers that care about
+ * such races when setting a page PageOffline().
+ */
+static DECLARE_RWSEM(page_offline_rwsem);
+
+void page_offline_freeze(void)
+{
+ down_read(&page_offline_rwsem);
+}
+
+void page_offline_thaw(void)
+{
+ up_read(&page_offline_rwsem);
+}
+
+void page_offline_begin(void)
+{
+ down_write(&page_offline_rwsem);
+}
+EXPORT_SYMBOL(page_offline_begin);
+
+void page_offline_end(void)
+{
+ up_write(&page_offline_rwsem);
+}
+EXPORT_SYMBOL(page_offline_end);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b2ec7f751bd0..d5cd52805149 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -25,6 +25,7 @@
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
+#include <linux/io.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
@@ -36,6 +37,7 @@
#include <linux/overflow.h>
#include <linux/pgtable.h>
#include <linux/uaccess.h>
+#include <linux/hugetlb.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -83,10 +85,11 @@ static void free_work(struct work_struct *w)
/*** Page table manipulation functions ***/
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
phys_addr_t phys_addr, pgprot_t prot,
- pgtbl_mod_mask *mask)
+ unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
pte_t *pte;
u64 pfn;
+ unsigned long size = PAGE_SIZE;
pfn = phys_addr >> PAGE_SHIFT;
pte = pte_alloc_kernel_track(pmd, addr, mask);
@@ -94,9 +97,22 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return -ENOMEM;
do {
BUG_ON(!pte_none(*pte));
+
+#ifdef CONFIG_HUGETLB_PAGE
+ size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
+ if (size != PAGE_SIZE) {
+ pte_t entry = pfn_pte(pfn, prot);
+
+ entry = pte_mkhuge(entry);
+ entry = arch_make_huge_pte(entry, ilog2(size), 0);
+ set_huge_pte_at(&init_mm, addr, pte, entry);
+ pfn += PFN_DOWN(size);
+ continue;
+ }
+#endif
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += PFN_DOWN(size), addr += size, addr != end);
*mask |= PGTBL_PTE_MODIFIED;
return 0;
}
@@ -145,7 +161,7 @@ static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
continue;
}
- if (vmap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+ if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
return -ENOMEM;
} while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
return 0;
@@ -1592,6 +1608,7 @@ static DEFINE_MUTEX(vmap_purge_lock);
/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
+#ifdef CONFIG_X86_64
/*
* called before a call to iounmap() if the caller wants vm_area_struct's
* immediately freed.
@@ -1600,6 +1617,7 @@ void set_iounmap_nonlazy(void)
{
atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
}
+#endif /* CONFIG_X86_64 */
/*
* Purges all lazily-freed vmap areas.
@@ -2912,8 +2930,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return NULL;
}
- if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP) &&
- arch_vmap_pmd_supported(prot)) {
+ if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) {
unsigned long size_per_node;
/*
@@ -2926,11 +2943,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
size_per_node = size;
if (node == NUMA_NO_NODE)
size_per_node /= num_online_nodes();
- if (size_per_node >= PMD_SIZE) {
+ if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
shift = PMD_SHIFT;
- align = max(real_align, 1UL << shift);
- size = ALIGN(real_size, 1UL << shift);
- }
+ else
+ shift = arch_vmap_pte_supported_shift(size_per_node);
+
+ align = max(real_align, 1UL << shift);
+ size = ALIGN(real_size, 1UL << shift);
}
again:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d7c3cb8688dd..4620df62f0ff 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1499,7 +1499,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
- if (!try_to_unmap(page, flags)) {
+ try_to_unmap(page, flags);
+ if (page_mapped(page)) {
stat->nr_unmap_fail += nr_pages;
if (!was_swapbacked && PageSwapBacked(page))
stat->nr_lazyfree_fail += nr_pages;
@@ -1701,6 +1702,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
unsigned int nr_reclaimed;
struct page *page, *next;
LIST_HEAD(clean_pages);
+ unsigned int noreclaim_flag;
list_for_each_entry_safe(page, next, page_list, lru) {
if (!PageHuge(page) && page_is_file_lru(page) &&
@@ -1711,8 +1713,17 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
}
}
+ /*
+ * We should be safe here since we are only dealing with file pages and
+ * we are not kswapd and therefore cannot write dirty file pages. But
+ * call memalloc_noreclaim_save() anyway, just in case these conditions
+ * change in the future.
+ */
+ noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
&stat, true);
+ memalloc_noreclaim_restore(noreclaim_flag);
+
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
-(long)nr_reclaimed);
@@ -1810,7 +1821,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
}
-/**
+/*
* Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
*
* lruvec->lru_lock is heavily contended. Some of the functions that
@@ -2306,6 +2317,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
LIST_HEAD(node_page_list);
struct reclaim_stat dummy_stat;
struct page *page;
+ unsigned int noreclaim_flag;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.priority = DEF_PRIORITY,
@@ -2314,6 +2326,8 @@ unsigned long reclaim_pages(struct list_head *page_list)
.may_swap = 1,
};
+ noreclaim_flag = memalloc_noreclaim_save();
+
while (!list_empty(page_list)) {
page = lru_to_page(page_list);
if (nid == NUMA_NO_NODE) {
@@ -2350,6 +2364,8 @@ unsigned long reclaim_pages(struct list_head *page_list)
}
}
+ memalloc_noreclaim_restore(noreclaim_flag);
+
return nr_reclaimed;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 4f7a306ce75a..5ba3e42446fa 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -168,8 +168,10 @@
* refault distance will immediately activate the refaulting page.
*/
+#define WORKINGSET_SHIFT 1
#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
- 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
+ WORKINGSET_SHIFT + NODES_SHIFT + \
+ MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/*
@@ -189,7 +191,7 @@ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
- eviction = (eviction << 1) | workingset;
+ eviction = (eviction << WORKINGSET_SHIFT) | workingset;
return xa_mk_value(eviction);
}
@@ -201,8 +203,8 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
int memcgid, nid;
bool workingset;
- workingset = entry & 1;
- entry >>= 1;
+ workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
+ entry >>= WORKINGSET_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 7fe7adaaad01..b3c0577b8095 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -62,7 +62,7 @@
#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
-#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+#define NCHUNKS (TOTAL_CHUNKS - ZHDR_CHUNKS)
#define BUDDY_MASK (0x3)
#define BUDDY_SHIFT 2
@@ -144,6 +144,8 @@ struct z3fold_header {
* @c_handle: cache for z3fold_buddy_slots allocation
* @ops: pointer to a structure of user defined operations specified at
* pool creation time.
+ * @zpool: zpool driver
+ * @zpool_ops: zpool operations structure with an evict callback
* @compact_wq: workqueue for page layout background optimization
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
@@ -253,9 +255,8 @@ static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
spin_unlock(&zhdr->page_lock);
}
-
-static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
- bool lock)
+/* return locked z3fold page if it's not headless */
+static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
{
struct z3fold_buddy_slots *slots;
struct z3fold_header *zhdr;
@@ -269,13 +270,12 @@ static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
read_lock(&slots->lock);
addr = *(unsigned long *)handle;
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
- if (lock)
- locked = z3fold_page_trylock(zhdr);
+ locked = z3fold_page_trylock(zhdr);
read_unlock(&slots->lock);
if (locked)
break;
cpu_relax();
- } while (lock);
+ } while (true);
} else {
zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
}
@@ -283,18 +283,6 @@ static inline struct z3fold_header *__get_z3fold_header(unsigned long handle,
return zhdr;
}
-/* Returns the z3fold page where a given handle is stored */
-static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
-{
- return __get_z3fold_header(h, false);
-}
-
-/* return locked z3fold page if it's not headless */
-static inline struct z3fold_header *get_z3fold_header(unsigned long h)
-{
- return __get_z3fold_header(h, true);
-}
-
static inline void put_z3fold_header(struct z3fold_header *zhdr)
{
struct page *page = virt_to_page(zhdr);
@@ -998,7 +986,8 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
- pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
+ pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS,
+ __alignof__(struct list_head));
if (!pool->unbuddied)
goto out_pool;
for_each_possible_cpu(cpu) {
@@ -1059,6 +1048,7 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
destroy_workqueue(pool->compact_wq);
destroy_workqueue(pool->release_wq);
z3fold_unregister_migration(pool);
+ free_percpu(pool->unbuddied);
kfree(pool);
}
@@ -1382,7 +1372,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
if (zhdr->foreign_handles ||
test_and_set_bit(PAGE_CLAIMED, &page->private)) {
if (kref_put(&zhdr->refcount,
- release_z3fold_page))
+ release_z3fold_page_locked))
atomic64_dec(&pool->pages_nr);
else
z3fold_page_unlock(zhdr);
@@ -1803,8 +1793,11 @@ static int __init init_z3fold(void)
{
int ret;
- /* Make sure the z3fold header is not larger than the page size */
- BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
+ /*
+ * Make sure the z3fold header is not larger than the page size and
+ * there has remaining spaces for its buddy.
+ */
+ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
ret = z3fold_mount();
if (ret)
return ret;
diff --git a/mm/zbud.c b/mm/zbud.c
index 7ec5f27a68b0..6348932430b8 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -51,7 +51,6 @@
#include <linux/preempt.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/zbud.h>
#include <linux/zpool.h>
/*****************
@@ -73,6 +72,12 @@
#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+struct zbud_pool;
+
+struct zbud_ops {
+ int (*evict)(struct zbud_pool *pool, unsigned long handle);
+};
+
/**
* struct zbud_pool - stores metadata for each zbud pool
* @lock: protects all pool fields and first|last_chunk fields of any
@@ -87,21 +92,27 @@
* @pages_nr: number of zbud pages in the pool.
* @ops: pointer to a structure of user defined operations specified at
* pool creation time.
+ * @zpool: zpool driver
+ * @zpool_ops: zpool operations structure with an evict callback
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular zbud pool.
*/
struct zbud_pool {
spinlock_t lock;
- struct list_head unbuddied[NCHUNKS];
- struct list_head buddied;
+ union {
+ /*
+ * Reuse unbuddied[0] as buddied on the ground that
+ * unbuddied[0] is unused.
+ */
+ struct list_head buddied;
+ struct list_head unbuddied[NCHUNKS];
+ };
struct list_head lru;
u64 pages_nr;
const struct zbud_ops *ops;
-#ifdef CONFIG_ZPOOL
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
-#endif
};
/*
@@ -121,104 +132,6 @@ struct zbud_header {
};
/*****************
- * zpool
- ****************/
-
-#ifdef CONFIG_ZPOOL
-
-static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
-{
- if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
- return pool->zpool_ops->evict(pool->zpool, handle);
- else
- return -ENOENT;
-}
-
-static const struct zbud_ops zbud_zpool_ops = {
- .evict = zbud_zpool_evict
-};
-
-static void *zbud_zpool_create(const char *name, gfp_t gfp,
- const struct zpool_ops *zpool_ops,
- struct zpool *zpool)
-{
- struct zbud_pool *pool;
-
- pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
- if (pool) {
- pool->zpool = zpool;
- pool->zpool_ops = zpool_ops;
- }
- return pool;
-}
-
-static void zbud_zpool_destroy(void *pool)
-{
- zbud_destroy_pool(pool);
-}
-
-static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
- unsigned long *handle)
-{
- return zbud_alloc(pool, size, gfp, handle);
-}
-static void zbud_zpool_free(void *pool, unsigned long handle)
-{
- zbud_free(pool, handle);
-}
-
-static int zbud_zpool_shrink(void *pool, unsigned int pages,
- unsigned int *reclaimed)
-{
- unsigned int total = 0;
- int ret = -EINVAL;
-
- while (total < pages) {
- ret = zbud_reclaim_page(pool, 8);
- if (ret < 0)
- break;
- total++;
- }
-
- if (reclaimed)
- *reclaimed = total;
-
- return ret;
-}
-
-static void *zbud_zpool_map(void *pool, unsigned long handle,
- enum zpool_mapmode mm)
-{
- return zbud_map(pool, handle);
-}
-static void zbud_zpool_unmap(void *pool, unsigned long handle)
-{
- zbud_unmap(pool, handle);
-}
-
-static u64 zbud_zpool_total_size(void *pool)
-{
- return zbud_get_pool_size(pool) * PAGE_SIZE;
-}
-
-static struct zpool_driver zbud_zpool_driver = {
- .type = "zbud",
- .sleep_mapped = true,
- .owner = THIS_MODULE,
- .create = zbud_zpool_create,
- .destroy = zbud_zpool_destroy,
- .malloc = zbud_zpool_malloc,
- .free = zbud_zpool_free,
- .shrink = zbud_zpool_shrink,
- .map = zbud_zpool_map,
- .unmap = zbud_zpool_unmap,
- .total_size = zbud_zpool_total_size,
-};
-
-MODULE_ALIAS("zpool-zbud");
-#endif /* CONFIG_ZPOOL */
-
-/*****************
* Helpers
*****************/
/* Just to make the code easier to read */
@@ -304,7 +217,7 @@ static int num_free_chunks(struct zbud_header *zhdr)
* Return: pointer to the new zbud pool or NULL if the metadata allocation
* failed.
*/
-struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
+static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
{
struct zbud_pool *pool;
int i;
@@ -328,7 +241,7 @@ struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops)
*
* The pool should be emptied before this function is called.
*/
-void zbud_destroy_pool(struct zbud_pool *pool)
+static void zbud_destroy_pool(struct zbud_pool *pool)
{
kfree(pool);
}
@@ -352,7 +265,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
* gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
* a new page.
*/
-int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
+static int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
int chunks, i, freechunks;
@@ -427,7 +340,7 @@ found:
* only sets the first|last_chunks to 0. The page is actually freed
* once both buddies are evicted (see zbud_reclaim_page() below).
*/
-void zbud_free(struct zbud_pool *pool, unsigned long handle)
+static void zbud_free(struct zbud_pool *pool, unsigned long handle)
{
struct zbud_header *zhdr;
int freechunks;
@@ -499,7 +412,7 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
* no pages to evict or an eviction handler is not registered, -EAGAIN if
* the retry limit was hit.
*/
-int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
+static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
{
int i, ret, freechunks;
struct zbud_header *zhdr;
@@ -581,7 +494,7 @@ next:
*
* Returns: a pointer to the mapped allocation
*/
-void *zbud_map(struct zbud_pool *pool, unsigned long handle)
+static void *zbud_map(struct zbud_pool *pool, unsigned long handle)
{
return (void *)(handle);
}
@@ -591,7 +504,7 @@ void *zbud_map(struct zbud_pool *pool, unsigned long handle)
* @pool: pool in which the allocation resides
* @handle: handle associated with the allocation to be unmapped
*/
-void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
+static void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
{
}
@@ -602,30 +515,120 @@ void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
* Returns: size in pages of the given pool. The pool lock need not be
* taken to access pages_nr.
*/
-u64 zbud_get_pool_size(struct zbud_pool *pool)
+static u64 zbud_get_pool_size(struct zbud_pool *pool)
{
return pool->pages_nr;
}
+/*****************
+ * zpool
+ ****************/
+
+static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
+{
+ if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+ return pool->zpool_ops->evict(pool->zpool, handle);
+ else
+ return -ENOENT;
+}
+
+static const struct zbud_ops zbud_zpool_ops = {
+ .evict = zbud_zpool_evict
+};
+
+static void *zbud_zpool_create(const char *name, gfp_t gfp,
+ const struct zpool_ops *zpool_ops,
+ struct zpool *zpool)
+{
+ struct zbud_pool *pool;
+
+ pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+ if (pool) {
+ pool->zpool = zpool;
+ pool->zpool_ops = zpool_ops;
+ }
+ return pool;
+}
+
+static void zbud_zpool_destroy(void *pool)
+{
+ zbud_destroy_pool(pool);
+}
+
+static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+ unsigned long *handle)
+{
+ return zbud_alloc(pool, size, gfp, handle);
+}
+static void zbud_zpool_free(void *pool, unsigned long handle)
+{
+ zbud_free(pool, handle);
+}
+
+static int zbud_zpool_shrink(void *pool, unsigned int pages,
+ unsigned int *reclaimed)
+{
+ unsigned int total = 0;
+ int ret = -EINVAL;
+
+ while (total < pages) {
+ ret = zbud_reclaim_page(pool, 8);
+ if (ret < 0)
+ break;
+ total++;
+ }
+
+ if (reclaimed)
+ *reclaimed = total;
+
+ return ret;
+}
+
+static void *zbud_zpool_map(void *pool, unsigned long handle,
+ enum zpool_mapmode mm)
+{
+ return zbud_map(pool, handle);
+}
+static void zbud_zpool_unmap(void *pool, unsigned long handle)
+{
+ zbud_unmap(pool, handle);
+}
+
+static u64 zbud_zpool_total_size(void *pool)
+{
+ return zbud_get_pool_size(pool) * PAGE_SIZE;
+}
+
+static struct zpool_driver zbud_zpool_driver = {
+ .type = "zbud",
+ .sleep_mapped = true,
+ .owner = THIS_MODULE,
+ .create = zbud_zpool_create,
+ .destroy = zbud_zpool_destroy,
+ .malloc = zbud_zpool_malloc,
+ .free = zbud_zpool_free,
+ .shrink = zbud_zpool_shrink,
+ .map = zbud_zpool_map,
+ .unmap = zbud_zpool_unmap,
+ .total_size = zbud_zpool_total_size,
+};
+
+MODULE_ALIAS("zpool-zbud");
+
static int __init init_zbud(void)
{
/* Make sure the zbud header will fit in one chunk */
BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
pr_info("loaded\n");
-#ifdef CONFIG_ZPOOL
zpool_register_driver(&zbud_zpool_driver);
-#endif
return 0;
}
static void __exit exit_zbud(void)
{
-#ifdef CONFIG_ZPOOL
zpool_unregister_driver(&zbud_zpool_driver);
-#endif
-
pr_info("unloaded\n");
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 19b563bc6c48..68e8831068f4 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1471,7 +1471,6 @@ static void obj_free(struct size_class *class, unsigned long obj)
unsigned int f_objidx;
void *vaddr;
- obj &= ~OBJ_ALLOCATED_TAG;
obj_to_location(obj, &f_page, &f_objidx);
f_offset = (class->size * f_objidx) & ~PAGE_MASK;
zspage = get_zspage(f_page);
@@ -2163,7 +2162,7 @@ static void async_free_zspage(struct work_struct *work)
VM_BUG_ON(fullness != ZS_EMPTY);
class = pool->size_class[class_idx];
spin_lock(&class->lock);
- __free_zspage(pool, pool->size_class[class_idx], zspage);
+ __free_zspage(pool, class, zspage);
spin_unlock(&class->lock);
}
};
diff --git a/mm/zswap.c b/mm/zswap.c
index 20763267a219..7944e3e57e78 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -967,6 +967,13 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
spin_unlock(&tree->lock);
BUG_ON(offset != entry->offset);
+ src = (u8 *)zhdr + sizeof(struct zswap_header);
+ if (!zpool_can_sleep_mapped(pool)) {
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+ zpool_unmap_handle(pool, handle);
+ }
+
/* try to allocate swap cache page */
switch (zswap_get_swap_cache_page(swpentry, &page)) {
case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
@@ -982,17 +989,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
/* decompress */
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
-
dlen = PAGE_SIZE;
- src = (u8 *)zhdr + sizeof(struct zswap_header);
-
- if (!zpool_can_sleep_mapped(pool)) {
-
- memcpy(tmp, src, entry->length);
- src = tmp;
-
- zpool_unmap_handle(pool, handle);
- }
mutex_lock(acomp_ctx->mutex);
sg_init_one(&input, src, entry->length);
@@ -1203,7 +1200,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
zswap_reject_alloc_fail++;
goto put_dstmem;
}
- buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
+ buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO);
memcpy(buf, &zhdr, hlen);
memcpy(buf + hlen, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
@@ -1427,18 +1424,11 @@ static int __init zswap_debugfs_init(void)
return 0;
}
-
-static void __exit zswap_debugfs_exit(void)
-{
- debugfs_remove_recursive(zswap_debugfs_root);
-}
#else
static int __init zswap_debugfs_init(void)
{
return 0;
}
-
-static void __exit zswap_debugfs_exit(void) { }
#endif
/*********************************