summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig14
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c14
-rw-r--r--mm/cma.h2
-rw-r--r--mm/compaction.c11
-rw-r--r--mm/debug.c18
-rw-r--r--mm/debug_vm_pgtable.c207
-rw-r--r--mm/dmapool.c46
-rw-r--r--mm/fadvise.c9
-rw-r--r--mm/filemap.c150
-rw-r--r--mm/gup.c105
-rw-r--r--mm/gup_benchmark.c38
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/huge_memory.c52
-rw-r--r--mm/hugetlb.c122
-rw-r--r--mm/hwpoison-inject.c18
-rw-r--r--mm/internal.h30
-rw-r--r--mm/kasan/report.c34
-rw-r--r--mm/khugepaged.c27
-rw-r--r--mm/kmemleak-test.c99
-rw-r--r--mm/kmemleak.c8
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/madvise.c198
-rw-r--r--mm/memblock.c106
-rw-r--r--mm/memcontrol.c337
-rw-r--r--mm/memory-failure.c336
-rw-r--r--mm/memory.c212
-rw-r--r--mm/memory_hotplug.c267
-rw-r--r--mm/mempolicy.c8
-rw-r--r--mm/mempool.c18
-rw-r--r--mm/memremap.c309
-rw-r--r--mm/migrate.c87
-rw-r--r--mm/mincore.c28
-rw-r--r--mm/mmap.c143
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/nommu.c9
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c328
-rw-r--r--mm/page_counter.c2
-rw-r--r--mm/page_io.c42
-rw-r--r--mm/page_isolation.c55
-rw-r--r--mm/page_owner.c10
-rw-r--r--mm/page_poison.c20
-rw-r--r--mm/page_reporting.c4
-rw-r--r--mm/percpu.c3
-rw-r--r--mm/process_vm_access.c86
-rw-r--r--mm/readahead.c148
-rw-r--r--mm/rmap.c10
-rw-r--r--mm/shmem.c30
-rw-r--r--mm/shuffle.c2
-rw-r--r--mm/slab.c6
-rw-r--r--mm/slab.h46
-rw-r--r--mm/slub.c35
-rw-r--r--mm/sparse.c12
-rw-r--r--mm/swap.c79
-rw-r--r--mm/swap_slots.c3
-rw-r--r--mm/swap_state.c38
-rw-r--r--mm/swapfile.c63
-rw-r--r--mm/truncate.c64
-rw-r--r--mm/util.c5
-rw-r--r--mm/vmalloc.c153
-rw-r--r--mm/vmscan.c10
-rw-r--r--mm/vmstat.c8
-rw-r--r--mm/workingset.c15
-rw-r--r--mm/z3fold.c3
-rw-r--r--mm/zbud.c1
-rw-r--r--mm/zsmalloc.c10
69 files changed, 2283 insertions, 2106 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 6c974888f86f..d42423f884a7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -152,6 +152,7 @@ config HAVE_BOOTMEM_INFO_NODE
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
+ select MEMORY_ISOLATION
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
depends on 64BIT || BROKEN
@@ -178,7 +179,6 @@ config MEMORY_HOTPLUG_DEFAULT_ONLINE
config MEMORY_HOTREMOVE
bool "Allow for memory hot remove"
- select MEMORY_ISOLATION
select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
@@ -383,7 +383,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
This option specifies the initial value of this option. The default
of 1 says that all excess pages should be trimmed.
- See Documentation/mm/nommu-mmap.rst for more information.
+ See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
@@ -516,13 +516,14 @@ config CMA_DEBUGFS
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
+ default 19 if NUMA
default 7
help
CMA allows to create CMA areas for particular purpose, mainly,
used as device private area. This parameter sets the maximum
number of CMA area in the system.
- If unsure, leave the default value "7".
+ If unsure, leave the default value "7" in UMA and "19" in NUMA.
config MEM_SOFT_DIRTY
bool "Track memory changes"
@@ -815,6 +816,9 @@ config DEVICE_PRIVATE
memory; i.e., memory that is only accessible from the device (or
group of devices). You likely also want to select HMM_MIRROR.
+config VMAP_PFN
+ bool
+
config FRAME_VECTOR
bool
@@ -831,10 +835,10 @@ config PERCPU_STATS
be used to help understand percpu memory usage.
config GUP_BENCHMARK
- bool "Enable infrastructure for get_user_pages_fast() benchmarking"
+ bool "Enable infrastructure for get_user_pages() and related calls benchmarking"
help
Provides /sys/kernel/debug/gup_benchmark that helps with testing
- performance of get_user_pages_fast().
+ performance of get_user_pages() and related calls.
See tools/testing/selftests/vm/gup_benchmark.c
diff --git a/mm/Makefile b/mm/Makefile
index d5649f1c12c0..d73aed0fc99c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -94,7 +94,6 @@ obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
-obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8e8b00627bb2..408d5051d05b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,9 +14,7 @@
#include <linux/device.h>
#include <trace/events/writeback.h>
-struct backing_dev_info noop_backing_dev_info = {
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
+struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
@@ -204,10 +202,9 @@ static ssize_t stable_pages_required_show(struct device *dev,
struct device_attribute *attr,
char *page)
{
- struct backing_dev_info *bdi = dev_get_drvdata(dev);
-
- return snprintf(page, PAGE_SIZE-1, "%d\n",
- bdi_cap_stable_pages_required(bdi) ? 1 : 0);
+ dev_warn_once(dev,
+ "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
+ return snprintf(page, PAGE_SIZE-1, "%d\n", 0);
}
static DEVICE_ATTR_RO(stable_pages_required);
@@ -746,6 +743,9 @@ struct backing_dev_info *bdi_alloc(int node_id)
kfree(bdi);
return NULL;
}
+ bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
+ bdi->ra_pages = VM_READAHEAD_PAGES;
+ bdi->io_pages = VM_READAHEAD_PAGES;
return bdi;
}
EXPORT_SYMBOL(bdi_alloc);
diff --git a/mm/cma.h b/mm/cma.h
index 20f6e24bc477..42ae082cb067 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -4,8 +4,6 @@
#include <linux/debugfs.h>
-#define CMA_MAX_NAME 64
-
struct cma {
unsigned long base_pfn;
unsigned long count;
diff --git a/mm/compaction.c b/mm/compaction.c
index 176dcded298e..6e0ee5641788 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -180,11 +180,10 @@ bool compaction_deferred(struct zone *zone, int order)
return false;
/* Avoid possible overflow */
- if (++zone->compact_considered > defer_limit)
+ if (++zone->compact_considered >= defer_limit) {
zone->compact_considered = defer_limit;
-
- if (zone->compact_considered >= defer_limit)
return false;
+ }
trace_mm_compaction_deferred(zone, order);
@@ -626,7 +625,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
}
/* Found a free page, will break it into order-0 pages */
- order = page_order(page);
+ order = buddy_order(page);
isolated = __isolate_free_page(page, order);
if (!isolated)
break;
@@ -899,7 +898,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* potential isolation targets.
*/
if (PageBuddy(page)) {
- unsigned long freepage_order = page_order_unsafe(page);
+ unsigned long freepage_order = buddy_order_unsafe(page);
/*
* Without lock, we cannot be sure that what we got is
@@ -1173,7 +1172,7 @@ static bool suitable_migration_target(struct compact_control *cc,
* the only small danger is that we skip a potentially suitable
* pageblock, so it's not worth to check order for valid range.
*/
- if (page_order_unsafe(page) >= pageblock_order)
+ if (buddy_order_unsafe(page) >= pageblock_order)
return false;
}
diff --git a/mm/debug.c b/mm/debug.c
index ca8d1cacdecc..ccca576b2899 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -102,12 +102,12 @@ void __dump_page(struct page *page, const char *reason)
if (hpage_pincount_available(page)) {
pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
head, compound_order(head),
- head_mapcount(head),
- head_pincount(head));
+ head_compound_mapcount(head),
+ head_compound_pincount(head));
} else {
pr_warn("head:%p order:%u compound_mapcount:%d\n",
head, compound_order(head),
- head_mapcount(head));
+ head_compound_mapcount(head));
}
}
if (PageKsm(page))
@@ -120,6 +120,7 @@ void __dump_page(struct page *page, const char *reason)
struct hlist_node *dentry_first;
struct dentry *dentry_ptr;
struct dentry dentry;
+ unsigned long ino;
/*
* mapping can be invalid pointer and we don't want to crash
@@ -136,21 +137,22 @@ void __dump_page(struct page *page, const char *reason)
goto out_mapping;
}
- if (get_kernel_nofault(dentry_first, &host->i_dentry.first)) {
+ if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
+ get_kernel_nofault(ino, &host->i_ino)) {
pr_warn("aops:%ps with invalid host inode %px\n",
a_ops, host);
goto out_mapping;
}
if (!dentry_first) {
- pr_warn("aops:%ps ino:%lx\n", a_ops, host->i_ino);
+ pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
goto out_mapping;
}
dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
if (get_kernel_nofault(dentry, dentry_ptr)) {
- pr_warn("aops:%ps with invalid dentry %px\n", a_ops,
- dentry_ptr);
+ pr_warn("aops:%ps ino:%lx with invalid dentry %px\n",
+ a_ops, ino, dentry_ptr);
} else {
/*
* if dentry is corrupted, the %pd handler may still
@@ -158,7 +160,7 @@ void __dump_page(struct page *page, const char *reason)
* corrupted struct page
*/
pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
- a_ops, host->i_ino, &dentry);
+ a_ops, ino, &dentry);
}
}
out_mapping:
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 086309fb9b6f..c05d9dcf7891 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -28,6 +28,7 @@
#include <linux/swapops.h>
#include <linux/start_kernel.h>
#include <linux/sched/mm.h>
+#include <linux/io.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
@@ -44,10 +45,17 @@
* entry type. But these bits might affect the ability to clear entries with
* pxx_clear() because of how dynamic page table folding works on s390. So
* while loading up the entries do not change the lower 4 bits. It does not
- * have affect any other platform.
+ * have affect any other platform. Also avoid the 62nd bit on ppc64 that is
+ * used to mark a pte entry.
*/
-#define S390_MASK_BITS 4
-#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define S390_SKIP_MASK GENMASK(3, 0)
+#if __BITS_PER_LONG == 64
+#define PPC64_SKIP_MASK GENMASK(62, 62)
+#else
+#define PPC64_SKIP_MASK 0x0
+#endif
+#define ARCH_SKIP_MASK (S390_SKIP_MASK | PPC64_SKIP_MASK)
+#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
#define RANDOM_NZVALUE GENMASK(7, 0)
static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
@@ -71,15 +79,18 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
{
pte_t pte = pfn_pte(pfn, prot);
+ /*
+ * Architectures optimize set_pte_at by avoiding TLB flush.
+ * This requires set_pte_at to be not used to update an
+ * existing pte entry. Clear pte before we do set_pte_at
+ */
+
pr_debug("Validating PTE advanced\n");
pte = pfn_pte(pfn, prot);
set_pte_at(mm, vaddr, ptep, pte);
ptep_set_wrprotect(mm, vaddr, ptep);
pte = ptep_get(ptep);
WARN_ON(pte_write(pte));
-
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear(mm, vaddr, ptep);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
@@ -93,13 +104,11 @@ static void __init pte_advanced_tests(struct mm_struct *mm,
ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
pte = ptep_get(ptep);
WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
-
- pte = pfn_pte(pfn, prot);
- set_pte_at(mm, vaddr, ptep, pte);
ptep_get_and_clear_full(mm, vaddr, ptep, 1);
pte = ptep_get(ptep);
WARN_ON(!pte_none(pte));
+ pte = pfn_pte(pfn, prot);
pte = pte_mkyoung(pte);
set_pte_at(mm, vaddr, ptep, pte);
ptep_test_and_clear_young(vma, vaddr, ptep);
@@ -111,10 +120,14 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
pte_t pte = pfn_pte(pfn, prot);
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
pr_debug("Validating PTE saved write\n");
WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
{
@@ -141,7 +154,7 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
static void __init pmd_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pmd_t *pmdp,
unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+ pgprot_t prot, pgtable_t pgtable)
{
pmd_t pmd = pfn_pmd(pfn, prot);
@@ -152,14 +165,13 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
/* Align the address wrt HPAGE_PMD_SIZE */
vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+ pgtable_trans_huge_deposit(mm, pmdp, pgtable);
+
pmd = pfn_pmd(pfn, prot);
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_set_wrprotect(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_write(pmd));
-
- pmd = pfn_pmd(pfn, prot);
- set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_huge_get_and_clear(mm, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
@@ -173,18 +185,20 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
pmd = READ_ONCE(*pmdp);
WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
-
- pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
- set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
pmd = pmd_mkyoung(pmd);
set_pmd_at(mm, vaddr, pmdp, pmd);
pmdp_test_and_clear_young(vma, vaddr, pmdp);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_young(pmd));
+
+ /* Clear the pte entries */
+ pmdp_huge_get_and_clear(mm, vaddr, pmdp);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmdp);
}
static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -199,11 +213,12 @@ static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pmd_leaf(pmd));
}
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
{
pmd_t pmd;
- if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ if (!arch_ioremap_pmd_supported())
return;
pr_debug("Validating PMD huge\n");
@@ -217,11 +232,17 @@ static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
pmd = READ_ONCE(*pmdp);
WARN_ON(!pmd_none(pmd));
}
+#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
{
pmd_t pmd = pfn_pmd(pfn, prot);
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
pr_debug("Validating PMD saved write\n");
WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
@@ -272,17 +293,9 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
WARN_ON(pud_write(pud));
#ifndef __PAGETABLE_PMD_FOLDED
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
pudp_huge_get_and_clear(mm, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
-
- pud = pfn_pud(pfn, prot);
- set_pud_at(mm, vaddr, pudp, pud);
- pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
- pud = READ_ONCE(*pudp);
- WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(pfn, prot);
pud = pud_wrprotect(pud);
@@ -294,11 +307,20 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
pud = READ_ONCE(*pudp);
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
+#ifndef __PAGETABLE_PMD_FOLDED
+ pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+
+ pud = pfn_pud(pfn, prot);
pud = pud_mkyoung(pud);
set_pud_at(mm, vaddr, pudp, pud);
pudp_test_and_clear_young(vma, vaddr, pudp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_young(pud));
+
+ pudp_huge_get_and_clear(mm, vaddr, pudp);
}
static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
@@ -313,11 +335,12 @@ static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pud_leaf(pud));
}
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
{
pud_t pud;
- if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ if (!arch_ioremap_pud_supported())
return;
pr_debug("Validating PUD huge\n");
@@ -331,6 +354,10 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
pud = READ_ONCE(*pudp);
WARN_ON(!pud_none(pud));
}
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) { }
+#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -350,7 +377,7 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
static void __init pmd_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pmd_t *pmdp,
unsigned long pfn, unsigned long vaddr,
- pgprot_t prot)
+ pgprot_t prot, pgtable_t pgtable)
{
}
static void __init pud_advanced_tests(struct mm_struct *mm,
@@ -417,8 +444,6 @@ static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
* This entry points to next level page table page.
* Hence this must not qualify as pud_bad().
*/
- pmd_clear(pmdp);
- pud_clear(pudp);
pud_populate(mm, pudp, pmdp);
pud = READ_ONCE(*pudp);
WARN_ON(pud_bad(pud));
@@ -515,12 +540,15 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
#endif /* PAGETABLE_P4D_FOLDED */
static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
- unsigned long vaddr)
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
{
- pte_t pte = ptep_get(ptep);
+ pte_t pte = pfn_pte(pfn, prot);
pr_debug("Validating PTE clear\n");
+#ifndef CONFIG_RISCV
pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
+#endif
set_pte_at(mm, vaddr, ptep, pte);
barrier();
pte_clear(mm, vaddr, ptep);
@@ -550,7 +578,6 @@ static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
* This entry points to next level page table page.
* Hence this must not qualify as pmd_bad().
*/
- pmd_clear(pmdp);
pmd_populate(mm, pmdp, pgtable);
pmd = READ_ONCE(*pmdp);
WARN_ON(pmd_bad(pmd));
@@ -784,57 +811,8 @@ static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(!pte_huge(pte_mkhuge(pte)));
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
}
-
-static void __init hugetlb_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pte_t *ptep, unsigned long pfn,
- unsigned long vaddr, pgprot_t prot)
-{
- struct page *page = pfn_to_page(pfn);
- pte_t pte = ptep_get(ptep);
- unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK;
-
- pr_debug("Validating HugeTLB advanced\n");
- pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
- huge_pte_clear(mm, vaddr, ptep, PMD_SIZE);
- pte = huge_ptep_get(ptep);
- WARN_ON(!huge_pte_none(pte));
-
- pte = mk_huge_pte(page, prot);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- huge_ptep_set_wrprotect(mm, vaddr, ptep);
- pte = huge_ptep_get(ptep);
- WARN_ON(huge_pte_write(pte));
-
- pte = mk_huge_pte(page, prot);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- huge_ptep_get_and_clear(mm, vaddr, ptep);
- pte = huge_ptep_get(ptep);
- WARN_ON(!huge_pte_none(pte));
-
- pte = mk_huge_pte(page, prot);
- pte = huge_pte_wrprotect(pte);
- set_huge_pte_at(mm, vaddr, ptep, pte);
- barrier();
- pte = huge_pte_mkwrite(pte);
- pte = huge_pte_mkdirty(pte);
- huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
- pte = huge_ptep_get(ptep);
- WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
-}
#else /* !CONFIG_HUGETLB_PAGE */
static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init hugetlb_advanced_tests(struct mm_struct *mm,
- struct vm_area_struct *vma,
- pte_t *ptep, unsigned long pfn,
- unsigned long vaddr, pgprot_t prot)
-{
-}
#endif /* CONFIG_HUGETLB_PAGE */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -965,7 +943,13 @@ static int __init debug_vm_pgtable(void)
p4dp = p4d_alloc(mm, pgdp, vaddr);
pudp = pud_alloc(mm, p4dp, vaddr);
pmdp = pmd_alloc(mm, pudp, vaddr);
- ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+ /*
+ * Allocate pgtable_t
+ */
+ if (pte_alloc(mm, pmdp)) {
+ pr_err("pgtable allocation failed\n");
+ return 1;
+ }
/*
* Save all the page table page addresses as the page table
@@ -985,32 +969,11 @@ static int __init debug_vm_pgtable(void)
p4d_basic_tests(p4d_aligned, prot);
pgd_basic_tests(pgd_aligned, prot);
- pte_clear_tests(mm, ptep, vaddr);
- pmd_clear_tests(mm, pmdp);
- pud_clear_tests(mm, pudp);
- p4d_clear_tests(mm, p4dp);
- pgd_clear_tests(mm, pgdp);
-
- pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
- pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
- pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
- hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
-
pmd_leaf_tests(pmd_aligned, prot);
pud_leaf_tests(pud_aligned, prot);
- pmd_huge_tests(pmdp, pmd_aligned, prot);
- pud_huge_tests(pudp, pud_aligned, prot);
-
- pte_savedwrite_tests(pte_aligned, prot);
- pmd_savedwrite_tests(pmd_aligned, prot);
-
- pte_unmap_unlock(ptep, ptl);
-
- pmd_populate_tests(mm, pmdp, saved_ptep);
- pud_populate_tests(mm, pudp, saved_pmdp);
- p4d_populate_tests(mm, p4dp, saved_pudp);
- pgd_populate_tests(mm, pgdp, saved_p4dp);
+ pte_savedwrite_tests(pte_aligned, protnone);
+ pmd_savedwrite_tests(pmd_aligned, protnone);
pte_special_tests(pte_aligned, prot);
pte_protnone_tests(pte_aligned, protnone);
@@ -1029,11 +992,43 @@ static int __init debug_vm_pgtable(void)
pmd_swap_tests(pmd_aligned, prot);
swap_migration_tests();
- hugetlb_basic_tests(pte_aligned, prot);
pmd_thp_tests(pmd_aligned, prot);
pud_thp_tests(pud_aligned, prot);
+ hugetlb_basic_tests(pte_aligned, prot);
+
+ /*
+ * Page table modifying tests. They need to hold
+ * proper page table lock.
+ */
+
+ ptep = pte_offset_map_lock(mm, pmdp, vaddr, &ptl);
+ pte_clear_tests(mm, ptep, pte_aligned, vaddr, prot);
+ pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+ pte_unmap_unlock(ptep, ptl);
+
+ ptl = pmd_lock(mm, pmdp);
+ pmd_clear_tests(mm, pmdp);
+ pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot, saved_ptep);
+ pmd_huge_tests(pmdp, pmd_aligned, prot);
+ pmd_populate_tests(mm, pmdp, saved_ptep);
+ spin_unlock(ptl);
+
+ ptl = pud_lock(mm, pudp);
+ pud_clear_tests(mm, pudp);
+ pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+ pud_huge_tests(pudp, pud_aligned, prot);
+ pud_populate_tests(mm, pudp, saved_pmdp);
+ spin_unlock(ptl);
+
+ spin_lock(&mm->page_table_lock);
+ p4d_clear_tests(mm, p4dp);
+ pgd_clear_tests(mm, pgdp);
+ p4d_populate_tests(mm, p4dp, saved_pudp);
+ pgd_populate_tests(mm, pgdp, saved_p4dp);
+ spin_unlock(&mm->page_table_lock);
+
p4d_free(mm, saved_p4dp);
pud_free(mm, saved_pudp);
pmd_free(mm, saved_pmdp);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f9fb9bbd733e..a97c97232337 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -266,6 +266,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
*/
void dma_pool_destroy(struct dma_pool *pool)
{
+ struct dma_page *page, *tmp;
bool empty = false;
if (unlikely(!pool))
@@ -281,17 +282,13 @@ void dma_pool_destroy(struct dma_pool *pool)
device_remove_file(pool->dev, &dev_attr_pools);
mutex_unlock(&pools_reg_lock);
- while (!list_empty(&pool->page_list)) {
- struct dma_page *page;
- page = list_entry(pool->page_list.next,
- struct dma_page, page_list);
+ list_for_each_entry_safe(page, tmp, &pool->page_list, page_list) {
if (is_page_busy(page)) {
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_destroy %s, %p busy\n",
+ dev_err(pool->dev, "%s %s, %p busy\n", __func__,
pool->name, page->vaddr);
else
- pr_err("dma_pool_destroy %s, %p busy\n",
+ pr_err("%s %s, %p busy\n", __func__,
pool->name, page->vaddr);
/* leak the still-in-use consistent memory */
list_del(&page->page_list);
@@ -355,12 +352,11 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
if (data[i] == POOL_POISON_FREED)
continue;
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_alloc %s, %p (corrupted)\n",
- pool->name, retval);
+ dev_err(pool->dev, "%s %s, %p (corrupted)\n",
+ __func__, pool->name, retval);
else
- pr_err("dma_pool_alloc %s, %p (corrupted)\n",
- pool->name, retval);
+ pr_err("%s %s, %p (corrupted)\n",
+ __func__, pool->name, retval);
/*
* Dump the first 4 bytes even if they are not
@@ -416,12 +412,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
if (!page) {
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_free %s, %p/%lx (bad dma)\n",
- pool->name, vaddr, (unsigned long)dma);
+ dev_err(pool->dev, "%s %s, %p/%pad (bad dma)\n",
+ __func__, pool->name, vaddr, &dma);
else
- pr_err("dma_pool_free %s, %p/%lx (bad dma)\n",
- pool->name, vaddr, (unsigned long)dma);
+ pr_err("%s %s, %p/%pad (bad dma)\n",
+ __func__, pool->name, vaddr, &dma);
return;
}
@@ -432,12 +427,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
if ((dma - page->dma) != offset) {
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev,
- "dma_pool_free %s, %p (bad vaddr)/%pad\n",
- pool->name, vaddr, &dma);
+ dev_err(pool->dev, "%s %s, %p (bad vaddr)/%pad\n",
+ __func__, pool->name, vaddr, &dma);
else
- pr_err("dma_pool_free %s, %p (bad vaddr)/%pad\n",
- pool->name, vaddr, &dma);
+ pr_err("%s %s, %p (bad vaddr)/%pad\n",
+ __func__, pool->name, vaddr, &dma);
return;
}
{
@@ -449,11 +443,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
}
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev, "dma_pool_free %s, dma %pad already free\n",
- pool->name, &dma);
+ dev_err(pool->dev, "%s %s, dma %pad already free\n",
+ __func__, pool->name, &dma);
else
- pr_err("dma_pool_free %s, dma %pad already free\n",
- pool->name, &dma);
+ pr_err("%s %s, dma %pad already free\n",
+ __func__, pool->name, &dma);
return;
}
}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0e66f2aaeea3..d6baa4f451c5 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -141,7 +141,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
}
if (end_index >= start_index) {
- unsigned long count;
+ unsigned long nr_pagevec = 0;
/*
* It's common to FADV_DONTNEED right after
@@ -154,8 +154,9 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
*/
lru_add_drain();
- count = invalidate_mapping_pages(mapping,
- start_index, end_index);
+ invalidate_mapping_pagevec(mapping,
+ start_index, end_index,
+ &nr_pagevec);
/*
* If fewer pages were invalidated than expected then
@@ -163,7 +164,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
* a per-cpu pagevec for a remote CPU. Drain all
* pagevecs and try again.
*/
- if (count < (end_index - start_index + 1)) {
+ if (nr_pagevec) {
lru_add_drain_all();
invalidate_mapping_pages(mapping, start_index,
end_index);
diff --git a/mm/filemap.c b/mm/filemap.c
index 99c49eeae71b..e4101b5bfa82 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -249,7 +249,7 @@ static void page_cache_free_page(struct address_space *mapping,
freepage(page);
if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
+ page_ref_sub(page, thp_nr_pages(page));
VM_BUG_ON_PAGE(page_count(page) <= 0, page);
} else {
put_page(page);
@@ -414,7 +414,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping) ||
+ if (!mapping_can_writeback(mapping) ||
!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
@@ -827,15 +827,14 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
-static int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
- void **shadowp)
+noinline int __add_to_page_cache_locked(struct page *page,
+ struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp,
+ void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
int error;
- void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
@@ -846,25 +845,46 @@ static int __add_to_page_cache_locked(struct page *page,
page->index = offset;
if (!huge) {
- error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ error = mem_cgroup_charge(page, current->mm, gfp);
if (error)
goto error;
}
+ gfp &= GFP_RECLAIM_MASK;
+
do {
+ unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ void *entry, *old = NULL;
+
+ if (order > thp_order(page))
+ xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+ order, gfp);
xas_lock_irq(&xas);
- old = xas_load(&xas);
- if (old && !xa_is_value(old))
- xas_set_err(&xas, -EEXIST);
+ xas_for_each_conflict(&xas, entry) {
+ old = entry;
+ if (!xa_is_value(entry)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ }
+
+ if (old) {
+ if (shadowp)
+ *shadowp = old;
+ /* entry may have been split before we acquired lock */
+ order = xa_get_order(xas.xa, xas.xa_index);
+ if (order > thp_order(page)) {
+ xas_split(&xas, old, order);
+ xas_reset(&xas);
+ }
+ }
+
xas_store(&xas, page);
if (xas_error(&xas))
goto unlock;
- if (xa_is_value(old)) {
+ if (old)
mapping->nrexceptional--;
- if (shadowp)
- *shadowp = old;
- }
mapping->nrpages++;
/* hugetlb pages do not participate in page cache accounting */
@@ -872,7 +892,7 @@ static int __add_to_page_cache_locked(struct page *page,
__inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+ } while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
error = xas_error(&xas);
@@ -1425,7 +1445,7 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
* unlock_page - unlock a locked page
* @page: the page
*
- * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
+ * Unlocks the page and wakes up sleepers in wait_on_page_locked().
* Also wakes sleepers in wait_on_page_writeback() because the wakeup
* mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
@@ -1645,19 +1665,19 @@ EXPORT_SYMBOL(page_cache_prev_miss);
/**
* find_get_entry - find and get a page cache entry
* @mapping: the address_space to search
- * @offset: the page cache index
+ * @index: The page cache index.
*
* Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned with an increased refcount.
+ * page cache page, the head page is returned with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
{
- XA_STATE(xas, &mapping->i_pages, offset);
+ XA_STATE(xas, &mapping->i_pages, index);
struct page *page;
rcu_read_lock();
@@ -1685,7 +1705,6 @@ repeat:
put_page(page);
goto repeat;
}
- page = find_subpage(page, offset);
out:
rcu_read_unlock();
@@ -1693,40 +1712,37 @@ out:
}
/**
- * find_lock_entry - locate, pin and lock a page cache entry
- * @mapping: the address_space to search
- * @offset: the page cache index
+ * find_lock_entry - Locate and lock a page cache entry.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @offset. If there is a
- * page cache page, it is returned locked and with an increased
- * refcount.
+ * Looks up the page at @mapping & @index. If there is a page in the
+ * cache, the head page is returned locked and with an increased refcount.
*
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * find_lock_entry() may sleep.
- *
- * Return: the found page or shadow entry, %NULL if nothing is found.
+ * Context: May sleep.
+ * Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
{
struct page *page;
repeat:
- page = find_get_entry(mapping, offset);
+ page = find_get_entry(mapping, index);
if (page && !xa_is_value(page)) {
lock_page(page);
/* Has the page been truncated? */
- if (unlikely(page_mapping(page) != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
return page;
}
-EXPORT_SYMBOL(find_lock_entry);
/**
* pagecache_get_page - Find and get a reference to a page.
@@ -1741,6 +1757,8 @@ EXPORT_SYMBOL(find_lock_entry);
*
* * %FGP_ACCESSED - The page will be marked accessed.
* * %FGP_LOCK - The page is returned locked.
+ * * %FGP_HEAD - If the page is present and a THP, return the head page
+ * rather than the exact page specified by the index.
* * %FGP_CREAT - If no page is present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
@@ -1781,12 +1799,12 @@ repeat:
}
/* Has the page been truncated? */
- if (unlikely(compound_head(page)->mapping != mapping)) {
+ if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
}
- VM_BUG_ON_PAGE(page->index != index, page);
+ VM_BUG_ON_PAGE(!thp_contains(page, index), page);
}
if (fgp_flags & FGP_ACCESSED)
@@ -1796,11 +1814,13 @@ repeat:
if (page_is_idle(page))
clear_page_idle(page);
}
+ if (!(fgp_flags & FGP_HEAD))
+ page = find_subpage(page, index);
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
int err;
- if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
+ if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp_mask |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
gfp_mask &= ~__GFP_FS;
@@ -2179,6 +2199,14 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
+ /*
+ * If we've already successfully copied some data, then we
+ * can no longer safely return -EIOCBQUEUED. Hence mark
+ * an async read NOWAIT at that point.
+ */
+ if (written && (iocb->ki_flags & IOCB_WAITQ))
+ iocb->ki_flags |= IOCB_NOWAIT;
+
for (;;) {
struct page *page;
pgoff_t end_index;
@@ -2568,8 +2596,8 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ DEFINE_READAHEAD(ractl, file, mapping, vmf->pgoff);
struct file *fpin = NULL;
- pgoff_t offset = vmf->pgoff;
unsigned int mmap_miss;
/* If we don't want any read-ahead, don't bother */
@@ -2580,8 +2608,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
if (vmf->vma->vm_flags & VM_SEQ_READ) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_sync_readahead(mapping, ra, file, offset,
- ra->ra_pages);
+ page_cache_sync_ra(&ractl, ra, ra->ra_pages);
return fpin;
}
@@ -2601,10 +2628,11 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* mmap read-around
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+ ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
- ra_submit(ra, mapping, file);
+ ractl._index = ra->start;
+ do_page_cache_ra(&ractl, ra->size, ra->async_size);
return fpin;
}
@@ -2793,42 +2821,42 @@ void filemap_map_pages(struct vm_fault *vmf,
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *page;
+ struct page *head, *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
rcu_read_lock();
- xas_for_each(&xas, page, end_pgoff) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, head, end_pgoff) {
+ if (xas_retry(&xas, head))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(head))
goto next;
/*
* Check for a locked page first, as a speculative
* reference may adversely influence page migration.
*/
- if (PageLocked(page))
+ if (PageLocked(head))
goto next;
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto next;
/* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
+ if (unlikely(head != xas_reload(&xas)))
goto skip;
- page = find_subpage(page, xas.xa_index);
+ page = find_subpage(head, xas.xa_index);
- if (!PageUptodate(page) ||
+ if (!PageUptodate(head) ||
PageReadahead(page) ||
PageHWPoison(page))
goto skip;
- if (!trylock_page(page))
+ if (!trylock_page(head))
goto skip;
- if (page->mapping != mapping || !PageUptodate(page))
+ if (head->mapping != mapping || !PageUptodate(head))
goto unlock;
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
- if (page->index >= max_idx)
+ if (xas.xa_index >= max_idx)
goto unlock;
if (mmap_miss > 0)
@@ -2840,12 +2868,12 @@ void filemap_map_pages(struct vm_fault *vmf,
last_pgoff = xas.xa_index;
if (alloc_set_pte(vmf, page))
goto unlock;
- unlock_page(page);
+ unlock_page(head);
goto next;
unlock:
- unlock_page(page);
+ unlock_page(head);
skip:
- put_page(page);
+ put_page(head);
next:
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd))
@@ -2984,7 +3012,7 @@ filler:
goto out;
/*
- * Page is not up to date and may be locked due one of the following
+ * Page is not up to date and may be locked due to one of the following
* case a: Page is being filled and the page lock is held
* case b: Read/write error clearing the page uptodate status
* case c: Truncation in progress (page locked)
diff --git a/mm/gup.c b/mm/gup.c
index e869c634cc9a..102877ed77a4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -329,6 +329,13 @@ void unpin_user_pages(struct page **pages, unsigned long npages)
unsigned long index;
/*
+ * If this WARN_ON() fires, then the system *might* be leaking pages (by
+ * leaving them pinned), but probably not. More likely, gup/pup returned
+ * a hard -ERRNO error to the caller, who erroneously passed it here.
+ */
+ if (WARN_ON(IS_ERR_VALUE(npages)))
+ return;
+ /*
* TODO: this can be optimized for huge pages: if a series of pages is
* physically contiguous and part of the same compound page, then a
* single operation to the head page should suffice.
@@ -1483,35 +1490,6 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
mmap_read_unlock(mm);
return ret; /* 0 or negative error code */
}
-
-/**
- * get_dump_page() - pin user page in memory while writing it to core dump
- * @addr: user address
- *
- * Returns struct page pointer of user page pinned for dump,
- * to be freed afterwards by put_page().
- *
- * Returns NULL on any kind of failure - a hole must then be inserted into
- * the corefile, to preserve alignment with its headers; and also returns
- * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
- * allowing a hole to be left in the corefile to save diskspace.
- *
- * Called without mmap_lock, but after all other threads have been killed.
- */
-#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
-{
- struct vm_area_struct *vma;
- struct page *page;
-
- if (__get_user_pages(current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
- NULL) < 1)
- return NULL;
- flush_cache_page(vma, addr, page_to_pfn(page));
- return page;
-}
-#endif /* CONFIG_ELF_CORE */
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
@@ -1557,6 +1535,38 @@ finish_or_fault:
}
#endif /* !CONFIG_MMU */
+/**
+ * get_dump_page() - pin user page in memory while writing it to core dump
+ * @addr: user address
+ *
+ * Returns struct page pointer of user page pinned for dump,
+ * to be freed afterwards by put_page().
+ *
+ * Returns NULL on any kind of failure - a hole must then be inserted into
+ * the corefile, to preserve alignment with its headers; and also returns
+ * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
+ * allowing a hole to be left in the corefile to save diskspace.
+ *
+ * Called without mmap_lock (takes and releases the mmap_lock by itself).
+ */
+#ifdef CONFIG_ELF_CORE
+struct page *get_dump_page(unsigned long addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct page *page;
+ int locked = 1;
+ int ret;
+
+ if (mmap_read_lock_killable(mm))
+ return NULL;
+ ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET);
+ if (locked)
+ mmap_read_unlock(mm);
+ return (ret == 1) ? page : NULL;
+}
+#endif /* CONFIG_ELF_CORE */
+
#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
{
@@ -1747,6 +1757,25 @@ static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
}
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+static bool is_valid_gup_flags(unsigned int gup_flags)
+{
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that with an assertion:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return false;
+ /*
+ * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
+ * that is, FOLL_LONGTERM is a specific case, more restrictive case of
+ * FOLL_PIN.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return false;
+
+ return true;
+}
+
#ifdef CONFIG_MMU
static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
@@ -1842,11 +1871,7 @@ long get_user_pages_remote(struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that with an assertion:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
@@ -1892,11 +1917,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
{
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that with an assertion:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
return __gup_longterm_locked(current->mm, start, nr_pages,
@@ -2786,11 +2807,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
- /*
- * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
- * never directly by the caller, so enforce that:
- */
- if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
/*
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index be690fa66a46..8b3e5b5cd8fa 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -6,10 +6,10 @@
#include <linux/debugfs.h>
#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
-#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
-#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
-#define PIN_FAST_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
-#define PIN_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
+#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
+#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
+#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
struct gup_benchmark {
__u64 get_delta_usec;
@@ -28,7 +28,6 @@ static void put_back_pages(unsigned int cmd, struct page **pages,
switch (cmd) {
case GUP_FAST_BENCHMARK:
- case GUP_LONGTERM_BENCHMARK:
case GUP_BENCHMARK:
for (i = 0; i < nr_pages; i++)
put_page(pages[i]);
@@ -36,6 +35,7 @@ static void put_back_pages(unsigned int cmd, struct page **pages,
case PIN_FAST_BENCHMARK:
case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
unpin_user_pages(pages, nr_pages);
break;
}
@@ -50,6 +50,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
switch (cmd) {
case PIN_FAST_BENCHMARK:
case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
for (i = 0; i < nr_pages; i++) {
page = pages[i];
if (WARN(!page_maybe_dma_pinned(page),
@@ -71,6 +72,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
int nr;
struct page **pages;
int ret = 0;
+ bool needs_mmap_lock =
+ cmd != GUP_FAST_BENCHMARK && cmd != PIN_FAST_BENCHMARK;
if (gup->size > ULONG_MAX)
return -EINVAL;
@@ -80,6 +83,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
if (!pages)
return -ENOMEM;
+ if (needs_mmap_lock && mmap_read_lock_killable(current->mm)) {
+ ret = -EINTR;
+ goto free_pages;
+ }
+
i = 0;
nr = gup->nr_pages_per_call;
start_time = ktime_get();
@@ -101,11 +109,6 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = get_user_pages_fast(addr, nr, gup->flags,
pages + i);
break;
- case GUP_LONGTERM_BENCHMARK:
- nr = get_user_pages(addr, nr,
- gup->flags | FOLL_LONGTERM,
- pages + i, NULL);
- break;
case GUP_BENCHMARK:
nr = get_user_pages(addr, nr, gup->flags, pages + i,
NULL);
@@ -118,10 +121,14 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = pin_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
+ case PIN_LONGTERM_BENCHMARK:
+ nr = pin_user_pages(addr, nr,
+ gup->flags | FOLL_LONGTERM,
+ pages + i, NULL);
+ break;
default:
- kvfree(pages);
ret = -EINVAL;
- goto out;
+ goto unlock;
}
if (nr <= 0)
@@ -149,8 +156,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
end_time = ktime_get();
gup->put_delta_usec = ktime_us_delta(end_time, start_time);
+unlock:
+ if (needs_mmap_lock)
+ mmap_read_unlock(current->mm);
+free_pages:
kvfree(pages);
-out:
return ret;
}
@@ -162,10 +172,10 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
switch (cmd) {
case GUP_FAST_BENCHMARK:
- case GUP_LONGTERM_BENCHMARK:
case GUP_BENCHMARK:
case PIN_FAST_BENCHMARK:
case PIN_BENCHMARK:
+ case PIN_LONGTERM_BENCHMARK:
break;
default:
return -EINVAL;
diff --git a/mm/highmem.c b/mm/highmem.c
index 64d8dea47dd1..1352a27951e3 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -369,7 +369,7 @@ void kunmap_high(struct page *page)
}
EXPORT_SYMBOL(kunmap_high);
-#endif
+#endif /* CONFIG_HIGHMEM */
#if defined(HASHED_PAGE_VIRTUAL)
@@ -481,4 +481,4 @@ void __init page_address_init(void)
}
}
-#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
+#endif /* defined(HASHED_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index da397779a6d4..9474dbc150ed 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2306,13 +2306,13 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
/*
* If we're also updating the vma->vm_next->vm_start, if the new
- * vm_next->vm_start isn't page aligned and it could previously
+ * vm_next->vm_start isn't hpage aligned and it could previously
* contain an hugepage: check if we need to split an huge pmd.
*/
if (adjust_next > 0) {
struct vm_area_struct *next = vma->vm_next;
unsigned long nstart = next->vm_start;
- nstart += adjust_next << PAGE_SHIFT;
+ nstart += adjust_next;
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
@@ -2335,13 +2335,13 @@ static void unmap_page(struct page *page)
VM_BUG_ON_PAGE(!unmap_success, page);
}
-static void remap_page(struct page *page)
+static void remap_page(struct page *page, unsigned int nr)
{
int i;
if (PageTransHuge(page)) {
remove_migration_ptes(page, page, true);
} else {
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
remove_migration_ptes(page + i, page + i, true);
}
}
@@ -2370,6 +2370,9 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
+#ifdef CONFIG_64BIT
+ (1L << PG_arch_2) |
+#endif
(1L << PG_dirty)));
/* ->mapping in first tail page is compound_mapcount */
@@ -2416,6 +2419,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
+ unsigned int nr = thp_nr_pages(head);
int i;
lruvec = mem_cgroup_page_lruvec(head, pgdat);
@@ -2431,7 +2435,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_lock(&swap_cache->i_pages);
}
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+ for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
if (head[i].index >= end) {
@@ -2451,7 +2455,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
ClearPageCompound(head);
- split_page_owner(head, HPAGE_PMD_ORDER);
+ split_page_owner(head, nr);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
@@ -2470,9 +2474,15 @@ static void __split_huge_page(struct page *page, struct list_head *list,
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- remap_page(head);
+ remap_page(head, nr);
+
+ if (PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ split_swap_cluster(entry);
+ }
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < nr; i++) {
struct page *subpage = head + i;
if (subpage == page)
continue;
@@ -2491,7 +2501,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
int total_mapcount(struct page *page)
{
- int i, compound, ret;
+ int i, compound, nr, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -2499,16 +2509,17 @@ int total_mapcount(struct page *page)
return atomic_read(&page->_mapcount) + 1;
compound = compound_mapcount(page);
+ nr = compound_nr(page);
if (PageHuge(page))
return compound;
ret = compound;
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
if (!PageAnon(page))
- return ret - compound * HPAGE_PMD_NR;
+ return ret - compound * nr;
if (PageDoubleMap(page))
- ret -= HPAGE_PMD_NR;
+ ret -= nr;
return ret;
}
@@ -2553,14 +2564,14 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
page = compound_head(page);
_total_mapcount = ret = 0;
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < thp_nr_pages(page); i++) {
mapcount = atomic_read(&page[i]._mapcount) + 1;
ret = max(ret, mapcount);
_total_mapcount += mapcount;
}
if (PageDoubleMap(page)) {
ret -= 1;
- _total_mapcount -= HPAGE_PMD_NR;
+ _total_mapcount -= thp_nr_pages(page);
}
mapcount = compound_mapcount(page);
ret += mapcount;
@@ -2577,9 +2588,9 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
/* Additional pins from page cache */
if (PageAnon(page))
- extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
+ extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
else
- extra_pins = HPAGE_PMD_NR;
+ extra_pins = thp_nr_pages(page);
if (pextra_pins)
*pextra_pins = extra_pins;
return total_mapcount(page) == page_count(page) - extra_pins - 1;
@@ -2706,12 +2717,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
__split_huge_page(page, list, end, flags);
- if (PageSwapCache(head)) {
- swp_entry_t entry = { .val = page_private(head) };
-
- ret = split_swap_cluster(entry);
- } else
- ret = 0;
+ ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
pr_alert("total_mapcount: %u, page_count(): %u\n",
@@ -2725,7 +2731,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
fail: if (mapping)
xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(&pgdata->lru_lock, flags);
- remap_page(head);
+ remap_page(head, thp_nr_pages(head));
ret = -EBUSY;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67fc6383995b..fe76f8fd5a73 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -240,7 +240,6 @@ get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
resv->region_cache_count--;
nrg = list_first_entry(&resv->region_cache, struct file_region, link);
- VM_BUG_ON(!nrg);
list_del(&nrg->link);
nrg->from = from;
@@ -309,8 +308,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
list_del(&rg->link);
kfree(rg);
- coalesce_file_region(resv, prg);
- return;
+ rg = prg;
}
nrg = list_next_entry(rg, link);
@@ -320,22 +318,20 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
list_del(&rg->link);
kfree(rg);
-
- coalesce_file_region(resv, nrg);
- return;
}
}
-/* Must be called with resv->lock held. Calling this with count_only == true
- * will count the number of pages to be added but will not modify the linked
- * list. If regions_needed != NULL and count_only == true, then regions_needed
- * will indicate the number of file_regions needed in the cache to carry out to
- * add the regions for this range.
+/*
+ * Must be called with resv->lock held.
+ *
+ * Calling this with regions_needed != NULL will count the number of pages
+ * to be added but will not modify the linked list. And regions_needed will
+ * indicate the number of file_regions needed in the cache to carry out to add
+ * the regions for this range.
*/
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
struct hugetlb_cgroup *h_cg,
- struct hstate *h, long *regions_needed,
- bool count_only)
+ struct hstate *h, long *regions_needed)
{
long add = 0;
struct list_head *head = &resv->regions;
@@ -371,14 +367,14 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
*/
if (rg->from > last_accounted_offset) {
add += rg->from - last_accounted_offset;
- if (!count_only) {
+ if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, rg->from);
record_hugetlb_cgroup_uncharge_info(h_cg, h,
resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
- } else if (regions_needed)
+ } else
*regions_needed += 1;
}
@@ -390,13 +386,13 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
*/
if (last_accounted_offset < t) {
add += t - last_accounted_offset;
- if (!count_only) {
+ if (!regions_needed) {
nrg = get_file_region_entry_from_cache(
resv, last_accounted_offset, t);
record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
list_add(&nrg->link, rg->link.prev);
coalesce_file_region(resv, nrg);
- } else if (regions_needed)
+ } else
*regions_needed += 1;
}
@@ -448,11 +444,8 @@ static int allocate_file_region_entries(struct resv_map *resv,
spin_lock(&resv->lock);
- list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
- list_del(&rg->link);
- list_add(&rg->link, &resv->region_cache);
- resv->region_cache_count++;
- }
+ list_splice(&allocated_regions, &resv->region_cache);
+ resv->region_cache_count += to_allocate;
}
return 0;
@@ -492,8 +485,8 @@ static long region_add(struct resv_map *resv, long f, long t,
retry:
/* Count how many regions are actually needed to execute this add. */
- add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed,
- true);
+ add_reservation_in_range(resv, f, t, NULL, NULL,
+ &actual_regions_needed);
/*
* Check for sufficient descriptors in the cache to accommodate
@@ -521,7 +514,7 @@ retry:
goto retry;
}
- add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false);
+ add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
resv->adds_in_progress -= in_regions_needed;
@@ -557,9 +550,9 @@ static long region_chg(struct resv_map *resv, long f, long t,
spin_lock(&resv->lock);
- /* Count how many hugepages in this range are NOT respresented. */
+ /* Count how many hugepages in this range are NOT represented. */
chg = add_reservation_in_range(resv, f, t, NULL, NULL,
- out_regions_needed, true);
+ out_regions_needed);
if (*out_regions_needed == 0)
*out_regions_needed = 1;
@@ -1047,21 +1040,17 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
if (nocma && is_migrate_cma_page(page))
continue;
- if (!PageHWPoison(page))
- break;
+ if (PageHWPoison(page))
+ continue;
+
+ list_move(&page->lru, &h->hugepage_activelist);
+ set_page_refcounted(page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
}
- /*
- * if 'non-isolated free hugepage' not found on the list,
- * the allocation fails.
- */
- if (&h->hugepage_freelists[nid] == &page->lru)
- return NULL;
- list_move(&page->lru, &h->hugepage_activelist);
- set_page_refcounted(page);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- return page;
+ return NULL;
}
static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
@@ -1511,9 +1500,9 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- spin_lock(&hugetlb_lock);
set_hugetlb_cgroup(page, NULL);
set_hugetlb_cgroup_rsvd(page, NULL);
+ spin_lock(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
@@ -2423,7 +2412,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
h->resv_huge_pages--;
}
spin_lock(&hugetlb_lock);
- list_move(&page->lru, &h->hugepage_activelist);
+ list_add(&page->lru, &h->hugepage_activelist);
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
@@ -3582,18 +3571,20 @@ void hugetlb_report_meminfo(struct seq_file *m)
seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
}
-int hugetlb_report_node_meminfo(int nid, char *buf)
+int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
struct hstate *h = &default_hstate;
+
if (!hugepages_supported())
return 0;
- return sprintf(buf,
- "Node %d HugePages_Total: %5u\n"
- "Node %d HugePages_Free: %5u\n"
- "Node %d HugePages_Surp: %5u\n",
- nid, h->nr_huge_pages_node[nid],
- nid, h->free_huge_pages_node[nid],
- nid, h->surplus_huge_pages_node[nid]);
+
+ return sysfs_emit_at(buf, len,
+ "Node %d HugePages_Total: %5u\n"
+ "Node %d HugePages_Free: %5u\n"
+ "Node %d HugePages_Surp: %5u\n",
+ nid, h->nr_huge_pages_node[nid],
+ nid, h->free_huge_pages_node[nid],
+ nid, h->surplus_huge_pages_node[nid]);
}
void hugetlb_show_meminfo(void)
@@ -3799,23 +3790,23 @@ bool is_hugetlb_entry_migration(pte_t pte)
if (huge_pte_none(pte) || pte_present(pte))
return false;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp))
+ if (is_migration_entry(swp))
return true;
else
return false;
}
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
if (huge_pte_none(pte) || pte_present(pte))
- return 0;
+ return false;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp))
- return 1;
+ if (is_hwpoison_entry(swp))
+ return true;
else
- return 0;
+ return false;
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
@@ -5348,10 +5339,16 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* !shared pmd case because we can allocate the pmd later as well, it makes the
* code much cleaner.
*
- * This routine must be called with i_mmap_rwsem held in at least read mode.
- * For hugetlbfs, this prevents removal of any page table entries associated
- * with the address space. This is important as we are setting up sharing
- * based on existing page table entries (mappings).
+ * This routine must be called with i_mmap_rwsem held in at least read mode if
+ * sharing is possible. For hugetlbfs, this prevents removal of any page
+ * table entries associated with the address space. This is important as we
+ * are setting up sharing based on existing page table entries (mappings).
+ *
+ * NOTE: This routine is only called from huge_pte_alloc. Some callers of
+ * huge_pte_alloc know that sharing is not possible and do not take
+ * i_mmap_rwsem as a performance optimization. This is handled by the
+ * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
+ * only required for subsequent processing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
@@ -5368,6 +5365,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_assert_locked(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -5708,12 +5706,12 @@ void __init hugetlb_cma_reserve(int order)
reserved = 0;
for_each_node_state(nid, N_ONLINE) {
int res;
- char name[20];
+ char name[CMA_MAX_NAME];
size = min(per_node, hugetlb_cma_size - reserved);
size = round_up(size, PAGE_SIZE << order);
- snprintf(name, 20, "hugetlb%d", nid);
+ snprintf(name, sizeof(name), "hugetlb%d", nid);
res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
0, false, name,
&hugetlb_cma[nid], nid);
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e488876b168a..1ae1ebc2b9b1 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -26,11 +26,6 @@ static int hwpoison_inject(void *data, u64 val)
p = pfn_to_page(pfn);
hpage = compound_head(p);
- /*
- * This implies unable to support free buddy pages.
- */
- if (!get_hwpoison_page(p))
- return 0;
if (!hwpoison_filter_enable)
goto inject;
@@ -40,23 +35,20 @@ static int hwpoison_inject(void *data, u64 val)
* This implies unable to support non-LRU pages.
*/
if (!PageLRU(hpage) && !PageHuge(p))
- goto put_out;
+ return 0;
/*
- * do a racy check with elevated page count, to make sure PG_hwpoison
- * will only be set for the targeted owner (or on a free page).
+ * do a racy check to make sure PG_hwpoison will only be set for
+ * the targeted owner (or on a free page).
* memory_failure() will redo the check reliably inside page lock.
*/
err = hwpoison_filter(hpage);
if (err)
- goto put_out;
+ return 0;
inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
- return memory_failure(pfn, MF_COUNT_INCREASED);
-put_out:
- put_hwpoison_page(p);
- return 0;
+ return memory_failure(pfn, 0);
}
static int hwpoison_unpoison(void *data, u64 val)
diff --git a/mm/internal.h b/mm/internal.h
index 10c677655912..c43ccdddb0f6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,22 +49,20 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details);
-void force_page_cache_readahead(struct address_space *, struct file *,
- pgoff_t index, unsigned long nr_to_read);
-void __do_page_cache_readahead(struct address_space *, struct file *,
- pgoff_t index, unsigned long nr_to_read,
+void do_page_cache_ra(struct readahead_control *, unsigned long nr_to_read,
unsigned long lookahead_size);
-
-/*
- * Submit IO for the read-ahead request in file_ra_state.
- */
-static inline void ra_submit(struct file_ra_state *ra,
- struct address_space *mapping, struct file *filp)
+void force_page_cache_ra(struct readahead_control *, struct file_ra_state *,
+ unsigned long nr);
+static inline void force_page_cache_readahead(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read)
{
- __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
+ DEFINE_READAHEAD(ractl, file, mapping, index);
+ force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
}
+struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
+struct page *find_lock_entry(struct address_space *mapping, pgoff_t index);
+
/**
* page_evictable - test whether a page is evictable
* @page: the page to test
@@ -272,16 +270,16 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
* page from being allocated in parallel and returning garbage as the order.
* If a caller does not hold page_zone(page)->lock, it must guarantee that the
* page cannot be allocated or merged in parallel. Alternatively, it must
- * handle invalid values gracefully, and use page_order_unsafe() below.
+ * handle invalid values gracefully, and use buddy_order_unsafe() below.
*/
-static inline unsigned int page_order(struct page *page)
+static inline unsigned int buddy_order(struct page *page)
{
/* PageBuddy() must be checked by the caller */
return page_private(page);
}
/*
- * Like page_order(), but for callers who cannot afford to hold the zone lock.
+ * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
* PageBuddy() should be checked first by the caller to minimize race window,
* and invalid values must be handled gracefully.
*
@@ -291,7 +289,7 @@ static inline unsigned int page_order(struct page *page)
* times, potentially observing different values in the tests and the actual
* use of the result.
*/
-#define page_order_unsafe(page) READ_ONCE(page_private(page))
+#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
static inline bool is_cow_mapping(vm_flags_t flags)
{
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 4f49fa6cd1aa..00a53f1355ae 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -33,6 +33,8 @@
#include <asm/sections.h>
+#include <kunit/test.h>
+
#include "kasan.h"
#include "../slab.h"
@@ -93,7 +95,7 @@ static void end_report(unsigned long *flags)
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
- if (panic_on_warn) {
+ if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) {
/*
* This thread may hit another WARN() in the panic path.
* Resetting this prevents additional WARN() from panicking the
@@ -464,12 +466,37 @@ static bool report_enabled(void)
return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
}
+#if IS_ENABLED(CONFIG_KUNIT)
+static void kasan_update_kunit_status(struct kunit *cur_test)
+{
+ struct kunit_resource *resource;
+ struct kunit_kasan_expectation *kasan_data;
+
+ resource = kunit_find_named_resource(cur_test, "kasan_data");
+
+ if (!resource) {
+ kunit_set_failure(cur_test);
+ return;
+ }
+
+ kasan_data = (struct kunit_kasan_expectation *)resource->data;
+ kasan_data->report_found = true;
+ kunit_put_resource(resource);
+}
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
void kasan_report_invalid_free(void *object, unsigned long ip)
{
unsigned long flags;
u8 tag = get_tag(object);
object = reset_tag(object);
+
+#if IS_ENABLED(CONFIG_KUNIT)
+ if (current->kunit_test)
+ kasan_update_kunit_status(current->kunit_test);
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
start_report(&flags);
pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
print_tags(tag, object);
@@ -488,6 +515,11 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
void *untagged_addr;
unsigned long flags;
+#if IS_ENABLED(CONFIG_KUNIT)
+ if (current->kunit_test)
+ kasan_update_kunit_status(current->kunit_test);
+#endif /* IS_ENABLED(CONFIG_KUNIT) */
+
disable_trace_on_warning();
tagged_addr = (void *)addr;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cfa0dba5fd3b..4e3dff13eb70 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -56,6 +56,9 @@ enum scan_result {
#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
@@ -431,7 +434,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
+ return atomic_read(&mm->mm_users) == 0;
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -914,6 +917,18 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
+ /*
+ * If the hpage allocated earlier was briefly exposed in page cache
+ * before collapse_file() failed, it is possible that racing lookups
+ * have not yet completed, and would then be unpleasantly surprised by
+ * finding the hpage reused for the same mapping at a different offset.
+ * Just release the previous allocation if there is any danger of that.
+ */
+ if (*hpage && page_count(*hpage) > 1) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
if (!*hpage)
*hpage = khugepaged_alloc_hugepage(wait);
@@ -2292,8 +2307,6 @@ static void set_recommended_min_free_kbytes(void)
int start_stop_khugepaged(void)
{
- static struct task_struct *khugepaged_thread __read_mostly;
- static DEFINE_MUTEX(khugepaged_mutex);
int err = 0;
mutex_lock(&khugepaged_mutex);
@@ -2320,3 +2333,11 @@ fail:
mutex_unlock(&khugepaged_mutex);
return err;
}
+
+void khugepaged_min_free_kbytes_update(void)
+{
+ mutex_lock(&khugepaged_mutex);
+ if (khugepaged_enabled() && khugepaged_thread)
+ set_recommended_min_free_kbytes();
+ mutex_unlock(&khugepaged_mutex);
+}
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
deleted file mode 100644
index e19279ff6aa3..000000000000
--- a/mm/kmemleak-test.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * mm/kmemleak-test.c
- *
- * Copyright (C) 2008 ARM Limited
- * Written by Catalin Marinas <catalin.marinas@arm.com>
- */
-
-#define pr_fmt(fmt) "kmemleak: " fmt
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/list.h>
-#include <linux/percpu.h>
-#include <linux/fdtable.h>
-
-#include <linux/kmemleak.h>
-
-struct test_node {
- long header[25];
- struct list_head list;
- long footer[25];
-};
-
-static LIST_HEAD(test_list);
-static DEFINE_PER_CPU(void *, kmemleak_test_pointer);
-
-/*
- * Some very simple testing. This function needs to be extended for
- * proper testing.
- */
-static int __init kmemleak_test_init(void)
-{
- struct test_node *elem;
- int i;
-
- pr_info("Kmemleak testing\n");
-
- /* make some orphan objects */
- pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
- pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
- pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
- pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
- pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
- pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
- pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
- pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
-#ifndef CONFIG_MODULES
- pr_info("kmem_cache_alloc(files_cachep) = %p\n",
- kmem_cache_alloc(files_cachep, GFP_KERNEL));
- pr_info("kmem_cache_alloc(files_cachep) = %p\n",
- kmem_cache_alloc(files_cachep, GFP_KERNEL));
-#endif
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
- pr_info("vmalloc(64) = %p\n", vmalloc(64));
-
- /*
- * Add elements to a list. They should only appear as orphan
- * after the module is removed.
- */
- for (i = 0; i < 10; i++) {
- elem = kzalloc(sizeof(*elem), GFP_KERNEL);
- pr_info("kzalloc(sizeof(*elem)) = %p\n", elem);
- if (!elem)
- return -ENOMEM;
- INIT_LIST_HEAD(&elem->list);
- list_add_tail(&elem->list, &test_list);
- }
-
- for_each_possible_cpu(i) {
- per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL);
- pr_info("kmalloc(129) = %p\n",
- per_cpu(kmemleak_test_pointer, i));
- }
-
- return 0;
-}
-module_init(kmemleak_test_init);
-
-static void __exit kmemleak_test_exit(void)
-{
- struct test_node *elem, *tmp;
-
- /*
- * Remove the list elements without actually freeing the
- * memory.
- */
- list_for_each_entry_safe(elem, tmp, &test_list, list)
- list_del(&elem->list);
-}
-module_exit(kmemleak_test_exit);
-
-MODULE_LICENSE("GPL");
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5e252d91eb14..c0014d3b91c1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1471,15 +1471,15 @@ static void kmemleak_scan(void)
if (kmemleak_stack_scan) {
struct task_struct *p, *g;
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ rcu_read_lock();
+ for_each_process_thread(g, p) {
void *stack = try_get_task_stack(p);
if (stack) {
scan_block(stack, stack + THREAD_SIZE, NULL);
put_task_stack(p);
}
- } while_each_thread(g, p);
- read_unlock(&tasklist_lock);
+ }
+ rcu_read_unlock();
}
/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 9afccc36dbd2..0960750bb316 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -81,7 +81,7 @@
* different KSM page copy of that content
*
* Internally, the regular nodes, "dups" and "chains" are represented
- * using the same :c:type:`struct stable_node` structure.
+ * using the same struct stable_node structure.
*
* In addition to the stable tree, KSM uses a second data structure called the
* unstable tree: this tree holds pointers to pages which have been found to
diff --git a/mm/madvise.c b/mm/madvise.c
index 0e0d61003fc6..416a56b8e757 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -17,6 +17,8 @@
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
@@ -27,7 +29,6 @@
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
-#include <linux/sched/mm.h>
#include <asm/tlb.h>
@@ -224,25 +225,28 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct address_space *mapping)
{
- pgoff_t index;
+ XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
+ pgoff_t end_index = end / PAGE_SIZE;
struct page *page;
- swp_entry_t swap;
- for (; start < end; start += PAGE_SIZE) {
- index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ rcu_read_lock();
+ xas_for_each(&xas, page, end_index) {
+ swp_entry_t swap;
- page = find_get_entry(mapping, index);
- if (!xa_is_value(page)) {
- if (page)
- put_page(page);
+ if (!xa_is_value(page))
continue;
- }
+ xas_pause(&xas);
+ rcu_read_unlock();
+
swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0, false);
if (page)
put_page(page);
+
+ rcu_read_lock();
}
+ rcu_read_unlock();
lru_add_drain(); /* Push any new pages onto the LRU now */
}
@@ -255,6 +259,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
+ struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
loff_t offset;
@@ -291,10 +296,10 @@ static long madvise_willneed(struct vm_area_struct *vma,
get_file(file);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
return 0;
}
@@ -763,6 +768,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
int behavior)
{
+ struct mm_struct *mm = vma->vm_mm;
+
*prev = vma;
if (!can_madv_lru_vma(vma))
return -EINVAL;
@@ -770,8 +777,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (!userfaultfd_remove(vma, start, end)) {
*prev = NULL; /* mmap_lock has been dropped, prev is stale */
- mmap_read_lock(current->mm);
- vma = find_vma(current->mm, start);
+ mmap_read_lock(mm);
+ vma = find_vma(mm, start);
if (!vma)
return -ENOMEM;
if (start < vma->vm_start) {
@@ -825,6 +832,7 @@ static long madvise_remove(struct vm_area_struct *vma,
loff_t offset;
int error;
struct file *f;
+ struct mm_struct *mm = vma->vm_mm;
*prev = NULL; /* tell sys_madvise we drop mmap_lock */
@@ -852,13 +860,13 @@ static long madvise_remove(struct vm_area_struct *vma,
get_file(f);
if (userfaultfd_remove(vma, start, end)) {
/* mmap_lock was not released by userfaultfd_remove() */
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
}
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
return error;
}
@@ -869,7 +877,6 @@ static long madvise_remove(struct vm_area_struct *vma,
static int madvise_inject_error(int behavior,
unsigned long start, unsigned long end)
{
- struct page *page;
struct zone *zone;
unsigned long size;
@@ -879,6 +886,7 @@ static int madvise_inject_error(int behavior,
for (; start < end; start += size) {
unsigned long pfn;
+ struct page *page;
int ret;
ret = get_user_pages_fast(start, 1, 0, &page);
@@ -893,32 +901,23 @@ static int madvise_inject_error(int behavior,
*/
size = page_size(compound_head(page));
- if (PageHWPoison(page)) {
- put_page(page);
- continue;
- }
-
if (behavior == MADV_SOFT_OFFLINE) {
pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
- pfn, start);
-
+ pfn, start);
ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
- if (ret)
- return ret;
- continue;
+ } else {
+ pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
+ pfn, start);
+ /*
+ * Drop the page reference taken by get_user_pages_fast(). In
+ * the absence of MF_COUNT_INCREASED the memory_failure()
+ * routine is responsible for pinning the page to prevent it
+ * from being released back to the page allocator.
+ */
+ put_page(page);
+ ret = memory_failure(pfn, 0);
}
- pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
- pfn, start);
-
- /*
- * Drop the page reference taken by get_user_pages_fast(). In
- * the absence of MF_COUNT_INCREASED the memory_failure()
- * routine is responsible for pinning the page to prevent it
- * from being released back to the page allocator.
- */
- put_page(page);
- ret = memory_failure(pfn, 0);
if (ret)
return ret;
}
@@ -990,6 +989,18 @@ madvise_behavior_valid(int behavior)
}
}
+static bool
+process_madvise_behavior_valid(int behavior)
+{
+ switch (behavior) {
+ case MADV_COLD:
+ case MADV_PAGEOUT:
+ return true;
+ default:
+ return false;
+ }
+}
+
/*
* The madvise(2) system call.
*
@@ -1037,6 +1048,11 @@ madvise_behavior_valid(int behavior)
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
+ * MADV_COLD - the application is not expected to use this memory soon,
+ * deactivate pages in this range so that they can be reclaimed
+ * easily if memory pressure hanppens.
+ * MADV_PAGEOUT - the application is not expected to use this memory soon,
+ * page out the pages in this range immediately.
*
* return values:
* zero - success
@@ -1051,7 +1067,7 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
-int do_madvise(unsigned long start, size_t len_in, int behavior)
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
@@ -1089,27 +1105,10 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
write = madvise_need_mmap_write(behavior);
if (write) {
- if (mmap_write_lock_killable(current->mm))
- return -EINTR;
-
- /*
- * We may have stolen the mm from another process
- * that is undergoing core dumping.
- *
- * Right now that's io_ring, in the future it may
- * be remote process management and not "current"
- * at all.
- *
- * We need to fix core dumping to not do this,
- * but for now we have the mmget_still_valid()
- * model.
- */
- if (!mmget_still_valid(current->mm)) {
- mmap_write_unlock(current->mm);
+ if (mmap_write_lock_killable(mm))
return -EINTR;
- }
} else {
- mmap_read_lock(current->mm);
+ mmap_read_lock(mm);
}
/*
@@ -1117,7 +1116,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
- vma = find_vma_prev(current->mm, start, &prev);
+ vma = find_vma_prev(mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
@@ -1154,19 +1153,92 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_lock */
- vma = find_vma(current->mm, start);
+ vma = find_vma(mm, start);
}
out:
blk_finish_plug(&plug);
if (write)
- mmap_write_unlock(current->mm);
+ mmap_write_unlock(mm);
else
- mmap_read_unlock(current->mm);
+ mmap_read_unlock(mm);
return error;
}
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
- return do_madvise(start, len_in, behavior);
+ return do_madvise(current->mm, start, len_in, behavior);
+}
+
+SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+ size_t, vlen, int, behavior, unsigned int, flags)
+{
+ ssize_t ret;
+ struct iovec iovstack[UIO_FASTIOV], iovec;
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ struct pid *pid;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ size_t total_len;
+ unsigned int f_flags;
+
+ if (flags != 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret < 0)
+ goto out;
+
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid)) {
+ ret = PTR_ERR(pid);
+ goto free_iov;
+ }
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ ret = -ESRCH;
+ goto put_pid;
+ }
+
+ if (task->mm != current->mm &&
+ !process_madvise_behavior_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_task;
+ }
+
+ mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
+ if (IS_ERR_OR_NULL(mm)) {
+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ goto release_task;
+ }
+
+ total_len = iov_iter_count(&iter);
+
+ while (iov_iter_count(&iter)) {
+ iovec = iov_iter_iovec(&iter);
+ ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+ iovec.iov_len, behavior);
+ if (ret < 0)
+ break;
+ iov_iter_advance(&iter, iovec.iov_len);
+ }
+
+ if (ret == 0)
+ ret = total_len - iov_iter_count(&iter);
+
+ mmput(mm);
+ return ret;
+
+release_task:
+ put_task_struct(task);
+put_pid:
+ put_pid(pid);
+free_iov:
+ kfree(iov);
+out:
+ return ret;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 45f198750be9..b68ee86788af 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -48,12 +48,12 @@
* boot regardless of the possible restrictions and memory hot(un)plug;
* the ``physmem`` type is only available on some architectures.
*
- * Each region is represented by :c:type:`struct memblock_region` that
+ * Each region is represented by struct memblock_region that
* defines the region extents, its attributes and NUMA node id on NUMA
- * systems. Every memory type is described by the :c:type:`struct
- * memblock_type` which contains an array of memory regions along with
+ * systems. Every memory type is described by the struct memblock_type
+ * which contains an array of memory regions along with
* the allocator metadata. The "memory" and "reserved" types are nicely
- * wrapped with :c:type:`struct memblock`. This structure is statically
+ * wrapped with struct memblock. This structure is statically
* initialized at build time. The region arrays are initially sized to
* %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS
* for "reserved". The region array for "physmem" is initially sized to
@@ -132,7 +132,26 @@ struct memblock_type physmem = {
};
#endif
-int memblock_debug __initdata_memblock;
+/*
+ * keep a pointer to &memblock.memory in the text section to use it in
+ * __next_mem_range() and its helpers.
+ * For architectures that do not keep memblock data after init, this
+ * pointer will be reset to NULL at memblock_discard()
+ */
+static __refdata struct memblock_type *memblock_memory = &memblock.memory;
+
+#define for_each_memblock_type(i, memblock_type, rgn) \
+ for (i = 0, rgn = &memblock_type->regions[0]; \
+ i < memblock_type->cnt; \
+ i++, rgn = &memblock_type->regions[i])
+
+#define memblock_dbg(fmt, ...) \
+ do { \
+ if (memblock_debug) \
+ pr_info(fmt, ##__VA_ARGS__); \
+ } while (0)
+
+static int memblock_debug __initdata_memblock;
static bool system_has_some_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
@@ -391,6 +410,8 @@ void __init memblock_discard(void)
memblock.memory.max);
__memblock_free_late(addr, size);
}
+
+ memblock_memory = NULL;
}
#endif
@@ -941,42 +962,16 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP);
}
-/**
- * __next_reserved_mem_region - next function for for_each_reserved_region()
- * @idx: pointer to u64 loop variable
- * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL
- * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL
- *
- * Iterate over all reserved memory regions.
- */
-void __init_memblock __next_reserved_mem_region(u64 *idx,
- phys_addr_t *out_start,
- phys_addr_t *out_end)
-{
- struct memblock_type *type = &memblock.reserved;
-
- if (*idx < type->cnt) {
- struct memblock_region *r = &type->regions[*idx];
- phys_addr_t base = r->base;
- phys_addr_t size = r->size;
-
- if (out_start)
- *out_start = base;
- if (out_end)
- *out_end = base + size - 1;
-
- *idx += 1;
- return;
- }
-
- /* signal end of iteration */
- *idx = ULLONG_MAX;
-}
-
-static bool should_skip_region(struct memblock_region *m, int nid, int flags)
+static bool should_skip_region(struct memblock_type *type,
+ struct memblock_region *m,
+ int nid, int flags)
{
int m_nid = memblock_get_region_node(m);
+ /* we never skip regions when iterating memblock.reserved or physmem */
+ if (type != memblock_memory)
+ return false;
+
/* only memory regions are associated with nodes, check it */
if (nid != NUMA_NO_NODE && nid != m_nid)
return true;
@@ -1041,7 +1036,7 @@ void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
- if (should_skip_region(m, nid, flags))
+ if (should_skip_region(type_a, m, nid, flags))
continue;
if (!type_b) {
@@ -1145,7 +1140,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
- if (should_skip_region(m, nid, flags))
+ if (should_skip_region(type_a, m, nid, flags))
continue;
if (!type_b) {
@@ -1649,23 +1644,6 @@ phys_addr_t __init_memblock memblock_reserved_size(void)
return memblock.reserved.total_size;
}
-phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
-{
- unsigned long pages = 0;
- struct memblock_region *r;
- unsigned long start_pfn, end_pfn;
-
- for_each_memblock(memory, r) {
- start_pfn = memblock_region_memory_base_pfn(r);
- end_pfn = memblock_region_memory_end_pfn(r);
- start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
- end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
- pages += end_pfn - start_pfn;
- }
-
- return PFN_PHYS(pages);
-}
-
/* lowest address */
phys_addr_t __init_memblock memblock_start_of_DRAM(void)
{
@@ -1689,7 +1667,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
* the memory memblock regions, if the @limit exceeds the total size
* of those regions, max_addr will keep original value PHYS_ADDR_MAX
*/
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (limit <= r->size) {
max_addr = r->base + limit;
break;
@@ -1859,7 +1837,7 @@ void __init_memblock memblock_trim_memory(phys_addr_t align)
phys_addr_t start, end, orig_start, orig_end;
struct memblock_region *r;
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
orig_start = r->base;
orig_end = r->base + r->size;
start = round_up(orig_start, align);
@@ -1915,7 +1893,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
}
}
-void __init_memblock __memblock_dump_all(void)
+static void __init_memblock __memblock_dump_all(void)
{
pr_info("MEMBLOCK configuration:\n");
pr_info(" memory size = %pa reserved size = %pa\n",
@@ -1929,6 +1907,12 @@ void __init_memblock __memblock_dump_all(void)
#endif
}
+void __init_memblock memblock_dump_all(void)
+{
+ if (memblock_debug)
+ __memblock_dump_all();
+}
+
void __init memblock_allow_resize(void)
{
memblock_can_resize = 1;
@@ -1981,7 +1965,7 @@ static unsigned long __init free_low_memory_core_early(void)
memblock_clear_hotplug(0, -1);
- for_each_reserved_mem_region(i, &start, &end)
+ for_each_reserved_mem_range(i, &start, &end)
reserve_bootmem_region(start, end);
/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6877c765b8d0..3a24e3b619f5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,6 +73,9 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
+/* Active memory cgroup to use from an interrupt context */
+DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
+
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
@@ -197,14 +200,6 @@ static struct move_charge_struct {
#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
-enum charge_type {
- MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
- MEM_CGROUP_CHARGE_TYPE_ANON,
- MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
- MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
- NR_CHARGE_TYPE,
-};
-
/* for encoding cft->private value on file */
enum res_type {
_MEM,
@@ -1069,23 +1064,56 @@ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
}
EXPORT_SYMBOL(get_mem_cgroup_from_page);
-/**
- * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
- */
-static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+static __always_inline struct mem_cgroup *active_memcg(void)
{
- if (unlikely(current->active_memcg)) {
- struct mem_cgroup *memcg;
+ if (in_interrupt())
+ return this_cpu_read(int_active_memcg);
+ else
+ return current->active_memcg;
+}
- rcu_read_lock();
+static __always_inline struct mem_cgroup *get_active_memcg(void)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = active_memcg();
+ if (memcg) {
/* current->active_memcg must hold a ref. */
- if (WARN_ON_ONCE(!css_tryget(&current->active_memcg->css)))
+ if (WARN_ON_ONCE(!css_tryget(&memcg->css)))
memcg = root_mem_cgroup;
else
memcg = current->active_memcg;
- rcu_read_unlock();
- return memcg;
}
+ rcu_read_unlock();
+
+ return memcg;
+}
+
+static __always_inline bool memcg_kmem_bypass(void)
+{
+ /* Allow remote memcg charging from any context. */
+ if (unlikely(active_memcg()))
+ return false;
+
+ /* Memcg to charge can't be determined. */
+ if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ return true;
+
+ return false;
+}
+
+/**
+ * If active memcg is set, do not fallback to current->mm->memcg.
+ */
+static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+ if (memcg_kmem_bypass())
+ return NULL;
+
+ if (unlikely(active_memcg()))
+ return get_active_memcg();
+
return get_mem_cgroup_from_mm(current->mm);
}
@@ -1102,9 +1130,9 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
* invocations for reference counting, or use mem_cgroup_iter_break()
* to cancel a hierarchy walk before the round-trip is complete.
*
- * Reclaimers can specify a node and a priority level in @reclaim to
- * divide up the memcgs in the hierarchy among all concurrent
- * reclaimers operating on the same node and priority.
+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
+ * in the hierarchy among all concurrent reclaimers operating on the
+ * same node.
*/
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
@@ -1456,6 +1484,70 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
+struct memory_stat {
+ const char *name;
+ unsigned int ratio;
+ unsigned int idx;
+};
+
+static struct memory_stat memory_stats[] = {
+ { "anon", PAGE_SIZE, NR_ANON_MAPPED },
+ { "file", PAGE_SIZE, NR_FILE_PAGES },
+ { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
+ { "percpu", 1, MEMCG_PERCPU_B },
+ { "sock", PAGE_SIZE, MEMCG_SOCK },
+ { "shmem", PAGE_SIZE, NR_SHMEM },
+ { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
+ { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
+ { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /*
+ * The ratio will be initialized in memory_stats_init(). Because
+ * on some architectures, the macro of HPAGE_PMD_SIZE is not
+ * constant(e.g. powerpc).
+ */
+ { "anon_thp", 0, NR_ANON_THPS },
+#endif
+ { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
+ { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
+ { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
+ { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
+ { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
+
+ /*
+ * Note: The slab_reclaimable and slab_unreclaimable must be
+ * together and slab_reclaimable must be in front.
+ */
+ { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
+ { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
+
+ /* The memory events */
+ { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
+ { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
+ { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
+ { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
+ { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
+ { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
+ { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
+};
+
+static int __init memory_stats_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memory_stats[i].idx == NR_ANON_THPS)
+ memory_stats[i].ratio = HPAGE_PMD_SIZE;
+#endif
+ VM_BUG_ON(!memory_stats[i].ratio);
+ VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
+ }
+
+ return 0;
+}
+pure_initcall(memory_stats_init);
+
static char *memory_stat_format(struct mem_cgroup *memcg)
{
struct seq_buf s;
@@ -1476,52 +1568,19 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
* Current memory state:
*/
- seq_buf_printf(&s, "anon %llu\n",
- (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
- PAGE_SIZE);
- seq_buf_printf(&s, "kernel_stack %llu\n",
- (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
- 1024);
- seq_buf_printf(&s, "slab %llu\n",
- (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
- seq_buf_printf(&s, "percpu %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
- seq_buf_printf(&s, "sock %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_SOCK) *
- PAGE_SIZE);
-
- seq_buf_printf(&s, "shmem %llu\n",
- (u64)memcg_page_state(memcg, NR_SHMEM) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_mapped %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_dirty %llu\n",
- (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
- PAGE_SIZE);
- seq_buf_printf(&s, "file_writeback %llu\n",
- (u64)memcg_page_state(memcg, NR_WRITEBACK) *
- PAGE_SIZE);
-
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_buf_printf(&s, "anon_thp %llu\n",
- (u64)memcg_page_state(memcg, NR_ANON_THPS) *
- HPAGE_PMD_SIZE);
-#endif
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ u64 size;
- for (i = 0; i < NR_LRU_LISTS; i++)
- seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
- (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
- PAGE_SIZE);
+ size = memcg_page_state(memcg, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
- seq_buf_printf(&s, "slab_reclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
- seq_buf_printf(&s, "slab_unreclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
+ if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
+ size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
+ seq_buf_printf(&s, "slab %llu\n", size);
+ }
+ }
/* Accumulated memory events */
@@ -1529,22 +1588,6 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
memcg_events(memcg, PGFAULT));
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
memcg_events(memcg, PGMAJFAULT));
-
- seq_buf_printf(&s, "workingset_refault_anon %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
- seq_buf_printf(&s, "workingset_refault_file %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
- seq_buf_printf(&s, "workingset_activate_anon %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
- seq_buf_printf(&s, "workingset_activate_file %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
- seq_buf_printf(&s, "workingset_restore_anon %lu\n",
- memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
- seq_buf_printf(&s, "workingset_restore_file %lu\n",
- memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
- seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
- memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
-
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
memcg_events(memcg, PGREFILL));
seq_buf_printf(&s, "pgscan %lu\n",
@@ -1641,17 +1684,19 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
*/
unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
- unsigned long max;
+ unsigned long max = READ_ONCE(memcg->memory.max);
- max = READ_ONCE(memcg->memory.max);
- if (mem_cgroup_swappiness(memcg)) {
- unsigned long memsw_max;
- unsigned long swap_max;
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ if (mem_cgroup_swappiness(memcg))
+ max += min(READ_ONCE(memcg->swap.max),
+ (unsigned long)total_swap_pages);
+ } else { /* v1 */
+ if (mem_cgroup_swappiness(memcg)) {
+ /* Calculate swap excess capacity from memsw limit */
+ unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
- memsw_max = memcg->memsw.max;
- swap_max = READ_ONCE(memcg->swap.max);
- swap_max = min(swap_max, (unsigned long)total_swap_pages);
- max = min(max + swap_max, memsw_max);
+ max += min(swap, (unsigned long)total_swap_pages);
+ }
}
return max;
}
@@ -1817,8 +1862,8 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
struct mem_cgroup *iter;
/*
- * When a new child is created while the hierarchy is under oom,
- * mem_cgroup_oom_lock() may not be called. Watch for underflow.
+ * Be careful about under_oom underflows becase a child memcg
+ * could have been added after mem_cgroup_mark_under_oom.
*/
spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
@@ -2888,6 +2933,17 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
page = virt_to_head_page(p);
/*
+ * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
+ * or a pointer to obj_cgroup vector. In the latter case the lowest
+ * bit of the pointer is set.
+ * The page->mem_cgroup pointer can be asynchronously changed
+ * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
+ * from a valid memcg pointer to objcg vector or back.
+ */
+ if (!page->mem_cgroup)
+ return NULL;
+
+ /*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* the page->obj_cgroups.
@@ -2913,12 +2969,12 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg;
- if (unlikely(!current->mm && !current->active_memcg))
+ if (memcg_kmem_bypass())
return NULL;
rcu_read_lock();
- if (unlikely(current->active_memcg))
- memcg = rcu_dereference(current->active_memcg);
+ if (unlikely(active_memcg()))
+ memcg = active_memcg();
else
memcg = mem_cgroup_from_task(current);
@@ -3039,19 +3095,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
struct mem_cgroup *memcg;
int ret = 0;
- if (memcg_kmem_bypass())
- return 0;
-
memcg = get_mem_cgroup_from_current();
- if (!mem_cgroup_is_root(memcg)) {
+ if (memcg && !mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
return 0;
}
+ css_put(&memcg->css);
}
- css_put(&memcg->css);
return ret;
}
@@ -4255,17 +4308,16 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
new->size = size;
/* Copy thresholds (if any) to new array */
- if (thresholds->primary) {
- memcpy(new->entries, thresholds->primary->entries, (size - 1) *
- sizeof(struct mem_cgroup_threshold));
- }
+ if (thresholds->primary)
+ memcpy(new->entries, thresholds->primary->entries,
+ flex_array_size(new, entries, size - 1));
/* Add new threshold */
new->entries[size - 1].eventfd = eventfd;
new->entries[size - 1].threshold = threshold;
/* Sort thresholds. Registering of new threshold isn't time-critical */
- sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
+ sort(new->entries, size, sizeof(*new->entries),
compare_thresholds, NULL);
/* Find current threshold */
@@ -5271,12 +5323,12 @@ static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *old_memcg;
long error = -ENOMEM;
- memalloc_use_memcg(parent);
+ old_memcg = set_active_memcg(parent);
memcg = mem_cgroup_alloc();
- memalloc_unuse_memcg();
+ set_active_memcg(old_memcg);
if (IS_ERR(memcg))
return ERR_CAST(memcg);
@@ -5291,13 +5343,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
- page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
page_counter_init(&memcg->memory, NULL);
page_counter_init(&memcg->swap, NULL);
- page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
page_counter_init(&memcg->tcpmem, NULL);
/*
@@ -5426,7 +5476,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
- page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
@@ -5500,7 +5549,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
- if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
+ if (!(mc.flags & MOVE_ANON))
return NULL;
/*
@@ -5519,6 +5568,9 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
return page;
}
+ if (non_swap_entry(ent))
+ return NULL;
+
/*
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
@@ -5539,35 +5591,15 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, swp_entry_t *entry)
{
- struct page *page = NULL;
- struct address_space *mapping;
- pgoff_t pgoff;
-
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!(mc.flags & MOVE_FILE))
return NULL;
- mapping = vma->vm_file->f_mapping;
- pgoff = linear_page_index(vma, addr);
-
/* page is moved even if it's not RSS of this task(page-faulted). */
-#ifdef CONFIG_SWAP
/* shmem/tmpfs may report page out on swap: account for that too. */
- if (shmem_mapping(mapping)) {
- page = find_get_entry(mapping, pgoff);
- if (xa_is_value(page)) {
- swp_entry_t swp = radix_to_swp_entry(page);
- *entry = swp;
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
- }
- } else
- page = find_get_page(mapping, pgoff);
-#else
- page = find_get_page(mapping, pgoff);
-#endif
- return page;
+ return find_get_incore_page(vma->vm_file->f_mapping,
+ linear_page_index(vma, addr));
}
/**
@@ -5643,7 +5675,7 @@ static int mem_cgroup_move_account(struct page *page,
if (PageDirty(page)) {
struct address_space *mapping = page_mapping(page);
- if (mapping_cap_account_dirty(mapping)) {
+ if (mapping_can_writeback(mapping)) {
__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
-nr_pages);
__mod_lruvec_state(to_vec, NR_FILE_DIRTY,
@@ -6393,6 +6425,35 @@ static int memory_stat_show(struct seq_file *m, void *v)
return 0;
}
+#ifdef CONFIG_NUMA
+static int memory_numa_stat_show(struct seq_file *m, void *v)
+{
+ int i;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+ int nid;
+
+ if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
+ continue;
+
+ seq_printf(m, "%s", memory_stats[i].name);
+ for_each_node_state(nid, N_MEMORY) {
+ u64 size;
+ struct lruvec *lruvec;
+
+ lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+ size = lruvec_page_state(lruvec, memory_stats[i].idx);
+ size *= memory_stats[i].ratio;
+ seq_printf(m, " N%d=%llu", nid, size);
+ }
+ seq_putc(m, '\n');
+ }
+
+ return 0;
+}
+#endif
+
static int memory_oom_group_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
@@ -6470,6 +6531,12 @@ static struct cftype memory_files[] = {
.name = "stat",
.seq_show = memory_stat_show,
},
+#ifdef CONFIG_NUMA
+ {
+ .name = "numa_stat",
+ .seq_show = memory_numa_stat_show,
+ },
+#endif
{
.name = "oom.group",
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f1aa6433f404..c0bb186bba62 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -65,6 +65,33 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
+static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release)
+{
+ if (hugepage_or_freepage) {
+ /*
+ * Doing this check for free pages is also fine since dissolve_free_huge_page
+ * returns 0 for non-hugetlb pages as well.
+ */
+ if (dissolve_free_huge_page(page) || !take_page_off_buddy(page))
+ /*
+ * We could fail to take off the target page from buddy
+ * for example due to racy page allocaiton, but that's
+ * acceptable because soft-offlined page is not broken
+ * and if someone really want to use it, they should
+ * take it.
+ */
+ return false;
+ }
+
+ SetPageHWPoison(page);
+ if (release)
+ put_page(page);
+ page_ref_inc(page);
+ num_poisoned_pages_inc();
+
+ return true;
+}
+
#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
u32 hwpoison_filter_enable = 0;
@@ -484,11 +511,12 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
struct vm_area_struct *vma;
struct task_struct *tsk;
struct address_space *mapping = page->mapping;
+ pgoff_t pgoff;
i_mmap_lock_read(mapping);
read_lock(&tasklist_lock);
+ pgoff = page_to_pgoff(page);
for_each_process(tsk) {
- pgoff_t pgoff = page_to_pgoff(page);
struct task_struct *t = task_early_kill(tsk, force_early);
if (!t)
@@ -554,6 +582,7 @@ static const char * const action_page_types[] = {
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
[MF_MSG_DAX] = "dax page",
+ [MF_MSG_UNSPLIT_THP] = "unsplit thp",
[MF_MSG_UNKNOWN] = "unknown page",
};
@@ -824,7 +853,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
#define sc ((1UL << PG_swapcache) | (1UL << PG_swapbacked))
#define unevict (1UL << PG_unevictable)
#define mlock (1UL << PG_mlocked)
-#define writeback (1UL << PG_writeback)
#define lru (1UL << PG_lru)
#define head (1UL << PG_head)
#define slab (1UL << PG_slab)
@@ -873,7 +901,6 @@ static struct page_state {
#undef sc
#undef unevict
#undef mlock
-#undef writeback
#undef lru
#undef head
#undef slab
@@ -925,7 +952,7 @@ static int page_action(struct page_state *ps, struct page *p,
* Return: return 0 if failed to grab the refcount, otherwise true (some
* non-zero value.)
*/
-int get_hwpoison_page(struct page *page)
+static int get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
@@ -954,7 +981,6 @@ int get_hwpoison_page(struct page *page)
return 0;
}
-EXPORT_SYMBOL_GPL(get_hwpoison_page);
/*
* Do all that is necessary to remove user space mappings. Unmap
@@ -1006,7 +1032,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
*/
mapping = page_mapping(hpage);
if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
- mapping_cap_writeback_dirty(mapping)) {
+ mapping_can_writeback(mapping)) {
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
} else {
@@ -1104,6 +1130,25 @@ static int identify_page_state(unsigned long pfn, struct page *p,
return page_action(ps, p, pfn);
}
+static int try_to_split_thp_page(struct page *page, const char *msg)
+{
+ lock_page(page);
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ unsigned long pfn = page_to_pfn(page);
+
+ unlock_page(page);
+ if (!PageAnon(page))
+ pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
+ else
+ pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ put_page(page);
+ return -EBUSY;
+ }
+ unlock_page(page);
+
+ return 0;
+}
+
static int memory_failure_hugetlb(unsigned long pfn, int flags)
{
struct page *p = pfn_to_page(pfn);
@@ -1145,7 +1190,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(head);
- put_hwpoison_page(head);
+ put_page(head);
return 0;
}
@@ -1326,23 +1371,11 @@ int memory_failure(unsigned long pfn, int flags)
}
if (PageTransHuge(hpage)) {
- lock_page(p);
- if (!PageAnon(p) || unlikely(split_huge_page(p))) {
- unlock_page(p);
- if (!PageAnon(p))
- pr_err("Memory failure: %#lx: non anonymous thp\n",
- pfn);
- else
- pr_err("Memory failure: %#lx: thp split failed\n",
- pfn);
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
- put_hwpoison_page(p);
+ if (try_to_split_thp_page(p, "Memory Failure") < 0) {
+ action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
return -EBUSY;
}
- unlock_page(p);
VM_BUG_ON_PAGE(!page_count(p), p);
- hpage = compound_head(p);
}
/*
@@ -1382,10 +1415,7 @@ int memory_failure(unsigned long pfn, int flags)
* page_remove_rmap() in try_to_unmap_one(). So to determine page status
* correctly, we save a copy of the page flags at this time.
*/
- if (PageHuge(p))
- page_flags = hpage->flags;
- else
- page_flags = p->flags;
+ page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1394,14 +1424,14 @@ int memory_failure(unsigned long pfn, int flags)
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(p);
- put_hwpoison_page(p);
+ put_page(p);
return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unlock_page(p);
- put_hwpoison_page(p);
+ put_page(p);
return 0;
}
@@ -1417,11 +1447,8 @@ int memory_failure(unsigned long pfn, int flags)
/*
* Now take care of user space mappings.
* Abort on fail: __delete_from_page_cache() assumes unmapped page.
- *
- * When the raw error page is thp tail page, hpage points to the raw
- * page after thp split.
*/
- if (!hwpoison_user_mappings(p, pfn, flags, &hpage)) {
+ if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
@@ -1638,24 +1665,14 @@ int unpoison_memory(unsigned long pfn)
}
unlock_page(page);
- put_hwpoison_page(page);
+ put_page(page);
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
- put_hwpoison_page(page);
+ put_page(page);
return 0;
}
EXPORT_SYMBOL(unpoison_memory);
-static struct page *new_page(struct page *p, unsigned long private)
-{
- struct migration_target_control mtc = {
- .nid = page_to_nid(p),
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
- };
-
- return alloc_migration_target(p, (unsigned long)&mtc);
-}
-
/*
* Safely get reference count of an arbitrary page.
* Returns 0 for a free page, -EIO for a zero refcount page
@@ -1680,6 +1697,9 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
} else if (is_free_buddy_page(p)) {
pr_info("%s: %#lx free buddy page\n", __func__, pfn);
ret = 0;
+ } else if (page_count(p)) {
+ /* raced with allocation */
+ ret = -EBUSY;
} else {
pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
__func__, pfn, p->flags);
@@ -1696,12 +1716,15 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
{
int ret = __get_any_page(page, pfn, flags);
+ if (ret == -EBUSY)
+ ret = __get_any_page(page, pfn, flags);
+
if (ret == 1 && !PageHuge(page) &&
!PageLRU(page) && !__PageMovable(page)) {
/*
* Try to free it.
*/
- put_hwpoison_page(page);
+ put_page(page);
shake_page(page, 1);
/*
@@ -1710,7 +1733,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
ret = __get_any_page(page, pfn, 0);
if (ret == 1 && !PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
- put_hwpoison_page(page);
+ put_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
pfn, page->flags, &page->flags);
return -EIO;
@@ -1719,69 +1742,55 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
return ret;
}
-static int soft_offline_huge_page(struct page *page, int flags)
+static bool isolate_page(struct page *page, struct list_head *pagelist)
{
- int ret;
- unsigned long pfn = page_to_pfn(page);
- struct page *hpage = compound_head(page);
- LIST_HEAD(pagelist);
+ bool isolated = false;
+ bool lru = PageLRU(page);
- /*
- * This double-check of PageHWPoison is to avoid the race with
- * memory_failure(). See also comment in __soft_offline_page().
- */
- lock_page(hpage);
- if (PageHWPoison(hpage)) {
- unlock_page(hpage);
- put_hwpoison_page(hpage);
- pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
- return -EBUSY;
+ if (PageHuge(page)) {
+ isolated = isolate_huge_page(page, pagelist);
+ } else {
+ if (lru)
+ isolated = !isolate_lru_page(page);
+ else
+ isolated = !isolate_movable_page(page, ISOLATE_UNEVICTABLE);
+
+ if (isolated)
+ list_add(&page->lru, pagelist);
}
- unlock_page(hpage);
- ret = isolate_huge_page(hpage, &pagelist);
+ if (isolated && lru)
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
+
/*
- * get_any_page() and isolate_huge_page() takes a refcount each,
- * so need to drop one here.
+ * If we succeed to isolate the page, we grabbed another refcount on
+ * the page, so we can safely drop the one we got from get_any_pages().
+ * If we failed to isolate the page, it means that we cannot go further
+ * and we will return an error, so drop the reference we got from
+ * get_any_pages() as well.
*/
- put_hwpoison_page(hpage);
- if (!ret) {
- pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
- return -EBUSY;
- }
-
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
- pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
- if (!list_empty(&pagelist))
- putback_movable_pages(&pagelist);
- if (ret > 0)
- ret = -EIO;
- } else {
- /*
- * We set PG_hwpoison only when the migration source hugepage
- * was successfully dissolved, because otherwise hwpoisoned
- * hugepage remains on free hugepage list, then userspace will
- * find it as SIGBUS by allocation failure. That's not expected
- * in soft-offlining.
- */
- ret = dissolve_free_huge_page(page);
- if (!ret) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- ret = -EBUSY;
- }
- }
- return ret;
+ put_page(page);
+ return isolated;
}
-static int __soft_offline_page(struct page *page, int flags)
+/*
+ * __soft_offline_page handles hugetlb-pages and non-hugetlb pages.
+ * If the page is a non-dirty unmapped page-cache page, it simply invalidates.
+ * If the page is mapped, it migrates the contents over.
+ */
+static int __soft_offline_page(struct page *page)
{
- int ret;
+ int ret = 0;
unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ char const *msg_page[] = {"page", "hugepage"};
+ bool huge = PageHuge(page);
+ LIST_HEAD(pagelist);
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
/*
* Check PageHWPoison again inside page lock because PageHWPoison
@@ -1790,121 +1799,75 @@ static int __soft_offline_page(struct page *page, int flags)
* so there's no race between soft_offline_page() and memory_failure().
*/
lock_page(page);
- wait_on_page_writeback(page);
+ if (!PageHuge(page))
+ wait_on_page_writeback(page);
if (PageHWPoison(page)) {
unlock_page(page);
- put_hwpoison_page(page);
+ put_page(page);
pr_info("soft offline: %#lx page already poisoned\n", pfn);
- return -EBUSY;
+ return 0;
}
- /*
- * Try to invalidate first. This should work for
- * non dirty unmapped page cache pages.
- */
- ret = invalidate_inode_page(page);
+
+ if (!PageHuge(page))
+ /*
+ * Try to invalidate first. This should work for
+ * non dirty unmapped page cache pages.
+ */
+ ret = invalidate_inode_page(page);
unlock_page(page);
+
/*
* RED-PEN would be better to keep it isolated here, but we
* would need to fix isolation locking first.
*/
- if (ret == 1) {
- put_hwpoison_page(page);
+ if (ret) {
pr_info("soft_offline: %#lx: invalidated\n", pfn);
- SetPageHWPoison(page);
- num_poisoned_pages_inc();
+ page_handle_poison(page, false, true);
return 0;
}
- /*
- * Simple invalidation didn't work.
- * Try to migrate to a new page instead. migrate.c
- * handles a large number of cases for us.
- */
- if (PageLRU(page))
- ret = isolate_lru_page(page);
- else
- ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
- /*
- * Drop page reference which is came from get_any_page()
- * successful isolate_lru_page() already took another one.
- */
- put_hwpoison_page(page);
- if (!ret) {
- LIST_HEAD(pagelist);
- /*
- * After isolated lru page, the PageLRU will be cleared,
- * so use !__PageMovable instead for LRU page's mapping
- * cannot have PAGE_MAPPING_MOVABLE.
- */
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
- list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
- MIGRATE_SYNC, MR_MEMORY_FAILURE);
- if (ret) {
+ if (isolate_page(hpage, &pagelist)) {
+ ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE);
+ if (!ret) {
+ bool release = !huge;
+
+ if (!page_handle_poison(page, huge, release))
+ ret = -EBUSY;
+ } else {
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
- pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
- pfn, ret, page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], ret, page->flags, &page->flags);
if (ret > 0)
ret = -EIO;
}
} else {
- pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
- pfn, ret, page_count(page), page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s isolation failed: %d, page count %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], ret, page_count(page), page->flags, &page->flags);
+ ret = -EBUSY;
}
return ret;
}
-static int soft_offline_in_use_page(struct page *page, int flags)
+static int soft_offline_in_use_page(struct page *page)
{
- int ret;
- int mt;
struct page *hpage = compound_head(page);
- if (!PageHuge(page) && PageTransHuge(hpage)) {
- lock_page(page);
- if (!PageAnon(page) || unlikely(split_huge_page(page))) {
- unlock_page(page);
- if (!PageAnon(page))
- pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
- else
- pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
- put_hwpoison_page(page);
+ if (!PageHuge(page) && PageTransHuge(hpage))
+ if (try_to_split_thp_page(page, "soft offline") < 0)
return -EBUSY;
- }
- unlock_page(page);
- }
-
- /*
- * Setting MIGRATE_ISOLATE here ensures that the page will be linked
- * to free list immediately (not via pcplist) when released after
- * successful page migration. Otherwise we can't guarantee that the
- * page is really free after put_page() returns, so
- * set_hwpoison_free_buddy_page() highly likely fails.
- */
- mt = get_pageblock_migratetype(page);
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- if (PageHuge(page))
- ret = soft_offline_huge_page(page, flags);
- else
- ret = __soft_offline_page(page, flags);
- set_pageblock_migratetype(page, mt);
- return ret;
+ return __soft_offline_page(page);
}
static int soft_offline_free_page(struct page *page)
{
- int rc = dissolve_free_huge_page(page);
+ int rc = 0;
+
+ if (!page_handle_poison(page, true, false))
+ rc = -EBUSY;
- if (!rc) {
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- else
- rc = -EBUSY;
- }
return rc;
}
@@ -1934,6 +1897,7 @@ int soft_offline_page(unsigned long pfn, int flags)
{
int ret;
struct page *page;
+ bool try_again = true;
if (!pfn_valid(pfn))
return -ENXIO;
@@ -1945,18 +1909,22 @@ int soft_offline_page(unsigned long pfn, int flags)
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
if (flags & MF_COUNT_INCREASED)
- put_hwpoison_page(page);
- return -EBUSY;
+ put_page(page);
+ return 0;
}
+retry:
get_online_mems();
ret = get_any_page(page, pfn, flags);
put_online_mems();
if (ret > 0)
- ret = soft_offline_in_use_page(page, flags);
+ ret = soft_offline_in_use_page(page);
else if (ret == 0)
- ret = soft_offline_free_page(page);
+ if (soft_offline_free_page(page) && try_again) {
+ try_again = false;
+ goto retry;
+ }
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index fcfc4ca36eba..c48f8df6e502 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -65,7 +65,6 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
-#include <linux/dma-debug.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
@@ -794,20 +793,17 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* lock.
*/
static inline int
-copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte,
- struct vm_area_struct *vma, struct vm_area_struct *new,
- unsigned long addr, int *rss, struct page **prealloc,
- pte_t pte, struct page *page)
+copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
+ struct page **prealloc, pte_t pte, struct page *page)
{
+ struct mm_struct *src_mm = src_vma->vm_mm;
struct page *new_page;
- if (!is_cow_mapping(vma->vm_flags))
+ if (!is_cow_mapping(src_vma->vm_flags))
return 1;
/*
- * The trick starts.
- *
* What we want to do is to check whether this page may
* have been pinned by the parent process. If so,
* instead of wrprotect the pte on both sides, we copy
@@ -815,47 +811,16 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* the pinned page won't be randomly replaced in the
* future.
*
- * To achieve this, we do the following:
- *
- * 1. Write-protect the pte if it's writable. This is
- * to protect concurrent write fast-gup with
- * FOLL_PIN, so that we'll fail the fast-gup with
- * the write bit removed.
- *
- * 2. Check page_maybe_dma_pinned() to see whether this
- * page may have been pinned.
- *
- * The order of these steps is important to serialize
- * against the fast-gup code (gup_pte_range()) on the
- * pte check and try_grab_compound_head(), so that
- * we'll make sure either we'll capture that fast-gup
- * so we'll copy the pinned page here, or we'll fail
- * that fast-gup.
- *
- * NOTE! Even if we don't end up copying the page,
- * we won't undo this wrprotect(), because the normal
- * reference copy will need it anyway.
- */
- if (pte_write(pte))
- ptep_set_wrprotect(src_mm, addr, src_pte);
-
- /*
- * These are the "normally we can just copy by reference"
- * checks.
+ * The page pinning checks are just "has this mm ever
+ * seen pinning", along with the (inexact) check of
+ * the page count. That might give false positives for
+ * for pinning, but it will work correctly.
*/
if (likely(!atomic_read(&src_mm->has_pinned)))
return 1;
if (likely(!page_maybe_dma_pinned(page)))
return 1;
- /*
- * Uhhuh. It looks like the page might be a pinned page,
- * and we actually need to copy it. Now we can set the
- * source pte back to being writable.
- */
- if (pte_write(pte))
- set_pte_at(src_mm, addr, src_pte, pte);
-
new_page = *prealloc;
if (!new_page)
return -EAGAIN;
@@ -865,16 +830,16 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* over and copy the page & arm it.
*/
*prealloc = NULL;
- copy_user_highpage(new_page, page, addr, vma);
+ copy_user_highpage(new_page, page, addr, src_vma);
__SetPageUptodate(new_page);
- page_add_new_anon_rmap(new_page, new, addr, false);
- lru_cache_add_inactive_or_unevictable(new_page, new);
+ page_add_new_anon_rmap(new_page, dst_vma, addr, false);
+ lru_cache_add_inactive_or_unevictable(new_page, dst_vma);
rss[mm_counter(new_page)]++;
/* All done, just insert the new page copy in the child */
- pte = mk_pte(new_page, new->vm_page_prot);
- pte = maybe_mkwrite(pte_mkdirty(pte), new);
- set_pte_at(dst_mm, addr, dst_pte, pte);
+ pte = mk_pte(new_page, dst_vma->vm_page_prot);
+ pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
+ set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
@@ -883,24 +848,21 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* is required to copy this pte.
*/
static inline int
-copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, int *rss, struct page **prealloc)
+copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
+ struct page **prealloc)
{
- unsigned long vm_flags = vma->vm_flags;
+ struct mm_struct *src_mm = src_vma->vm_mm;
+ unsigned long vm_flags = src_vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
- page = vm_normal_page(vma, addr, pte);
+ page = vm_normal_page(src_vma, addr, pte);
if (page) {
int retval;
- retval = copy_present_page(dst_mm, src_mm,
- dst_pte, src_pte,
- vma, new,
- addr, rss, prealloc,
- pte, page);
+ retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, prealloc, pte, page);
if (retval <= 0)
return retval;
@@ -934,7 +896,7 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!(vm_flags & VM_UFFD_WP))
pte = pte_clear_uffd_wp(pte);
- set_pte_at(dst_mm, addr, dst_pte, pte);
+ set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
return 0;
}
@@ -957,11 +919,13 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
return new_page;
}
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static int
+copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
@@ -1004,15 +968,15 @@ again:
if (unlikely(!pte_present(*src_pte))) {
entry.val = copy_nonpresent_pte(dst_mm, src_mm,
dst_pte, src_pte,
- vma, addr, rss);
+ src_vma, addr, rss);
if (entry.val)
break;
progress += 8;
continue;
}
/* copy_present_pte() will clear `*prealloc' if consumed */
- ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
- vma, new, addr, rss, &prealloc);
+ ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
+ addr, rss, &prealloc);
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
@@ -1047,7 +1011,7 @@ again:
entry.val = 0;
} else if (ret) {
WARN_ON_ONCE(ret != -EAGAIN);
- prealloc = page_copy_prealloc(src_mm, vma, addr);
+ prealloc = page_copy_prealloc(src_mm, src_vma, addr);
if (!prealloc)
return -ENOMEM;
/* We've captured and resolved the error. Reset, try again. */
@@ -1061,11 +1025,13 @@ out:
return ret;
}
-static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static inline int
+copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
pmd_t *src_pmd, *dst_pmd;
unsigned long next;
@@ -1078,9 +1044,9 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
|| pmd_devmap(*src_pmd)) {
int err;
- VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma);
+ VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
err = copy_huge_pmd(dst_mm, src_mm,
- dst_pmd, src_pmd, addr, vma);
+ dst_pmd, src_pmd, addr, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
@@ -1089,18 +1055,20 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
}
if (pmd_none_or_clear_bad(src_pmd))
continue;
- if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
- vma, new, addr, next))
+ if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
+ addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
}
-static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static inline int
+copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
pud_t *src_pud, *dst_pud;
unsigned long next;
@@ -1113,9 +1081,9 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
int err;
- VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma);
+ VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
err = copy_huge_pud(dst_mm, src_mm,
- dst_pud, src_pud, addr, vma);
+ dst_pud, src_pud, addr, src_vma);
if (err == -ENOMEM)
return -ENOMEM;
if (!err)
@@ -1124,18 +1092,19 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
}
if (pud_none_or_clear_bad(src_pud))
continue;
- if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
- vma, new, addr, next))
+ if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
+ addr, next))
return -ENOMEM;
} while (dst_pud++, src_pud++, addr = next, addr != end);
return 0;
}
-static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
- struct vm_area_struct *new,
- unsigned long addr, unsigned long end)
+static inline int
+copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
+ unsigned long end)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
p4d_t *src_p4d, *dst_p4d;
unsigned long next;
@@ -1147,20 +1116,22 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(src_p4d))
continue;
- if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
- vma, new, addr, next))
+ if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
+ addr, next))
return -ENOMEM;
} while (dst_p4d++, src_p4d++, addr = next, addr != end);
return 0;
}
-int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- struct vm_area_struct *vma, struct vm_area_struct *new)
+int
+copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
pgd_t *src_pgd, *dst_pgd;
unsigned long next;
- unsigned long addr = vma->vm_start;
- unsigned long end = vma->vm_end;
+ unsigned long addr = src_vma->vm_start;
+ unsigned long end = src_vma->vm_end;
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
+ struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
bool is_cow;
int ret;
@@ -1171,19 +1142,19 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
- if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
- !vma->anon_vma)
+ if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+ !src_vma->anon_vma)
return 0;
- if (is_vm_hugetlb_page(vma))
- return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+ if (is_vm_hugetlb_page(src_vma))
+ return copy_hugetlb_page_range(dst_mm, src_mm, src_vma);
- if (unlikely(vma->vm_flags & VM_PFNMAP)) {
+ if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
/*
* We do not free on error cases below as remove_vma
* gets called on error from higher level routine
*/
- ret = track_pfn_copy(vma);
+ ret = track_pfn_copy(src_vma);
if (ret)
return ret;
}
@@ -1194,11 +1165,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* parent mm. And a permission downgrade will only happen if
* is_cow_mapping() returns true.
*/
- is_cow = is_cow_mapping(vma->vm_flags);
+ is_cow = is_cow_mapping(src_vma->vm_flags);
if (is_cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
- 0, vma, src_mm, addr, end);
+ 0, src_vma, src_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
}
@@ -1209,8 +1180,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(src_pgd))
continue;
- if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
- vma, new, addr, next))) {
+ if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
+ addr, next))) {
ret = -ENOMEM;
break;
}
@@ -2420,13 +2391,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_enter_lazy_mmu_mode();
- do {
- if (create || !pte_none(*pte)) {
- err = fn(pte++, addr, data);
- if (err)
- break;
- }
- } while (addr += PAGE_SIZE, addr != end);
+ if (fn) {
+ do {
+ if (create || !pte_none(*pte)) {
+ err = fn(pte++, addr, data);
+ if (err)
+ break;
+ }
+ } while (addr += PAGE_SIZE, addr != end);
+ }
*mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();
@@ -3622,7 +3595,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
* unlock_page(A)
* lock_page(B)
* lock_page(B)
- * pte_alloc_pne
+ * pte_alloc_one
* shrink_page_list
* wait_on_page_writeback(A)
* SetPageWriteback(B)
@@ -3630,7 +3603,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
* # flush A, B to clear the writeback
*/
if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
- vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
@@ -3738,13 +3711,14 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t entry;
int i;
- vm_fault_t ret;
+ vm_fault_t ret = VM_FAULT_FALLBACK;
if (!transhuge_vma_suitable(vma, haddr))
- return VM_FAULT_FALLBACK;
+ return ret;
- ret = VM_FAULT_FALLBACK;
page = compound_head(page);
+ if (compound_order(page) != HPAGE_PMD_ORDER)
+ return ret;
/*
* Archs like ppc64 need additonal space to store information
@@ -3797,7 +3771,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
/**
* alloc_set_pte - setup new PTE entry for given page and add reverse page
- * mapping. If needed, the fucntion allocates page table or use pre-allocated.
+ * mapping. If needed, the function allocates page table or use pre-allocated.
*
* @vmf: fault environment
* @page: page to map
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ce3e73e3a5c1..b44d4c7ba73b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -105,7 +105,7 @@ static struct resource *register_memory_resource(u64 start, u64 size,
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
if (strcmp(resource_name, "System RAM"))
- flags |= IORESOURCE_MEM_DRIVER_MANAGED;
+ flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
/*
* Make sure value parsed from 'mem=' only restricts memory adding
@@ -353,11 +353,19 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
#ifdef CONFIG_NUMA
int __weak memory_add_physaddr_to_nid(u64 start)
{
- pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
start);
return 0;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+
+int __weak phys_to_target_node(u64 start)
+{
+ pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
#endif
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
@@ -617,31 +625,22 @@ void generic_online_page(struct page *page, unsigned int order)
}
EXPORT_SYMBOL_GPL(generic_online_page);
-static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
- void *arg)
+static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
{
const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn;
- int order;
/*
- * Online the pages. The callback might decide to keep some pages
- * PG_reserved (to add them to the buddy later), but we still account
- * them as being online/belonging to this zone ("present").
+ * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
+ * decide to not expose all pages to the buddy (e.g., expose them
+ * later). We account all pages as being online and belonging to this
+ * zone ("present").
*/
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
- order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
- /* __free_pages_core() wants pfns to be aligned to the order */
- if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
- order = 0;
- (*online_page_callback)(pfn_to_page(pfn), order);
- }
+ for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
+ (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
/* mark all involved sections as online */
online_mem_sections(start_pfn, end_pfn);
-
- *(unsigned long *)arg += nr_pages;
- return 0;
}
/* check which state of node_states will be changed when online memory */
@@ -702,9 +701,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
* Associate the pfn range with the given zone, initializing the memmaps
* and resizing the pgdat/zone data to span the added pages. After this
* call, all affected pages are PG_reserved.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+ unsigned long nr_pages,
+ struct vmem_altmap *altmap, int migratetype)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
@@ -729,7 +733,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
- MEMINIT_HOTPLUG, altmap);
+ MEMINIT_HOTPLUG, altmap, migratetype);
set_zone_contiguous(zone);
}
@@ -795,17 +799,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
int online_type, int nid)
{
unsigned long flags;
- unsigned long onlined_pages = 0;
struct zone *zone;
int need_zonelists_rebuild = 0;
int ret;
struct memory_notify arg;
+ /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
mem_hotplug_begin();
/* associate pfn range with the zone */
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
@@ -817,6 +825,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
goto failed_addition;
/*
+ * Fixup the number of isolated pageblocks before marking the sections
+ * onlining, such that undo_isolate_page_range() works correctly.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ /*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
@@ -826,36 +842,29 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
setup_zone_pageset(zone);
}
- ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
- online_pages_range);
- if (ret) {
- /* not a single memory resource was applicable */
- if (need_zonelists_rebuild)
- zone_pcp_reset(zone);
- goto failed_addition;
- }
-
- zone->present_pages += onlined_pages;
+ online_pages_range(pfn, nr_pages);
+ zone->present_pages += nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages += onlined_pages;
+ zone->zone_pgdat->node_present_pages += nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
- /*
- * When exposing larger, physically contiguous memory areas to the
- * buddy, shuffling in the buddy (when freeing onlined pages, putting
- * them either to the head or the tail of the freelist) is only helpful
- * for maintaining the shuffle, but not for creating the initial
- * shuffle. Shuffle the whole zone to make sure the just onlined pages
- * are properly distributed across the whole freelist.
- */
- shuffle_zone(zone);
-
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
zone_pcp_update(zone);
+ /* Basic onlining is complete, allow allocation of onlined pages. */
+ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+
+ /*
+ * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
+ * the tail of the freelist when undoing isolation). Shuffle the whole
+ * zone to make sure the just onlined pages are properly distributed
+ * across the whole freelist - to create an initial shuffle.
+ */
+ shuffle_zone(zone);
+
init_per_zone_wmark_min();
kswapd_run(nid);
@@ -1027,7 +1036,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*
* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
*/
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = PAGE_KERNEL };
u64 start, size;
@@ -1080,9 +1089,8 @@ int __ref add_memory_resource(int nid, struct resource *res)
}
/* link memory sections under this node.*/
- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
- MEMINIT_HOTPLUG);
- BUG_ON(ret);
+ link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
+ MEMINIT_HOTPLUG);
/* create new memmap entry */
if (!strcmp(res->name, "System RAM"))
@@ -1091,6 +1099,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
/* device_online() will take the lock when calling online_pages() */
mem_hotplug_done();
+ /*
+ * In case we're allowed to merge the resource, flag it and trigger
+ * merging now that adding succeeded.
+ */
+ if (mhp_flags & MEMHP_MERGE_RESOURCE)
+ merge_system_ram_resource(res);
+
/* online pages if requested */
if (memhp_default_online_type != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
@@ -1107,7 +1122,7 @@ error:
}
/* requires device_hotplug_lock, see add_memory_resource() */
-int __ref __add_memory(int nid, u64 start, u64 size)
+int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
struct resource *res;
int ret;
@@ -1116,18 +1131,18 @@ int __ref __add_memory(int nid, u64 start, u64 size)
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res);
+ ret = add_memory_resource(nid, res, mhp_flags);
if (ret < 0)
release_memory_resource(res);
return ret;
}
-int add_memory(int nid, u64 start, u64 size)
+int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
int rc;
lock_device_hotplug();
- rc = __add_memory(nid, start, size);
+ rc = __add_memory(nid, start, size, mhp_flags);
unlock_device_hotplug();
return rc;
@@ -1149,14 +1164,14 @@ EXPORT_SYMBOL_GPL(add_memory);
*
* For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
* memory map") are created. Also, the created memory resource is flagged
- * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case
+ * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
* this memory as well (esp., not place kexec images onto it).
*
* The resource_name (visible via /proc/iomem) has to have the format
* "System RAM ($DRIVER)".
*/
int add_memory_driver_managed(int nid, u64 start, u64 size,
- const char *resource_name)
+ const char *resource_name, mhp_t mhp_flags)
{
struct resource *res;
int rc;
@@ -1174,7 +1189,7 @@ int add_memory_driver_managed(int nid, u64 start, u64 size,
goto out_unlock;
}
- rc = add_memory_resource(nid, res);
+ rc = add_memory_resource(nid, res, mhp_flags);
if (rc < 0)
release_memory_resource(res);
@@ -1275,27 +1290,6 @@ found:
return 0;
}
-static struct page *new_node_page(struct page *page, unsigned long private)
-{
- nodemask_t nmask = node_states[N_MEMORY];
- struct migration_target_control mtc = {
- .nid = page_to_nid(page),
- .nmask = &nmask,
- .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
- };
-
- /*
- * try to allocate from a different node but reuse this node if there
- * are no other online nodes to be used (e.g. we are offlining a part
- * of the only existing node)
- */
- node_clear(mtc.nid, nmask);
- if (nodes_empty(nmask))
- node_set(mtc.nid, nmask);
-
- return alloc_migration_target(page, (unsigned long)&mtc);
-}
-
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
@@ -1355,9 +1349,28 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
put_page(page);
}
if (!list_empty(&source)) {
- /* Allocate a new page from the nearest neighbor node */
- ret = migrate_pages(&source, new_node_page, NULL, 0,
- MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ nodemask_t nmask = node_states[N_MEMORY];
+ struct migration_target_control mtc = {
+ .nmask = &nmask,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
+
+ /*
+ * We have checked that migration range is on a single zone so
+ * we can use the nid of the first page to all the others.
+ */
+ mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
+
+ /*
+ * try to allocate from a different node but reuse this node
+ * if there are no other online nodes to be used (e.g. we are
+ * offlining a part of the only existing node)
+ */
+ node_clear(mtc.nid, nmask);
+ if (nodes_empty(nmask))
+ node_set(mtc.nid, nmask);
+ ret = migrate_pages(&source, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret) {
list_for_each_entry(page, &source, lru) {
pr_warn("migrating pfn %lx failed ret:%d ",
@@ -1371,28 +1384,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
-/* Mark all sections offline and remove all free pages from the buddy. */
-static int
-offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
- void *data)
-{
- unsigned long *offlined_pages = (unsigned long *)data;
-
- *offlined_pages += __offline_isolated_pages(start, start + nr_pages);
- return 0;
-}
-
-/*
- * Check all pages in range, recorded as memory resource, are isolated.
- */
-static int
-check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
- void *data)
-{
- return test_pages_isolated(start_pfn, start_pfn + nr_pages,
- MEMORY_OFFLINE);
-}
-
static int __init cmdline_parse_movable_node(char *p)
{
movable_node_enabled = true;
@@ -1476,17 +1467,21 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
return 0;
}
-static int __ref __offline_pages(unsigned long start_pfn,
- unsigned long end_pfn)
+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
- unsigned long pfn, nr_pages = 0;
- unsigned long offlined_pages = 0;
- int ret, node, nr_isolate_pageblock;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn, system_ram_pages = 0;
unsigned long flags;
struct zone *zone;
struct memory_notify arg;
+ int ret, node;
char *reason;
+ /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
mem_hotplug_begin();
/*
@@ -1497,9 +1492,9 @@ static int __ref __offline_pages(unsigned long start_pfn,
* memory holes PG_reserved, don't need pfn_valid() checks, and can
* avoid using walk_system_ram_range() later.
*/
- walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages,
+ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
count_system_ram_pages_cb);
- if (nr_pages != end_pfn - start_pfn) {
+ if (system_ram_pages != nr_pages) {
ret = -EINVAL;
reason = "memory holes";
goto failed_removal;
@@ -1519,11 +1514,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
MEMORY_OFFLINE | REPORT_FAILURE);
- if (ret < 0) {
+ if (ret) {
reason = "failure to isolate range";
goto failed_removal;
}
- nr_isolate_pageblock = ret;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -1573,9 +1567,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
reason = "failure to dissolve huge pages";
goto failed_removal_isolated;
}
- /* check again */
- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- NULL, check_pages_isolated_cb);
+
/*
* per-cpu pages are drained in start_isolate_page_range, but if
* there are still pages that are not free, make sure that we
@@ -1588,30 +1580,30 @@ static int __ref __offline_pages(unsigned long start_pfn,
* because has_unmovable_pages explicitly checks for
* PageBuddy on freed pages on other zones.
*/
+ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
if (ret)
drain_all_pages(zone);
} while (ret);
- /* Ok, all of our target is isolated.
- We cannot do rollback at this point. */
- walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- &offlined_pages, offline_isolated_pages_cb);
- pr_info("Offlined Pages %ld\n", offlined_pages);
+ /* Mark all sections offline and remove free pages from the buddy. */
+ __offline_isolated_pages(start_pfn, end_pfn);
+ pr_info("Offlined Pages %ld\n", nr_pages);
+
/*
- * Onlining will reset pagetype flags and makes migrate type
- * MOVABLE, so just need to decrease the number of isolated
- * pageblocks zone counter here.
+ * The memory sections are marked offline, and the pageblock flags
+ * effectively stale; nobody should be touching them. Fixup the number
+ * of isolated pageblocks, memory onlining will properly revert this.
*/
spin_lock_irqsave(&zone->lock, flags);
- zone->nr_isolate_pageblock -= nr_isolate_pageblock;
+ zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
/* removal success */
- adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
- zone->present_pages -= offlined_pages;
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+ zone->present_pages -= nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages -= offlined_pages;
+ zone->zone_pgdat->node_present_pages -= nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
init_per_zone_wmark_min();
@@ -1648,11 +1640,6 @@ failed_removal:
return ret;
}
-int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
-{
- return __offline_pages(start_pfn, start_pfn + nr_pages);
-}
-
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1741,26 +1728,6 @@ void try_offline_node(int nid)
}
EXPORT_SYMBOL(try_offline_node);
-static void __release_memory_resource(resource_size_t start,
- resource_size_t size)
-{
- int ret;
-
- /*
- * When removing memory in the same granularity as it was added,
- * this function never fails. It might only fail if resources
- * have to be adjusted or split. We'll ignore the error, as
- * removing of memory cannot fail.
- */
- ret = release_mem_region_adjustable(&iomem_resource, start, size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
-}
-
static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
int rc = 0;
@@ -1794,7 +1761,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
memblock_remove(start, size);
}
- __release_memory_resource(start, size);
+ release_mem_region_adjustable(start, size);
try_offline_node(nid);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eddbe4e56c73..3fde772ef5ef 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -875,13 +875,12 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
goto out;
}
- task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
- task_unlock(current);
mpol_put(new);
goto out;
}
+ task_lock(current);
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
@@ -1324,9 +1323,7 @@ static long do_mbind(unsigned long start, unsigned long len,
NODEMASK_SCRATCH(scratch);
if (scratch) {
mmap_write_lock(mm);
- task_lock(current);
err = mpol_set_nodemask(new, nmask, scratch);
- task_unlock(current);
if (err)
mmap_write_unlock(mm);
} else
@@ -1885,8 +1882,7 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
}
/* Return the node id preferred by the given mempolicy, or the given id */
-static int policy_node(gfp_t gfp, struct mempolicy *policy,
- int nd)
+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
diff --git a/mm/mempool.c b/mm/mempool.c
index 79bff63ecf27..f473cdddaff0 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -58,11 +58,10 @@ static void __check_element(mempool_t *pool, void *element, size_t size)
static void check_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
- if (pool->free == mempool_free_slab || pool->free == mempool_kfree)
+ if (pool->free == mempool_free_slab || pool->free == mempool_kfree) {
__check_element(pool, element, ksize(element));
-
- /* Mempools backed by page allocator */
- if (pool->free == mempool_free_pages) {
+ } else if (pool->free == mempool_free_pages) {
+ /* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
void *addr = kmap_atomic((struct page *)element);
@@ -82,11 +81,10 @@ static void __poison_element(void *element, size_t size)
static void poison_element(mempool_t *pool, void *element)
{
/* Mempools backed by slab allocator */
- if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+ if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) {
__poison_element(element, ksize(element));
-
- /* Mempools backed by page allocator */
- if (pool->alloc == mempool_alloc_pages) {
+ } else if (pool->alloc == mempool_alloc_pages) {
+ /* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
void *addr = kmap_atomic((struct page *)element);
@@ -107,7 +105,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_poison_kfree(element, _RET_IP_);
- if (pool->alloc == mempool_alloc_pages)
+ else if (pool->alloc == mempool_alloc_pages)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
@@ -115,7 +113,7 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_unpoison_slab(element);
- if (pool->alloc == mempool_alloc_pages)
+ else if (pool->alloc == mempool_alloc_pages)
kasan_alloc_pages(element, (unsigned long)pool->pool_data);
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 006dace60b1a..73a206d0f645 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -40,12 +40,10 @@ EXPORT_SYMBOL_GPL(memremap_compat_align);
#ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL(devmap_managed_key);
-static atomic_t devmap_managed_enable;
static void devmap_managed_enable_put(void)
{
- if (atomic_dec_and_test(&devmap_managed_enable))
- static_branch_disable(&devmap_managed_key);
+ static_branch_dec(&devmap_managed_key);
}
static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
@@ -56,8 +54,7 @@ static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
return -EINVAL;
}
- if (atomic_inc_return(&devmap_managed_enable) == 1)
- static_branch_enable(&devmap_managed_key);
+ static_branch_inc(&devmap_managed_key);
return 0;
}
#else
@@ -70,24 +67,28 @@ static void devmap_managed_enable_put(void)
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */
-static void pgmap_array_delete(struct resource *res)
+static void pgmap_array_delete(struct range *range)
{
- xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
+ xa_store_range(&pgmap_array, PHYS_PFN(range->start), PHYS_PFN(range->end),
NULL, GFP_KERNEL);
synchronize_rcu();
}
-static unsigned long pfn_first(struct dev_pagemap *pgmap)
+static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id)
{
- return PHYS_PFN(pgmap->res.start) +
- vmem_altmap_offset(pgmap_altmap(pgmap));
+ struct range *range = &pgmap->ranges[range_id];
+ unsigned long pfn = PHYS_PFN(range->start);
+
+ if (range_id)
+ return pfn;
+ return pfn + vmem_altmap_offset(pgmap_altmap(pgmap));
}
-static unsigned long pfn_end(struct dev_pagemap *pgmap)
+static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
{
- const struct resource *res = &pgmap->res;
+ const struct range *range = &pgmap->ranges[range_id];
- return (res->start + resource_size(res)) >> PAGE_SHIFT;
+ return (range->start + range_len(range)) >> PAGE_SHIFT;
}
static unsigned long pfn_next(unsigned long pfn)
@@ -97,8 +98,8 @@ static unsigned long pfn_next(unsigned long pfn)
return pfn + 1;
}
-#define for_each_device_pfn(pfn, map) \
- for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
+#define for_each_device_pfn(pfn, map, i) \
+ for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
static void dev_pagemap_kill(struct dev_pagemap *pgmap)
{
@@ -124,39 +125,49 @@ static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
pgmap->ref = NULL;
}
-void memunmap_pages(struct dev_pagemap *pgmap)
+static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
{
- struct resource *res = &pgmap->res;
+ struct range *range = &pgmap->ranges[range_id];
struct page *first_page;
- unsigned long pfn;
int nid;
- dev_pagemap_kill(pgmap);
- for_each_device_pfn(pfn, pgmap)
- put_page(pfn_to_page(pfn));
- dev_pagemap_cleanup(pgmap);
-
/* make sure to access a memmap that was actually initialized */
- first_page = pfn_to_page(pfn_first(pgmap));
+ first_page = pfn_to_page(pfn_first(pgmap, range_id));
/* pages are dead and unused, undo the arch mapping */
nid = page_to_nid(first_page);
mem_hotplug_begin();
- remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)));
+ remove_pfn_range_from_zone(page_zone(first_page), PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)));
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- __remove_pages(PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), NULL);
+ __remove_pages(PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), NULL);
} else {
- arch_remove_memory(nid, res->start, resource_size(res),
+ arch_remove_memory(nid, range->start, range_len(range),
pgmap_altmap(pgmap));
- kasan_remove_zero_shadow(__va(res->start), resource_size(res));
+ kasan_remove_zero_shadow(__va(range->start), range_len(range));
}
mem_hotplug_done();
- untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
- pgmap_array_delete(res);
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+ pgmap_array_delete(range);
+}
+
+void memunmap_pages(struct dev_pagemap *pgmap)
+{
+ unsigned long pfn;
+ int i;
+
+ dev_pagemap_kill(pgmap);
+ for (i = 0; i < pgmap->nr_range; i++)
+ for_each_device_pfn(pfn, pgmap, i)
+ put_page(pfn_to_page(pfn));
+ dev_pagemap_cleanup(pgmap);
+
+ for (i = 0; i < pgmap->nr_range; i++)
+ pageunmap_range(pgmap, i);
+
WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
devmap_managed_enable_put();
}
@@ -175,6 +186,115 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
complete(&pgmap->done);
}
+static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
+ int range_id, int nid)
+{
+ struct range *range = &pgmap->ranges[range_id];
+ struct dev_pagemap *conflict_pgmap;
+ int error, is_ram;
+
+ if (WARN_ONCE(pgmap_altmap(pgmap) && range_id > 0,
+ "altmap not supported for multiple ranges\n"))
+ return -EINVAL;
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ return -ENOMEM;
+ }
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ return -ENOMEM;
+ }
+
+ is_ram = region_intersects(range->start, range_len(range),
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
+
+ if (is_ram != REGION_DISJOINT) {
+ WARN_ONCE(1, "attempted on %s region %#llx-%#llx\n",
+ is_ram == REGION_MIXED ? "mixed" : "ram",
+ range->start, range->end);
+ return -ENXIO;
+ }
+
+ error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(range->start),
+ PHYS_PFN(range->end), pgmap, GFP_KERNEL));
+ if (error)
+ return error;
+
+ if (nid < 0)
+ nid = numa_mem_id();
+
+ error = track_pfn_remap(NULL, &params->pgprot, PHYS_PFN(range->start), 0,
+ range_len(range));
+ if (error)
+ goto err_pfn_remap;
+
+ mem_hotplug_begin();
+
+ /*
+ * For device private memory we call add_pages() as we only need to
+ * allocate and initialize struct page for the device memory. More-
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
+ *
+ * For all other device memory types, which are accessible by
+ * the CPU, we do want the linear mapping and thus use
+ * arch_add_memory().
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ error = add_pages(nid, PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), params);
+ } else {
+ error = kasan_add_zero_shadow(__va(range->start), range_len(range));
+ if (error) {
+ mem_hotplug_done();
+ goto err_kasan;
+ }
+
+ error = arch_add_memory(nid, range->start, range_len(range),
+ params);
+ }
+
+ if (!error) {
+ struct zone *zone;
+
+ zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
+ move_pfn_range_to_zone(zone, PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), params->altmap,
+ MIGRATE_MOVABLE);
+ }
+
+ mem_hotplug_done();
+ if (error)
+ goto err_add_memory;
+
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ PHYS_PFN(range->start),
+ PHYS_PFN(range_len(range)), pgmap);
+ percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
+ - pfn_first(pgmap, range_id));
+ return 0;
+
+err_add_memory:
+ kasan_remove_zero_shadow(__va(range->start), range_len(range));
+err_kasan:
+ untrack_pfn(NULL, PHYS_PFN(range->start), range_len(range));
+err_pfn_remap:
+ pgmap_array_delete(range);
+ return error;
+}
+
+
/*
* Not device managed version of dev_memremap_pages, undone by
* memunmap_pages(). Please use dev_memremap_pages if you have a struct
@@ -182,17 +302,16 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
*/
void *memremap_pages(struct dev_pagemap *pgmap, int nid)
{
- struct resource *res = &pgmap->res;
- struct dev_pagemap *conflict_pgmap;
struct mhp_params params = {
- /*
- * We do not want any optional features only our own memmap
- */
.altmap = pgmap_altmap(pgmap),
.pgprot = PAGE_KERNEL,
};
- int error, is_ram;
+ const int nr_range = pgmap->nr_range;
bool need_devmap_managed = true;
+ int error, i;
+
+ if (WARN_ONCE(!nr_range, "nr_range must be specified\n"))
+ return ERR_PTR(-EINVAL);
switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
@@ -251,105 +370,27 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
return ERR_PTR(error);
}
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL);
- if (conflict_pgmap) {
- WARN(1, "Conflicting mapping in same section\n");
- put_dev_pagemap(conflict_pgmap);
- error = -ENOMEM;
- goto err_array;
- }
-
- conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL);
- if (conflict_pgmap) {
- WARN(1, "Conflicting mapping in same section\n");
- put_dev_pagemap(conflict_pgmap);
- error = -ENOMEM;
- goto err_array;
- }
-
- is_ram = region_intersects(res->start, resource_size(res),
- IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
-
- if (is_ram != REGION_DISJOINT) {
- WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
- is_ram == REGION_MIXED ? "mixed" : "ram", res);
- error = -ENXIO;
- goto err_array;
- }
-
- error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
- PHYS_PFN(res->end), pgmap, GFP_KERNEL));
- if (error)
- goto err_array;
-
- if (nid < 0)
- nid = numa_mem_id();
-
- error = track_pfn_remap(NULL, &params.pgprot, PHYS_PFN(res->start),
- 0, resource_size(res));
- if (error)
- goto err_pfn_remap;
-
- mem_hotplug_begin();
-
/*
- * For device private memory we call add_pages() as we only need to
- * allocate and initialize struct page for the device memory. More-
- * over the device memory is un-accessible thus we do not want to
- * create a linear mapping for the memory like arch_add_memory()
- * would do.
- *
- * For all other device memory types, which are accessible by
- * the CPU, we do want the linear mapping and thus use
- * arch_add_memory().
+ * Clear the pgmap nr_range as it will be incremented for each
+ * successfully processed range. This communicates how many
+ * regions to unwind in the abort case.
*/
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- error = add_pages(nid, PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), &params);
- } else {
- error = kasan_add_zero_shadow(__va(res->start), resource_size(res));
- if (error) {
- mem_hotplug_done();
- goto err_kasan;
- }
-
- error = arch_add_memory(nid, res->start, resource_size(res),
- &params);
+ pgmap->nr_range = 0;
+ error = 0;
+ for (i = 0; i < nr_range; i++) {
+ error = pagemap_range(pgmap, &params, i, nid);
+ if (error)
+ break;
+ pgmap->nr_range++;
}
- if (!error) {
- struct zone *zone;
-
- zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
- move_pfn_range_to_zone(zone, PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), params.altmap);
+ if (i < nr_range) {
+ memunmap_pages(pgmap);
+ pgmap->nr_range = nr_range;
+ return ERR_PTR(error);
}
- mem_hotplug_done();
- if (error)
- goto err_add_memory;
-
- /*
- * Initialization of the pages has been deferred until now in order
- * to allow us to do the work while not holding the hotplug lock.
- */
- memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
- PHYS_PFN(res->start),
- PHYS_PFN(resource_size(res)), pgmap);
- percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
- return __va(res->start);
-
- err_add_memory:
- kasan_remove_zero_shadow(__va(res->start), resource_size(res));
- err_kasan:
- untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
- err_pfn_remap:
- pgmap_array_delete(res);
- err_array:
- dev_pagemap_kill(pgmap);
- dev_pagemap_cleanup(pgmap);
- devmap_managed_enable_put();
- return ERR_PTR(error);
+ return __va(pgmap->ranges[0].start);
}
EXPORT_SYMBOL_GPL(memremap_pages);
@@ -369,7 +410,7 @@ EXPORT_SYMBOL_GPL(memremap_pages);
* 'live' on entry and will be killed and reaped at
* devm_memremap_pages_release() time, or if this routine fails.
*
- * 4/ res is expected to be a host memory range that could feasibly be
+ * 4/ range is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
* this is not enforced.
*/
@@ -426,7 +467,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
* In the cached case we're already holding a live reference.
*/
if (pgmap) {
- if (phys >= pgmap->res.start && phys <= pgmap->res.end)
+ if (phys >= pgmap->range.start && phys <= pgmap->range.end)
return pgmap;
put_dev_pagemap(pgmap);
}
@@ -451,8 +492,6 @@ void free_devmap_managed_page(struct page *page)
return;
}
- /* Clear Active bit in case of parallel mark_page_accessed */
- __ClearPageActive(page);
__ClearPageWaiters(page);
mem_cgroup_uncharge(page);
diff --git a/mm/migrate.c b/mm/migrate.c
index 04a98bb2f568..5ca5842df5db 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -381,7 +381,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
int expected_count = 1;
/*
- * Device public or private pages have an extra refcount as they are
+ * Device private pages have an extra refcount as they are
* ZONE_DEVICE pages.
*/
expected_count += is_device_private_page(page);
@@ -503,7 +503,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
__dec_lruvec_state(old_lruvec, NR_SHMEM);
__inc_lruvec_state(new_lruvec, NR_SHMEM);
}
- if (dirty && mapping_cap_account_dirty(mapping)) {
+ if (dirty && mapping_can_writeback(mapping)) {
__dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
__dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
__inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
@@ -1223,16 +1223,11 @@ out:
* we want to retry.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- put_page(page);
- if (reason == MR_MEMORY_FAILURE) {
+ if (reason != MR_MEMORY_FAILURE)
/*
- * Set PG_HWPoison on just freed page
- * intentionally. Although it's rather weird,
- * it's how HWPoison flag works at the moment.
+ * We release the page in page_handle_poison.
*/
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- }
+ put_page(page);
} else {
if (rc != -EAGAIN) {
if (likely(!__PageMovable(page))) {
@@ -1869,33 +1864,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
return nr_pages ? -EFAULT : 0;
}
-/*
- * Move a list of pages in the address space of the currently executing
- * process.
- */
-static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
- const void __user * __user *pages,
- const int __user *nodes,
- int __user *status, int flags)
+static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
{
struct task_struct *task;
struct mm_struct *mm;
- int err;
- nodemask_t task_nodes;
- /* Check flags */
- if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
- return -EINVAL;
-
- if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
- return -EPERM;
+ /*
+ * There is no need to check if current process has the right to modify
+ * the specified process when they are same.
+ */
+ if (!pid) {
+ mmget(current->mm);
+ *mem_nodes = cpuset_mems_allowed(current);
+ return current->mm;
+ }
/* Find the mm_struct */
rcu_read_lock();
- task = pid ? find_task_by_vpid(pid) : current;
+ task = find_task_by_vpid(pid);
if (!task) {
rcu_read_unlock();
- return -ESRCH;
+ return ERR_PTR(-ESRCH);
}
get_task_struct(task);
@@ -1905,22 +1894,47 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
- err = -EPERM;
+ mm = ERR_PTR(-EPERM);
goto out;
}
rcu_read_unlock();
- err = security_task_movememory(task);
- if (err)
+ mm = ERR_PTR(security_task_movememory(task));
+ if (IS_ERR(mm))
goto out;
-
- task_nodes = cpuset_mems_allowed(task);
+ *mem_nodes = cpuset_mems_allowed(task);
mm = get_task_mm(task);
+out:
put_task_struct(task);
-
if (!mm)
+ mm = ERR_PTR(-EINVAL);
+ return mm;
+}
+
+/*
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
+ const void __user * __user *pages,
+ const int __user *nodes,
+ int __user *status, int flags)
+{
+ struct mm_struct *mm;
+ int err;
+ nodemask_t task_nodes;
+
+ /* Check flags */
+ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
return -EINVAL;
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ mm = find_mm_struct(pid, &task_nodes);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+
if (nodes)
err = do_pages_move(mm, task_nodes, nr_pages, pages,
nodes, status, flags);
@@ -1929,10 +1943,6 @@ static int kernel_move_pages(pid_t pid, unsigned long nr_pages,
mmput(mm);
return err;
-
-out:
- put_task_struct(task);
- return err;
}
SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
@@ -3077,7 +3087,6 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
remove_migration_ptes(page, newpage, false);
unlock_page(page);
- migrate->cpages--;
if (is_zone_device_page(page))
put_page(page);
diff --git a/mm/mincore.c b/mm/mincore.c
index 453ff112470f..02db1a834021 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -48,7 +48,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
* and is up to date; i.e. that no page-in operation would be required
* at this time if an application were to map and access this page.
*/
-static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t index)
{
unsigned char present = 0;
struct page *page;
@@ -59,31 +59,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
-#ifdef CONFIG_SWAP
- if (shmem_mapping(mapping)) {
- page = find_get_entry(mapping, pgoff);
- /*
- * shmem/tmpfs may return swap: account for swapcache
- * page too.
- */
- if (xa_is_value(page)) {
- swp_entry_t swp = radix_to_swp_entry(page);
- struct swap_info_struct *si;
-
- /* Prevent swap device to being swapoff under us */
- si = get_swap_device(swp);
- if (si) {
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
- put_swap_device(si);
- } else
- page = NULL;
- }
- } else
- page = find_get_page(mapping, pgoff);
-#else
- page = find_get_page(mapping, pgoff);
-#endif
+ page = find_get_incore_page(mapping, index);
if (page) {
present = PageUptodate(page);
put_page(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index 40248d84ad5f..d91ecb00d38c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -143,7 +143,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
struct file *file, struct address_space *mapping)
{
if (vma->vm_flags & VM_DENYWRITE)
- atomic_inc(&file_inode(file)->i_writecount);
+ allow_write_access(file);
if (vma->vm_flags & VM_SHARED)
mapping_unmap_writable(mapping);
@@ -474,8 +474,12 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
{
/*
* All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of the "next" vma being erased if
- * next->vm_start was reduced.
+ * with the possible exception of
+ *
+ * a. the "next" vma being erased if next->vm_start was reduced in
+ * __vma_adjust() -> __vma_unlink()
+ * b. the vma being erased in detach_vmas_to_be_unmapped() ->
+ * vma_rb_erase()
*/
validate_mm_rb(root, ignore);
@@ -485,13 +489,7 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
struct rb_root *root)
{
- /*
- * All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of the vma being erased.
- */
- validate_mm_rb(root, vma);
-
- __vma_rb_erase(vma, root);
+ vma_rb_erase_ignore(vma, root, vma);
}
/*
@@ -560,6 +558,50 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
return 0;
}
+/*
+ * vma_next() - Get the next VMA.
+ * @mm: The mm_struct.
+ * @vma: The current vma.
+ *
+ * If @vma is NULL, return the first vma in the mm.
+ *
+ * Returns: The next VMA after @vma.
+ */
+static inline struct vm_area_struct *vma_next(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ if (!vma)
+ return mm->mmap;
+
+ return vma->vm_next;
+}
+
+/*
+ * munmap_vma_range() - munmap VMAs that overlap a range.
+ * @mm: The mm struct
+ * @start: The start of the range.
+ * @len: The length of the range.
+ * @pprev: pointer to the pointer that will be set to previous vm_area_struct
+ * @rb_link: the rb_node
+ * @rb_parent: the parent rb_node
+ *
+ * Find all the vm_area_struct that overlap from @start to
+ * @end and munmap them. Set @pprev to the previous vm_area_struct.
+ *
+ * Returns: -ENOMEM on munmap failure or 0 on success.
+ */
+static inline int
+munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len,
+ struct vm_area_struct **pprev, struct rb_node ***link,
+ struct rb_node **parent, struct list_head *uf)
+{
+
+ while (find_vma_links(mm, start, start + len, pprev, link, parent))
+ if (do_munmap(mm, start, len, uf))
+ return -ENOMEM;
+
+ return 0;
+}
static unsigned long count_vma_pages_range(struct mm_struct *mm,
unsigned long addr, unsigned long end)
{
@@ -621,9 +663,9 @@ static void __vma_link_file(struct vm_area_struct *vma)
struct address_space *mapping = file->f_mapping;
if (vma->vm_flags & VM_DENYWRITE)
- atomic_dec(&file_inode(file)->i_writecount);
+ put_write_access(file_inode(file));
if (vma->vm_flags & VM_SHARED)
- atomic_inc(&mapping->i_mmap_writable);
+ mapping_allow_writable(mapping);
flush_dcache_mmap_lock(mapping);
vma_interval_tree_insert(vma, &mapping->i_mmap);
@@ -677,7 +719,7 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
mm->map_count++;
}
-static __always_inline void __vma_unlink_common(struct mm_struct *mm,
+static __always_inline void __vma_unlink(struct mm_struct *mm,
struct vm_area_struct *vma,
struct vm_area_struct *ignore)
{
@@ -760,7 +802,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* vma expands, overlapping part of the next:
* mprotect case 5 shifting the boundary up.
*/
- adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
+ adjust_next = (end - next->vm_start);
exporter = next;
importer = vma;
VM_WARN_ON(expand != importer);
@@ -770,7 +812,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
* split_vma inserting another: so it must be
* mprotect case 4 shifting the boundary down.
*/
- adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
+ adjust_next = -(vma->vm_end - end);
exporter = vma;
importer = next;
VM_WARN_ON(expand != importer);
@@ -825,7 +867,7 @@ again:
anon_vma_interval_tree_pre_update_vma(next);
}
- if (root) {
+ if (file) {
flush_dcache_mmap_lock(mapping);
vma_interval_tree_remove(vma, root);
if (adjust_next)
@@ -842,11 +884,11 @@ again:
}
vma->vm_pgoff = pgoff;
if (adjust_next) {
- next->vm_start += adjust_next << PAGE_SHIFT;
- next->vm_pgoff += adjust_next;
+ next->vm_start += adjust_next;
+ next->vm_pgoff += adjust_next >> PAGE_SHIFT;
}
- if (root) {
+ if (file) {
if (adjust_next)
vma_interval_tree_insert(next, root);
vma_interval_tree_insert(vma, root);
@@ -859,7 +901,7 @@ again:
* us to remove next before dropping the locks.
*/
if (remove_next != 3)
- __vma_unlink_common(mm, next, next);
+ __vma_unlink(mm, next, next);
else
/*
* vma is not before next if they've been
@@ -870,7 +912,7 @@ again:
* "next" (which is stored in post-swap()
* "vma").
*/
- __vma_unlink_common(mm, next, vma);
+ __vma_unlink(mm, next, vma);
if (file)
__remove_shared_vm_struct(next, file, mapping);
} else if (insert) {
@@ -897,10 +939,9 @@ again:
anon_vma_interval_tree_post_update_vma(next);
anon_vma_unlock_write(anon_vma);
}
- if (mapping)
- i_mmap_unlock_write(mapping);
- if (root) {
+ if (file) {
+ i_mmap_unlock_write(mapping);
uprobe_mmap(vma);
if (adjust_next)
@@ -1131,10 +1172,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
if (vm_flags & VM_SPECIAL)
return NULL;
- if (prev)
- next = prev->vm_next;
- else
- next = mm->mmap;
+ next = vma_next(mm, prev);
area = next;
if (area && area->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next;
@@ -1666,7 +1704,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
/* Can the mapping track the dirty pages? */
return vma->vm_file && vma->vm_file->f_mapping &&
- mapping_cap_account_dirty(vma->vm_file->f_mapping);
+ mapping_can_writeback(vma->vm_file->f_mapping);
}
/*
@@ -1710,13 +1748,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return -ENOMEM;
}
- /* Clear old maps */
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
- &rb_parent)) {
- if (do_munmap(mm, addr, len, uf))
- return -ENOMEM;
- }
-
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ return -ENOMEM;
/*
* Private writable mapping: check memory availability
*/
@@ -1781,7 +1815,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
if (merge) {
- fput(file);
+ /* ->mmap() can change vma->vm_file and fput the original file. So
+ * fput the vma->vm_file here or we would add an extra fput for file
+ * and cause general protection fault ultimately.
+ */
+ fput(vma->vm_file);
vm_area_free(vma);
vma = merge;
/* Update vm_flags and possible addr to pick up the change. We don't
@@ -1812,6 +1850,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma_set_anonymous(vma);
}
+ /* Allow architectures to sanity-check the vm_flags */
+ if (!arch_validate_flags(vma->vm_flags)) {
+ error = -EINVAL;
+ if (file)
+ goto unmap_and_free_vma;
+ else
+ goto free_vma;
+ }
+
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
if (file) {
@@ -2552,7 +2599,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (vma && (vma->vm_start <= addr))
return vma;
/* don't alter vm_end if the coredump is running */
- if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
+ if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
@@ -2578,9 +2625,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return vma;
if (!(vma->vm_flags & VM_GROWSDOWN))
return NULL;
- /* don't alter vm_start if the coredump is running */
- if (!mmget_still_valid(mm))
- return NULL;
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
@@ -2625,7 +2669,7 @@ static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
+ struct vm_area_struct *next = vma_next(mm, prev);
struct mmu_gather tlb;
lru_add_drain();
@@ -2824,7 +2868,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
if (error)
return error;
}
- vma = prev ? prev->vm_next : mm->mmap;
+ vma = vma_next(mm, prev);
if (unlikely(uf)) {
/*
@@ -3042,14 +3086,9 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
if (error)
return error;
- /*
- * Clear old maps. this also does some error checking for us
- */
- while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
- &rb_parent)) {
- if (do_munmap(mm, addr, len, uf))
- return -ENOMEM;
- }
+ /* Clear old maps, set up prev, rb_link, rb_parent, and uf */
+ if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
+ return -ENOMEM;
/* Check against address space limits *after* clearing old maps... */
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
@@ -3223,7 +3262,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
* By setting it to reflect the virtual start address of the
* vma, merges and splits can happen in a seamless way, just
* using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap and in do_brk.
+ * Similarly in do_mmap and in do_brk_flags.
*/
if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 4fc918163dd3..5654dd19addc 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -913,7 +913,7 @@ static int __mmu_interval_notifier_insert(
return -EOVERFLOW;
/* Must call with a mmget() held */
- if (WARN_ON(atomic_read(&mm->mm_count) <= 0))
+ if (WARN_ON(atomic_read(&mm->mm_users) <= 0))
return -EINVAL;
/* pairs with mmdrop in mmu_interval_notifier_remove() */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ce8b8a5eacbb..56c02beb6041 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -603,6 +603,12 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
goto out;
}
+ /* Allow architectures to sanity-check the new flags */
+ if (!arch_validate_flags(newflags)) {
+ error = -EINVAL;
+ goto out;
+ }
+
error = security_file_mprotect(vma, reqprot, prot);
if (error)
goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 75a327149af1..0faf39b32cdb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -5,7 +5,7 @@
* Replacement code for mm functions to support CPU's that don't
* have any form of memory management unit (thus no virtual memory).
*
- * See Documentation/mm/nommu-mmap.rst
+ * See Documentation/admin-guide/mm/nommu-mmap.rst
*
* Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
* Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
@@ -354,13 +354,6 @@ void vm_unmap_aliases(void)
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
-{
- BUG();
- return NULL;
-}
-EXPORT_SYMBOL_GPL(alloc_vm_area);
-
void free_vm_area(struct vm_struct *area)
{
BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e90f25d6385d..8b84661a6410 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -64,6 +64,8 @@ int sysctl_oom_dump_tasks = 1;
* and mark_oom_victim
*/
DEFINE_MUTEX(oom_lock);
+/* Serializes oom_score_adj and oom_score_adj_min updates */
+DEFINE_MUTEX(oom_adj_mutex);
static inline bool is_memcg_oom(struct oom_control *oc)
{
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4e4ddd67b71e..7709f0e223f5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1882,7 +1882,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
int ratelimit;
int *p;
- if (!bdi_cap_account_dirty(bdi))
+ if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
return;
if (inode_cgwb_enabled(inode))
@@ -2423,7 +2423,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
trace_writeback_dirty_page(page, mapping);
- if (mapping_cap_account_dirty(mapping)) {
+ if (mapping_can_writeback(mapping)) {
struct bdi_writeback *wb;
inode_attach_wb(inode, page);
@@ -2450,7 +2450,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
void account_page_cleaned(struct page *page, struct address_space *mapping,
struct bdi_writeback *wb)
{
- if (mapping_cap_account_dirty(mapping)) {
+ if (mapping_can_writeback(mapping)) {
dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
@@ -2513,7 +2513,7 @@ void account_page_redirty(struct page *page)
{
struct address_space *mapping = page->mapping;
- if (mapping && mapping_cap_account_dirty(mapping)) {
+ if (mapping && mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
@@ -2625,7 +2625,7 @@ void __cancel_dirty_page(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- if (mapping_cap_account_dirty(mapping)) {
+ if (mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
@@ -2665,7 +2665,7 @@ int clear_page_dirty_for_io(struct page *page)
VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (mapping && mapping_cap_account_dirty(mapping)) {
+ if (mapping && mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
@@ -2738,7 +2738,7 @@ int test_clear_page_writeback(struct page *page)
if (ret) {
__xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_WRITEBACK);
- if (bdi_cap_account_writeback(bdi)) {
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
dec_wb_stat(wb, WB_WRITEBACK);
@@ -2791,7 +2791,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
PAGECACHE_TAG_WRITEBACK);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
- if (bdi_cap_account_writeback(bdi))
+ if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT)
inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
/*
@@ -2849,7 +2849,8 @@ EXPORT_SYMBOL_GPL(wait_on_page_writeback);
*/
void wait_for_stable_page(struct page *page)
{
- if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
+ page = thp_head(page);
+ if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
wait_on_page_writeback(page);
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6866533de8e6..23f5066bd4a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
#include <linux/nmi.h>
#include <linux/psi.h>
#include <linux/padata.h>
+#include <linux/khugepaged.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -77,6 +78,34 @@
#include "shuffle.h"
#include "page_reporting.h"
+/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
+typedef int __bitwise fpi_t;
+
+/* No special request */
+#define FPI_NONE ((__force fpi_t)0)
+
+/*
+ * Skip free page reporting notification for the (possibly merged) page.
+ * This does not hinder free page reporting from grabbing the page,
+ * reporting it and marking it "reported" - it only skips notifying
+ * the free page reporting infrastructure about a newly freed page. For
+ * example, used when temporarily pulling a page from a freelist and
+ * putting it back unmodified.
+ */
+#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
+
+/*
+ * Place the (possibly merged) page to the tail of the freelist. Will ignore
+ * page shuffling (relevant code - e.g., memory onlining - is expected to
+ * shuffle the whole zone).
+ *
+ * Note: No code should rely on this flag for correctness - it's purely
+ * to allow for optimizations when handing back either fresh pages
+ * (memory onlining) or untouched pages (page isolation, free page
+ * reporting).
+ */
+#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
+
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
@@ -155,16 +184,16 @@ static int __init early_init_on_alloc(char *buf)
int ret;
bool bool_result;
- if (!buf)
- return -EINVAL;
ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
if (bool_result && page_poisoning_enabled())
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
if (bool_result)
static_branch_enable(&init_on_alloc);
else
static_branch_disable(&init_on_alloc);
- return ret;
+ return 0;
}
early_param("init_on_alloc", early_init_on_alloc);
@@ -173,16 +202,16 @@ static int __init early_init_on_free(char *buf)
int ret;
bool bool_result;
- if (!buf)
- return -EINVAL;
ret = kstrtobool(buf, &bool_result);
+ if (ret)
+ return ret;
if (bool_result && page_poisoning_enabled())
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
if (bool_result)
static_branch_enable(&init_on_free);
else
static_branch_disable(&init_on_free);
- return ret;
+ return 0;
}
early_param("init_on_free", early_init_on_free);
@@ -246,7 +275,8 @@ bool pm_suspended_storage(void)
unsigned int pageblock_order __read_mostly;
#endif
-static void __free_pages_ok(struct page *page, unsigned int order);
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
@@ -658,7 +688,7 @@ out:
void free_compound_page(struct page *page)
{
mem_cgroup_uncharge(page);
- __free_pages_ok(page, compound_order(page));
+ __free_pages_ok(page, compound_order(page), FPI_NONE);
}
void prep_compound_page(struct page *page, unsigned int order)
@@ -762,7 +792,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
-static inline void set_page_order(struct page *page, unsigned int order)
+static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
@@ -787,7 +817,7 @@ static inline bool page_is_buddy(struct page *page, struct page *buddy,
if (!page_is_guard(buddy) && !PageBuddy(buddy))
return false;
- if (page_order(buddy) != order)
+ if (buddy_order(buddy) != order)
return false;
/*
@@ -872,13 +902,17 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
area->nr_free++;
}
-/* Used for pages which are on another list */
+/*
+ * Used for pages which are on another list. Move the pages to the tail
+ * of the list - so the moved pages won't immediately be considered for
+ * allocation again (e.g., optimization for memory onlining).
+ */
static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
- list_move(&page->lru, &area->free_list[migratetype]);
+ list_move_tail(&page->lru, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -951,7 +985,7 @@ buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
- int migratetype, bool report)
+ int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
unsigned long buddy_pfn;
@@ -1025,9 +1059,11 @@ continue_merging:
}
done_merging:
- set_page_order(page, order);
+ set_buddy_order(page, order);
- if (is_shuffle_order(order))
+ if (fpi_flags & FPI_TO_TAIL)
+ to_tail = true;
+ else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
@@ -1038,7 +1074,7 @@ done_merging:
add_to_free_list(page, zone, order, migratetype);
/* Notify page reporting subsystem of freed page */
- if (report)
+ if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}
@@ -1173,6 +1209,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
trace_mm_page_free(page, order);
+ if (unlikely(PageHWPoison(page)) && !order) {
+ /*
+ * Do not let hwpoison pages hit pcplists/buddy
+ * Untie memcg state and reset page's owner
+ */
+ if (memcg_kmem_enabled() && PageKmemcg(page))
+ __memcg_kmem_uncharge_page(page, order);
+ reset_page_owner(page, order);
+ return false;
+ }
+
/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
@@ -1368,7 +1415,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
- __free_one_page(page, page_to_pfn(page), zone, 0, mt, true);
+ __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
@@ -1377,14 +1424,14 @@ static void free_pcppages_bulk(struct zone *zone, int count,
static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
- int migratetype)
+ int migratetype, fpi_t fpi_flags)
{
spin_lock(&zone->lock);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
- __free_one_page(page, pfn, zone, order, migratetype, true);
+ __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock(&zone->lock);
}
@@ -1462,7 +1509,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
}
}
-static void __free_pages_ok(struct page *page, unsigned int order)
+static void __free_pages_ok(struct page *page, unsigned int order,
+ fpi_t fpi_flags)
{
unsigned long flags;
int migratetype;
@@ -1474,7 +1522,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
- free_one_page(page_zone(page), page, pfn, order, migratetype);
+ free_one_page(page_zone(page), page, pfn, order, migratetype,
+ fpi_flags);
local_irq_restore(flags);
}
@@ -1484,6 +1533,11 @@ void __free_pages_core(struct page *page, unsigned int order)
struct page *p = page;
unsigned int loop;
+ /*
+ * When initializing the memmap, __init_single_page() sets the refcount
+ * of all pages to 1 ("allocated"/"not free"). We have to set the
+ * refcount of all involved pages to 0.
+ */
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
@@ -1494,8 +1548,12 @@ void __free_pages_core(struct page *page, unsigned int order)
set_page_count(p, 0);
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
- set_page_refcounted(page);
- __free_pages(page, order);
+
+ /*
+ * Bypass PCP and place fresh pages right to the tail, primarily
+ * relevant for memory onlining.
+ */
+ __free_pages_ok(page, order, FPI_TO_TAIL);
}
#ifdef CONFIG_NEED_MULTIPLE_NODES
@@ -2120,7 +2178,7 @@ static inline void expand(struct zone *zone, struct page *page,
continue;
add_to_free_list(&page[size], zone, high, migratetype);
- set_page_order(&page[size], high);
+ set_buddy_order(&page[size], high);
}
}
@@ -2298,7 +2356,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
#endif
/*
- * Move the free pages in a range to the free lists of the requested type.
+ * Move the free pages in a range to the freelist tail of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
@@ -2334,7 +2392,7 @@ static int move_freepages(struct zone *zone,
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
- order = page_order(page);
+ order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
page += 1 << order;
pages_moved += 1 << order;
@@ -2458,7 +2516,7 @@ static inline void boost_watermark(struct zone *zone)
static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
- unsigned int current_order = page_order(page);
+ unsigned int current_order = buddy_order(page);
int free_pages, movable_pages, alike_pages;
int old_block_type;
@@ -3122,7 +3180,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
*/
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
- free_one_page(zone, page, pfn, 0, migratetype);
+ free_one_page(zone, page, pfn, 0, migratetype,
+ FPI_NONE);
return;
}
migratetype = MIGRATE_MOVABLE;
@@ -3208,7 +3267,7 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- split_page_owner(page, order);
+ split_page_owner(page, 1 << order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -3277,7 +3336,8 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt)
lockdep_assert_held(&zone->lock);
/* Return isolated page to tail of freelist. */
- __free_one_page(page, page_to_pfn(page), zone, order, mt, false);
+ __free_one_page(page, page_to_pfn(page), zone, order, mt,
+ FPI_SKIP_REPORT_NOTIFY | FPI_TO_TAIL);
}
/*
@@ -3495,7 +3555,7 @@ static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
-static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
}
@@ -3740,8 +3800,8 @@ retry:
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
- ac->highest_zoneidx, ac->nodemask) {
+ for_next_zone_zonelist_nodemask(zone, z, ac->highest_zoneidx,
+ ac->nodemask) {
struct page *page;
unsigned long mark;
@@ -3985,8 +4045,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* success so it is time to admit defeat. We will skip the OOM killer
* because it is very likely that the caller has a more reasonable
* fallback than shooting a random task.
+ *
+ * The OOM killer may not free memory on a specific node.
*/
- if (gfp_mask & __GFP_RETRY_MAYFAIL)
+ if (gfp_mask & (__GFP_RETRY_MAYFAIL | __GFP_THISNODE))
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->highest_zoneidx < ZONE_NORMAL)
@@ -4003,10 +4065,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* failures more gracefully we should just bail out here.
*/
- /* The OOM killer may not free memory on a specific node */
- if (gfp_mask & __GFP_THISNODE)
- goto out;
-
/* Exhausted what can be done so it's blame time */
if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
@@ -4254,13 +4312,12 @@ EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
/* Perform direct synchronous page reclaim */
-static int
+static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
- int progress;
unsigned int noreclaim_flag;
- unsigned long pflags;
+ unsigned long pflags, progress;
cond_resched();
@@ -4839,12 +4896,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
- return true;
-}
-
-/* Determine whether to spread dirty pages and what the first usable zone */
-static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
-{
/* Dirty zone balancing only done in the fast path */
ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
@@ -4855,6 +4906,8 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
+
+ return true;
}
/*
@@ -4883,8 +4936,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
- finalise_ac(gfp_mask, &ac);
-
/*
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
@@ -4953,13 +5004,16 @@ static inline void free_the_page(struct page *page, unsigned int order)
if (order == 0) /* Via pcp? */
free_unref_page(page);
else
- __free_pages_ok(page, order);
+ __free_pages_ok(page, order, FPI_NONE);
}
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page))
free_the_page(page, order);
+ else if (!PageHead(page))
+ while (order-- > 0)
+ free_the_page(page + (1 << order), order);
}
EXPORT_SYMBOL(__free_pages);
@@ -5650,7 +5704,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
int n, val;
int min_val = INT_MAX;
int best_node = NUMA_NO_NODE;
- const struct cpumask *tmp = cpumask_of_node(0);
/* Use the local node if we haven't already */
if (!node_isset(node, *used_node_mask)) {
@@ -5671,8 +5724,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
val += (n < node);
/* Give preference to headless and unused nodes */
- tmp = cpumask_of_node(n);
- if (!cpumask_empty(tmp))
+ if (!cpumask_empty(cpumask_of_node(n)))
val += PENALTY_FOR_NODE_WITH_CPUS;
/* Slight preference for less loaded node */
@@ -5968,7 +6020,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (*pfn < memblock_region_memory_end_pfn(r))
break;
}
@@ -5986,10 +6038,15 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
* done. Non-atomic initialization, single-pass.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum meminit_context context,
- struct vmem_altmap *altmap)
+ unsigned long start_pfn,
+ enum meminit_context context,
+ struct vmem_altmap *altmap, int migratetype)
{
unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
@@ -6033,19 +6090,12 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
__SetPageReserved(page);
/*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
+ * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
+ * such that unmovable allocations won't be scattered all
+ * over the place during system boot.
*/
- if (!(pfn & (pageblock_nr_pages - 1))) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ set_pageblock_migratetype(page, migratetype);
cond_resched();
}
pfn++;
@@ -6107,15 +6157,10 @@ void __ref memmap_init_zone_device(struct zone *zone,
* the address space during boot when many long-lived
* kernel allocations are made.
*
- * bitmap is created for zone's valid pfn range. but memmap
- * can be created for invalid pages (for alignment)
- * check here not to call set_pageblock_migratetype() against
- * pfn out of zone.
- *
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
- if (!(pfn & (pageblock_nr_pages - 1))) {
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
@@ -6150,7 +6195,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid,
if (end_pfn > start_pfn) {
size = end_pfn - start_pfn;
memmap_init_zone(size, nid, zone, start_pfn,
- MEMINIT_EARLY, NULL);
+ MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
}
}
}
@@ -6553,7 +6598,7 @@ static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long start_pfn, end_pfn;
struct memblock_region *r;
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
start_pfn = clamp(memblock_region_memory_base_pfn(r),
zone_start_pfn, zone_end_pfn);
end_pfn = clamp(memblock_region_memory_end_pfn(r),
@@ -6997,8 +7042,7 @@ static void __init init_unavailable_mem(void)
* Loop through unavailable ranges not covered by memblock.memory.
*/
pgcnt = 0;
- for_each_mem_range(i, &memblock.memory, NULL,
- NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
+ for_each_mem_range(i, &start, &end) {
if (next < start)
pgcnt += init_unavailable_range(PFN_DOWN(next),
PFN_UP(start));
@@ -7148,7 +7192,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
* options.
*/
if (movable_node_is_enabled()) {
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (!memblock_is_hotpluggable(r))
continue;
@@ -7169,7 +7213,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (mirrored_kernelcore) {
bool mem_below_4gb_not_mirrored = false;
- for_each_memblock(memory, r) {
+ for_each_mem_region(r) {
if (memblock_is_mirror(r))
continue;
@@ -7904,6 +7948,8 @@ int __meminit init_per_zone_wmark_min(void)
setup_min_slab_ratio();
#endif
+ khugepaged_min_free_kbytes_update();
+
return 0;
}
postcore_initcall(init_per_zone_wmark_min)
@@ -8231,14 +8277,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
{
unsigned long iter = 0;
unsigned long pfn = page_to_pfn(page);
-
- /*
- * TODO we could make this much more efficient by not checking every
- * page in the range if we know all of them are in MOVABLE_ZONE and
- * that the movable zone guarantees that pages are migratable but
- * the later is not the case right now unfortunatelly. E.g. movablecore
- * can still lead to having bootmem allocations in zone_movable.
- */
+ unsigned long offset = pfn % pageblock_nr_pages;
if (is_migrate_cma_page(page)) {
/*
@@ -8252,12 +8291,18 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
return page;
}
- for (; iter < pageblock_nr_pages; iter++) {
+ for (; iter < pageblock_nr_pages - offset; iter++) {
if (!pfn_valid_within(pfn + iter))
continue;
page = pfn_to_page(pfn + iter);
+ /*
+ * Both, bootmem allocations and memory holes are marked
+ * PG_reserved and are unmovable. We can even have unmovable
+ * allocations inside ZONE_MOVABLE, for example when
+ * specifying "movablecore".
+ */
if (PageReserved(page))
return page;
@@ -8299,7 +8344,7 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
*/
if (!page_ref_count(page)) {
if (PageBuddy(page))
- iter += (1 << page_order(page)) - 1;
+ iter += (1 << buddy_order(page)) - 1;
continue;
}
@@ -8331,14 +8376,6 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
* it. But now, memory offline itself doesn't call
* shrink_node_slabs() and it still to be fixed.
*/
- /*
- * If the page is not RAM, page_count()should be 0.
- * we don't need more check. This is an _used_ not-movable page.
- *
- * The problematic thing here is PG_reserved pages. PG_reserved
- * is set to both of a memory hole page and a _used_ kernel
- * page at boot.
- */
return page;
}
return NULL;
@@ -8472,7 +8509,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
ret = start_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype, 0);
- if (ret < 0)
+ if (ret)
return ret;
/*
@@ -8520,7 +8557,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
}
if (outer_start != start) {
- order = page_order(pfn_to_page(outer_start));
+ order = buddy_order(pfn_to_page(outer_start));
/*
* outer_start page could be small order buddy page and
@@ -8708,35 +8745,21 @@ void zone_pcp_reset(struct zone *zone)
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * All pages in the range must be in a single zone and isolated
- * before calling this.
+ * All pages in the range must be in a single zone, must not contain holes,
+ * must span full sections, and must be isolated before calling this function.
*/
-unsigned long
-__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
+ unsigned long pfn = start_pfn;
struct page *page;
struct zone *zone;
unsigned int order;
- unsigned long pfn;
unsigned long flags;
- unsigned long offlined_pages = 0;
-
- /* find the first valid pfn */
- for (pfn = start_pfn; pfn < end_pfn; pfn++)
- if (pfn_valid(pfn))
- break;
- if (pfn == end_pfn)
- return offlined_pages;
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
- pfn = start_pfn;
while (pfn < end_pfn) {
- if (!pfn_valid(pfn)) {
- pfn++;
- continue;
- }
page = pfn_to_page(pfn);
/*
* The HWPoisoned page may be not in buddy system, and
@@ -8744,7 +8767,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
*/
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
- offlined_pages++;
continue;
}
/*
@@ -8755,20 +8777,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(page_count(page));
BUG_ON(PageBuddy(page));
pfn++;
- offlined_pages++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
- order = page_order(page);
- offlined_pages += 1 << order;
+ order = buddy_order(page);
del_page_from_free_list(page, zone, order);
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
-
- return offlined_pages;
}
#endif
@@ -8783,7 +8801,7 @@ bool is_free_buddy_page(struct page *page)
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
- if (PageBuddy(page_head) && page_order(page_head) >= order)
+ if (PageBuddy(page_head) && buddy_order(page_head) >= order)
break;
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -8793,30 +8811,70 @@ bool is_free_buddy_page(struct page *page)
#ifdef CONFIG_MEMORY_FAILURE
/*
- * Set PG_hwpoison flag if a given page is confirmed to be a free page. This
- * test is performed under the zone lock to prevent a race against page
- * allocation.
+ * Break down a higher-order page in sub-pages, and keep our target out of
+ * buddy allocator.
*/
-bool set_hwpoison_free_buddy_page(struct page *page)
+static void break_down_buddy_pages(struct zone *zone, struct page *page,
+ struct page *target, int low, int high,
+ int migratetype)
+{
+ unsigned long size = 1 << high;
+ struct page *current_buddy, *next_page;
+
+ while (high > low) {
+ high--;
+ size >>= 1;
+
+ if (target >= &page[size]) {
+ next_page = page + size;
+ current_buddy = page;
+ } else {
+ next_page = page;
+ current_buddy = page + size;
+ }
+
+ if (set_page_guard(zone, current_buddy, high, migratetype))
+ continue;
+
+ if (current_buddy != target) {
+ add_to_free_list(current_buddy, zone, high, migratetype);
+ set_buddy_order(current_buddy, high);
+ page = next_page;
+ }
+ }
+}
+
+/*
+ * Take a page that will be marked as poisoned off the buddy allocator.
+ */
+bool take_page_off_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
unsigned int order;
- bool hwpoisoned = false;
+ bool ret = false;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
+ int page_order = buddy_order(page_head);
- if (PageBuddy(page_head) && page_order(page_head) >= order) {
- if (!TestSetPageHWPoison(page))
- hwpoisoned = true;
+ if (PageBuddy(page_head) && page_order >= order) {
+ unsigned long pfn_head = page_to_pfn(page_head);
+ int migratetype = get_pfnblock_migratetype(page_head,
+ pfn_head);
+
+ del_page_from_free_list(page_head, zone, page_order);
+ break_down_buddy_pages(zone, page_head, page, 0,
+ page_order, migratetype);
+ ret = true;
break;
}
+ if (page_count(page_head) > 0)
+ break;
}
spin_unlock_irqrestore(&zone->lock, flags);
-
- return hwpoisoned;
+ return ret;
}
#endif
diff --git a/mm/page_counter.c b/mm/page_counter.c
index afe22ad335cc..b24a60b28bb0 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -109,7 +109,7 @@ bool page_counter_try_charge(struct page_counter *counter,
*
* The atomic_long_add_return() implies a full memory
* barrier between incrementing the count and reading
- * the limit. When racing with page_counter_limit(),
+ * the limit. When racing with page_counter_set_max(),
* we either see the new limit or the setter sees the
* counter has changed and retries.
*/
diff --git a/mm/page_io.c b/mm/page_io.c
index e485a6e8a6cd..433df1263349 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -252,6 +252,16 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
unlock_page(page);
goto out;
}
+ /*
+ * Arch code may have to preserve more data than just the page
+ * contents, e.g. memory tags.
+ */
+ ret = arch_prepare_to_swap(page);
+ if (ret) {
+ set_page_dirty(page);
+ unlock_page(page);
+ goto out;
+ }
if (frontswap_store(page) == 0) {
set_page_writeback(page);
unlock_page(page);
@@ -302,7 +312,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (data_race(sis->flags & SWP_FS)) {
+ if (data_race(sis->flags & SWP_FS_OPS)) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -349,13 +359,11 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
return 0;
}
- ret = 0;
bio = get_swap_bio(GFP_NOIO, page, end_write_func);
if (bio == NULL) {
set_page_dirty(page);
unlock_page(page);
- ret = -ENOMEM;
- goto out;
+ return -ENOMEM;
}
bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
bio_associate_blkg_from_page(bio, page);
@@ -363,8 +371,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
set_page_writeback(page);
unlock_page(page);
submit_bio(bio);
-out:
- return ret;
+
+ return 0;
}
int swap_readpage(struct page *page, bool synchronous)
@@ -393,7 +401,7 @@ int swap_readpage(struct page *page, bool synchronous)
goto out;
}
- if (data_race(sis->flags & SWP_FS)) {
+ if (data_race(sis->flags & SWP_FS_OPS)) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -403,15 +411,17 @@ int swap_readpage(struct page *page, bool synchronous)
goto out;
}
- ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
- if (!ret) {
- if (trylock_page(page)) {
- swap_slot_free_notify(page);
- unlock_page(page);
- }
+ if (sis->flags & SWP_SYNCHRONOUS_IO) {
+ ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
+ if (!ret) {
+ if (trylock_page(page)) {
+ swap_slot_free_notify(page);
+ unlock_page(page);
+ }
- count_vm_event(PSWPIN);
- goto out;
+ count_vm_event(PSWPIN);
+ goto out;
+ }
}
ret = 0;
@@ -455,7 +465,7 @@ int swap_set_page_dirty(struct page *page)
{
struct swap_info_struct *sis = page_swap_info(page);
- if (data_race(sis->flags & SWP_FS)) {
+ if (data_race(sis->flags & SWP_FS_OPS)) {
struct address_space *mapping = sis->swap_file->f_mapping;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 63a3db10a8c0..abbf42214485 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -17,22 +17,21 @@
static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
{
- struct page *unmovable = NULL;
- struct zone *zone;
+ struct zone *zone = page_zone(page);
+ struct page *unmovable;
unsigned long flags;
- int ret = -EBUSY;
-
- zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
/*
* We assume the caller intended to SET migrate type to isolate.
* If it is already set, then someone else must have raced and
- * set it before us. Return -EBUSY
+ * set it before us.
*/
- if (is_migrate_isolate_page(page))
- goto out;
+ if (is_migrate_isolate_page(page)) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return -EBUSY;
+ }
/*
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
@@ -49,25 +48,21 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
NULL);
__mod_zone_freepage_state(zone, -nr_pages, mt);
- ret = 0;
+ spin_unlock_irqrestore(&zone->lock, flags);
+ drain_all_pages(zone);
+ return 0;
}
-out:
spin_unlock_irqrestore(&zone->lock, flags);
- if (!ret) {
- drain_all_pages(zone);
- } else {
- WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
-
- if ((isol_flags & REPORT_FAILURE) && unmovable)
- /*
- * printk() with zone->lock held will likely trigger a
- * lockdep splat, so defer it here.
- */
- dump_page(unmovable, "unmovable page");
+ if (isol_flags & REPORT_FAILURE) {
+ /*
+ * printk() with zone->lock held will likely trigger a
+ * lockdep splat, so defer it here.
+ */
+ dump_page(unmovable, "unmovable page");
}
- return ret;
+ return -EBUSY;
}
static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
@@ -93,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* these pages to be merged.
*/
if (PageBuddy(page)) {
- order = page_order(page);
+ order = buddy_order(page);
if (order >= pageblock_order) {
pfn = page_to_pfn(page);
buddy_pfn = __find_buddy_pfn(pfn, order);
@@ -111,6 +106,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* If we isolate freepage with more than pageblock_order, there
* should be no freepage in the range, so we could avoid costly
* pageblock scanning for freepage moving.
+ *
+ * We didn't actually touch any of the isolated pages, so place them
+ * to the tail of the freelist. This is an optimization for memory
+ * onlining - just onlined memory won't immediately be considered for
+ * allocation.
*/
if (!isolated_page) {
nr_pages = move_freepages_block(zone, page, migratetype, NULL);
@@ -178,8 +178,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* (e.g. __offline_pages will need to call it after check for isolated range for
* a next retry).
*
- * Return: the number of isolated pageblocks on success and -EBUSY if any part
- * of range cannot be isolated.
+ * Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype, int flags)
@@ -187,7 +186,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned long pfn;
unsigned long undo_pfn;
struct page *page;
- int nr_isolate_pageblock = 0;
BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
@@ -201,10 +199,9 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
undo_pfn = pfn;
goto undo;
}
- nr_isolate_pageblock++;
}
}
- return nr_isolate_pageblock;
+ return 0;
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
@@ -264,7 +261,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
* the correct MIGRATE_ISOLATE freelist. There is no
* simple way to verify that as VM_BUG_ON(), though.
*/
- pfn += 1 << page_order(page);
+ pfn += 1 << buddy_order(page);
else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 360461509423..b735a8eafcdb 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -204,7 +204,7 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
page_owner->last_migrate_reason = reason;
}
-void __split_page_owner(struct page *page, unsigned int order)
+void __split_page_owner(struct page *page, unsigned int nr)
{
int i;
struct page_ext *page_ext = lookup_page_ext(page);
@@ -213,7 +213,7 @@ void __split_page_owner(struct page *page, unsigned int order)
if (unlikely(!page_ext))
return;
- for (i = 0; i < (1 << order); i++) {
+ for (i = 0; i < nr; i++) {
page_owner = get_page_owner(page_ext);
page_owner->order = 0;
page_ext = page_ext_next(page_ext);
@@ -295,7 +295,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
if (PageBuddy(page)) {
unsigned long freepage_order;
- freepage_order = page_order_unsafe(page);
+ freepage_order = buddy_order_unsafe(page);
if (freepage_order < MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
continue;
@@ -490,7 +490,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
page = pfn_to_page(pfn);
if (PageBuddy(page)) {
- unsigned long freepage_order = page_order_unsafe(page);
+ unsigned long freepage_order = buddy_order_unsafe(page);
if (freepage_order < MAX_ORDER)
pfn += (1UL << freepage_order) - 1;
@@ -584,7 +584,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
* heavy lock contention.
*/
if (PageBuddy(page)) {
- unsigned long order = page_order_unsafe(page);
+ unsigned long order = buddy_order_unsafe(page);
if (order > 0 && order < MAX_ORDER)
pfn += (1UL << order) - 1;
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 34b9181ee5d1..ae0482cded87 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -8,13 +8,23 @@
#include <linux/ratelimit.h>
#include <linux/kasan.h>
-static bool want_page_poisoning __read_mostly;
+static DEFINE_STATIC_KEY_FALSE_RO(want_page_poisoning);
static int __init early_page_poison_param(char *buf)
{
- if (!buf)
- return -EINVAL;
- return strtobool(buf, &want_page_poisoning);
+ int ret;
+ bool tmp;
+
+ ret = strtobool(buf, &tmp);
+ if (ret)
+ return ret;
+
+ if (tmp)
+ static_branch_enable(&want_page_poisoning);
+ else
+ static_branch_disable(&want_page_poisoning);
+
+ return 0;
}
early_param("page_poison", early_page_poison_param);
@@ -31,7 +41,7 @@ bool page_poisoning_enabled(void)
* Page poisoning is debug page alloc for some arches. If
* either of those options are enabled, enable poisoning.
*/
- return (want_page_poisoning ||
+ return (static_branch_unlikely(&want_page_poisoning) ||
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
debug_pagealloc_enabled()));
}
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index 3bbd471cfc81..cd8e13d41df4 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -92,7 +92,7 @@ page_reporting_drain(struct page_reporting_dev_info *prdev,
* report on the new larger page when we make our way
* up to that higher order.
*/
- if (PageBuddy(page) && page_order(page) == order)
+ if (PageBuddy(page) && buddy_order(page) == order)
__SetPageReported(page);
} while ((sg = sg_next(sg)));
@@ -178,7 +178,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
* the new head of the free list before we release the
* zone lock.
*/
- if (&page->lru != list && !list_is_first(&page->lru, list))
+ if (!list_is_first(&page->lru, list))
list_rotate_to_front(&page->lru, list);
/* release lock before waiting on report processing */
diff --git a/mm/percpu.c b/mm/percpu.c
index 1ed1a349eab8..66a93f096394 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1584,8 +1584,7 @@ static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
{
struct obj_cgroup *objcg;
- if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) ||
- memcg_kmem_bypass())
+ if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
return PCPU_CHUNK_ROOT;
objcg = get_obj_cgroup_from_current();
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 29c052099aff..fd12da80b6f2 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -14,10 +14,6 @@
#include <linux/slab.h>
#include <linux/syscalls.h>
-#ifdef CONFIG_COMPAT
-#include <linux/compat.h>
-#endif
-
/**
* process_vm_rw_pages - read/write pages from task specified
* @pages: array of pointers to pages we want to copy
@@ -276,20 +272,17 @@ static ssize_t process_vm_rw(pid_t pid,
if (rc < 0)
return rc;
if (!iov_iter_count(&iter))
- goto free_iovecs;
-
- rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
- iovstack_r, &iov_r);
- if (rc <= 0)
- goto free_iovecs;
-
+ goto free_iov_l;
+ iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, false);
+ if (IS_ERR(iov_r)) {
+ rc = PTR_ERR(iov_r);
+ goto free_iov_l;
+ }
rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
-
-free_iovecs:
if (iov_r != iovstack_r)
kfree(iov_r);
+free_iov_l:
kfree(iov_l);
-
return rc;
}
@@ -307,68 +300,3 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
{
return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
}
-
-#ifdef CONFIG_COMPAT
-
-static ssize_t
-compat_process_vm_rw(compat_pid_t pid,
- const struct compat_iovec __user *lvec,
- unsigned long liovcnt,
- const struct compat_iovec __user *rvec,
- unsigned long riovcnt,
- unsigned long flags, int vm_write)
-{
- struct iovec iovstack_l[UIO_FASTIOV];
- struct iovec iovstack_r[UIO_FASTIOV];
- struct iovec *iov_l = iovstack_l;
- struct iovec *iov_r = iovstack_r;
- struct iov_iter iter;
- ssize_t rc = -EFAULT;
- int dir = vm_write ? WRITE : READ;
-
- if (flags != 0)
- return -EINVAL;
-
- rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
- if (rc < 0)
- return rc;
- if (!iov_iter_count(&iter))
- goto free_iovecs;
- rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
- UIO_FASTIOV, iovstack_r,
- &iov_r);
- if (rc <= 0)
- goto free_iovecs;
-
- rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
-
-free_iovecs:
- if (iov_r != iovstack_r)
- kfree(iov_r);
- kfree(iov_l);
- return rc;
-}
-
-COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid,
- const struct compat_iovec __user *, lvec,
- compat_ulong_t, liovcnt,
- const struct compat_iovec __user *, rvec,
- compat_ulong_t, riovcnt,
- compat_ulong_t, flags)
-{
- return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
- riovcnt, flags, 0);
-}
-
-COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid,
- const struct compat_iovec __user *, lvec,
- compat_ulong_t, liovcnt,
- const struct compat_iovec __user *, rvec,
- compat_ulong_t, riovcnt,
- compat_ulong_t, flags)
-{
- return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
- riovcnt, flags, 1);
-}
-
-#endif
diff --git a/mm/readahead.c b/mm/readahead.c
index 3c9a8dd7c56c..c5b0457415be 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,10 +158,8 @@ out:
}
/**
- * page_cache_readahead_unbounded - Start unchecked readahead.
- * @mapping: File address space.
- * @file: This instance of the open file; used for authentication.
- * @index: First page index to read.
+ * page_cache_ra_unbounded - Start unchecked readahead.
+ * @ractl: Readahead control.
* @nr_to_read: The number of pages to read.
* @lookahead_size: Where to start the next readahead.
*
@@ -173,17 +171,13 @@ out:
* Context: File is referenced by caller. Mutexes may be held by caller.
* May sleep, but will not reenter filesystem to reclaim memory.
*/
-void page_cache_readahead_unbounded(struct address_space *mapping,
- struct file *file, pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_size)
+void page_cache_ra_unbounded(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
{
+ struct address_space *mapping = ractl->mapping;
+ unsigned long index = readahead_index(ractl);
LIST_HEAD(page_pool);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
- struct readahead_control rac = {
- .mapping = mapping,
- .file = file,
- ._index = index,
- };
unsigned long i;
/*
@@ -204,7 +198,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
for (i = 0; i < nr_to_read; i++) {
struct page *page = xa_load(&mapping->i_pages, index + i);
- BUG_ON(index + i != rac._index + rac._nr_pages);
+ BUG_ON(index + i != ractl->_index + ractl->_nr_pages);
if (page && !xa_is_value(page)) {
/*
@@ -215,7 +209,7 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
* have a stable reference to this page, and it's
* not worth getting one just for that.
*/
- read_pages(&rac, &page_pool, true);
+ read_pages(ractl, &page_pool, true);
continue;
}
@@ -228,12 +222,12 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
} else if (add_to_page_cache_lru(page, mapping, index + i,
gfp_mask) < 0) {
put_page(page);
- read_pages(&rac, &page_pool, true);
+ read_pages(ractl, &page_pool, true);
continue;
}
if (i == nr_to_read - lookahead_size)
SetPageReadahead(page);
- rac._nr_pages++;
+ ractl->_nr_pages++;
}
/*
@@ -241,22 +235,22 @@ void page_cache_readahead_unbounded(struct address_space *mapping,
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
- read_pages(&rac, &page_pool, false);
+ read_pages(ractl, &page_pool, false);
memalloc_nofs_restore(nofs);
}
-EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded);
+EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
/*
- * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
+ * do_page_cache_ra() actually reads a chunk of disk. It allocates
* the pages first, then submits them for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that.
*/
-void __do_page_cache_readahead(struct address_space *mapping,
- struct file *file, pgoff_t index, unsigned long nr_to_read,
- unsigned long lookahead_size)
+void do_page_cache_ra(struct readahead_control *ractl,
+ unsigned long nr_to_read, unsigned long lookahead_size)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = ractl->mapping->host;
+ unsigned long index = readahead_index(ractl);
loff_t isize = i_size_read(inode);
pgoff_t end_index; /* The last page we want to read */
@@ -270,20 +264,19 @@ void __do_page_cache_readahead(struct address_space *mapping,
if (nr_to_read > end_index - index)
nr_to_read = end_index - index + 1;
- page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
- lookahead_size);
+ page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}
/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
-void force_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t index, unsigned long nr_to_read)
+void force_page_cache_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned long nr_to_read)
{
+ struct address_space *mapping = ractl->mapping;
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
- struct file_ra_state *ra = &filp->f_ra;
- unsigned long max_pages;
+ unsigned long max_pages, index;
if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
!mapping->a_ops->readahead))
@@ -293,14 +286,16 @@ void force_page_cache_readahead(struct address_space *mapping,
* If the request exceeds the readahead window, allow the read to
* be up to the optimal hardware IO size
*/
+ index = readahead_index(ractl);
max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
- nr_to_read = min(nr_to_read, max_pages);
+ nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
while (nr_to_read) {
unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
- __do_page_cache_readahead(mapping, filp, index, this_chunk, 0);
+ ractl->_index = index;
+ do_page_cache_ra(ractl, this_chunk, 0);
index += this_chunk;
nr_to_read -= this_chunk;
@@ -437,14 +432,14 @@ static int try_context_readahead(struct address_space *mapping,
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
-static void ondemand_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- bool hit_readahead_marker, pgoff_t index,
+static void ondemand_readahead(struct readahead_control *ractl,
+ struct file_ra_state *ra, bool hit_readahead_marker,
unsigned long req_size)
{
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
+ unsigned long index = readahead_index(ractl);
pgoff_t prev_index;
/*
@@ -482,7 +477,8 @@ static void ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_miss(mapping, index + 1, max_pages);
+ start = page_cache_next_miss(ractl->mapping, index + 1,
+ max_pages);
rcu_read_unlock();
if (!start || start - index > max_pages)
@@ -515,14 +511,15 @@ static void ondemand_readahead(struct address_space *mapping,
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
- if (try_context_readahead(mapping, ra, index, req_size, max_pages))
+ if (try_context_readahead(ractl->mapping, ra, index, req_size,
+ max_pages))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
- __do_page_cache_readahead(mapping, filp, index, req_size, 0);
+ do_page_cache_ra(ractl, req_size, 0);
return;
initial_readahead:
@@ -548,63 +545,42 @@ readit:
}
}
- ra_submit(ra, mapping, filp);
+ ractl->_index = ra->start;
+ do_page_cache_ra(ractl, ra->size, ra->async_size);
}
-/**
- * page_cache_sync_readahead - generic file readahead
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @index: Index of first page to be read.
- * @req_count: Total number of pages being read by the caller.
- *
- * page_cache_sync_readahead() should be called when a cache miss happened:
- * it will submit the read. The readahead logic may decide to piggyback more
- * pages onto the read request if access patterns suggest it will improve
- * performance.
- */
-void page_cache_sync_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- pgoff_t index, unsigned long req_count)
+void page_cache_sync_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, unsigned long req_count)
{
- /* no read-ahead */
- if (!ra->ra_pages)
- return;
+ bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
- if (blk_cgroup_congested())
- return;
+ /*
+ * Even if read-ahead is disabled, issue this request as read-ahead
+ * as we'll need it to satisfy the requested range. The forced
+ * read-ahead will do the right thing and limit the read to just the
+ * requested range, which we'll set to 1 page for this case.
+ */
+ if (!ra->ra_pages || blk_cgroup_congested()) {
+ if (!ractl->file)
+ return;
+ req_count = 1;
+ do_forced_ra = true;
+ }
/* be dumb */
- if (filp && (filp->f_mode & FMODE_RANDOM)) {
- force_page_cache_readahead(mapping, filp, index, req_count);
+ if (do_forced_ra) {
+ force_page_cache_ra(ractl, ra, req_count);
return;
}
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, false, index, req_count);
+ ondemand_readahead(ractl, ra, false, req_count);
}
-EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
+EXPORT_SYMBOL_GPL(page_cache_sync_ra);
-/**
- * page_cache_async_readahead - file readahead for marked pages
- * @mapping: address_space which holds the pagecache and I/O vectors
- * @ra: file_ra_state which holds the readahead state
- * @filp: passed on to ->readpage() and ->readpages()
- * @page: The page at @index which triggered the readahead call.
- * @index: Index of first page to be read.
- * @req_count: Total number of pages being read by the caller.
- *
- * page_cache_async_readahead() should be called when a page is used which
- * is marked as PageReadahead; this is a marker to suggest that the application
- * has used up enough of the readahead window that we should start pulling in
- * more pages.
- */
-void
-page_cache_async_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- struct page *page, pgoff_t index,
- unsigned long req_count)
+void page_cache_async_ra(struct readahead_control *ractl,
+ struct file_ra_state *ra, struct page *page,
+ unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -621,16 +597,16 @@ page_cache_async_readahead(struct address_space *mapping,
/*
* Defer asynchronous read-ahead on IO congestion.
*/
- if (inode_read_congested(mapping->host))
+ if (inode_read_congested(ractl->mapping->host))
return;
if (blk_cgroup_congested())
return;
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, true, index, req_count);
+ ondemand_readahead(ractl, ra, true, req_count);
}
-EXPORT_SYMBOL_GPL(page_cache_async_readahead);
+EXPORT_SYMBOL_GPL(page_cache_async_ra);
ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
diff --git a/mm/rmap.c b/mm/rmap.c
index 9425260774a1..1b84945d655c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1205,7 +1205,7 @@ void page_add_file_rmap(struct page *page, bool compound)
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
@@ -1246,7 +1246,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
/* page still mapped by someone else? */
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
@@ -1293,7 +1293,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
* Subpages can be mapped with PTEs too. Check how many of
* them are still mapped.
*/
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
@@ -1303,10 +1303,10 @@ static void page_remove_anon_compound_rmap(struct page *page)
* page of the compound page is unmapped, but at least one
* small page is still mapped.
*/
- if (nr && nr < HPAGE_PMD_NR)
+ if (nr && nr < thp_nr_pages(page))
deferred_split_huge_page(page);
} else {
- nr = HPAGE_PMD_NR;
+ nr = thp_nr_pages(page);
}
if (unlikely(PageMlocked(page)))
diff --git a/mm/shmem.c b/mm/shmem.c
index 8e2b35ba93ad..537c137698f8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1736,6 +1736,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
}
wait_on_page_writeback(page);
+ /*
+ * Some architectures may have to restore extra metadata to the
+ * physical page after reading from swap.
+ */
+ arch_swap_restore(swap, page);
+
if (shmem_should_replace_page(page, gfp)) {
error = shmem_replace_page(&page, gfp, info, index);
if (error)
@@ -1824,6 +1830,8 @@ repeat:
return error;
}
+ if (page)
+ hindex = page->index;
if (page && sgp == SGP_WRITE)
mark_page_accessed(page);
@@ -1834,11 +1842,10 @@ repeat:
unlock_page(page);
put_page(page);
page = NULL;
+ hindex = index;
}
- if (page || sgp == SGP_READ) {
- *pagep = page;
- return 0;
- }
+ if (page || sgp == SGP_READ)
+ goto out;
/*
* Fast cache lookup did not find it:
@@ -1963,14 +1970,13 @@ clear:
* it now, lest undo on failure cancel our earlier guarantee.
*/
if (sgp != SGP_WRITE && !PageUptodate(page)) {
- struct page *head = compound_head(page);
int i;
- for (i = 0; i < compound_nr(head); i++) {
- clear_highpage(head + i);
- flush_dcache_page(head + i);
+ for (i = 0; i < compound_nr(page); i++) {
+ clear_highpage(page + i);
+ flush_dcache_page(page + i);
}
- SetPageUptodate(head);
+ SetPageUptodate(page);
}
/* Perhaps the file has been truncated since we checked */
@@ -1986,6 +1992,7 @@ clear:
error = -EINVAL;
goto unlock;
}
+out:
*pagep = page + index - hindex;
return 0;
@@ -2269,6 +2276,9 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_flags &= ~(VM_MAYWRITE);
}
+ /* arm64 - allow memory tagging on RAM-based files */
+ vma->vm_flags |= VM_MTE_ALLOWED;
+
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
@@ -3974,7 +3984,7 @@ static struct file_system_type shmem_fs_type = {
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
- .fs_flags = FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT,
};
int __init shmem_init(void)
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 9b5cd4b004b0..9c2e145a747a 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -60,7 +60,7 @@ static struct page * __meminit shuffle_valid_page(struct zone *zone,
* ...is the page on the same list as the page we will
* shuffle it with?
*/
- if (page_order(page) != order)
+ if (buddy_order(page) != order)
return NULL;
return page;
diff --git a/mm/slab.c b/mm/slab.c
index f658e86ec8ce..b1113561b98b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1062,7 +1062,7 @@ int slab_prepare_cpu(unsigned int cpu)
* Even if all the cpus of a node are down, we don't free the
* kmem_cache_node of any cache. This to avoid a race between cpu_down, and
* a kmalloc allocation from another cpu for memory from the node of
- * the cpu going down. The list3 structure is usually allocated from
+ * the cpu going down. The kmem_cache_node structure is usually allocated from
* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
*/
int slab_dead_cpu(unsigned int cpu)
@@ -2305,8 +2305,6 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
/* Slab management obj is off-slab. */
freelist = kmem_cache_alloc_node(cachep->freelist_cache,
local_flags, nodeid);
- if (!freelist)
- return NULL;
} else {
/* We will use last bytes at the slab for freelist */
freelist = addr + (PAGE_SIZE << cachep->gfporder) -
@@ -3440,7 +3438,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
memset(objp, 0, cachep->object_size);
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
- memcg_slab_free_hook(cachep, virt_to_head_page(objp), objp);
+ memcg_slab_free_hook(cachep, &objp, 1);
/*
* Skip calling cache_free_alien() when the platform is not numa.
diff --git a/mm/slab.h b/mm/slab.h
index 6cc323f1313a..6d7c6a5056ba 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -46,7 +46,6 @@ struct kmem_cache {
#include <linux/kmemleak.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
-#include <linux/kmemleak.h>
/*
* State of the slab allocator.
@@ -281,9 +280,6 @@ static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
{
struct obj_cgroup *objcg;
- if (memcg_kmem_bypass())
- return NULL;
-
objcg = get_obj_cgroup_from_current();
if (!objcg)
return NULL;
@@ -345,30 +341,42 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
obj_cgroup_put(objcg);
}
-static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
- void *p)
+static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
+ void **p, int objects)
{
+ struct kmem_cache *s;
struct obj_cgroup *objcg;
+ struct page *page;
unsigned int off;
+ int i;
if (!memcg_kmem_enabled())
return;
- if (!page_has_obj_cgroups(page))
- return;
+ for (i = 0; i < objects; i++) {
+ if (unlikely(!p[i]))
+ continue;
- off = obj_to_index(s, page, p);
- objcg = page_obj_cgroups(page)[off];
- page_obj_cgroups(page)[off] = NULL;
+ page = virt_to_head_page(p[i]);
+ if (!page_has_obj_cgroups(page))
+ continue;
- if (!objcg)
- return;
+ if (!s_orig)
+ s = page->slab_cache;
+ else
+ s = s_orig;
- obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
- -obj_full_size(s));
+ off = obj_to_index(s, page, p[i]);
+ objcg = page_obj_cgroups(page)[off];
+ if (!objcg)
+ continue;
- obj_cgroup_put(objcg);
+ page_obj_cgroups(page)[off] = NULL;
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
+ -obj_full_size(s));
+ obj_cgroup_put(objcg);
+ }
}
#else /* CONFIG_MEMCG_KMEM */
@@ -406,8 +414,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
{
}
-static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
- void *p)
+static inline void memcg_slab_free_hook(struct kmem_cache *s,
+ void **p, int objects)
{
}
#endif /* CONFIG_MEMCG_KMEM */
diff --git a/mm/slub.c b/mm/slub.c
index 6d3574013b2f..b30be2385d1c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1956,7 +1956,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
/*
* Racy check. If we mistakenly see no partial slabs then we
* just allocate an empty slab. If we mistakenly try to get a
- * partial slab and there is none available then get_partials()
+ * partial slab and there is none available then get_partial()
* will return NULL.
*/
if (!n || !n->nr_partial)
@@ -2245,7 +2245,8 @@ redo:
}
} else {
m = M_FULL;
- if (kmem_cache_debug(s) && !lock) {
+#ifdef CONFIG_SLUB_DEBUG
+ if ((s->flags & SLAB_STORE_USER) && !lock) {
lock = 1;
/*
* This also ensures that the scanning of full
@@ -2254,6 +2255,7 @@ redo:
*/
spin_lock(&n->list_lock);
}
+#endif
}
if (l != m) {
@@ -2661,6 +2663,8 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
void *freelist;
struct page *page;
+ stat(s, ALLOC_SLOWPATH);
+
page = c->page;
if (!page) {
/*
@@ -2850,7 +2854,6 @@ redo:
page = c->page;
if (unlikely(!object || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
- stat(s, ALLOC_SLOWPATH);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -3019,20 +3022,21 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
if (likely(!n)) {
- /*
- * If we just froze the page then put it onto the
- * per cpu partial list.
- */
- if (new.frozen && !was_frozen) {
+ if (likely(was_frozen)) {
+ /*
+ * The list lock was not taken therefore no list
+ * activity can be necessary.
+ */
+ stat(s, FREE_FROZEN);
+ } else if (new.frozen) {
+ /*
+ * If we just froze the page then put it onto the
+ * per cpu partial list.
+ */
put_cpu_partial(s, page, 1);
stat(s, CPU_PARTIAL_FREE);
}
- /*
- * The list lock was not taken therefore no list
- * activity can be necessary.
- */
- if (was_frozen)
- stat(s, FREE_FROZEN);
+
return;
}
@@ -3091,7 +3095,7 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long tid;
- memcg_slab_free_hook(s, page, head);
+ memcg_slab_free_hook(s, &head, 1);
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -3253,6 +3257,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
if (WARN_ON(!size))
return;
+ memcg_slab_free_hook(s, p, size);
do {
struct detached_freelist df;
diff --git a/mm/sparse.c b/mm/sparse.c
index fcc3d176f1ea..7bd23f9d6cef 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -291,13 +291,11 @@ static void __init memory_present(int nid, unsigned long start, unsigned long en
*/
static void __init memblocks_present(void)
{
- struct memblock_region *reg;
+ unsigned long start, end;
+ int i, nid;
- for_each_memblock(memory, reg) {
- memory_present(memblock_get_region_node(reg),
- memblock_region_memory_base_pfn(reg),
- memblock_region_memory_end_pfn(reg));
- }
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
+ memory_present(nid, start, end);
}
/*
@@ -314,6 +312,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
return coded_mem_map;
}
+#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Decode mem_map from the coded memmap
*/
@@ -323,6 +322,7 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
coded_mem_map &= SECTION_MAP_MASK;
return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
+#endif /* CONFIG_MEMORY_HOTPLUG */
static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
diff --git a/mm/swap.c b/mm/swap.c
index e7bdf094f76a..47a47681c86b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -348,7 +348,7 @@ static bool need_activate_page_drain(int cpu)
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
}
-void activate_page(struct page *page)
+static void activate_page(struct page *page)
{
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -368,7 +368,7 @@ static inline void activate_page_drain(int cpu)
{
}
-void activate_page(struct page *page)
+static void activate_page(struct page *page)
{
pg_data_t *pgdat = page_pgdat(page);
@@ -481,9 +481,7 @@ EXPORT_SYMBOL(lru_cache_add);
* @vma: vma in which page is mapped for determining reclaimability
*
* Place @page on the inactive or unevictable LRU list, depending on its
- * evictability. Note that if the page is not evictable, it goes
- * directly back onto it's zone's unevictable list, it does NOT use a
- * per cpu pagevec.
+ * evictability.
*/
void lru_cache_add_inactive_or_unevictable(struct page *page,
struct vm_area_struct *vma)
@@ -763,10 +761,20 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
*/
void lru_add_drain_all(void)
{
- static seqcount_t seqcount = SEQCNT_ZERO(seqcount);
- static DEFINE_MUTEX(lock);
+ /*
+ * lru_drain_gen - Global pages generation number
+ *
+ * (A) Definition: global lru_drain_gen = x implies that all generations
+ * 0 < n <= x are already *scheduled* for draining.
+ *
+ * This is an optimization for the highly-contended use case where a
+ * user space workload keeps constantly generating a flow of pages for
+ * each CPU.
+ */
+ static unsigned int lru_drain_gen;
static struct cpumask has_work;
- int cpu, seq;
+ static DEFINE_MUTEX(lock);
+ unsigned cpu, this_gen;
/*
* Make sure nobody triggers this path before mm_percpu_wq is fully
@@ -775,21 +783,54 @@ void lru_add_drain_all(void)
if (WARN_ON(!mm_percpu_wq))
return;
- seq = raw_read_seqcount_latch(&seqcount);
+ /*
+ * Guarantee pagevec counter stores visible by this CPU are visible to
+ * other CPUs before loading the current drain generation.
+ */
+ smp_mb();
+
+ /*
+ * (B) Locally cache global LRU draining generation number
+ *
+ * The read barrier ensures that the counter is loaded before the mutex
+ * is taken. It pairs with smp_mb() inside the mutex critical section
+ * at (D).
+ */
+ this_gen = smp_load_acquire(&lru_drain_gen);
mutex_lock(&lock);
/*
- * Piggyback on drain started and finished while we waited for lock:
- * all pages pended at the time of our enter were drained from vectors.
+ * (C) Exit the draining operation if a newer generation, from another
+ * lru_add_drain_all(), was already scheduled for draining. Check (A).
*/
- if (__read_seqcount_retry(&seqcount, seq))
+ if (unlikely(this_gen != lru_drain_gen))
goto done;
- raw_write_seqcount_latch(&seqcount);
+ /*
+ * (D) Increment global generation number
+ *
+ * Pairs with smp_load_acquire() at (B), outside of the critical
+ * section. Use a full memory barrier to guarantee that the new global
+ * drain generation number is stored before loading pagevec counters.
+ *
+ * This pairing must be done here, before the for_each_online_cpu loop
+ * below which drains the page vectors.
+ *
+ * Let x, y, and z represent some system CPU numbers, where x < y < z.
+ * Assume CPU #z is is in the middle of the for_each_online_cpu loop
+ * below and has already reached CPU #y's per-cpu data. CPU #x comes
+ * along, adds some pages to its per-cpu vectors, then calls
+ * lru_add_drain_all().
+ *
+ * If the paired barrier is done at any later step, e.g. after the
+ * loop, CPU #x will just exit at (C) and miss flushing out all of its
+ * added pages.
+ */
+ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
+ smp_mb();
cpumask_clear(&has_work);
-
for_each_online_cpu(cpu) {
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
@@ -801,7 +842,7 @@ void lru_add_drain_all(void)
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
- cpumask_set_cpu(cpu, &has_work);
+ __cpumask_set_cpu(cpu, &has_work);
}
}
@@ -816,7 +857,7 @@ void lru_add_drain_all(void)
{
lru_add_drain();
}
-#endif
+#endif /* CONFIG_SMP */
/**
* release_pages - batched put_page()
@@ -848,6 +889,7 @@ void release_pages(struct page **pages, int nr)
locked_pgdat = NULL;
}
+ page = compound_head(page);
if (is_huge_zero_page(page))
continue;
@@ -859,7 +901,7 @@ void release_pages(struct page **pages, int nr)
}
/*
* ZONE_DEVICE pages that return 'false' from
- * put_devmap_managed_page() do not require special
+ * page_is_devmap_managed() do not require special
* processing, and instead, expect a call to
* put_page_testzero().
*/
@@ -869,7 +911,6 @@ void release_pages(struct page **pages, int nr)
}
}
- page = compound_head(page);
if (!put_page_testzero(page))
continue;
@@ -900,8 +941,6 @@ void release_pages(struct page **pages, int nr)
del_page_from_lru_list(page, lruvec, page_off_lru(page));
}
- /* Clear Active bit in case of parallel mark_page_accessed */
- __ClearPageActive(page);
__ClearPageWaiters(page);
list_add(&page->lru, &pages_to_free);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 3e6453573a89..0357fbe70645 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -237,7 +237,7 @@ static int free_slot_cache(unsigned int cpu)
return 0;
}
-int enable_swap_slots_cache(void)
+void enable_swap_slots_cache(void)
{
mutex_lock(&swap_slots_cache_enable_mutex);
if (!swap_slot_cache_initialized) {
@@ -255,7 +255,6 @@ int enable_swap_slots_cache(void)
__reenable_swap_slots_cache();
out_unlock:
mutex_unlock(&swap_slots_cache_enable_mutex);
- return 0;
}
/* called with swap slot cache's alloc lock held */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index c16eebb81d8b..ee465827420e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,6 +21,7 @@
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
+#include <linux/shmem_fs.h>
#include "internal.h"
/*
@@ -245,7 +246,7 @@ int add_to_swap(struct page *page)
goto fail;
/*
* Normally the page will be dirtied in unmap because its pte should be
- * dirty. A special case is MADV_FREE page. The page'e pte could have
+ * dirty. A special case is MADV_FREE page. The page's pte could have
* dirty bit cleared but the page's SwapBacked bit is still set because
* clearing the dirty bit and SwapBacked bit has no lock protected. For
* such page, unmap will not set dirty bit for it, so page reclaim will
@@ -414,6 +415,39 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
return page;
}
+/**
+ * find_get_incore_page - Find and get a page from the page or swap caches.
+ * @mapping: The address_space to search.
+ * @index: The page cache index.
+ *
+ * This differs from find_get_page() in that it will also look for the
+ * page in the swap cache.
+ *
+ * Return: The found page or %NULL.
+ */
+struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
+{
+ swp_entry_t swp;
+ struct swap_info_struct *si;
+ struct page *page = find_get_entry(mapping, index);
+
+ if (!page)
+ return page;
+ if (!xa_is_value(page))
+ return find_subpage(page, index);
+ if (!shmem_mapping(mapping))
+ return NULL;
+
+ swp = radix_to_swp_entry(page);
+ /* Prevent swapoff from happening to us */
+ si = get_swap_device(swp);
+ if (!si)
+ return NULL;
+ page = find_get_page(swap_address_space(swp), swp_offset(swp));
+ put_swap_device(si);
+ return page;
+}
+
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
@@ -631,7 +665,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
goto skip;
/* Test swap type to make sure the dereference is safe */
- if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) {
+ if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
struct inode *inode = si->swap_file->f_mapping->host;
if (inode_read_congested(inode))
goto skip;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index debc94155f74..c4a613688a17 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -717,6 +717,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
else
swap_slot_free_notify = NULL;
while (offset <= end) {
+ arch_swap_invalidate_page(si->type, offset);
frontswap_invalidate_page(si->type, offset);
if (swap_slot_free_notify)
swap_slot_free_notify(si->bdev, offset);
@@ -1183,7 +1184,6 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
bad_free:
pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
- goto out;
out:
return NULL;
}
@@ -1801,13 +1801,12 @@ int free_swap_and_cache(swp_entry_t entry)
*
* This is needed for the suspend to disk (aka swsusp).
*/
-int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
+int swap_type_of(dev_t device, sector_t offset)
{
- struct block_device *bdev = NULL;
int type;
- if (device)
- bdev = bdget(device);
+ if (!device)
+ return -1;
spin_lock(&swap_lock);
for (type = 0; type < nr_swapfiles; type++) {
@@ -1816,30 +1815,34 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
if (!(sis->flags & SWP_WRITEOK))
continue;
- if (!bdev) {
- if (bdev_p)
- *bdev_p = bdgrab(sis->bdev);
-
- spin_unlock(&swap_lock);
- return type;
- }
- if (bdev == sis->bdev) {
+ if (device == sis->bdev->bd_dev) {
struct swap_extent *se = first_se(sis);
if (se->start_block == offset) {
- if (bdev_p)
- *bdev_p = bdgrab(sis->bdev);
-
spin_unlock(&swap_lock);
- bdput(bdev);
return type;
}
}
}
spin_unlock(&swap_lock);
- if (bdev)
- bdput(bdev);
+ return -ENODEV;
+}
+
+int find_first_swap(dev_t *device)
+{
+ int type;
+
+ spin_lock(&swap_lock);
+ for (type = 0; type < nr_swapfiles; type++) {
+ struct swap_info_struct *sis = swap_info[type];
+ if (!(sis->flags & SWP_WRITEOK))
+ continue;
+ *device = sis->bdev->bd_dev;
+ spin_unlock(&swap_lock);
+ return type;
+ }
+ spin_unlock(&swap_lock);
return -ENODEV;
}
@@ -1925,11 +1928,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
lru_cache_add_inactive_or_unevictable(page, vma);
}
swap_free(entry);
- /*
- * Move the page to the active list so it is not
- * immediately swapped out again after swapon.
- */
- activate_page(page);
out:
pte_unmap_unlock(pte, ptl);
if (page != swapcache) {
@@ -2433,7 +2431,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
if (ret >= 0)
sis->flags |= SWP_ACTIVATED;
if (!ret) {
- sis->flags |= SWP_FS;
+ sis->flags |= SWP_FS_OPS;
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
}
@@ -2682,6 +2680,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
frontswap_map = frontswap_map_get(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
+ arch_swap_invalidate_area(p->type);
frontswap_invalidate_area(p->type);
frontswap_map_set(p, NULL);
mutex_unlock(&swapon_mutex);
@@ -2920,10 +2919,10 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
int error;
if (S_ISBLK(inode->i_mode)) {
- p->bdev = bdgrab(I_BDEV(inode));
- error = blkdev_get(p->bdev,
+ p->bdev = blkdev_get_by_dev(inode->i_rdev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
- if (error < 0) {
+ if (IS_ERR(p->bdev)) {
+ error = PTR_ERR(p->bdev);
p->bdev = NULL;
return error;
}
@@ -3234,10 +3233,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}
- if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
+ if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
p->flags |= SWP_STABLE_WRITES;
- if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
+ if (p->bdev && p->bdev->bd_disk->fops->rw_page)
p->flags |= SWP_SYNCHRONOUS_IO;
if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -3343,7 +3342,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = inode_drain_writes(inode);
if (error) {
inode->i_flags &= ~S_SWAPFILE;
- goto bad_swap_unlock_inode;
+ goto free_swap_address_space;
}
mutex_lock(&swapon_mutex);
@@ -3368,6 +3367,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = 0;
goto out;
+free_swap_address_space:
+ exit_swap_address_space(p->type);
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
diff --git a/mm/truncate.c b/mm/truncate.c
index dd9ebc1da356..18cec39a9f53 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -168,7 +168,7 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* becomes orphaned. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_fault().
*
- * We need to bale out if page->mapping is no longer equal to the original
+ * We need to bail out if page->mapping is no longer equal to the original
* mapping. This happens a) when the VM reclaimed the page while we waited on
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
@@ -177,12 +177,12 @@ static void
truncate_cleanup_page(struct address_space *mapping, struct page *page)
{
if (page_mapped(page)) {
- pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
+ unsigned int nr = thp_nr_pages(page);
unmap_mapping_pages(mapping, page->index, nr, false);
}
if (page_has_private(page))
- do_invalidatepage(page, 0, PAGE_SIZE);
+ do_invalidatepage(page, 0, thp_size(page));
/*
* Some filesystems seem to re-dirty the page even after
@@ -528,23 +528,8 @@ void truncate_inode_pages_final(struct address_space *mapping)
}
EXPORT_SYMBOL(truncate_inode_pages_final);
-/**
- * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
- * @mapping: the address_space which holds the pages to invalidate
- * @start: the offset 'from' which to invalidate
- * @end: the offset 'to' which to invalidate (inclusive)
- *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
- *
- * invalidate_mapping_pages() will not block on IO activity. It will not
- * invalidate pages which are dirty, locked, under writeback or mapped into
- * pagetables.
- *
- * Return: the number of the pages that were invalidated
- */
-unsigned long invalidate_mapping_pages(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
+unsigned long __invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
{
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
@@ -610,8 +595,13 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
* Invalidation is a hint that the page is no longer
* of interest and try to speed up its reclaim.
*/
- if (!ret)
+ if (!ret) {
deactivate_file_page(page);
+ /* It is likely on the pagevec of a remote CPU */
+ if (nr_pagevec)
+ (*nr_pagevec)++;
+ }
+
if (PageTransHuge(page))
put_page(page);
count += ret;
@@ -623,8 +613,40 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
}
return count;
}
+
+/**
+ * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_mapping_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ *
+ * Return: the number of the pages that were invalidated
+ */
+unsigned long invalidate_mapping_pages(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ return __invalidate_mapping_pages(mapping, start, end, NULL);
+}
EXPORT_SYMBOL(invalidate_mapping_pages);
+/**
+ * This helper is similar with the above one, except that it accounts for pages
+ * that are likely on a pagevec and count them in @nr_pagevec, which will used by
+ * the caller.
+ */
+void invalidate_mapping_pagevec(struct address_space *mapping,
+ pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
+{
+ __invalidate_mapping_pages(mapping, start, end, nr_pagevec);
+}
+
/*
* This is like invalidate_complete_page(), except it ignores the page's
* refcount. We do this because invalidate_inode_pages2() needs stronger
diff --git a/mm/util.c b/mm/util.c
index 5ef378a2a038..4ddb6e186dd5 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -69,7 +69,8 @@ EXPORT_SYMBOL(kstrdup);
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
- * Note: Strings allocated by kstrdup_const should be freed by kfree_const.
+ * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
+ * must not be passed to krealloc().
*
* Return: source string if it is in .rodata section otherwise
* fallback to kstrdup.
@@ -957,7 +958,7 @@ out:
return res;
}
-int memcmp_pages(struct page *page1, struct page *page2)
+int __weak memcmp_pages(struct page *page1, struct page *page2)
{
char *addr1, *addr2;
int ret;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index be4724b916b3..6ae491a8b210 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/mm/vmalloc.c
- *
* Copyright (C) 1993 Linus Torvalds
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
@@ -2133,7 +2131,7 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
* It is up to the caller to do all required locking to keep the returned
* pointer valid.
*
- * Return: pointer to the found area or %NULL on faulure
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *find_vm_area(const void *addr)
{
@@ -2154,7 +2152,7 @@ struct vm_struct *find_vm_area(const void *addr)
* This function returns the found VM area, but using it is NOT safe
* on SMP machines, except for its size or flags.
*
- * Return: pointer to the found area or %NULL on faulure
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *remove_vm_area(const void *addr)
{
@@ -2321,20 +2319,21 @@ static void __vfree(const void *addr)
}
/**
- * vfree - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree - Release memory allocated by vmalloc()
+ * @addr: Memory base address
*
- * Free the virtually continuous memory area starting at @addr, as
- * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
- * NULL, no operation is performed.
+ * Free the virtually continuous memory area starting at @addr, as obtained
+ * from one of the vmalloc() family of APIs. This will usually also free the
+ * physical memory underlying the virtual allocation, but that memory is
+ * reference counted, so it will not be freed until the last user goes away.
*
- * Must not be called in NMI context (strictly speaking, only if we don't
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea)
+ * If @addr is NULL, no operation is performed.
*
+ * Context:
* May sleep if called *not* from interrupt context.
- *
- * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
+ * Must not be called in NMI context (strictly speaking, it could be
+ * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
+ * conventions for vfree() arch-depenedent would be a really bad idea).
*/
void vfree(const void *addr)
{
@@ -2376,8 +2375,11 @@ EXPORT_SYMBOL(vunmap);
* @flags: vm_area->flags
* @prot: page protection for the mapping
*
- * Maps @count pages from @pages into contiguous kernel virtual
- * space.
+ * Maps @count pages from @pages into contiguous kernel virtual space.
+ * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
+ * (which must be kmalloc or vmalloc memory) and one reference per pages in it
+ * are transferred from the caller to vmap(), and will be freed / dropped when
+ * vfree() is called on the return value.
*
* Return: the address of the area or %NULL on failure
*/
@@ -2403,28 +2405,73 @@ void *vmap(struct page **pages, unsigned int count,
return NULL;
}
+ if (flags & VM_MAP_PUT_PAGES)
+ area->pages = pages;
return area->addr;
}
EXPORT_SYMBOL(vmap);
+#ifdef CONFIG_VMAP_PFN
+struct vmap_pfn_data {
+ unsigned long *pfns;
+ pgprot_t prot;
+ unsigned int idx;
+};
+
+static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
+{
+ struct vmap_pfn_data *data = private;
+
+ if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
+ return -EINVAL;
+ *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
+ return 0;
+}
+
+/**
+ * vmap_pfn - map an array of PFNs into virtually contiguous space
+ * @pfns: array of PFNs
+ * @count: number of pages to map
+ * @prot: page protection for the mapping
+ *
+ * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
+ * the start address of the mapping.
+ */
+void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
+{
+ struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
+ struct vm_struct *area;
+
+ area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+ if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
+ count * PAGE_SIZE, vmap_pfn_apply, &data)) {
+ free_vm_area(area);
+ return NULL;
+ }
+ return area->addr;
+}
+EXPORT_SYMBOL_GPL(vmap_pfn);
+#endif /* CONFIG_VMAP_PFN */
+
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
- struct page **pages;
- unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
- const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
- const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
- 0 :
- __GFP_HIGHMEM;
+ unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
+ unsigned int array_size = nr_pages * sizeof(struct page *), i;
+ struct page **pages;
- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
- array_size = (nr_pages * sizeof(struct page *));
+ gfp_mask |= __GFP_NOWARN;
+ if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
+ gfp_mask |= __GFP_HIGHMEM;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
- pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
- node, area->caller);
+ pages = __vmalloc_node(array_size, 1, nested_gfp, node,
+ area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -2442,12 +2489,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask|highmem_mask);
+ page = alloc_page(gfp_mask);
else
- page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
+ page = alloc_pages_node(node, gfp_mask, 0);
if (unlikely(!page)) {
- /* Successfully allocated i pages, free them in __vunmap() */
+ /* Successfully allocated i pages, free them in __vfree() */
area->nr_pages = i;
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
goto fail;
@@ -3032,54 +3079,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-static int f(pte_t *pte, unsigned long addr, void *data)
-{
- pte_t ***p = data;
-
- if (p) {
- *(*p) = pte;
- (*p)++;
- }
- return 0;
-}
-
-/**
- * alloc_vm_area - allocate a range of kernel address space
- * @size: size of the area
- * @ptes: returns the PTEs for the address space
- *
- * Returns: NULL on failure, vm_struct on success
- *
- * This function reserves a range of kernel address space, and
- * allocates pagetables to map that range. No actual mappings
- * are created.
- *
- * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
- * allocated for the VM area are returned.
- */
-struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
-{
- struct vm_struct *area;
-
- area = get_vm_area_caller(size, VM_IOREMAP,
- __builtin_return_address(0));
- if (area == NULL)
- return NULL;
-
- /*
- * This ensures that page tables are constructed for this region
- * of kernel virtual address space and mapped into init_mm.
- */
- if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
- size, f, ptes ? &ptes : NULL)) {
- free_vm_area(area);
- return NULL;
- }
-
- return area;
-}
-EXPORT_SYMBOL_GPL(alloc_vm_area);
-
void free_vm_area(struct vm_struct *area)
{
struct vm_struct *ret;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 466fc3144fff..1b8f0e059767 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -699,6 +699,9 @@ void drop_slab_node(int nid)
do {
struct mem_cgroup *memcg = NULL;
+ if (fatal_signal_pending(current))
+ return;
+
freed = 0;
memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
@@ -722,8 +725,7 @@ static inline int is_page_cache_freeable(struct page *page)
* that isolated the page, the page cache and optional buffer
* heads at page->private.
*/
- int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
- HPAGE_PMD_NR : 1;
+ int page_cache_pins = thp_nr_pages(page);
return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
@@ -1751,7 +1753,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* Restrictions:
*
* (1) Must be called with an elevated refcount on the page. This is a
- * fundamentnal difference from isolate_lru_pages (which is called
+ * fundamental difference from isolate_lru_pages (which is called
* without a stable reference).
* (2) the lru_lock must not be held.
* (3) interrupts must be enabled.
@@ -2237,7 +2239,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
- u64 fraction[2];
+ u64 fraction[ANON_AND_FILE];
u64 denominator = 0; /* gcc */
enum scan_balance scan_balance;
unsigned long ap, fp;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4f7b4ee6aa12..698bc0bc18d1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -325,7 +325,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > t || x < -t)) {
+ if (unlikely(abs(x) > t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
@@ -350,7 +350,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
t = __this_cpu_read(pcp->stat_threshold);
- if (unlikely(x > t || x < -t)) {
+ if (unlikely(abs(x) > t)) {
node_page_state_add(x, pgdat, item);
x = 0;
}
@@ -511,7 +511,7 @@ static inline void mod_zone_state(struct zone *zone,
o = this_cpu_read(*p);
n = delta + o;
- if (n > t || n < -t) {
+ if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
/* Overflow must be added to zone counters */
@@ -573,7 +573,7 @@ static inline void mod_node_state(struct pglist_data *pgdat,
o = this_cpu_read(*p);
n = delta + o;
- if (n > t || n < -t) {
+ if (abs(n) > t) {
int os = overstep_mode * (t >> 1) ;
/* Overflow must be added to node counters */
diff --git a/mm/workingset.c b/mm/workingset.c
index 92e66113a577..975a4d2dd02e 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -216,7 +216,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
/**
* workingset_age_nonresident - age non-resident entries as LRU ages
- * @memcg: the lruvec that was aged
+ * @lruvec: the lruvec that was aged
* @nr_pages: the number of pages to count
*
* As in-memory pages are aged, non-resident pages need to be aged as
@@ -519,12 +519,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
void *arg) __must_hold(lru_lock)
{
struct xa_node *node = container_of(item, struct xa_node, private_list);
- XA_STATE(xas, node->array, 0);
struct address_space *mapping;
int ret;
/*
- * Page cache insertions and deletions synchroneously maintain
+ * Page cache insertions and deletions synchronously maintain
* the shadow node LRU under the i_pages lock and the
* lru_lock. Because the page cache tree is emptied before
* the inode can be destroyed, holding the lru_lock pins any
@@ -559,15 +558,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
mapping->nrexceptional -= node->nr_values;
- xas.xa_node = xa_parent_locked(&mapping->i_pages, node);
- xas.xa_offset = node->offset;
- xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
- xas_set_update(&xas, workingset_update_node);
- /*
- * We could store a shadow entry here which was the minimum of the
- * shadow entries we were tracking ...
- */
- xas_store(&xas, NULL);
+ xa_delete_node(node, workingset_update_node);
__inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM);
out_invalid:
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 460b0feced26..18feaa0bc537 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -212,13 +212,12 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
{
struct z3fold_buddy_slots *slots;
- slots = kmem_cache_alloc(pool->c_handle,
+ slots = kmem_cache_zalloc(pool->c_handle,
(gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
if (slots) {
/* It will be freed separately in free_handle(). */
kmemleak_not_leak(slots);
- memset(slots->slot, 0, sizeof(slots->slot));
slots->pool = (unsigned long)pool;
rwlock_init(&slots->lock);
}
diff --git a/mm/zbud.c b/mm/zbud.c
index bc93aa4e46fc..c49966ece674 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -367,7 +367,6 @@ int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
spin_lock(&pool->lock);
/* First, try to find an unbuddied zbud page. */
- zhdr = NULL;
for_each_unbuddied_list(i, chunks) {
if (!list_empty(&pool->unbuddied[i])) {
zhdr = list_first_entry(&pool->unbuddied[i],
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c36fdff9a371..918c7b019b3d 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1122,10 +1122,16 @@ static inline int __zs_cpu_up(struct mapping_area *area)
*/
if (area->vm)
return 0;
- area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
+ area->vm = get_vm_area(PAGE_SIZE * 2, 0);
if (!area->vm)
return -ENOMEM;
- return 0;
+
+ /*
+ * Populate ptes in advance to avoid pte allocation with GFP_KERNEL
+ * in non-preemtible context of zs_map_object.
+ */
+ return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr,
+ PAGE_SIZE * 2, NULL, NULL);
}
static inline void __zs_cpu_down(struct mapping_area *area)