summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2018-01-30 15:08:27 +0100
committerIngo Molnar <mingo@kernel.org>2018-01-30 15:08:27 +0100
commit7e86548e2cc8d308cb75439480f428137151b0de (patch)
treefa3bcdedb64f4642a21080bc2b4ddc69ce2b2285 /mm
parent64e16720ea0879f8ab4547e3b9758936d483909b (diff)
parentd8a5b80568a9cb66810e75b182018e9edb68e8ff (diff)
downloadlinux-7e86548e2cc8d308cb75439480f428137151b0de.tar.bz2
Merge tag 'v4.15' into x86/pti, to be able to merge dependent changes
Time has come to switch PTI development over to a v4.15 base - we'll still try to make sure that all PTI fixes backport cleanly to v4.14 and earlier. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/Makefile3
-rw-r--r--mm/backing-dev.c37
-rw-r--r--mm/balloon_compaction.c28
-rw-r--r--mm/cma.c2
-rw-r--r--mm/compaction.c48
-rw-r--r--mm/debug.c31
-rw-r--r--mm/early_ioremap.c2
-rw-r--r--mm/filemap.c222
-rw-r--r--mm/frame_vector.c14
-rw-r--r--mm/gup.c64
-rw-r--r--mm/gup_benchmark.c100
-rw-r--r--mm/hmm.c3
-rw-r--r--mm/huge_memory.c124
-rw-r--r--mm/hugetlb.c28
-rw-r--r--mm/internal.h1
-rw-r--r--mm/kasan/kasan.c2
-rw-r--r--mm/kasan/report.c8
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/kmemcheck.c126
-rw-r--r--mm/kmemleak.c15
-rw-r--r--mm/ksm.c15
-rw-r--r--mm/list_lru.c1
-rw-r--r--mm/madvise.c4
-rw-r--r--mm/memblock.c68
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c109
-rw-r--r--mm/memory_hotplug.c50
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c15
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mmap.c33
-rw-r--r--mm/mmu_notifier.c11
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/oom_kill.c71
-rw-r--r--mm/page-writeback.c83
-rw-r--r--mm/page_alloc.c502
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/page_io.c8
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_owner.c5
-rw-r--r--mm/page_vma_mapped.c66
-rw-r--r--mm/pagewalk.c6
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/percpu.c7
-rw-r--r--mm/rmap.c65
-rw-r--r--mm/shmem.c59
-rw-r--r--mm/slab.c68
-rw-r--r--mm/slab.h41
-rw-r--r--mm/slab_common.c59
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c67
-rw-r--r--mm/sparse-vmemmap.c34
-rw-r--r--mm/sparse.c16
-rw-r--r--mm/swap.c35
-rw-r--r--mm/swap_slots.c11
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c21
-rw-r--r--mm/truncate.c149
-rw-r--r--mm/vmscan.c13
-rw-r--r--mm/vmstat.c77
-rw-r--r--mm/workingset.c10
-rw-r--r--mm/z3fold.c10
-rw-r--r--mm/zsmalloc.c3
67 files changed, 1709 insertions, 1023 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 9c4bdddd80c2..03ff7703d322 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -756,3 +756,12 @@ config PERCPU_STATS
This feature collects and exposes statistics via debugfs. The
information includes global and per chunk statistics, which can
be used to help understand percpu memory usage.
+
+config GUP_BENCHMARK
+ bool "Enable infrastructure for get_user_pages_fast() benchmarking"
+ default n
+ help
+ Provides /sys/kernel/debug/gup_benchmark that helps with testing
+ performance of get_user_pages_fast().
+
+ See tools/testing/selftests/vm/gup_benchmark.c
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5b0adf1435de..e5e606ee5f71 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
depends on DEBUG_KERNEL
depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
- depends on !KMEMCHECK
select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
diff --git a/mm/Makefile b/mm/Makefile
index 4659b93cba43..e669f02c5a54 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n
KCOV_INSTRUMENT_page_alloc.o := n
KCOV_INSTRUMENT_debug-pagealloc.o := n
KCOV_INSTRUMENT_kmemleak.o := n
-KCOV_INSTRUMENT_kmemcheck.o := n
KCOV_INSTRUMENT_memcontrol.o := n
KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
@@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o
obj-$(CONFIG_PAGE_POISONING) += page_poison.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
-obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
@@ -82,6 +80,7 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
+obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e19606bb41a0..b5f940ce0143 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -113,11 +113,23 @@ static const struct file_operations bdi_debug_stats_fops = {
.release = single_release,
};
-static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+static int bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
+ if (!bdi_debug_root)
+ return -ENOMEM;
+
bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
+ if (!bdi->debug_dir)
+ return -ENOMEM;
+
bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
bdi, &bdi_debug_stats_fops);
+ if (!bdi->debug_stats) {
+ debugfs_remove(bdi->debug_dir);
+ return -ENOMEM;
+ }
+
+ return 0;
}
static void bdi_debug_unregister(struct backing_dev_info *bdi)
@@ -129,9 +141,10 @@ static void bdi_debug_unregister(struct backing_dev_info *bdi)
static inline void bdi_debug_init(void)
{
}
-static inline void bdi_debug_register(struct backing_dev_info *bdi,
+static inline int bdi_debug_register(struct backing_dev_info *bdi,
const char *name)
{
+ return 0;
}
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
@@ -1072,23 +1085,3 @@ out:
return ret;
}
EXPORT_SYMBOL(wait_iff_congested);
-
-int pdflush_proc_obsolete(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
-{
- char kbuf[] = "0\n";
-
- if (*ppos || *lenp < sizeof(kbuf)) {
- *lenp = 0;
- return 0;
- }
-
- if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
- return -EFAULT;
- pr_warn_once("%s exported in /proc is scheduled for removal\n",
- table->procname);
-
- *lenp = 2;
- *ppos += *lenp;
- return 2;
-}
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 68d28924ba79..ef858d547e2d 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -11,22 +11,37 @@
#include <linux/balloon_compaction.h>
/*
+ * balloon_page_alloc - allocates a new page for insertion into the balloon
+ * page list.
+ *
+ * Driver must call it to properly allocate a new enlisted balloon page.
+ * Driver must call balloon_page_enqueue before definitively removing it from
+ * the guest system. This function returns the page address for the recently
+ * allocated page or NULL in the case we fail to allocate a new page this turn.
+ */
+struct page *balloon_page_alloc(void)
+{
+ struct page *page = alloc_page(balloon_mapping_gfp_mask() |
+ __GFP_NOMEMALLOC | __GFP_NORETRY);
+ return page;
+}
+EXPORT_SYMBOL_GPL(balloon_page_alloc);
+
+/*
* balloon_page_enqueue - allocates a new page and inserts it into the balloon
* page list.
* @b_dev_info: balloon device descriptor where we will insert a new page to
+ * @page: new page to enqueue - allocated using balloon_page_alloc.
*
- * Driver must call it to properly allocate a new enlisted balloon page
+ * Driver must call it to properly enqueue a new allocated balloon page
* before definitively removing it from the guest system.
* This function returns the page address for the recently enqueued page or
* NULL in the case we fail to allocate a new page this turn.
*/
-struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
+void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
+ struct page *page)
{
unsigned long flags;
- struct page *page = alloc_page(balloon_mapping_gfp_mask() |
- __GFP_NOMEMALLOC | __GFP_NORETRY);
- if (!page)
- return NULL;
/*
* Block others from accessing the 'page' when we get around to
@@ -39,7 +54,6 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
__count_vm_event(BALLOON_INFLATE);
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
unlock_page(page);
- return page;
}
EXPORT_SYMBOL_GPL(balloon_page_enqueue);
diff --git a/mm/cma.c b/mm/cma.c
index 022e52bd8370..0607729abf3b 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -461,7 +461,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
trace_cma_alloc(pfn, page, count, align);
if (ret && !(gfp_mask & __GFP_NOWARN)) {
- pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n",
+ pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
__func__, count, ret);
cma_debug_show_areas(cma);
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 85395dc6eb13..10cd757f1006 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -219,6 +219,24 @@ static void reset_cached_positions(struct zone *zone)
}
/*
+ * Compound pages of >= pageblock_order should consistenly be skipped until
+ * released. It is always pointless to compact pages of such order (if they are
+ * migratable), and the pageblocks they occupy cannot contain any free pages.
+ */
+static bool pageblock_skip_persistent(struct page *page)
+{
+ if (!PageCompound(page))
+ return false;
+
+ page = compound_head(page);
+
+ if (compound_order(page) >= pageblock_order)
+ return true;
+
+ return false;
+}
+
+/*
* This function is called to clear all cached information on pageblocks that
* should be skipped for page isolation when the migrate and free page scanner
* meet.
@@ -242,6 +260,8 @@ static void __reset_isolation_suitable(struct zone *zone)
continue;
if (zone != page_zone(page))
continue;
+ if (pageblock_skip_persistent(page))
+ continue;
clear_pageblock_skip(page);
}
@@ -275,7 +295,7 @@ static void update_pageblock_skip(struct compact_control *cc,
struct zone *zone = cc->zone;
unsigned long pfn;
- if (cc->ignore_skip_hint)
+ if (cc->no_set_skip_hint)
return;
if (!page)
@@ -307,7 +327,12 @@ static inline bool isolation_suitable(struct compact_control *cc,
return true;
}
-static void update_pageblock_skip(struct compact_control *cc,
+static inline bool pageblock_skip_persistent(struct page *page)
+{
+ return false;
+}
+
+static inline void update_pageblock_skip(struct compact_control *cc,
struct page *page, unsigned long nr_isolated,
bool migrate_scanner)
{
@@ -449,13 +474,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
* and the only danger is skipping too much.
*/
if (PageCompound(page)) {
- unsigned int comp_order = compound_order(page);
+ const unsigned int order = compound_order(page);
- if (likely(comp_order < MAX_ORDER)) {
- blockpfn += (1UL << comp_order) - 1;
- cursor += (1UL << comp_order) - 1;
+ if (likely(order < MAX_ORDER)) {
+ blockpfn += (1UL << order) - 1;
+ cursor += (1UL << order) - 1;
}
-
goto isolate_fail;
}
@@ -772,11 +796,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* danger is skipping too much.
*/
if (PageCompound(page)) {
- unsigned int comp_order = compound_order(page);
-
- if (likely(comp_order < MAX_ORDER))
- low_pfn += (1UL << comp_order) - 1;
+ const unsigned int order = compound_order(page);
+ if (likely(order < MAX_ORDER))
+ low_pfn += (1UL << order) - 1;
goto isolate_fail;
}
@@ -1928,9 +1951,8 @@ static void kcompactd_do_work(pg_data_t *pgdat)
.total_free_scanned = 0,
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
- .ignore_skip_hint = true,
+ .ignore_skip_hint = false,
.gfp_mask = GFP_KERNEL,
-
};
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
cc.classzone_idx);
diff --git a/mm/debug.c b/mm/debug.c
index 6726bec731c9..56e2d9125ea5 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -50,7 +50,7 @@ void __dump_page(struct page *page, const char *reason)
*/
int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
- pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
+ pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
page, page_ref_count(page), mapcount,
page->mapping, page_to_pgoff(page));
if (PageCompound(page))
@@ -69,7 +69,7 @@ void __dump_page(struct page *page, const char *reason)
#ifdef CONFIG_MEMCG
if (page->mem_cgroup)
- pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
+ pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup);
#endif
}
@@ -84,10 +84,10 @@ EXPORT_SYMBOL(dump_page);
void dump_vma(const struct vm_area_struct *vma)
{
- pr_emerg("vma %p start %p end %p\n"
- "next %p prev %p mm %p\n"
- "prot %lx anon_vma %p vm_ops %p\n"
- "pgoff %lx file %p private_data %p\n"
+ pr_emerg("vma %px start %px end %px\n"
+ "next %px prev %px mm %px\n"
+ "prot %lx anon_vma %px vm_ops %px\n"
+ "pgoff %lx file %px private_data %px\n"
"flags: %#lx(%pGv)\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
vma->vm_prev, vma->vm_mm,
@@ -100,27 +100,27 @@ EXPORT_SYMBOL(dump_vma);
void dump_mm(const struct mm_struct *mm)
{
- pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n"
+ pr_emerg("mm %px mmap %px seqnum %d task_size %lu\n"
#ifdef CONFIG_MMU
- "get_unmapped_area %p\n"
+ "get_unmapped_area %px\n"
#endif
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
- "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
+ "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
- "binfmt %p flags %lx core_state %p\n"
+ "binfmt %px flags %lx core_state %px\n"
#ifdef CONFIG_AIO
- "ioctx_table %p\n"
+ "ioctx_table %px\n"
#endif
#ifdef CONFIG_MEMCG
- "owner %p "
+ "owner %px "
#endif
- "exe_file %p\n"
+ "exe_file %px\n"
#ifdef CONFIG_MMU_NOTIFIER
- "mmu_notifier_mm %p\n"
+ "mmu_notifier_mm %px\n"
#endif
#ifdef CONFIG_NUMA_BALANCING
"numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
@@ -135,8 +135,7 @@ void dump_mm(const struct mm_struct *mm)
mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
- atomic_long_read((atomic_long_t *)&mm->nr_ptes),
- mm_nr_pmds((struct mm_struct *)mm),
+ mm_pgtables_bytes(mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index d04ac1ec0559..1826f191e72c 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -111,7 +111,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
enum fixed_addresses idx;
int i, slot;
- WARN_ON(system_state != SYSTEM_BOOTING);
+ WARN_ON(system_state >= SYSTEM_RUNNING);
slot = -1;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 594d73fef8b4..ee83baaf855d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -35,6 +35,7 @@
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/cleancache.h>
+#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include "internal.h"
@@ -134,7 +135,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
*shadowp = p;
}
__radix_tree_replace(&mapping->page_tree, node, slot, page,
- workingset_update_node, mapping);
+ workingset_lookup_update(mapping));
mapping->nrpages++;
return 0;
}
@@ -162,9 +163,12 @@ static void page_cache_tree_delete(struct address_space *mapping,
radix_tree_clear_tags(&mapping->page_tree, node, slot);
__radix_tree_replace(&mapping->page_tree, node, slot, shadow,
- workingset_update_node, mapping);
+ workingset_lookup_update(mapping));
}
+ page->mapping = NULL;
+ /* Leave page->index set: truncation lookup relies upon it */
+
if (shadow) {
mapping->nrexceptional += nr;
/*
@@ -178,17 +182,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
mapping->nrpages -= nr;
}
-/*
- * Delete a page from the page cache and free it. Caller has to make
- * sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock.
- */
-void __delete_from_page_cache(struct page *page, void *shadow)
+static void unaccount_page_cache_page(struct address_space *mapping,
+ struct page *page)
{
- struct address_space *mapping = page->mapping;
- int nr = hpage_nr_pages(page);
+ int nr;
- trace_mm_filemap_delete_from_page_cache(page);
/*
* if we're uptodate, flush out into the cleancache, otherwise
* invalidate any existing cleancache entries. We can't leave
@@ -224,15 +222,12 @@ void __delete_from_page_cache(struct page *page, void *shadow)
}
}
- page_cache_tree_delete(mapping, page, shadow);
-
- page->mapping = NULL;
- /* Leave page->index set: truncation lookup relies upon it */
-
/* hugetlb pages do not participate in page cache accounting. */
if (PageHuge(page))
return;
+ nr = hpage_nr_pages(page);
+
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
if (PageSwapBacked(page)) {
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
@@ -243,17 +238,51 @@ void __delete_from_page_cache(struct page *page, void *shadow)
}
/*
- * At this point page must be either written or cleaned by truncate.
- * Dirty page here signals a bug and loss of unwritten data.
+ * At this point page must be either written or cleaned by
+ * truncate. Dirty page here signals a bug and loss of
+ * unwritten data.
*
- * This fixes dirty accounting after removing the page entirely but
- * leaves PageDirty set: it has no effect for truncated page and
- * anyway will be cleared before returning page into buddy allocator.
+ * This fixes dirty accounting after removing the page entirely
+ * but leaves PageDirty set: it has no effect for truncated
+ * page and anyway will be cleared before returning page into
+ * buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
}
+/*
+ * Delete a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe. The caller must hold the mapping's tree_lock.
+ */
+void __delete_from_page_cache(struct page *page, void *shadow)
+{
+ struct address_space *mapping = page->mapping;
+
+ trace_mm_filemap_delete_from_page_cache(page);
+
+ unaccount_page_cache_page(mapping, page);
+ page_cache_tree_delete(mapping, page, shadow);
+}
+
+static void page_cache_free_page(struct address_space *mapping,
+ struct page *page)
+{
+ void (*freepage)(struct page *);
+
+ freepage = mapping->a_ops->freepage;
+ if (freepage)
+ freepage(page);
+
+ if (PageTransHuge(page) && !PageHuge(page)) {
+ page_ref_sub(page, HPAGE_PMD_NR);
+ VM_BUG_ON_PAGE(page_count(page) <= 0, page);
+ } else {
+ put_page(page);
+ }
+}
+
/**
* delete_from_page_cache - delete page from page cache
* @page: the page which the kernel is trying to remove from page cache
@@ -266,27 +295,98 @@ void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page_mapping(page);
unsigned long flags;
- void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
-
- freepage = mapping->a_ops->freepage;
-
spin_lock_irqsave(&mapping->tree_lock, flags);
__delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- if (freepage)
- freepage(page);
+ page_cache_free_page(mapping, page);
+}
+EXPORT_SYMBOL(delete_from_page_cache);
- if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, HPAGE_PMD_NR);
- VM_BUG_ON_PAGE(page_count(page) <= 0, page);
- } else {
- put_page(page);
+/*
+ * page_cache_tree_delete_batch - delete several pages from page cache
+ * @mapping: the mapping to which pages belong
+ * @pvec: pagevec with pages to delete
+ *
+ * The function walks over mapping->page_tree and removes pages passed in @pvec
+ * from the radix tree. The function expects @pvec to be sorted by page index.
+ * It tolerates holes in @pvec (radix tree entries at those indices are not
+ * modified). The function expects only THP head pages to be present in the
+ * @pvec and takes care to delete all corresponding tail pages from the radix
+ * tree as well.
+ *
+ * The function expects mapping->tree_lock to be held.
+ */
+static void
+page_cache_tree_delete_batch(struct address_space *mapping,
+ struct pagevec *pvec)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ int total_pages = 0;
+ int i = 0, tail_pages = 0;
+ struct page *page;
+ pgoff_t start;
+
+ start = pvec->pages[0]->index;
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ if (i >= pagevec_count(pvec) && !tail_pages)
+ break;
+ page = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (radix_tree_exceptional_entry(page))
+ continue;
+ if (!tail_pages) {
+ /*
+ * Some page got inserted in our range? Skip it. We
+ * have our pages locked so they are protected from
+ * being removed.
+ */
+ if (page != pvec->pages[i])
+ continue;
+ WARN_ON_ONCE(!PageLocked(page));
+ if (PageTransHuge(page) && !PageHuge(page))
+ tail_pages = HPAGE_PMD_NR - 1;
+ page->mapping = NULL;
+ /*
+ * Leave page->index set: truncation lookup relies
+ * upon it
+ */
+ i++;
+ } else {
+ tail_pages--;
+ }
+ radix_tree_clear_tags(&mapping->page_tree, iter.node, slot);
+ __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL,
+ workingset_lookup_update(mapping));
+ total_pages++;
}
+ mapping->nrpages -= total_pages;
+}
+
+void delete_from_page_cache_batch(struct address_space *mapping,
+ struct pagevec *pvec)
+{
+ int i;
+ unsigned long flags;
+
+ if (!pagevec_count(pvec))
+ return;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
+
+ unaccount_page_cache_page(mapping, pvec->pages[i]);
+ }
+ page_cache_tree_delete_batch(mapping, pvec);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+
+ for (i = 0; i < pagevec_count(pvec); i++)
+ page_cache_free_page(mapping, pvec->pages[i]);
}
-EXPORT_SYMBOL(delete_from_page_cache);
int filemap_check_errors(struct address_space *mapping)
{
@@ -419,20 +519,18 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
if (end_byte < start_byte)
return;
- pagevec_init(&pvec, 0);
- while ((index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_WRITEBACK,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
+ pagevec_init(&pvec);
+ while (index <= end) {
unsigned i;
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
+ end, PAGECACHE_TAG_WRITEBACK);
+ if (!nr_pages)
+ break;
+
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- /* until radix tree lookup accepts end_index */
- if (page->index > end)
- continue;
-
wait_on_page_writeback(page);
ClearPageError(page);
}
@@ -1041,6 +1139,7 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
wait_queue_head_t *q = page_waitqueue(page);
return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
}
+EXPORT_SYMBOL(wait_on_page_bit_killable);
/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
@@ -1754,9 +1853,10 @@ repeat:
EXPORT_SYMBOL(find_get_pages_contig);
/**
- * find_get_pages_tag - find and return pages that match @tag
+ * find_get_pages_range_tag - find and return pages in given range matching @tag
* @mapping: the address_space to search
* @index: the starting page index
+ * @end: The final page index (inclusive)
* @tag: the tag index
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
@@ -1764,8 +1864,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
* Like find_get_pages, except we only return pages which are tagged with
* @tag. We update @index to index the next page for the traversal.
*/
-unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
- int tag, unsigned int nr_pages, struct page **pages)
+unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
+ pgoff_t end, int tag, unsigned int nr_pages,
+ struct page **pages)
{
struct radix_tree_iter iter;
void **slot;
@@ -1778,6 +1879,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
struct page *head, *page;
+
+ if (iter.index > end)
+ break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
@@ -1819,18 +1923,28 @@ repeat:
}
pages[ret] = page;
- if (++ret == nr_pages)
- break;
+ if (++ret == nr_pages) {
+ *index = pages[ret - 1]->index + 1;
+ goto out;
+ }
}
+ /*
+ * We come here when we got at @end. We take care to not overflow the
+ * index @index as it confuses some of the callers. This breaks the
+ * iteration when there is page at index -1 but that is already broken
+ * anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *index = (pgoff_t)-1;
+ else
+ *index = end + 1;
+out:
rcu_read_unlock();
- if (ret)
- *index = pages[ret - 1]->index + 1;
-
return ret;
}
-EXPORT_SYMBOL(find_get_pages_tag);
+EXPORT_SYMBOL(find_get_pages_range_tag);
/**
* find_get_entries_tag - find and return entries that match @tag
@@ -2159,7 +2273,7 @@ no_cached_page:
* Ok, it wasn't cached, so we need to create a new
* page..
*/
- page = page_cache_alloc_cold(mapping);
+ page = page_cache_alloc(mapping);
if (!page) {
error = -ENOMEM;
goto out;
@@ -2271,7 +2385,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
int ret;
do {
- page = __page_cache_alloc(gfp_mask|__GFP_COLD);
+ page = __page_cache_alloc(gfp_mask);
if (!page)
return -ENOMEM;
@@ -2675,7 +2789,7 @@ static struct page *do_read_cache_page(struct address_space *mapping,
repeat:
page = find_get_page(mapping, index);
if (!page) {
- page = __page_cache_alloc(gfp | __GFP_COLD);
+ page = __page_cache_alloc(gfp);
if (!page)
return ERR_PTR(-ENOMEM);
err = add_to_page_cache_lru(page, mapping, index, gfp);
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index 2f98df0d460e..c64dca6e27c2 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -53,6 +53,20 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
ret = -EFAULT;
goto out;
}
+
+ /*
+ * While get_vaddr_frames() could be used for transient (kernel
+ * controlled lifetime) pinning of memory pages all current
+ * users establish long term (userspace controlled lifetime)
+ * page pinning. Treat get_vaddr_frames() like
+ * get_user_pages_longterm() and disallow it for filesystem-dax
+ * mappings.
+ */
+ if (vma_is_fsdax(vma)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
vec->got_ref = true;
vec->is_pfns = false;
diff --git a/mm/gup.c b/mm/gup.c
index dfcde13f289a..e0d82b6706d7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1095,6 +1095,70 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);
+#ifdef CONFIG_FS_DAX
+/*
+ * This is the same as get_user_pages() in that it assumes we are
+ * operating on the current task's mm, but it goes further to validate
+ * that the vmas associated with the address range are suitable for
+ * longterm elevated page reference counts. For example, filesystem-dax
+ * mappings are subject to the lifetime enforced by the filesystem and
+ * we need guarantees that longterm users like RDMA and V4L2 only
+ * establish mappings that have a kernel enforced revocation mechanism.
+ *
+ * "longterm" == userspace controlled elevated page count lifetime.
+ * Contrast this to iov_iter_get_pages() usages which are transient.
+ */
+long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas_arg)
+{
+ struct vm_area_struct **vmas = vmas_arg;
+ struct vm_area_struct *vma_prev = NULL;
+ long rc, i;
+
+ if (!pages)
+ return -EINVAL;
+
+ if (!vmas) {
+ vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas)
+ return -ENOMEM;
+ }
+
+ rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
+
+ for (i = 0; i < rc; i++) {
+ struct vm_area_struct *vma = vmas[i];
+
+ if (vma == vma_prev)
+ continue;
+
+ vma_prev = vma;
+
+ if (vma_is_fsdax(vma))
+ break;
+ }
+
+ /*
+ * Either get_user_pages() failed, or the vma validation
+ * succeeded, in either case we don't need to put_page() before
+ * returning.
+ */
+ if (i >= rc)
+ goto out;
+
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
+out:
+ if (vmas != vmas_arg)
+ kfree(vmas);
+ return rc;
+}
+EXPORT_SYMBOL(get_user_pages_longterm);
+#endif /* CONFIG_FS_DAX */
+
/**
* populate_vma_page_range() - populate a range of pages in the vma.
* @vma: target vma
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
new file mode 100644
index 000000000000..5c8e2abeaa15
--- /dev/null
+++ b/mm/gup_benchmark.c
@@ -0,0 +1,100 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/ktime.h>
+#include <linux/debugfs.h>
+
+#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
+
+struct gup_benchmark {
+ __u64 delta_usec;
+ __u64 addr;
+ __u64 size;
+ __u32 nr_pages_per_call;
+ __u32 flags;
+};
+
+static int __gup_benchmark_ioctl(unsigned int cmd,
+ struct gup_benchmark *gup)
+{
+ ktime_t start_time, end_time;
+ unsigned long i, nr, nr_pages, addr, next;
+ struct page **pages;
+
+ nr_pages = gup->size / PAGE_SIZE;
+ pages = kvmalloc(sizeof(void *) * nr_pages, GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+ i = 0;
+ nr = gup->nr_pages_per_call;
+ start_time = ktime_get();
+ for (addr = gup->addr; addr < gup->addr + gup->size; addr = next) {
+ if (nr != gup->nr_pages_per_call)
+ break;
+
+ next = addr + nr * PAGE_SIZE;
+ if (next > gup->addr + gup->size) {
+ next = gup->addr + gup->size;
+ nr = (next - addr) / PAGE_SIZE;
+ }
+
+ nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i);
+ i += nr;
+ }
+ end_time = ktime_get();
+
+ gup->delta_usec = ktime_us_delta(end_time, start_time);
+ gup->size = addr - gup->addr;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (!pages[i])
+ break;
+ put_page(pages[i]);
+ }
+
+ kvfree(pages);
+ return 0;
+}
+
+static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ struct gup_benchmark gup;
+ int ret;
+
+ if (cmd != GUP_FAST_BENCHMARK)
+ return -EINVAL;
+
+ if (copy_from_user(&gup, (void __user *)arg, sizeof(gup)))
+ return -EFAULT;
+
+ ret = __gup_benchmark_ioctl(cmd, &gup);
+ if (ret)
+ return ret;
+
+ if (copy_to_user((void __user *)arg, &gup, sizeof(gup)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static const struct file_operations gup_benchmark_fops = {
+ .open = nonseekable_open,
+ .unlocked_ioctl = gup_benchmark_ioctl,
+};
+
+static int gup_benchmark_init(void)
+{
+ void *ret;
+
+ ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
+ &gup_benchmark_fops);
+ if (!ret)
+ pr_warn("Failed to create gup_benchmark in debugfs");
+
+ return 0;
+}
+
+late_initcall(gup_benchmark_init);
diff --git a/mm/hmm.c b/mm/hmm.c
index a88a847bccba..ea19742a5d60 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -803,11 +803,10 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
static void hmm_devmem_radix_release(struct resource *resource)
{
- resource_size_t key, align_start, align_size, align_end;
+ resource_size_t key, align_start, align_size;
align_start = resource->start & ~(PA_SECTION_SIZE - 1);
align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
- align_end = align_start + align_size - 1;
mutex_lock(&hmm_devmem_lock);
for (key = resource->start;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1981ed697dab..0e7ded98d114 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -39,10 +39,10 @@
#include "internal.h"
/*
- * By default transparent hugepage support is disabled in order that avoid
- * to risk increase the memory footprint of applications without a guaranteed
- * benefit. When transparent hugepage support is enabled, is for all mappings,
- * and khugepaged scans all mappings.
+ * By default, transparent hugepage support is disabled in order to avoid
+ * risking an increased memory footprint for applications that are not
+ * guaranteed to benefit from it. When transparent hugepage support is
+ * enabled, it is for all mappings, and khugepaged scans all mappings.
* Defrag is invoked by khugepaged hugepage allocations and by page faults
* for all hugepage allocations.
*/
@@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- atomic_long_inc(&vma->vm_mm->nr_ptes);
+ mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
}
@@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
if (pgtable)
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
- atomic_long_inc(&mm->nr_ptes);
+ mm_inc_nr_ptes(mm);
return true;
}
@@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
- atomic_long_inc(&mm->nr_ptes);
+ mm_inc_nr_ptes(mm);
}
set_pmd_at(mm, addr, pmd, entry);
@@ -842,20 +842,15 @@ EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd)
+ pmd_t *pmd, int flags)
{
pmd_t _pmd;
- /*
- * We should set the dirty bit only for FOLL_WRITE but for now
- * the dirty bit in the pmd is meaningless. And if the dirty
- * bit will become meaningful and we'll only set it with
- * FOLL_WRITE, an atomic set_bit will be required on the pmd to
- * set the young bit, instead of the current set_pmd_at.
- */
- _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+ _pmd = pmd_mkyoung(*pmd);
+ if (flags & FOLL_WRITE)
+ _pmd = pmd_mkdirty(_pmd);
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
- pmd, _pmd, 1))
+ pmd, _pmd, flags & FOLL_WRITE))
update_mmu_cache_pmd(vma, addr, pmd);
}
@@ -884,7 +879,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd);
+ touch_pmd(vma, addr, pmd, flags);
/*
* device mapped pages can only be returned if the
@@ -942,7 +937,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
set_pmd_at(src_mm, addr, src_pmd, pmd);
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- atomic_long_inc(&dst_mm->nr_ptes);
+ mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0;
@@ -978,7 +973,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
get_page(src_page);
page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- atomic_long_inc(&dst_mm->nr_ptes);
+ mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -995,20 +990,15 @@ out:
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud)
+ pud_t *pud, int flags)
{
pud_t _pud;
- /*
- * We should set the dirty bit only for FOLL_WRITE but for now
- * the dirty bit in the pud is meaningless. And if the dirty
- * bit will become meaningful and we'll only set it with
- * FOLL_WRITE, an atomic set_bit will be required on the pud to
- * set the young bit, instead of the current set_pud_at.
- */
- _pud = pud_mkyoung(pud_mkdirty(*pud));
+ _pud = pud_mkyoung(*pud);
+ if (flags & FOLL_WRITE)
+ _pud = pud_mkdirty(_pud);
if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
- pud, _pud, 1))
+ pud, _pud, flags & FOLL_WRITE))
update_mmu_cache_pud(vma, addr, pud);
}
@@ -1031,7 +1021,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (flags & FOLL_TOUCH)
- touch_pud(vma, addr, pud);
+ touch_pud(vma, addr, pud, flags);
/*
* device mapped pages can only be returned if the
@@ -1189,8 +1179,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
+ /*
+ * Leave pmd empty until pte is filled note we must notify here as
+ * concurrent CPU thread might write to new page before the call to
+ * mmu_notifier_invalidate_range_end() happens which can lead to a
+ * device seeing memory write in different order than CPU.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
- /* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
pmd_populate(vma->vm_mm, &_pmd, pgtable);
@@ -1216,7 +1213,12 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
page_remove_rmap(page, true);
spin_unlock(vmf->ptl);
- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pmdp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
+ mmun_end);
ret |= VM_FAULT_WRITE;
put_page(page);
@@ -1365,7 +1367,12 @@ alloc:
}
spin_unlock(vmf->ptl);
out_mn:
- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pmdp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
+ mmun_end);
out:
return ret;
out_unlock:
@@ -1407,7 +1414,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd);
+ touch_pmd(vma, addr, pmd, flags);
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* We don't mlock() pte-mapped THPs. This way we can avoid
@@ -1678,7 +1685,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pte_free(mm, pgtable);
- atomic_long_dec(&mm->nr_ptes);
+ mm_dec_nr_ptes(mm);
}
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
@@ -2017,7 +2024,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
out:
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pudp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
+ HPAGE_PUD_SIZE);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -2029,8 +2041,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- /* leave pmd empty until pte is filled */
- pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ /*
+ * Leave pmd empty until pte is filled note that it is fine to delay
+ * notification until mmu_notifier_invalidate_range_end() as we are
+ * replacing a zero pmd write protected page with a zero pte write
+ * protected page.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
+ pmdp_huge_clear_flush(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -2085,6 +2104,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
return;
} else if (is_huge_zero_pmd(*pmd)) {
+ /*
+ * FIXME: Do we want to invalidate secondary mmu by calling
+ * mmu_notifier_invalidate_range() see comments below inside
+ * __split_huge_pmd() ?
+ *
+ * We are going from a zero huge page write protected to zero
+ * small page also write protected so it does not seems useful
+ * to invalidate secondary mmu at this time.
+ */
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
@@ -2220,7 +2248,21 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
__split_huge_pmd_locked(vma, pmd, haddr, freeze);
out:
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback.
+ * They are 3 cases to consider inside __split_huge_pmd_locked():
+ * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
+ * 2) __split_huge_zero_page_pmd() read only zero page and any write
+ * fault will trigger a flush_notify before pointing to a new page
+ * (it is fine if the secondary mmu keeps pointing to the old zero
+ * page in the meantime)
+ * 3) Split a huge pmd into pte pointing to the same page. No need
+ * to invalidate secondary tlb entry they are all still valid.
+ * any further changes to individual pte will notify. So no need
+ * to call mmu_notifier->invalidate_range()
+ */
+ mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
+ HPAGE_PMD_SIZE);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
@@ -2718,7 +2760,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
- return ACCESS_ONCE(pgdata->split_queue_len);
+ return READ_ONCE(pgdata->split_queue_len);
}
static unsigned long deferred_split_scan(struct shrinker *shrink,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2d2ff5e8bf2b..9a334f5fb730 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3125,6 +3125,13 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
}
}
+static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ if (addr & ~(huge_page_mask(hstate_vma(vma))))
+ return -EINVAL;
+ return 0;
+}
+
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
* handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -3141,6 +3148,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
.open = hugetlb_vm_op_open,
.close = hugetlb_vm_op_close,
+ .split = hugetlb_vm_op_split,
};
static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -3256,9 +3264,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
} else {
if (cow) {
+ /*
+ * No need to notify as we are downgrading page
+ * table protection not changing it to point
+ * to a new page.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
huge_ptep_set_wrprotect(src, addr, src_pte);
- mmu_notifier_invalidate_range(src, mmun_start,
- mmun_end);
}
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
@@ -4318,7 +4331,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* and that page table be reused and filled with junk.
*/
flush_hugetlb_tlb_range(vma, start, end);
- mmu_notifier_invalidate_range(mm, start, end);
+ /*
+ * No need to call mmu_notifier_invalidate_range() we are downgrading
+ * page table protection not changing it to point to a new page.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
i_mmap_unlock_write(vma->vm_file->f_mapping);
mmu_notifier_invalidate_range_end(mm, start, end);
@@ -4617,7 +4635,9 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
- p4d = p4d_offset(pgd, addr);
+ p4d = p4d_alloc(mm, pgd, addr);
+ if (!p4d)
+ return NULL;
pud = pud_alloc(mm, p4d, addr);
if (pud) {
if (sz == PUD_SIZE) {
diff --git a/mm/internal.h b/mm/internal.h
index 1df011f62480..e6bd35182dae 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -198,6 +198,7 @@ struct compact_control {
const int classzone_idx; /* zone index of a direct compactor */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool no_set_skip_hint; /* Don't mark blocks for skipping */
bool ignore_block_suitable; /* Scan blocks considered unsuitable */
bool direct_compaction; /* False from kcompactd or /proc/... */
bool whole_zone; /* Whole zone should/has been scanned */
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 6f319fb81718..405bba487df5 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -337,7 +337,7 @@ static size_t optimal_redzone(size_t object_size)
}
void kasan_cache_create(struct kmem_cache *cache, size_t *size,
- unsigned long *flags)
+ slab_flags_t *flags)
{
int redzone_adjust;
int orig_size = *size;
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 6bcfb01ba038..410c8235e671 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -134,7 +134,7 @@ static void print_error_description(struct kasan_access_info *info)
pr_err("BUG: KASAN: %s in %pS\n",
bug_type, (void *)info->ip);
- pr_err("%s of size %zu at addr %p by task %s/%d\n",
+ pr_err("%s of size %zu at addr %px by task %s/%d\n",
info->is_write ? "Write" : "Read", info->access_size,
info->access_addr, current->comm, task_pid_nr(current));
}
@@ -206,7 +206,7 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
const char *rel_type;
int rel_bytes;
- pr_err("The buggy address belongs to the object at %p\n"
+ pr_err("The buggy address belongs to the object at %px\n"
" which belongs to the cache %s of size %d\n",
object, cache->name, cache->object_size);
@@ -225,7 +225,7 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
}
pr_err("The buggy address is located %d bytes %s of\n"
- " %d-byte region [%p, %p)\n",
+ " %d-byte region [%px, %px)\n",
rel_bytes, rel_type, cache->object_size, (void *)object_addr,
(void *)(object_addr + cache->object_size));
}
@@ -302,7 +302,7 @@ static void print_shadow_for_address(const void *addr)
char shadow_buf[SHADOW_BYTES_PER_ROW];
snprintf(buffer, sizeof(buffer),
- (i == 0) ? ">%p: " : " %p: ", kaddr);
+ (i == 0) ? ">%px: " : " %px: ", kaddr);
/*
* We should not pass a shadow pointer to generic
* function, because generic functions may try to
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 43cb3043311b..ea4ff259b671 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
_pmd = pmdp_collapse_flush(vma, addr, pmd);
spin_unlock(ptl);
up_write(&vma->vm_mm->mmap_sem);
- atomic_long_dec(&vma->vm_mm->nr_ptes);
+ mm_dec_nr_ptes(vma->vm_mm);
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
}
}
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
deleted file mode 100644
index 800d64b854ea..000000000000
--- a/mm/kmemcheck.c
+++ /dev/null
@@ -1,126 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/gfp.h>
-#include <linux/mm_types.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include "slab.h"
-#include <linux/kmemcheck.h>
-
-void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
-{
- struct page *shadow;
- int pages;
- int i;
-
- pages = 1 << order;
-
- /*
- * With kmemcheck enabled, we need to allocate a memory area for the
- * shadow bits as well.
- */
- shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
- if (!shadow) {
- if (printk_ratelimit())
- pr_err("kmemcheck: failed to allocate shadow bitmap\n");
- return;
- }
-
- for(i = 0; i < pages; ++i)
- page[i].shadow = page_address(&shadow[i]);
-
- /*
- * Mark it as non-present for the MMU so that our accesses to
- * this memory will trigger a page fault and let us analyze
- * the memory accesses.
- */
- kmemcheck_hide_pages(page, pages);
-}
-
-void kmemcheck_free_shadow(struct page *page, int order)
-{
- struct page *shadow;
- int pages;
- int i;
-
- if (!kmemcheck_page_is_tracked(page))
- return;
-
- pages = 1 << order;
-
- kmemcheck_show_pages(page, pages);
-
- shadow = virt_to_page(page[0].shadow);
-
- for(i = 0; i < pages; ++i)
- page[i].shadow = NULL;
-
- __free_pages(shadow, order);
-}
-
-void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
- size_t size)
-{
- if (unlikely(!object)) /* Skip object if allocation failed */
- return;
-
- /*
- * Has already been memset(), which initializes the shadow for us
- * as well.
- */
- if (gfpflags & __GFP_ZERO)
- return;
-
- /* No need to initialize the shadow of a non-tracked slab. */
- if (s->flags & SLAB_NOTRACK)
- return;
-
- if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
- /*
- * Allow notracked objects to be allocated from
- * tracked caches. Note however that these objects
- * will still get page faults on access, they just
- * won't ever be flagged as uninitialized. If page
- * faults are not acceptable, the slab cache itself
- * should be marked NOTRACK.
- */
- kmemcheck_mark_initialized(object, size);
- } else if (!s->ctor) {
- /*
- * New objects should be marked uninitialized before
- * they're returned to the called.
- */
- kmemcheck_mark_uninitialized(object, size);
- }
-}
-
-void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
-{
- /* TODO: RCU freeing is unsupported for now; hide false positives. */
- if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU))
- kmemcheck_mark_freed(object, size);
-}
-
-void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
- gfp_t gfpflags)
-{
- int pages;
-
- if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
- return;
-
- pages = 1 << order;
-
- /*
- * NOTE: We choose to track GFP_ZERO pages too; in fact, they
- * can become uninitialized by copying uninitialized memory
- * into them.
- */
-
- /* XXX: Can use zone->node for node? */
- kmemcheck_alloc_shadow(page, order, gfpflags, -1);
-
- if (gfpflags & __GFP_ZERO)
- kmemcheck_mark_initialized_pages(page, pages);
- else
- kmemcheck_mark_uninitialized_pages(page, pages);
-}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 7780cd83a495..f656ca27f6c2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -110,7 +110,6 @@
#include <linux/atomic.h>
#include <linux/kasan.h>
-#include <linux/kmemcheck.h>
#include <linux/kmemleak.h>
#include <linux/memory_hotplug.h>
@@ -128,7 +127,7 @@
/* GFP bitmask for kmemleak internal allocations */
#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
__GFP_NORETRY | __GFP_NOMEMALLOC | \
- __GFP_NOWARN)
+ __GFP_NOWARN | __GFP_NOFAIL)
/* scanning area inside a memory block */
struct kmemleak_scan_area {
@@ -1238,9 +1237,6 @@ static bool update_checksum(struct kmemleak_object *object)
{
u32 old_csum = object->checksum;
- if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
- return false;
-
kasan_disable_current();
object->checksum = crc32(0, (void *)object->pointer, object->size);
kasan_enable_current();
@@ -1314,11 +1310,6 @@ static void scan_block(void *_start, void *_end,
if (scan_should_stop())
break;
- /* don't scan uninitialized memory */
- if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
- BYTES_PER_POINTER))
- continue;
-
kasan_disable_current();
pointer = *ptr;
kasan_enable_current();
@@ -1532,6 +1523,8 @@ static void kmemleak_scan(void)
if (page_count(page) == 0)
continue;
scan_block(page, page + 1, NULL);
+ if (!(pfn & 63))
+ cond_resched();
}
}
put_online_mems();
@@ -2104,7 +2097,7 @@ static int __init kmemleak_late_init(void)
return -ENOMEM;
}
- dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
+ dentry = debugfs_create_file("kmemleak", 0644, NULL, NULL,
&kmemleak_fops);
if (!dentry)
pr_warn("Failed to create the debugfs kmemleak file\n");
diff --git a/mm/ksm.c b/mm/ksm.c
index 6cb60f46cce5..be8f4576f842 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
* So we clear the pte and flush the tlb before the check
* this assure us that no O_DIRECT can happen after the check
* or in the middle of the check.
+ *
+ * No need to notify as we are downgrading page table to read
+ * only not changing it to point to a new page.
+ *
+ * See Documentation/vm/mmu_notifier.txt
*/
- entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
+ entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
/*
* Check that no O_DIRECT or similar I/O is in progress on the
* page
@@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
}
flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush_notify(vma, addr, ptep);
+ /*
+ * No need to notify as we are replacing a read only page with another
+ * read only page with the same content.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
+ ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, newpte);
page_remove_rmap(page, false);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f141f0c80ff3..fd41e969ede5 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -221,6 +221,7 @@ restart:
switch (ret) {
case LRU_REMOVED_RETRY:
assert_spin_locked(&nlru->lock);
+ /* fall through */
case LRU_REMOVED:
isolated++;
nlru->nr_items--;
diff --git a/mm/madvise.c b/mm/madvise.c
index 375cf32087e4..751e97aa2210 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -276,15 +276,14 @@ static long madvise_willneed(struct vm_area_struct *vma,
{
struct file *file = vma->vm_file;
+ *prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
- *prev = vma;
force_swapin_readahead(vma, start, end);
return 0;
}
if (shmem_mapping(file->f_mapping)) {
- *prev = vma;
force_shm_swapin_readahead(vma, start, end,
file->f_mapping);
return 0;
@@ -299,7 +298,6 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
- *prev = vma;
start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
if (end > vma->vm_end)
end = vma->vm_end;
diff --git a/mm/memblock.c b/mm/memblock.c
index 91205780e6b1..46aacdfa4f4d 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -533,7 +533,7 @@ repeat:
base = obase;
nr_new = 0;
- for_each_memblock_type(type, rgn) {
+ for_each_memblock_type(idx, type, rgn) {
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;
@@ -637,7 +637,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
if (memblock_double_array(type, base, size) < 0)
return -ENOMEM;
- for_each_memblock_type(type, rgn) {
+ for_each_memblock_type(idx, type, rgn) {
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;
@@ -1327,7 +1327,6 @@ again:
return NULL;
done:
ptr = phys_to_virt(alloc);
- memset(ptr, 0, size);
/*
* The min_count is set to 0 so that bootmem allocated blocks
@@ -1341,6 +1340,45 @@ done:
}
/**
+ * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing
+ * memory and without panicking
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region from where the allocation
+ * is preferred (phys address)
+ * @max_addr: the upper bound of the memory region from where the allocation
+ * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * allocate only from memory limited by memblock.current_limit value
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. Does not zero allocated memory, does not panic if request
+ * cannot be satisfied.
+ *
+ * RETURNS:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+void * __init memblock_virt_alloc_try_nid_raw(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ void *ptr;
+
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
+ __func__, (u64)size, (u64)align, nid, (u64)min_addr,
+ (u64)max_addr, (void *)_RET_IP_);
+
+ ptr = memblock_virt_alloc_internal(size, align,
+ min_addr, max_addr, nid);
+#ifdef CONFIG_DEBUG_VM
+ if (ptr && size > 0)
+ memset(ptr, 0xff, size);
+#endif
+ return ptr;
+}
+
+/**
* memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
* @size: size of memory block to be allocated in bytes
* @align: alignment of the region and block's size
@@ -1351,8 +1389,8 @@ done:
* allocate only from memory limited by memblock.current_limit value
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
- * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides
- * additional debug information (including caller info), if enabled.
+ * Public function, provides additional debug information (including caller
+ * info), if enabled. This function zeroes the allocated memory.
*
* RETURNS:
* Virtual address of allocated memory block on success, NULL on failure.
@@ -1362,11 +1400,17 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
+ void *ptr;
+
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n",
__func__, (u64)size, (u64)align, nid, (u64)min_addr,
(u64)max_addr, (void *)_RET_IP_);
- return memblock_virt_alloc_internal(size, align, min_addr,
- max_addr, nid);
+
+ ptr = memblock_virt_alloc_internal(size, align,
+ min_addr, max_addr, nid);
+ if (ptr)
+ memset(ptr, 0, size);
+ return ptr;
}
/**
@@ -1380,7 +1424,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic(
* allocate only from memory limited by memblock.current_limit value
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
- * Public panicking version of _memblock_virt_alloc_try_nid_nopanic()
+ * Public panicking version of memblock_virt_alloc_try_nid_nopanic()
* which provides debug information (including caller info), if enabled,
* and panics if the request can not be satisfied.
*
@@ -1399,8 +1443,10 @@ void * __init memblock_virt_alloc_try_nid(
(u64)max_addr, (void *)_RET_IP_);
ptr = memblock_virt_alloc_internal(size, align,
min_addr, max_addr, nid);
- if (ptr)
+ if (ptr) {
+ memset(ptr, 0, size);
return ptr;
+ }
panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n",
__func__, (u64)size, (u64)align, nid, (u64)min_addr,
@@ -1715,7 +1761,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt);
- for_each_memblock_type(type, rgn) {
+ for_each_memblock_type(idx, type, rgn) {
char nid_buf[32] = "";
base = rgn->base;
@@ -1739,7 +1785,7 @@ memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
unsigned long size = 0;
int idx;
- for_each_memblock_type((&memblock.reserved), rgn) {
+ for_each_memblock_type(idx, (&memblock.reserved), rgn) {
phys_addr_t start, end;
if (rgn->base + rgn->size < start_addr)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 661f046ad318..ac2ffd5e02b9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4049,7 +4049,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
-#ifdef CONFIG_SLABINFO
+#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
{
.name = "kmem.slabinfo",
.seq_start = memcg_slab_start,
@@ -6044,7 +6044,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
memcg_check_events(memcg, page);
if (!mem_cgroup_is_root(memcg))
- css_put(&memcg->css);
+ css_put_many(&memcg->css, nr_entries);
}
/**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 88366626c0b7..4acdf393a801 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1587,7 +1587,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
- pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
+ pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n",
pfn, ret, page->flags, &page->flags);
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
diff --git a/mm/memory.c b/mm/memory.c
index a728bed16c20..793004608332 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
pgtable_t token = pmd_pgtable(*pmd);
pmd_clear(pmd);
pte_free_tlb(tlb, token, addr);
- atomic_long_dec(&tlb->mm->nr_ptes);
+ mm_dec_nr_ptes(tlb->mm);
}
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
pud = pud_offset(p4d, start);
p4d_clear(p4d);
pud_free_tlb(tlb, pud, start);
+ mm_dec_nr_puds(tlb->mm);
}
static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -665,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
ptl = pmd_lock(mm, pmd);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
- atomic_long_inc(&mm->nr_ptes);
+ mm_inc_nr_ptes(mm);
pmd_populate(mm, pmd, new);
new = NULL;
}
@@ -2554,7 +2555,11 @@ static int wp_page_copy(struct vm_fault *vmf)
put_page(new_page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above ptep_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
@@ -2842,7 +2847,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
int do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL, *swapcache;
+ struct page *page = NULL, *swapcache = NULL;
struct mem_cgroup *memcg;
struct vma_swap_readahead swap_ra;
swp_entry_t entry;
@@ -2852,8 +2857,11 @@ int do_swap_page(struct vm_fault *vmf)
int ret = 0;
bool vma_readahead = swap_use_vma_readahead();
- if (vma_readahead)
+ if (vma_readahead) {
page = swap_readahead_detect(vmf, &swap_ra);
+ swapcache = page;
+ }
+
if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
if (page)
put_page(page);
@@ -2881,17 +2889,39 @@ int do_swap_page(struct vm_fault *vmf)
}
goto out;
}
+
+
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
- if (!page)
+ if (!page) {
page = lookup_swap_cache(entry, vma_readahead ? vma : NULL,
vmf->address);
+ swapcache = page;
+ }
+
if (!page) {
- if (vma_readahead)
- page = do_swap_page_readahead(entry,
- GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
- else
- page = swapin_readahead(entry,
- GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ struct swap_info_struct *si = swp_swap_info(entry);
+
+ if (si->flags & SWP_SYNCHRONOUS_IO &&
+ __swap_count(si, entry) == 1) {
+ /* skip swapcache */
+ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ if (page) {
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+ set_page_private(page, entry.val);
+ lru_cache_add_anon(page);
+ swap_readpage(page, true);
+ }
+ } else {
+ if (vma_readahead)
+ page = do_swap_page_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
+ else
+ page = swapin_readahead(entry,
+ GFP_HIGHUSER_MOVABLE, vma, vmf->address);
+ swapcache = page;
+ }
+
if (!page) {
/*
* Back out if somebody else faulted in this pte
@@ -2920,7 +2950,6 @@ int do_swap_page(struct vm_fault *vmf)
goto out_release;
}
- swapcache = page;
locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2935,7 +2964,8 @@ int do_swap_page(struct vm_fault *vmf)
* test below, are not enough to exclude that. Even if it is still
* swapcache, we need to check that the page's swap has not changed.
*/
- if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
+ if (unlikely((!PageSwapCache(page) ||
+ page_private(page) != entry.val)) && swapcache)
goto out_page;
page = ksm_might_need_to_copy(page, vma, vmf->address);
@@ -2988,14 +3018,16 @@ int do_swap_page(struct vm_fault *vmf)
pte = pte_mksoft_dirty(pte);
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
vmf->orig_pte = pte;
- if (page == swapcache) {
- do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
- mem_cgroup_commit_charge(page, memcg, true, false);
- activate_page(page);
- } else { /* ksm created a completely new copy */
+
+ /* ksm created a completely new copy */
+ if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
+ } else {
+ do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
+ mem_cgroup_commit_charge(page, memcg, true, false);
+ activate_page(page);
}
swap_free(entry);
@@ -3003,7 +3035,7 @@ int do_swap_page(struct vm_fault *vmf)
(vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
- if (page != swapcache) {
+ if (page != swapcache && swapcache) {
/*
* Hold the lock to avoid the swap entry to be reused
* until we take the PT lock for the pte_same() check
@@ -3036,7 +3068,7 @@ out_page:
unlock_page(page);
out_release:
put_page(page);
- if (page != swapcache) {
+ if (page != swapcache && swapcache) {
unlock_page(swapcache);
put_page(swapcache);
}
@@ -3212,7 +3244,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
goto map_pte;
}
- atomic_long_inc(&vma->vm_mm->nr_ptes);
+ mm_inc_nr_ptes(vma->vm_mm);
pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
spin_unlock(vmf->ptl);
vmf->prealloc_pte = NULL;
@@ -3271,7 +3303,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
* We are going to consume the prealloc table,
* count that as nr_ptes.
*/
- atomic_long_inc(&vma->vm_mm->nr_ptes);
+ mm_inc_nr_ptes(vma->vm_mm);
vmf->prealloc_pte = NULL;
}
@@ -3805,7 +3837,8 @@ static inline int create_huge_pmd(struct vm_fault *vmf)
return VM_FAULT_FALLBACK;
}
-static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
+/* `inline' is required to avoid gcc 4.1.2 build error */
+static inline int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
{
if (vma_is_anonymous(vmf->vma))
return do_huge_pmd_wp_page(vmf, orig_pmd);
@@ -3891,9 +3924,9 @@ static int handle_pte_fault(struct vm_fault *vmf)
/*
* some architectures can have larger ptes than wordsize,
* e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
- * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee
- * atomic accesses. The code below just needs a consistent
- * view for the ifs and we later double check anyway with the
+ * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
+ * accesses. The code below just needs a consistent view
+ * for the ifs and we later double check anyway with the
* ptl lock held. So here a barrier will do.
*/
barrier();
@@ -4124,15 +4157,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_5LEVEL_HACK
- if (p4d_present(*p4d)) /* Another has populated it */
- pud_free(mm, new);
- else
+ if (!p4d_present(*p4d)) {
+ mm_inc_nr_puds(mm);
p4d_populate(mm, p4d, new);
-#else
- if (pgd_present(*p4d)) /* Another has populated it */
+ } else /* Another has populated it */
pud_free(mm, new);
- else
+#else
+ if (!pgd_present(*p4d)) {
+ mm_inc_nr_puds(mm);
pgd_populate(mm, p4d, new);
+ } else /* Another has populated it */
+ pud_free(mm, new);
#endif /* __ARCH_HAS_5LEVEL_HACK */
spin_unlock(&mm->page_table_lock);
return 0;
@@ -4457,17 +4492,15 @@ void print_vma_addr(char *prefix, unsigned long ip)
struct vm_area_struct *vma;
/*
- * Do not print if we are in atomic
- * contexts (in exception stacks, etc.):
+ * we might be running from an atomic context so we cannot sleep
*/
- if (preempt_count())
+ if (!down_read_trylock(&mm->mmap_sem))
return;
- down_read(&mm->mmap_sem);
vma = find_vma(mm, ip);
if (vma && vma->vm_file) {
struct file *f = vma->vm_file;
- char *buf = (char *)__get_free_page(GFP_KERNEL);
+ char *buf = (char *)__get_free_page(GFP_NOWAIT);
if (buf) {
char *p;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d4b5f29906b9..c52aa05b106c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -265,7 +265,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
/*
* Make all the pages reserved so that nobody will stumble over half
* initialized state.
- * FIXME: We also have to associate it with a node because pfn_to_node
+ * FIXME: We also have to associate it with a node because page_to_nid
* relies on having page with the proper node.
*/
for (i = 0; i < PAGES_PER_SECTION; i++) {
@@ -1590,11 +1590,11 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
}
static int __ref __offline_pages(unsigned long start_pfn,
- unsigned long end_pfn, unsigned long timeout)
+ unsigned long end_pfn)
{
- unsigned long pfn, nr_pages, expire;
+ unsigned long pfn, nr_pages;
long offlined_pages;
- int ret, drain, retry_max, node;
+ int ret, node;
unsigned long flags;
unsigned long valid_start, valid_end;
struct zone *zone;
@@ -1630,44 +1630,22 @@ static int __ref __offline_pages(unsigned long start_pfn,
goto failed_removal;
pfn = start_pfn;
- expire = jiffies + timeout;
- drain = 0;
- retry_max = 5;
repeat:
/* start memory hot removal */
- ret = -EAGAIN;
- if (time_after(jiffies, expire))
- goto failed_removal;
ret = -EINTR;
if (signal_pending(current))
goto failed_removal;
- ret = 0;
- if (drain) {
- lru_add_drain_all_cpuslocked();
- cond_resched();
- drain_all_pages(zone);
- }
+
+ cond_resched();
+ lru_add_drain_all_cpuslocked();
+ drain_all_pages(zone);
pfn = scan_movable_pages(start_pfn, end_pfn);
if (pfn) { /* We have movable pages */
ret = do_migrate_range(pfn, end_pfn);
- if (!ret) {
- drain = 1;
- goto repeat;
- } else {
- if (ret < 0)
- if (--retry_max == 0)
- goto failed_removal;
- yield();
- drain = 1;
- goto repeat;
- }
+ goto repeat;
}
- /* drain all zone's lru pagevec, this is asynchronous... */
- lru_add_drain_all_cpuslocked();
- yield();
- /* drain pcp pages, this is synchronous. */
- drain_all_pages(zone);
+
/*
* dissolve free hugepages in the memory block before doing offlining
* actually in order to make hugetlbfs's object counting consistent.
@@ -1677,10 +1655,8 @@ repeat:
goto failed_removal;
/* check again */
offlined_pages = check_pages_isolated(start_pfn, end_pfn);
- if (offlined_pages < 0) {
- ret = -EBUSY;
- goto failed_removal;
- }
+ if (offlined_pages < 0)
+ goto repeat;
pr_info("Offlined Pages %ld\n", offlined_pages);
/* Ok, all of our target is isolated.
We cannot do rollback at this point. */
@@ -1728,7 +1704,7 @@ failed_removal:
/* Must be protected by mem_hotplug_begin() or a device_lock */
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
- return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
+ return __offline_pages(start_pfn, start_pfn + nr_pages);
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a2af6d58a68f..4ce44d3ff03d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -85,6 +85,7 @@
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
+#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
@@ -1365,7 +1366,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
const unsigned long __user *, old_nodes,
const unsigned long __user *, new_nodes)
{
- const struct cred *cred = current_cred(), *tcred;
struct mm_struct *mm = NULL;
struct task_struct *task;
nodemask_t task_nodes;
@@ -1401,15 +1401,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
err = -EINVAL;
/*
- * Check if this process has the right to modify the specified
- * process. The right exists if the process has administrative
- * capabilities, superuser privileges or the same
- * userid as the target process.
+ * Check if this process has the right to modify the specified process.
+ * Use the regular "ptrace_may_access()" checks.
*/
- tcred = __task_cred(task);
- if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) &&
- !capable(CAP_SYS_NICE)) {
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
rcu_read_unlock();
err = -EPERM;
goto out_put;
@@ -1920,6 +1915,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
struct page *page;
page = __alloc_pages(gfp, order, nid);
+ /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
+ if (!static_branch_likely(&vm_numa_stat_key))
+ return page;
if (page && page_to_nid(page) == nid) {
preempt_disable();
__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
diff --git a/mm/mempool.c b/mm/mempool.c
index c4a23cdae3f0..7d8c5a0010a2 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -189,7 +189,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
if (!pool)
return NULL;
- pool->elements = kmalloc_node(min_nr * sizeof(void *),
+ pool->elements = kmalloc_array_node(min_nr, sizeof(void *),
gfp_mask, node_id);
if (!pool->elements) {
kfree(pool);
diff --git a/mm/migrate.c b/mm/migrate.c
index 1236449b4777..4d0be47a322a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2089,7 +2089,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above pmdp_huge_clear_flush_notify() did already call it.
+ */
+ mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
/* Take an "isolate" reference and put new page on the LRU. */
get_page(new_page);
@@ -2805,9 +2809,14 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
}
+ /*
+ * No need to double call mmu_notifier->invalidate_range() callback as
+ * the above ptep_clear_flush_notify() inside migrate_vma_insert_page()
+ * did already call it.
+ */
if (notified)
- mmu_notifier_invalidate_range_end(mm, mmu_start,
- migrate->end);
+ mmu_notifier_invalidate_range_only_end(mm, mmu_start,
+ migrate->end);
}
/*
diff --git a/mm/mlock.c b/mm/mlock.c
index 46af369c13e5..30472d438794 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
struct pagevec pvec_putback;
int pgrescued = 0;
- pagevec_init(&pvec_putback, 0);
+ pagevec_init(&pvec_putback);
/* Phase 1: page isolation */
spin_lock_irq(zone_lru_lock(zone));
@@ -448,7 +448,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
struct pagevec pvec;
struct zone *zone;
- pagevec_init(&pvec, 0);
+ pagevec_init(&pvec);
/*
* Although FOLL_DUMP is intended for get_dump_page(),
* it just so happens that its special treatment of the
@@ -670,8 +670,6 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
if (!can_do_mlock())
return -EPERM;
- lru_add_drain_all(); /* flush pagevec */
-
len = PAGE_ALIGN(len + (offset_in_page(start)));
start &= PAGE_MASK;
@@ -798,9 +796,6 @@ SYSCALL_DEFINE1(mlockall, int, flags)
if (!can_do_mlock())
return -EPERM;
- if (flags & MCL_CURRENT)
- lru_add_drain_all(); /* flush pagevec */
-
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
diff --git a/mm/mmap.c b/mm/mmap.c
index 680506faceae..9efdc021ad22 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1387,9 +1387,24 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (file) {
struct inode *inode = file_inode(file);
+ unsigned long flags_mask;
+
+ flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
switch (flags & MAP_TYPE) {
case MAP_SHARED:
+ /*
+ * Force use of MAP_SHARED_VALIDATE with non-legacy
+ * flags. E.g. MAP_SYNC is dangerous to use with
+ * MAP_SHARED as you don't know which consistency model
+ * you will get. We silently ignore unsupported flags
+ * with MAP_SHARED to preserve backward compatibility.
+ */
+ flags &= LEGACY_MAP_MASK;
+ /* fall through */
+ case MAP_SHARED_VALIDATE:
+ if (flags & ~flags_mask)
+ return -EOPNOTSUPP;
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
return -EACCES;
@@ -2540,9 +2555,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *new;
int err;
- if (is_vm_hugetlb_page(vma) && (addr &
- ~(huge_page_mask(hstate_vma(vma)))))
- return -EINVAL;
+ if (vma->vm_ops && vma->vm_ops->split) {
+ err = vma->vm_ops->split(vma, addr);
+ if (err)
+ return err;
+ }
new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
@@ -3002,20 +3019,20 @@ void exit_mmap(struct mm_struct *mm)
/* Use -1 here to ensure all VMAs in the mm are unmapped */
unmap_vmas(&tlb, vma, 0, -1);
- set_bit(MMF_OOM_SKIP, &mm->flags);
- if (unlikely(tsk_is_oom_victim(current))) {
+ if (unlikely(mm_is_oom_victim(mm))) {
/*
* Wait for oom_reap_task() to stop working on this
* mm. Because MMF_OOM_SKIP is already set before
* calling down_read(), oom_reap_task() will not run
* on this "mm" post up_write().
*
- * tsk_is_oom_victim() cannot be set from under us
- * either because current->mm is already set to NULL
+ * mm_is_oom_victim() cannot be set from under us
+ * either because victim->mm is already set to NULL
* under task_lock before calling mmput and oom_mm is
- * set not NULL by the OOM killer only if current->mm
+ * set not NULL by the OOM killer only if victim->mm
* is found not NULL while holding the task_lock.
*/
+ set_bit(MMF_OOM_SKIP, &mm->flags);
down_write(&mm->mmap_sem);
up_write(&mm->mmap_sem);
}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 314285284e6e..96edb33fd09a 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -190,7 +190,9 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
- unsigned long start, unsigned long end)
+ unsigned long start,
+ unsigned long end,
+ bool only_end)
{
struct mmu_notifier *mn;
int id;
@@ -204,8 +206,13 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
* subsystem registers either invalidate_range_start()/end() or
* invalidate_range(), so this will be no additional overhead
* (besides the pointer check).
+ *
+ * We skip call to invalidate_range() if we know it is safe ie
+ * call site use mmu_notifier_invalidate_range_only_end() which
+ * is safe to do when we know that a call to invalidate_range()
+ * already happen under page table lock.
*/
- if (mn->ops->invalidate_range)
+ if (!only_end && mn->ops->invalidate_range)
mn->ops->invalidate_range(mn, mm, start, end);
if (mn->ops->invalidate_range_end)
mn->ops->invalidate_range_end(mn, mm, start, end);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ec39f730a0bf..58b629bb70de 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -166,7 +166,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
next = pmd_addr_end(addr, end);
if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
&& pmd_none_or_clear_bad(pmd))
- continue;
+ goto next;
/* invoke the mmu notifier if the pmd is populated */
if (!mni_start) {
@@ -188,7 +188,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
}
/* huge pmd was handled */
- continue;
+ goto next;
}
}
/* fall through, the trans huge pmd just split */
@@ -196,6 +196,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
this_pages = change_pte_range(vma, pmd, addr, next, newprot,
dirty_accountable, prot_numa);
pages += this_pages;
+next:
+ cond_resched();
} while (pmd++, addr = next, addr != end);
if (mni_start)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..29f855551efe 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,6 +44,7 @@
#include <asm/tlb.h>
#include "internal.h"
+#include "slab.h"
#define CREATE_TRACE_POINTS
#include <trace/events/oom.h>
@@ -161,6 +162,25 @@ static bool oom_unkillable_task(struct task_struct *p,
return false;
}
+/*
+ * Print out unreclaimble slabs info when unreclaimable slabs amount is greater
+ * than all user memory (LRU pages)
+ */
+static bool is_dump_unreclaim_slabs(void)
+{
+ unsigned long nr_lru;
+
+ nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
+ global_node_page_state(NR_INACTIVE_ANON) +
+ global_node_page_state(NR_ACTIVE_FILE) +
+ global_node_page_state(NR_INACTIVE_FILE) +
+ global_node_page_state(NR_ISOLATED_ANON) +
+ global_node_page_state(NR_ISOLATED_FILE) +
+ global_node_page_state(NR_UNEVICTABLE);
+
+ return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
+}
+
/**
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
@@ -201,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* task's rss, pagetable and swap space use.
*/
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
- atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
+ mm_pgtables_bytes(p->mm) / PAGE_SIZE;
task_unlock(p);
/*
@@ -369,15 +389,15 @@ static void select_bad_process(struct oom_control *oc)
* Dumps the current memory state of all eligible tasks. Tasks not in the same
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
* are not shown.
- * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
- * swapents, oom_score_adj value, and name.
+ * State information includes task's pid, uid, tgid, vm size, rss,
+ * pgtables_bytes, swapents, oom_score_adj value, and name.
*/
static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *task;
- pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
+ pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
rcu_read_lock();
for_each_process(p) {
if (oom_unkillable_task(p, memcg, nodemask))
@@ -393,11 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
+ pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
- atomic_long_read(&task->mm->nr_ptes),
- mm_nr_pmds(task->mm),
+ mm_pgtables_bytes(task->mm),
get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
@@ -407,23 +426,22 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
static void dump_header(struct oom_control *oc, struct task_struct *p)
{
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=",
- current->comm, oc->gfp_mask, &oc->gfp_mask);
- if (oc->nodemask)
- pr_cont("%*pbl", nodemask_pr_args(oc->nodemask));
- else
- pr_cont("(null)");
- pr_cont(", order=%d, oom_score_adj=%hd\n",
- oc->order, current->signal->oom_score_adj);
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
+ current->comm, oc->gfp_mask, &oc->gfp_mask,
+ nodemask_pr_args(oc->nodemask), oc->order,
+ current->signal->oom_score_adj);
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
pr_warn("COMPACTION is disabled!!!\n");
cpuset_print_current_mems_allowed();
dump_stack();
- if (oc->memcg)
+ if (is_memcg_oom(oc))
mem_cgroup_print_oom_info(oc->memcg, p);
- else
+ else {
show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
+ if (is_dump_unreclaim_slabs())
+ dump_unreclaimable_slab();
+ }
if (sysctl_oom_dump_tasks)
dump_tasks(oc->memcg, oc->nodemask);
}
@@ -532,7 +550,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
*/
set_bit(MMF_UNSTABLE, &mm->flags);
- tlb_gather_mmu(&tlb, mm, 0, -1);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
if (!can_madv_dontneed_vma(vma))
continue;
@@ -547,11 +564,13 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
* we do not want to block exit_mmap by keeping mm ref
* count elevated without a good reason.
*/
- if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
+ tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end);
unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
NULL);
+ tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end);
+ }
}
- tlb_finish_mmu(&tlb, 0, -1);
pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(tsk), tsk->comm,
K(get_mm_counter(mm, MM_ANONPAGES)),
@@ -618,9 +637,6 @@ static int oom_reaper(void *unused)
static void wake_oom_reaper(struct task_struct *tsk)
{
- if (!oom_reaper_th)
- return;
-
/* tsk is already queued? */
if (tsk == oom_reaper_list || tsk->oom_reaper_list)
return;
@@ -638,11 +654,6 @@ static void wake_oom_reaper(struct task_struct *tsk)
static int __init oom_init(void)
{
oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
- if (IS_ERR(oom_reaper_th)) {
- pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
- PTR_ERR(oom_reaper_th));
- oom_reaper_th = NULL;
- }
return 0;
}
subsys_initcall(oom_init)
@@ -672,8 +683,10 @@ static void mark_oom_victim(struct task_struct *tsk)
return;
/* oom_mm is bound to the signal struct life time. */
- if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm))
+ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
mmgrab(tsk->signal->oom_mm);
+ set_bit(MMF_OOM_VICTIM, &mm->flags);
+ }
/*
* Make sure that the task is woken up from uninterruptible sleep
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b9c5cbe8eba..586f31261c83 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -625,9 +625,9 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc);
* On idle system, we can be called long after we scheduled because we use
* deferred timers so count with missed periods.
*/
-static void writeout_period(unsigned long t)
+static void writeout_period(struct timer_list *t)
{
- struct wb_domain *dom = (void *)t;
+ struct wb_domain *dom = from_timer(dom, t, period_timer);
int miss_periods = (jiffies - dom->period_time) /
VM_COMPLETIONS_PERIOD_LEN;
@@ -650,8 +650,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
spin_lock_init(&dom->lock);
- setup_deferrable_timer(&dom->period_timer, writeout_period,
- (unsigned long)dom);
+ timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);
dom->dirty_limit_tstamp = jiffies;
@@ -1543,7 +1542,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
* actually dirty; with m+n sitting in the percpu
* deltas.
*/
- if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
+ if (dtc->wb_thresh < 2 * wb_stat_error()) {
wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
} else {
@@ -1559,8 +1558,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
* If we're over `background_thresh' then the writeback threads are woken to
* perform some writeout.
*/
-static void balance_dirty_pages(struct address_space *mapping,
- struct bdi_writeback *wb,
+static void balance_dirty_pages(struct bdi_writeback *wb,
unsigned long pages_dirtied)
{
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
@@ -1802,7 +1800,7 @@ pause:
* more page. However wb_dirty has accounting errors. So use
* the larger and more IO friendly wb_stat_error.
*/
- if (sdtc->wb_dirty <= wb_stat_error(wb))
+ if (sdtc->wb_dirty <= wb_stat_error())
break;
if (fatal_signal_pending(current))
@@ -1910,7 +1908,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
preempt_enable();
if (unlikely(current->nr_dirtied >= ratelimit))
- balance_dirty_pages(mapping, wb, current->nr_dirtied);
+ balance_dirty_pages(wb, current->nr_dirtied);
wb_put(wb);
}
@@ -1972,31 +1970,32 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, buffer, length, ppos);
- return 0;
-}
+ unsigned int old_interval = dirty_writeback_interval;
+ int ret;
-#ifdef CONFIG_BLOCK
-void laptop_mode_timer_fn(unsigned long data)
-{
- struct request_queue *q = (struct request_queue *)data;
- int nr_pages = global_node_page_state(NR_FILE_DIRTY) +
- global_node_page_state(NR_UNSTABLE_NFS);
- struct bdi_writeback *wb;
+ ret = proc_dointvec(table, write, buffer, length, ppos);
/*
- * We want to write everything out, not just down to the dirty
- * threshold
+ * Writing 0 to dirty_writeback_interval will disable periodic writeback
+ * and a different non-zero value will wakeup the writeback threads.
+ * wb_wakeup_delayed() would be more appropriate, but it's a pain to
+ * iterate over all bdis and wbs.
+ * The reason we do this is to make the change take effect immediately.
*/
- if (!bdi_has_dirty_io(q->backing_dev_info))
- return;
+ if (!ret && write && dirty_writeback_interval &&
+ dirty_writeback_interval != old_interval)
+ wakeup_flusher_threads(WB_REASON_PERIODIC);
- rcu_read_lock();
- list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node)
- if (wb_has_dirty_io(wb))
- wb_start_writeback(wb, nr_pages, true,
- WB_REASON_LAPTOP_TIMER);
- rcu_read_unlock();
+ return ret;
+}
+
+#ifdef CONFIG_BLOCK
+void laptop_mode_timer_fn(struct timer_list *t)
+{
+ struct backing_dev_info *backing_dev_info =
+ from_timer(backing_dev_info, t, laptop_mode_wb_timer);
+
+ wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
}
/*
@@ -2167,7 +2166,7 @@ int write_cache_pages(struct address_space *mapping,
int range_whole = 0;
int tag;
- pagevec_init(&pvec, 0);
+ pagevec_init(&pvec);
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
@@ -2194,30 +2193,14 @@ retry:
while (!done && (index <= end)) {
int i;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
+ tag);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- /*
- * At this point, the page may be truncated or
- * invalidated (changing page->mapping to NULL), or
- * even swizzled back from swapper_space to tmpfs file
- * mapping. However, page->index will not change
- * because we have a reference on the page.
- */
- if (page->index > end) {
- /*
- * can't be range_cyclic (1st pass) because
- * end == -1 in that case.
- */
- done = 1;
- break;
- }
-
done_index = page->index;
lock_page(page);
@@ -2623,7 +2606,7 @@ EXPORT_SYMBOL(set_page_dirty_lock);
* page without actually doing it through the VM. Can you say "ext3 is
* horribly ugly"? Thought you could.
*/
-void cancel_dirty_page(struct page *page)
+void __cancel_dirty_page(struct page *page)
{
struct address_space *mapping = page_mapping(page);
@@ -2644,7 +2627,7 @@ void cancel_dirty_page(struct page *page)
ClearPageDirty(page);
}
}
-EXPORT_SYMBOL(cancel_dirty_page);
+EXPORT_SYMBOL(__cancel_dirty_page);
/*
* Clear a page's dirty flag, while caring for dirty memory accounting.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8dfd13f724d9..76c9688b6a0a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -24,7 +24,6 @@
#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/suspend.h>
@@ -83,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif
+DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
+
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -290,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+/*
+ * Determine how many pages need to be initialized durig early boot
+ * (non-deferred initialization).
+ * The value of first_deferred_pfn will be set later, once non-deferred pages
+ * are initialized, but for now set it ULONG_MAX.
+ */
static inline void reset_deferred_meminit(pg_data_t *pgdat)
{
- unsigned long max_initialise;
- unsigned long reserved_lowmem;
+ phys_addr_t start_addr, end_addr;
+ unsigned long max_pgcnt;
+ unsigned long reserved;
/*
* Initialise at least 2G of a node but also take into account that
* two large system hashes that can take up 1GB for 0.25TB/node.
*/
- max_initialise = max(2UL << (30 - PAGE_SHIFT),
- (pgdat->node_spanned_pages >> 8));
+ max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
/*
* Compensate the all the memblock reservations (e.g. crash kernel)
* from the initial estimation to make sure we will initialize enough
* memory to boot.
*/
- reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
- pgdat->node_start_pfn + max_initialise);
- max_initialise += reserved_lowmem;
+ start_addr = PFN_PHYS(pgdat->node_start_pfn);
+ end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
+ reserved = memblock_reserved_memory_within(start_addr, end_addr);
+ max_pgcnt += PHYS_PFN(reserved);
- pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
+ pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
}
@@ -338,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
if (zone_end < pgdat_end_pfn(pgdat))
return true;
(*nr_initialised)++;
- if ((*nr_initialised > pgdat->static_init_size) &&
+ if ((*nr_initialised > pgdat->static_init_pgcnt) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
@@ -1013,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
- kmemcheck_free_shadow(page, order);
/*
* Check tail pages before head page information is cleared to
@@ -1170,6 +1179,7 @@ static void free_one_page(struct zone *zone,
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
{
+ mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
@@ -1410,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone)
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __init deferred_free_range(struct page *page,
- unsigned long pfn, int nr_pages)
+static void __init deferred_free_range(unsigned long pfn,
+ unsigned long nr_pages)
{
- int i;
+ struct page *page;
+ unsigned long i;
- if (!page)
+ if (!nr_pages)
return;
+ page = pfn_to_page(pfn);
+
/* Free a large naturally-aligned chunk if possible */
if (nr_pages == pageblock_nr_pages &&
(pfn & (pageblock_nr_pages - 1)) == 0) {
@@ -1443,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void)
complete(&pgdat_init_all_done_comp);
}
+/*
+ * Helper for deferred_init_range, free the given range, reset the counters, and
+ * return number of pages freed.
+ */
+static inline unsigned long __init __def_free(unsigned long *nr_free,
+ unsigned long *free_base_pfn,
+ struct page **page)
+{
+ unsigned long nr = *nr_free;
+
+ deferred_free_range(*free_base_pfn, nr);
+ *free_base_pfn = 0;
+ *nr_free = 0;
+ *page = NULL;
+
+ return nr;
+}
+
+static unsigned long __init deferred_init_range(int nid, int zid,
+ unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ struct mminit_pfnnid_cache nid_init_state = { };
+ unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ unsigned long free_base_pfn = 0;
+ unsigned long nr_pages = 0;
+ unsigned long nr_free = 0;
+ struct page *page = NULL;
+ unsigned long pfn;
+
+ /*
+ * First we check if pfn is valid on architectures where it is possible
+ * to have holes within pageblock_nr_pages. On systems where it is not
+ * possible, this function is optimized out.
+ *
+ * Then, we check if a current large page is valid by only checking the
+ * validity of the head pfn.
+ *
+ * meminit_pfn_in_nid is checked on systems where pfns can interleave
+ * within a node: a pfn is between start and end of a node, but does not
+ * belong to this memory node.
+ *
+ * Finally, we minimize pfn page lookups and scheduler checks by
+ * performing it only once every pageblock_nr_pages.
+ *
+ * We do it in two loops: first we initialize struct page, than free to
+ * buddy allocator, becuse while we are freeing pages we can access
+ * pages that are ahead (computing buddy page in __free_one_page()).
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ continue;
+ if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
+ if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+ if (page && (pfn & nr_pgmask))
+ page++;
+ else
+ page = pfn_to_page(pfn);
+ __init_single_page(page, pfn, zid, nid);
+ cond_resched();
+ }
+ }
+ }
+
+ page = NULL;
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn)) {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ } else if (page && (pfn & nr_pgmask)) {
+ page++;
+ nr_free++;
+ } else {
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+ page = pfn_to_page(pfn);
+ free_base_pfn = pfn;
+ nr_free = 1;
+ cond_resched();
+ }
+ }
+ /* Free the last block of pages to allocator */
+ nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+
+ return nr_pages;
+}
+
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
int nid = pgdat->node_id;
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long start = jiffies;
unsigned long nr_pages = 0;
- unsigned long walk_start, walk_end;
- int i, zid;
+ unsigned long spfn, epfn;
+ phys_addr_t spa, epa;
+ int zid;
struct zone *zone;
unsigned long first_init_pfn = pgdat->first_deferred_pfn;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ u64 i;
if (first_init_pfn == ULONG_MAX) {
pgdat_init_report_one_done();
@@ -1477,83 +1580,12 @@ static int __init deferred_init_memmap(void *data)
if (first_init_pfn < zone_end_pfn(zone))
break;
}
+ first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
- for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
- unsigned long pfn, end_pfn;
- struct page *page = NULL;
- struct page *free_base_page = NULL;
- unsigned long free_base_pfn = 0;
- int nr_to_free = 0;
-
- end_pfn = min(walk_end, zone_end_pfn(zone));
- pfn = first_init_pfn;
- if (pfn < walk_start)
- pfn = walk_start;
- if (pfn < zone->zone_start_pfn)
- pfn = zone->zone_start_pfn;
-
- for (; pfn < end_pfn; pfn++) {
- if (!pfn_valid_within(pfn))
- goto free_range;
-
- /*
- * Ensure pfn_valid is checked every
- * pageblock_nr_pages for memory holes
- */
- if ((pfn & (pageblock_nr_pages - 1)) == 0) {
- if (!pfn_valid(pfn)) {
- page = NULL;
- goto free_range;
- }
- }
-
- if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
- page = NULL;
- goto free_range;
- }
-
- /* Minimise pfn page lookups and scheduler checks */
- if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
- page++;
- } else {
- nr_pages += nr_to_free;
- deferred_free_range(free_base_page,
- free_base_pfn, nr_to_free);
- free_base_page = NULL;
- free_base_pfn = nr_to_free = 0;
-
- page = pfn_to_page(pfn);
- cond_resched();
- }
-
- if (page->flags) {
- VM_BUG_ON(page_zone(page) != zone);
- goto free_range;
- }
-
- __init_single_page(page, pfn, zid, nid);
- if (!free_base_page) {
- free_base_page = page;
- free_base_pfn = pfn;
- nr_to_free = 0;
- }
- nr_to_free++;
-
- /* Where possible, batch up pages for a single free */
- continue;
-free_range:
- /* Free the current block of pages to allocator */
- nr_pages += nr_to_free;
- deferred_free_range(free_base_page, free_base_pfn,
- nr_to_free);
- free_base_page = NULL;
- free_base_pfn = nr_to_free = 0;
- }
- /* Free the last block of pages to allocator */
- nr_pages += nr_to_free;
- deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
-
- first_init_pfn = max(end_pfn, first_init_pfn);
+ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
+ spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
+ epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+ nr_pages += deferred_init_range(nid, zid, spfn, epfn);
}
/* Sanity check that the next zone really is unpopulated */
@@ -1792,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
-static inline
+static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
@@ -1836,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
};
#ifdef CONFIG_CMA
-static struct page *__rmqueue_cma_fallback(struct zone *zone,
+static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order)
{
return __rmqueue_smallest(zone, order, MIGRATE_CMA);
@@ -2217,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
-static inline bool
+static __always_inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area *area;
@@ -2289,8 +2321,8 @@ do_steal:
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
-static struct page *__rmqueue(struct zone *zone, unsigned int order,
- int migratetype)
+static __always_inline struct page *
+__rmqueue(struct zone *zone, unsigned int order, int migratetype)
{
struct page *page;
@@ -2315,7 +2347,7 @@ retry:
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype, bool cold)
+ int migratetype)
{
int i, alloced = 0;
@@ -2329,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
continue;
/*
- * Split buddy pages returned by expand() are received here
- * in physical page order. The page is added to the callers and
- * list and the list head then moves forward. From the callers
- * perspective, the linked list is ordered by page number in
- * some conditions. This is useful for IO devices that can
- * merge IO requests if the physical pages are ordered
- * properly.
+ * Split buddy pages returned by expand() are received here in
+ * physical page order. The page is added to the tail of
+ * caller's list. From the callers perspective, the linked list
+ * is ordered by page number under some conditions. This is
+ * useful for IO devices that can forward direction from the
+ * head, thus also in the physical page order. This is useful
+ * for IO devices that can merge IO requests if the physical
+ * pages are ordered properly.
*/
- if (likely(!cold))
- list_add(&page->lru, list);
- else
- list_add_tail(&page->lru, list);
- list = &page->lru;
+ list_add_tail(&page->lru, list);
alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -2478,10 +2507,6 @@ void drain_all_pages(struct zone *zone)
if (WARN_ON_ONCE(!mm_percpu_wq))
return;
- /* Workqueues cannot recurse */
- if (current->flags & PF_WQ_WORKER)
- return;
-
/*
* Do not drain if one is already in progress unless it's specific to
* a zone. Such callers are primarily CMA and memory hotplug and need
@@ -2590,24 +2615,25 @@ void mark_free_pages(struct zone *zone)
}
#endif /* CONFIG_PM */
-/*
- * Free a 0-order page
- * cold == true ? free a cold page : free a hot page
- */
-void free_hot_cold_page(struct page *page, bool cold)
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
{
- struct zone *zone = page_zone(page);
- struct per_cpu_pages *pcp;
- unsigned long flags;
- unsigned long pfn = page_to_pfn(page);
int migratetype;
if (!free_pcp_prepare(page))
- return;
+ return false;
migratetype = get_pfnblock_migratetype(page, pfn);
set_pcppage_migratetype(page, migratetype);
- local_irq_save(flags);
+ return true;
+}
+
+static void free_unref_page_commit(struct page *page, unsigned long pfn)
+{
+ struct zone *zone = page_zone(page);
+ struct per_cpu_pages *pcp;
+ int migratetype;
+
+ migratetype = get_pcppage_migratetype(page);
__count_vm_event(PGFREE);
/*
@@ -2620,38 +2646,73 @@ void free_hot_cold_page(struct page *page, bool cold)
if (migratetype >= MIGRATE_PCPTYPES) {
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(zone, page, pfn, 0, migratetype);
- goto out;
+ return;
}
migratetype = MIGRATE_MOVABLE;
}
pcp = &this_cpu_ptr(zone->pageset)->pcp;
- if (!cold)
- list_add(&page->lru, &pcp->lists[migratetype]);
- else
- list_add_tail(&page->lru, &pcp->lists[migratetype]);
+ list_add(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
pcp->count -= batch;
}
+}
-out:
+/*
+ * Free a 0-order page
+ */
+void free_unref_page(struct page *page)
+{
+ unsigned long flags;
+ unsigned long pfn = page_to_pfn(page);
+
+ if (!free_unref_page_prepare(page, pfn))
+ return;
+
+ local_irq_save(flags);
+ free_unref_page_commit(page, pfn);
local_irq_restore(flags);
}
/*
* Free a list of 0-order pages
*/
-void free_hot_cold_page_list(struct list_head *list, bool cold)
+void free_unref_page_list(struct list_head *list)
{
struct page *page, *next;
+ unsigned long flags, pfn;
+ int batch_count = 0;
+ /* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
- trace_mm_page_free_batched(page, cold);
- free_hot_cold_page(page, cold);
+ pfn = page_to_pfn(page);
+ if (!free_unref_page_prepare(page, pfn))
+ list_del(&page->lru);
+ set_page_private(page, pfn);
}
+
+ local_irq_save(flags);
+ list_for_each_entry_safe(page, next, list, lru) {
+ unsigned long pfn = page_private(page);
+
+ set_page_private(page, 0);
+ trace_mm_page_free_batched(page);
+ free_unref_page_commit(page, pfn);
+
+ /*
+ * Guard against excessive IRQ disabled times when we get
+ * a large list of pages to free.
+ */
+ if (++batch_count == SWAP_CLUSTER_MAX) {
+ local_irq_restore(flags);
+ batch_count = 0;
+ local_irq_save(flags);
+ }
+ }
+ local_irq_restore(flags);
}
/*
@@ -2669,15 +2730,6 @@ void split_page(struct page *page, unsigned int order)
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
-#ifdef CONFIG_KMEMCHECK
- /*
- * Split shadow pages too, because free(page[0]) would
- * otherwise free the whole shadow.
- */
- if (kmemcheck_page_is_tracked(page))
- split_page(virt_to_page(page[0].shadow), order);
-#endif
-
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, order);
@@ -2743,6 +2795,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
#ifdef CONFIG_NUMA
enum numa_stat_item local_stat = NUMA_LOCAL;
+ /* skip numa counters update if numa stats is disabled */
+ if (!static_branch_likely(&vm_numa_stat_key))
+ return;
+
if (z->node != numa_node_id())
local_stat = NUMA_OTHER;
@@ -2758,7 +2814,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
- bool cold, struct per_cpu_pages *pcp,
+ struct per_cpu_pages *pcp,
struct list_head *list)
{
struct page *page;
@@ -2767,16 +2823,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
- migratetype, cold);
+ migratetype);
if (unlikely(list_empty(list)))
return NULL;
}
- if (cold)
- page = list_last_entry(list, struct page, lru);
- else
- page = list_first_entry(list, struct page, lru);
-
+ page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
pcp->count--;
} while (check_new_pcp(page));
@@ -2791,14 +2843,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
{
struct per_cpu_pages *pcp;
struct list_head *list;
- bool cold = ((gfp_flags & __GFP_COLD) != 0);
struct page *page;
unsigned long flags;
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
- page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
+ page = __rmqueue_pcplist(zone, migratetype, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
@@ -3006,9 +3057,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
if (!area->nr_free)
continue;
- if (alloc_harder)
- return true;
-
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
if (!list_empty(&area->free_list[mt]))
return true;
@@ -3020,6 +3068,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
return true;
}
#endif
+ if (alloc_harder &&
+ !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+ return true;
}
return false;
}
@@ -3235,20 +3286,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
return;
- pr_warn("%s: ", current->comm);
-
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- pr_cont("%pV", &vaf);
+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
+ current->comm, &vaf, gfp_mask, &gfp_mask,
+ nodemask_pr_args(nodemask));
va_end(args);
- pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
- if (nodemask)
- pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
- else
- pr_cont("(null)\n");
-
cpuset_print_current_mems_allowed();
dump_stack();
@@ -3868,8 +3913,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
- unsigned long alloc_start = jiffies;
- unsigned int stall_timeout = 10 * HZ;
unsigned int cpuset_mems_cookie;
int reserve_flags;
@@ -4001,14 +4044,6 @@ retry:
if (!can_direct_reclaim)
goto nopage;
- /* Make sure we know about allocations which stall for too long */
- if (time_after(jiffies, alloc_start + stall_timeout)) {
- warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
- "page allocation stalls for %ums, order:%u",
- jiffies_to_msecs(jiffies-alloc_start), order);
- stall_timeout += 10 * HZ;
- }
-
/* Avoid recursion of direct reclaim */
if (current->flags & PF_MEMALLOC)
goto nopage;
@@ -4223,9 +4258,6 @@ out:
page = NULL;
}
- if (kmemcheck_enabled && page)
- kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-
trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
return page;
@@ -4262,7 +4294,7 @@ void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
if (order == 0)
- free_hot_cold_page(page, false);
+ free_unref_page(page);
else
__free_pages_ok(page, order);
}
@@ -4320,7 +4352,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
unsigned int order = compound_order(page);
if (order == 0)
- free_hot_cold_page(page, false);
+ free_unref_page(page);
else
__free_pages_ok(page, order);
}
@@ -5646,16 +5678,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
unsigned long start_pfn, end_pfn;
int i, this_nid;
-#ifdef CONFIG_SPARSEMEM_EXTREME
- if (!mem_section) {
- unsigned long size, align;
-
- size = sizeof(struct mem_section) * NR_SECTION_ROOTS;
- align = 1 << (INTERNODE_CACHE_SHIFT);
- mem_section = memblock_virt_alloc(size, align);
- }
-#endif
-
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
memory_present(this_nid, start_pfn, end_pfn);
}
@@ -6136,6 +6158,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
}
}
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
{
unsigned long __maybe_unused start = 0;
@@ -6145,7 +6168,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
if (!pgdat->node_spanned_pages)
return;
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
offset = pgdat->node_start_pfn - start;
/* ia64 gets its own node_mem_map, before this, without bootmem */
@@ -6167,6 +6189,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
pgdat->node_id);
pgdat->node_mem_map = map + offset;
}
+ pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
+ __func__, pgdat->node_id, (unsigned long)pgdat,
+ (unsigned long)pgdat->node_mem_map);
#ifndef CONFIG_NEED_MULTIPLE_NODES
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
@@ -6179,8 +6204,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
+#else
+static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
@@ -6207,16 +6234,51 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
zones_size, zholes_size);
alloc_node_mem_map(pgdat);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
- printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
- nid, (unsigned long)pgdat,
- (unsigned long)pgdat->node_mem_map);
-#endif
reset_deferred_meminit(pgdat);
free_area_init_core(pgdat);
}
+#ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Only struct pages that are backed by physical memory are zeroed and
+ * initialized by going through __init_single_page(). But, there are some
+ * struct pages which are reserved in memblock allocator and their fields
+ * may be accessed (for example page_to_pfn() on some configuration accesses
+ * flags). We must explicitly zero those struct pages.
+ */
+void __paginginit zero_resv_unavail(void)
+{
+ phys_addr_t start, end;
+ unsigned long pfn;
+ u64 i, pgcnt;
+
+ /*
+ * Loop through ranges that are reserved, but do not have reported
+ * physical memory backing.
+ */
+ pgcnt = 0;
+ for_each_resv_unavail_range(i, &start, &end) {
+ for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages)))
+ continue;
+ mm_zero_struct_page(pfn_to_page(pfn));
+ pgcnt++;
+ }
+ }
+
+ /*
+ * Struct pages that do not have backing memory. This could be because
+ * firmware is using some of this memory, or for some other reasons.
+ * Once memblock is changed so such behaviour is not allowed: i.e.
+ * list of "reserved" memory must be a subset of list of "memory", then
+ * this code can be removed.
+ */
+ if (pgcnt)
+ pr_info("Reserved but unavailable: %lld pages", pgcnt);
+}
+#endif /* CONFIG_HAVE_MEMBLOCK */
+
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
#if MAX_NUMNODES > 1
@@ -6640,6 +6702,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
+ zero_resv_unavail();
}
static int __init cmdline_parse_core(char *p, unsigned long *core)
@@ -6803,6 +6866,7 @@ void __init free_area_init(unsigned long *zones_size)
{
free_area_init_node(0, zones_size,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+ zero_resv_unavail();
}
static int page_alloc_cpu_dead(unsigned int cpu)
@@ -7315,18 +7379,17 @@ void *__init alloc_large_system_hash(const char *tablename,
log2qty = ilog2(numentries);
- /*
- * memblock allocator returns zeroed memory already, so HASH_ZERO is
- * currently not used when HASH_EARLY is specified.
- */
gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
size = bucketsize << log2qty;
- if (flags & HASH_EARLY)
- table = memblock_virt_alloc_nopanic(size, 0);
- else if (hashdist)
+ if (flags & HASH_EARLY) {
+ if (flags & HASH_ZERO)
+ table = memblock_virt_alloc_nopanic(size, 0);
+ else
+ table = memblock_virt_alloc_raw(size, 0);
+ } else if (hashdist) {
table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
- else {
+ } else {
/*
* If bucketsize is not a power-of-two, we may free
* some pages at the end of hash table which
@@ -7363,10 +7426,10 @@ void *__init alloc_large_system_hash(const char *tablename,
* race condition. So you can't expect this function should be exact.
*/
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+ int migratetype,
bool skip_hwpoisoned_pages)
{
unsigned long pfn, iter, found;
- int mt;
/*
* For avoiding noise data, lru_add_drain_all() should be called
@@ -7374,8 +7437,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
*/
if (zone_idx(zone) == ZONE_MOVABLE)
return false;
- mt = get_pageblock_migratetype(page);
- if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
+
+ /*
+ * CMA allocations (alloc_contig_range) really need to mark isolate
+ * CMA pageblocks even when they are not movable in fact so consider
+ * them movable here.
+ */
+ if (is_migrate_cma(migratetype) &&
+ is_migrate_cma(get_pageblock_migratetype(page)))
return false;
pfn = page_to_pfn(page);
@@ -7387,6 +7456,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
page = pfn_to_page(check);
+ if (PageReserved(page))
+ return true;
+
/*
* Hugepages are not in LRU lists, but they're movable.
* We need not scan over tail pages bacause we don't
@@ -7460,7 +7532,7 @@ bool is_pageblock_removable_nolock(struct page *page)
if (!zone_spans_pfn(zone, pfn))
return false;
- return !has_unmovable_pages(zone, page, 0, true);
+ return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
}
#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
@@ -7556,6 +7628,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.zone = page_zone(pfn_to_page(start)),
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
+ .no_set_skip_hint = true,
.gfp_mask = current_gfp_context(gfp_mask),
};
INIT_LIST_HEAD(&cc.migratepages);
@@ -7592,11 +7665,18 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/*
* In case of -EBUSY, we'd like to know which page causes problem.
- * So, just fall through. We will check it in test_pages_isolated().
+ * So, just fall through. test_pages_isolated() has a tracepoint
+ * which will report the busy page.
+ *
+ * It is possible that busy pages could become available before
+ * the call to test_pages_isolated, and the range will actually be
+ * allocated. So, if we fall through be sure to clear ret so that
+ * -EBUSY is not accidentally used or returned to caller.
*/
ret = __alloc_contig_migrate_range(&cc, start, end);
if (ret && ret != -EBUSY)
goto done;
+ ret =0;
/*
* Pages from [start, end) are within a MAX_ORDER_NR_PAGES
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 4f0367d472c4..2c16216c29b6 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -125,7 +125,6 @@ struct page_ext *lookup_page_ext(struct page *page)
struct page_ext *base;
base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#if defined(CONFIG_DEBUG_VM)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
@@ -134,7 +133,6 @@ struct page_ext *lookup_page_ext(struct page *page)
*/
if (unlikely(!base))
return NULL;
-#endif
index = pfn - round_down(node_start_pfn(page_to_nid(page)),
MAX_ORDER_NR_PAGES);
return get_entry(base, index);
@@ -199,7 +197,6 @@ struct page_ext *lookup_page_ext(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
-#if defined(CONFIG_DEBUG_VM)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
@@ -208,7 +205,6 @@ struct page_ext *lookup_page_ext(struct page *page)
*/
if (!section->page_ext)
return NULL;
-#endif
return get_entry(section->page_ext, pfn);
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 5d882de3fbfd..e93f1a4cacd7 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -347,7 +347,7 @@ out:
return ret;
}
-int swap_readpage(struct page *page, bool do_poll)
+int swap_readpage(struct page *page, bool synchronous)
{
struct bio *bio;
int ret = 0;
@@ -355,7 +355,7 @@ int swap_readpage(struct page *page, bool do_poll)
blk_qc_t qc;
struct gendisk *disk;
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageUptodate(page), page);
if (frontswap_load(page) == 0) {
@@ -403,12 +403,12 @@ int swap_readpage(struct page *page, bool do_poll)
count_vm_event(PSWPIN);
bio_get(bio);
qc = submit_bio(bio);
- while (do_poll) {
+ while (synchronous) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(bio->bi_private))
break;
- if (!blk_mq_poll(disk->queue, qc))
+ if (!blk_poll(disk->queue, qc))
break;
}
__set_current_state(TASK_RUNNING);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 44f213935bf6..165ed8117bd1 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,7 +15,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/page_isolation.h>
-static int set_migratetype_isolate(struct page *page,
+static int set_migratetype_isolate(struct page *page, int migratetype,
bool skip_hwpoisoned_pages)
{
struct zone *zone;
@@ -52,7 +52,7 @@ static int set_migratetype_isolate(struct page *page,
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
* We just check MOVABLE pages.
*/
- if (!has_unmovable_pages(zone, page, arg.pages_found,
+ if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
skip_hwpoisoned_pages))
ret = 0;
@@ -64,14 +64,14 @@ static int set_migratetype_isolate(struct page *page,
out:
if (!ret) {
unsigned long nr_pages;
- int migratetype = get_pageblock_migratetype(page);
+ int mt = get_pageblock_migratetype(page);
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
zone->nr_isolate_pageblock++;
nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
NULL);
- __mod_zone_freepage_state(zone, -nr_pages, migratetype);
+ __mod_zone_freepage_state(zone, -nr_pages, mt);
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -183,7 +183,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
if (page &&
- set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
+ set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) {
undo_pfn = pfn;
goto undo;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 4f44b95b9d1e..270a8219ccd0 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -20,9 +20,9 @@
#define PAGE_OWNER_STACK_DEPTH (16)
struct page_owner {
- unsigned int order;
+ unsigned short order;
+ short last_migrate_reason;
gfp_t gfp_mask;
- int last_migrate_reason;
depot_stack_handle_t handle;
};
@@ -616,7 +616,6 @@ static void init_early_allocated_pages(void)
{
pg_data_t *pgdat;
- drain_all_pages(NULL);
for_each_online_pgdat(pgdat)
init_zones_in_node(pgdat);
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index d22b84310f6d..ae3c2a35d61b 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -30,10 +30,37 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
return true;
}
+static inline bool pfn_in_hpage(struct page *hpage, unsigned long pfn)
+{
+ unsigned long hpage_pfn = page_to_pfn(hpage);
+
+ /* THP can be referenced by any subpage */
+ return pfn >= hpage_pfn && pfn - hpage_pfn < hpage_nr_pages(hpage);
+}
+
+/**
+ * check_pte - check if @pvmw->page is mapped at the @pvmw->pte
+ *
+ * page_vma_mapped_walk() found a place where @pvmw->page is *potentially*
+ * mapped. check_pte() has to validate this.
+ *
+ * @pvmw->pte may point to empty PTE, swap PTE or PTE pointing to arbitrary
+ * page.
+ *
+ * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration
+ * entry that points to @pvmw->page or any subpage in case of THP.
+ *
+ * If PVMW_MIGRATION flag is not set, returns true if @pvmw->pte points to
+ * @pvmw->page or any subpage in case of THP.
+ *
+ * Otherwise, return false.
+ *
+ */
static bool check_pte(struct page_vma_mapped_walk *pvmw)
{
+ unsigned long pfn;
+
if (pvmw->flags & PVMW_MIGRATION) {
-#ifdef CONFIG_MIGRATION
swp_entry_t entry;
if (!is_swap_pte(*pvmw->pte))
return false;
@@ -41,38 +68,25 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
if (!is_migration_entry(entry))
return false;
- if (migration_entry_to_page(entry) - pvmw->page >=
- hpage_nr_pages(pvmw->page)) {
- return false;
- }
- if (migration_entry_to_page(entry) < pvmw->page)
- return false;
-#else
- WARN_ON_ONCE(1);
-#endif
- } else {
- if (is_swap_pte(*pvmw->pte)) {
- swp_entry_t entry;
- entry = pte_to_swp_entry(*pvmw->pte);
- if (is_device_private_entry(entry) &&
- device_private_entry_to_page(entry) == pvmw->page)
- return true;
- }
+ pfn = migration_entry_to_pfn(entry);
+ } else if (is_swap_pte(*pvmw->pte)) {
+ swp_entry_t entry;
- if (!pte_present(*pvmw->pte))
+ /* Handle un-addressable ZONE_DEVICE memory */
+ entry = pte_to_swp_entry(*pvmw->pte);
+ if (!is_device_private_entry(entry))
return false;
- /* THP can be referenced by any subpage */
- if (pte_page(*pvmw->pte) - pvmw->page >=
- hpage_nr_pages(pvmw->page)) {
- return false;
- }
- if (pte_page(*pvmw->pte) < pvmw->page)
+ pfn = device_private_entry_to_pfn(entry);
+ } else {
+ if (!pte_present(*pvmw->pte))
return false;
+
+ pfn = pte_pfn(*pvmw->pte);
}
- return true;
+ return pfn_in_hpage(pvmw->page, pfn);
}
/**
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8bd4afa83cb8..23a3e415ac2c 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -188,8 +188,12 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
do {
next = hugetlb_entry_end(h, addr, end);
pte = huge_pte_offset(walk->mm, addr & hmask, sz);
- if (pte && walk->hugetlb_entry)
+
+ if (pte)
err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
+ else if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+
if (err)
break;
} while (addr = next, addr != end);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 15dab691ea70..9158e5a81391 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
struct page **pages, int page_start, int page_end)
{
- const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+ const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
unsigned int cpu, tcpu;
int i;
diff --git a/mm/percpu.c b/mm/percpu.c
index a0e0c82c1e4c..50e7fdf84055 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1856,7 +1856,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
- ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
+ ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE);
if (!ptr)
return NULL;
ai = ptr;
@@ -2719,6 +2719,11 @@ void __init setup_per_cpu_areas(void)
if (pcpu_setup_first_chunk(ai, fc) < 0)
panic("Failed to initialize percpu areas.");
+#ifdef CONFIG_CRIS
+#warning "the CRIS architecture has physical and virtual addresses confused"
+#else
+ pcpu_free_alloc_info(ai);
+#endif
}
#endif /* CONFIG_SMP */
diff --git a/mm/rmap.c b/mm/rmap.c
index b874c4761e84..47db27f8049e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -899,7 +899,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
while (page_vma_mapped_walk(&pvmw)) {
- unsigned long cstart, cend;
+ unsigned long cstart;
int ret = 0;
cstart = address = pvmw.address;
@@ -915,7 +915,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(vma->vm_mm, address, pte, entry);
- cend = cstart + PAGE_SIZE;
ret = 1;
} else {
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -931,7 +930,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
cstart &= PMD_MASK;
- cend = cstart + PMD_SIZE;
ret = 1;
#else
/* unexpected pmd-mapped page? */
@@ -939,10 +937,15 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
#endif
}
- if (ret) {
- mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend);
+ /*
+ * No need to call mmu_notifier_invalidate_range() as we are
+ * downgrading page table protection not changing it to point
+ * to a new page.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
+ if (ret)
(*cleaned)++;
- }
}
mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
@@ -1318,7 +1321,7 @@ void page_remove_rmap(struct page *page, bool compound)
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
* which increments mapcount after us but sets mapping
- * before us: so leave the reset to free_hot_cold_page,
+ * before us: so leave the reset to free_unref_page,
* and remember that it's only reliable while mapped.
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
@@ -1426,6 +1429,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ */
goto discard;
}
@@ -1483,6 +1490,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* will take care of the rest.
*/
dec_mm_counter(mm, mm_counter(page));
+ /* We have to invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
} else if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
swp_entry_t entry;
@@ -1498,6 +1508,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
+ /*
+ * No need to invalidate here it will synchronize on
+ * against the special swap migration pte.
+ */
} else if (PageAnon(page)) {
swp_entry_t entry = { .val = page_private(subpage) };
pte_t swp_pte;
@@ -1509,6 +1523,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
WARN_ON_ONCE(1);
ret = false;
/* We have to invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1516,6 +1532,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
/* MADV_FREE page check */
if (!PageSwapBacked(page)) {
if (!PageDirty(page)) {
+ /* Invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm,
+ address, address + PAGE_SIZE);
dec_mm_counter(mm, MM_ANONPAGES);
goto discard;
}
@@ -1549,13 +1568,39 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
- } else
+ /* Invalidate as we cleared the pte */
+ mmu_notifier_invalidate_range(mm, address,
+ address + PAGE_SIZE);
+ } else {
+ /*
+ * We should not need to notify here as we reach this
+ * case only from freeze_page() itself only call from
+ * split_huge_page_to_list() so everything below must
+ * be true:
+ * - page is not anonymous
+ * - page is locked
+ *
+ * So as it is a locked file back page thus it can not
+ * be remove from the page cache and replace by a new
+ * page before mmu_notifier_invalidate_range_end so no
+ * concurrent thread might update its page table to
+ * point at new page while a device still is using this
+ * page.
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
dec_mm_counter(mm, mm_counter_file(page));
+ }
discard:
+ /*
+ * No need to call mmu_notifier_invalidate_range() it has be
+ * done above for all cases requiring it to happen under page
+ * table lock before mmu_notifier_invalidate_range_end()
+ *
+ * See Documentation/vm/mmu_notifier.txt
+ */
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
- mmu_notifier_invalidate_range(mm, address,
- address + PAGE_SIZE);
}
mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
diff --git a/mm/shmem.c b/mm/shmem.c
index 07a1d22807be..7fbe67be86fa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -338,7 +338,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
if (item != expected)
return -ENOENT;
__radix_tree_replace(&mapping->page_tree, node, pslot,
- replacement, NULL, NULL);
+ replacement, NULL);
return 0;
}
@@ -747,7 +747,7 @@ void shmem_unlock_mapping(struct address_space *mapping)
pgoff_t indices[PAGEVEC_SIZE];
pgoff_t index = 0;
- pagevec_init(&pvec, 0);
+ pagevec_init(&pvec);
/*
* Minor point, but we might as well stop if someone else SHM_LOCKs it.
*/
@@ -790,7 +790,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (lend == -1)
end = -1; /* unsigned, so actually very big */
- pagevec_init(&pvec, 0);
+ pagevec_init(&pvec);
index = start;
while (index < end) {
pvec.nr = find_get_entries(mapping, index,
@@ -2528,7 +2528,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
bool done = false;
int i;
- pagevec_init(&pvec, 0);
+ pagevec_init(&pvec);
pvec.nr = 1; /* start small: we may be there already */
while (!done) {
pvec.nr = find_get_entries(mapping, index,
@@ -3202,7 +3202,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
int len;
struct inode *inode;
struct page *page;
- struct shmem_inode_info *info;
len = strlen(symname) + 1;
if (len > PAGE_SIZE)
@@ -3222,7 +3221,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
error = 0;
}
- info = SHMEM_I(inode);
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
inode->i_link = kmemdup(symname, len, GFP_KERNEL);
@@ -3778,7 +3776,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
* tmpfs instance, limiting inodes to one per page of lowmem;
* but the internal instance is left unlimited.
*/
- if (!(sb->s_flags & MS_KERNMOUNT)) {
+ if (!(sb->s_flags & SB_KERNMOUNT)) {
sbinfo->max_blocks = shmem_default_max_blocks();
sbinfo->max_inodes = shmem_default_max_inodes();
if (shmem_parse_options(data, sbinfo, false)) {
@@ -3786,12 +3784,12 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
goto failed;
}
} else {
- sb->s_flags |= MS_NOUSER;
+ sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
- sb->s_flags |= MS_NOSEC;
+ sb->s_flags |= SB_NOSEC;
#else
- sb->s_flags |= MS_NOUSER;
+ sb->s_flags |= SB_NOUSER;
#endif
spin_lock_init(&sbinfo->stat_lock);
@@ -3811,7 +3809,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = shmem_xattr_handlers;
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
- sb->s_flags |= MS_POSIXACL;
+ sb->s_flags |= SB_POSIXACL;
#endif
uuid_gen(&sb->s_uuid);
@@ -3862,12 +3860,11 @@ static void shmem_init_inode(void *foo)
inode_init_once(&info->vfs_inode);
}
-static int shmem_init_inodecache(void)
+static void shmem_init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
- return 0;
}
static void shmem_destroy_inodecache(void)
@@ -3991,9 +3988,7 @@ int __init shmem_init(void)
if (shmem_inode_cachep)
return 0;
- error = shmem_init_inodecache();
- if (error)
- goto out3;
+ shmem_init_inodecache();
error = register_filesystem(&shmem_fs_type);
if (error) {
@@ -4020,7 +4015,6 @@ out1:
unregister_filesystem(&shmem_fs_type);
out2:
shmem_destroy_inodecache();
-out3:
shm_mnt = ERR_PTR(error);
return error;
}
@@ -4102,6 +4096,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma)
if (i_size >= HPAGE_PMD_SIZE &&
i_size >> PAGE_SHIFT >= off)
return true;
+ /* fall through */
case SHMEM_HUGE_ADVISE:
/* TODO: implement fadvise() hints */
return (vma->vm_flags & VM_HUGEPAGE);
@@ -4183,7 +4178,7 @@ static const struct dentry_operations anon_ops = {
.d_dname = simple_dname
};
-static struct file *__shmem_file_setup(const char *name, loff_t size,
+static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
unsigned long flags, unsigned int i_flags)
{
struct file *res;
@@ -4192,8 +4187,8 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
struct super_block *sb;
struct qstr this;
- if (IS_ERR(shm_mnt))
- return ERR_CAST(shm_mnt);
+ if (IS_ERR(mnt))
+ return ERR_CAST(mnt);
if (size < 0 || size > MAX_LFS_FILESIZE)
return ERR_PTR(-EINVAL);
@@ -4205,8 +4200,8 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
this.name = name;
this.len = strlen(name);
this.hash = 0; /* will go */
- sb = shm_mnt->mnt_sb;
- path.mnt = mntget(shm_mnt);
+ sb = mnt->mnt_sb;
+ path.mnt = mntget(mnt);
path.dentry = d_alloc_pseudo(sb, &this);
if (!path.dentry)
goto put_memory;
@@ -4251,7 +4246,7 @@ put_path:
*/
struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
{
- return __shmem_file_setup(name, size, flags, S_PRIVATE);
+ return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}
/**
@@ -4262,11 +4257,25 @@ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned lon
*/
struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
{
- return __shmem_file_setup(name, size, flags, 0);
+ return __shmem_file_setup(shm_mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup);
/**
+ * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
+ * @mnt: the tmpfs mount where the file will be created
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
+ loff_t size, unsigned long flags)
+{
+ return __shmem_file_setup(mnt, name, size, flags, 0);
+}
+EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
+
+/**
* shmem_zero_setup - setup a shared anonymous mapping
* @vma: the vma to be mmapped is prepared by do_mmap_pgoff
*/
@@ -4281,7 +4290,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
* accessible to the user through its mapping, use S_PRIVATE flag to
* bypass file security, in the same way as shmem_kernel_file_setup().
*/
- file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE);
+ file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
if (IS_ERR(file))
return PTR_ERR(file);
diff --git a/mm/slab.c b/mm/slab.c
index b7095884fd93..4e51ef954026 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -114,7 +114,6 @@
#include <linux/rtmutex.h>
#include <linux/reciprocal_div.h>
#include <linux/debugobjects.h>
-#include <linux/kmemcheck.h>
#include <linux/memory.h>
#include <linux/prefetch.h>
#include <linux/sched/task_stack.h>
@@ -252,8 +251,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
} while (0)
-#define CFLGS_OBJFREELIST_SLAB (0x40000000UL)
-#define CFLGS_OFF_SLAB (0x80000000UL)
+#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U)
+#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U)
#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
@@ -441,7 +440,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
* Calculate the number of objects and left-over bytes for a given buffer size.
*/
static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
- unsigned long flags, size_t *left_over)
+ slab_flags_t flags, size_t *left_over)
{
unsigned int num;
size_t slab_size = PAGE_SIZE << gfporder;
@@ -1410,10 +1409,8 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
int nr_pages;
flags |= cachep->allocflags;
- if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
- flags |= __GFP_RECLAIMABLE;
- page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
+ page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
if (!page) {
slab_out_of_memory(cachep, flags, nodeid);
return NULL;
@@ -1435,15 +1432,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
if (sk_memalloc_socks() && page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
- if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
- kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
-
- if (cachep->ctor)
- kmemcheck_mark_uninitialized_pages(page, nr_pages);
- else
- kmemcheck_mark_unallocated_pages(page, nr_pages);
- }
-
return page;
}
@@ -1455,8 +1443,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
int order = cachep->gfporder;
unsigned long nr_freed = (1 << order);
- kmemcheck_free_shadow(page, order);
-
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
else
@@ -1598,11 +1584,8 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
*dbg_redzone2(cachep, objp));
}
- if (cachep->flags & SLAB_STORE_USER) {
- pr_err("Last user: [<%p>](%pSR)\n",
- *dbg_userword(cachep, objp),
- *dbg_userword(cachep, objp));
- }
+ if (cachep->flags & SLAB_STORE_USER)
+ pr_err("Last user: (%pSR)\n", *dbg_userword(cachep, objp));
realobj = (char *)objp + obj_offset(cachep);
size = cachep->object_size;
for (i = 0; i < size && lines; i += 16, lines--) {
@@ -1635,7 +1618,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Mismatch ! */
/* Print header */
if (lines == 0) {
- pr_err("Slab corruption (%s): %s start=%p, len=%d\n",
+ pr_err("Slab corruption (%s): %s start=%px, len=%d\n",
print_tainted(), cachep->name,
realobj, size);
print_objinfo(cachep, objp, 0);
@@ -1664,13 +1647,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
if (objnr) {
objp = index_to_obj(cachep, page, objnr - 1);
realobj = (char *)objp + obj_offset(cachep);
- pr_err("Prev obj: start=%p, len=%d\n", realobj, size);
+ pr_err("Prev obj: start=%px, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
if (objnr + 1 < cachep->num) {
objp = index_to_obj(cachep, page, objnr + 1);
realobj = (char *)objp + obj_offset(cachep);
- pr_err("Next obj: start=%p, len=%d\n", realobj, size);
+ pr_err("Next obj: start=%px, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
}
@@ -1761,7 +1744,7 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
* towards high-order requests, this should be changed.
*/
static size_t calculate_slab_order(struct kmem_cache *cachep,
- size_t size, unsigned long flags)
+ size_t size, slab_flags_t flags)
{
size_t left_over = 0;
int gfporder;
@@ -1888,8 +1871,8 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
return 0;
}
-unsigned long kmem_cache_flags(unsigned long object_size,
- unsigned long flags, const char *name,
+slab_flags_t kmem_cache_flags(unsigned long object_size,
+ slab_flags_t flags, const char *name,
void (*ctor)(void *))
{
return flags;
@@ -1897,7 +1880,7 @@ unsigned long kmem_cache_flags(unsigned long object_size,
struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+ slab_flags_t flags, void (*ctor)(void *))
{
struct kmem_cache *cachep;
@@ -1915,7 +1898,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
}
static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
- size_t size, unsigned long flags)
+ size_t size, slab_flags_t flags)
{
size_t left;
@@ -1938,7 +1921,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
}
static bool set_off_slab_cache(struct kmem_cache *cachep,
- size_t size, unsigned long flags)
+ size_t size, slab_flags_t flags)
{
size_t left;
@@ -1972,7 +1955,7 @@ static bool set_off_slab_cache(struct kmem_cache *cachep,
}
static bool set_on_slab_cache(struct kmem_cache *cachep,
- size_t size, unsigned long flags)
+ size_t size, slab_flags_t flags)
{
size_t left;
@@ -2008,8 +1991,7 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
*/
-int
-__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
+int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
{
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
@@ -2144,6 +2126,8 @@ done:
cachep->allocflags = __GFP_COMP;
if (flags & SLAB_CACHE_DMA)
cachep->allocflags |= GFP_DMA;
+ if (flags & SLAB_RECLAIM_ACCOUNT)
+ cachep->allocflags |= __GFP_RECLAIMABLE;
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
@@ -2621,7 +2605,7 @@ static void slab_put_obj(struct kmem_cache *cachep,
/* Verify double free bug */
for (i = page->active; i < cachep->num; i++) {
if (get_free_obj(page, i) == objnr) {
- pr_err("slab: double free detected in cache '%s', objp %p\n",
+ pr_err("slab: double free detected in cache '%s', objp %px\n",
cachep->name, objp);
BUG();
}
@@ -2785,7 +2769,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
else
slab_error(cache, "memory outside object was overwritten");
- pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
obj, redzone1, redzone2);
}
@@ -3091,7 +3075,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
slab_error(cachep, "double free, or memory outside object was overwritten");
- pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ pr_err("%px: redzone 1:0x%llx, redzone 2:0x%llx\n",
objp, *dbg_redzone1(cachep, objp),
*dbg_redzone2(cachep, objp));
}
@@ -3104,7 +3088,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
cachep->ctor(objp);
if (ARCH_SLAB_MINALIGN &&
((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
- pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+ pr_err("0x%px: not aligned to ARCH_SLAB_MINALIGN=%d\n",
objp, (int)ARCH_SLAB_MINALIGN);
}
return objp;
@@ -3516,8 +3500,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
- kmemcheck_slab_free(cachep, objp, cachep->object_size);
-
/*
* Skip calling cache_free_alien() when the platform is not numa.
* This will avoid cache misses that happen while accessing slabp (which
@@ -4097,7 +4079,6 @@ out:
schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC));
}
-#ifdef CONFIG_SLABINFO
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
unsigned long active_objs, num_objs, active_slabs;
@@ -4299,7 +4280,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
return;
}
#endif
- seq_printf(m, "%p", (void *)address);
+ seq_printf(m, "%px", (void *)address);
}
static int leaks_show(struct seq_file *m, void *p)
@@ -4405,7 +4386,6 @@ static int __init slab_proc_init(void)
return 0;
}
module_init(slab_proc_init);
-#endif
#ifdef CONFIG_HARDENED_USERCOPY
/*
diff --git a/mm/slab.h b/mm/slab.h
index 86d7c7d860f9..ad657ffa44e5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -21,7 +21,7 @@ struct kmem_cache {
unsigned int object_size;/* The original size of the object */
unsigned int size; /* The aligned/padded/added on size */
unsigned int align; /* Alignment as calculated */
- unsigned long flags; /* Active flags on the slab */
+ slab_flags_t flags; /* Active flags on the slab */
const char *name; /* Slab name for sysfs */
int refcount; /* Use counter */
void (*ctor)(void *); /* Called on object slot creation */
@@ -40,7 +40,6 @@ struct kmem_cache {
#include <linux/memcontrol.h>
#include <linux/fault-inject.h>
-#include <linux/kmemcheck.h>
#include <linux/kasan.h>
#include <linux/kmemleak.h>
#include <linux/random.h>
@@ -79,13 +78,13 @@ extern const struct kmalloc_info_struct {
unsigned long size;
} kmalloc_info[];
-unsigned long calculate_alignment(unsigned long flags,
+unsigned long calculate_alignment(slab_flags_t flags,
unsigned long align, unsigned long size);
#ifndef CONFIG_SLOB
/* Kmalloc array related functions */
void setup_kmalloc_cache_index_table(void);
-void create_kmalloc_caches(unsigned long);
+void create_kmalloc_caches(slab_flags_t);
/* Find the kmalloc slab corresponding for a certain size */
struct kmem_cache *kmalloc_slab(size_t, gfp_t);
@@ -93,32 +92,32 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t);
/* Functions provided by the slab allocators */
-extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags);
+int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
- unsigned long flags);
+ slab_flags_t flags);
extern void create_boot_cache(struct kmem_cache *, const char *name,
- size_t size, unsigned long flags);
+ size_t size, slab_flags_t flags);
int slab_unmergeable(struct kmem_cache *s);
struct kmem_cache *find_mergeable(size_t size, size_t align,
- unsigned long flags, const char *name, void (*ctor)(void *));
+ slab_flags_t flags, const char *name, void (*ctor)(void *));
#ifndef CONFIG_SLOB
struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *));
+ slab_flags_t flags, void (*ctor)(void *));
-unsigned long kmem_cache_flags(unsigned long object_size,
- unsigned long flags, const char *name,
+slab_flags_t kmem_cache_flags(unsigned long object_size,
+ slab_flags_t flags, const char *name,
void (*ctor)(void *));
#else
static inline struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+ slab_flags_t flags, void (*ctor)(void *))
{ return NULL; }
-static inline unsigned long kmem_cache_flags(unsigned long object_size,
- unsigned long flags, const char *name,
+static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
+ slab_flags_t flags, const char *name,
void (*ctor)(void *))
{
return flags;
@@ -142,10 +141,10 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#if defined(CONFIG_SLAB)
#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
- SLAB_NOTRACK | SLAB_ACCOUNT)
+ SLAB_ACCOUNT)
#elif defined(CONFIG_SLUB)
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
+ SLAB_TEMPORARY | SLAB_ACCOUNT)
#else
#define SLAB_CACHE_FLAGS (0)
#endif
@@ -164,7 +163,6 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | \
- SLAB_NOTRACK | \
SLAB_ACCOUNT)
int __kmem_cache_shutdown(struct kmem_cache *);
@@ -439,7 +437,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
for (i = 0; i < size; i++) {
void *object = p[i];
- kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
kmemleak_alloc_recursive(object, s->object_size, 1,
s->flags, flags);
kasan_slab_alloc(s, object, flags);
@@ -506,6 +503,14 @@ void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
void memcg_slab_stop(struct seq_file *m, void *p);
int memcg_slab_show(struct seq_file *m, void *p);
+#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
+void dump_unreclaimable_slab(void);
+#else
+static inline void dump_unreclaimable_slab(void)
+{
+}
+#endif
+
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
#ifdef CONFIG_SLAB_FREELIST_RANDOM
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0d7fe71ff5e4..c8cb36774ba1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
SLAB_FAILSLAB | SLAB_KASAN)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
- SLAB_NOTRACK | SLAB_ACCOUNT)
+ SLAB_ACCOUNT)
/*
* Merge control. If this is set then no merging of slab caches will occur.
@@ -291,7 +291,7 @@ int slab_unmergeable(struct kmem_cache *s)
}
struct kmem_cache *find_mergeable(size_t size, size_t align,
- unsigned long flags, const char *name, void (*ctor)(void *))
+ slab_flags_t flags, const char *name, void (*ctor)(void *))
{
struct kmem_cache *s;
@@ -341,7 +341,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
* Figure out what the alignment of the objects will be given a set of
* flags, a user specified alignment and the size of the objects.
*/
-unsigned long calculate_alignment(unsigned long flags,
+unsigned long calculate_alignment(slab_flags_t flags,
unsigned long align, unsigned long size)
{
/*
@@ -366,7 +366,7 @@ unsigned long calculate_alignment(unsigned long flags,
static struct kmem_cache *create_cache(const char *name,
size_t object_size, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *),
+ slab_flags_t flags, void (*ctor)(void *),
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
@@ -431,7 +431,7 @@ out_free_cache:
*/
struct kmem_cache *
kmem_cache_create(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+ slab_flags_t flags, void (*ctor)(void *))
{
struct kmem_cache *s = NULL;
const char *cache_name;
@@ -879,7 +879,7 @@ bool slab_is_available(void)
#ifndef CONFIG_SLOB
/* Create a cache during boot when no slab services are available yet */
void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
- unsigned long flags)
+ slab_flags_t flags)
{
int err;
@@ -899,7 +899,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
}
struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
- unsigned long flags)
+ slab_flags_t flags)
{
struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
@@ -1057,7 +1057,7 @@ void __init setup_kmalloc_cache_index_table(void)
}
}
-static void __init new_kmalloc_cache(int idx, unsigned long flags)
+static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
{
kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
kmalloc_info[idx].size, flags);
@@ -1068,7 +1068,7 @@ static void __init new_kmalloc_cache(int idx, unsigned long flags)
* may already have been created because they were needed to
* enable allocations for slab creation.
*/
-void __init create_kmalloc_caches(unsigned long flags)
+void __init create_kmalloc_caches(slab_flags_t flags)
{
int i;
@@ -1184,8 +1184,7 @@ void cache_random_seq_destroy(struct kmem_cache *cachep)
}
#endif /* CONFIG_SLAB_FREELIST_RANDOM */
-#ifdef CONFIG_SLABINFO
-
+#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
#ifdef CONFIG_SLAB
#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
#else
@@ -1281,7 +1280,41 @@ static int slab_show(struct seq_file *m, void *p)
return 0;
}
-#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
+void dump_unreclaimable_slab(void)
+{
+ struct kmem_cache *s, *s2;
+ struct slabinfo sinfo;
+
+ /*
+ * Here acquiring slab_mutex is risky since we don't prefer to get
+ * sleep in oom path. But, without mutex hold, it may introduce a
+ * risk of crash.
+ * Use mutex_trylock to protect the list traverse, dump nothing
+ * without acquiring the mutex.
+ */
+ if (!mutex_trylock(&slab_mutex)) {
+ pr_warn("excessive unreclaimable slab but cannot dump stats\n");
+ return;
+ }
+
+ pr_info("Unreclaimable slab info:\n");
+ pr_info("Name Used Total\n");
+
+ list_for_each_entry_safe(s, s2, &slab_caches, list) {
+ if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT))
+ continue;
+
+ get_slabinfo(s, &sinfo);
+
+ if (sinfo.num_objs > 0)
+ pr_info("%-17s %10luKB %10luKB\n", cache_name(s),
+ (sinfo.active_objs * s->size) / 1024,
+ (sinfo.num_objs * s->size) / 1024);
+ }
+ mutex_unlock(&slab_mutex);
+}
+
+#if defined(CONFIG_MEMCG)
void *memcg_slab_start(struct seq_file *m, loff_t *pos)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -1355,7 +1388,7 @@ static int __init slab_proc_init(void)
return 0;
}
module_init(slab_proc_init);
-#endif /* CONFIG_SLABINFO */
+#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
static __always_inline void *__do_krealloc(const void *p, size_t new_size,
gfp_t flags)
diff --git a/mm/slob.c b/mm/slob.c
index 10249160b693..623e8a5c46ce 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -330,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
BUG_ON(!b);
spin_unlock_irqrestore(&slob_lock, flags);
}
- if (unlikely((gfp & __GFP_ZERO) && b))
+ if (unlikely(gfp & __GFP_ZERO))
memset(b, 0, size);
return b;
}
@@ -524,7 +524,7 @@ size_t ksize(const void *block)
}
EXPORT_SYMBOL(ksize);
-int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
+int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
{
if (flags & SLAB_TYPESAFE_BY_RCU) {
/* leave room for rcu footer at the end of object */
diff --git a/mm/slub.c b/mm/slub.c
index 1efbb8123037..cfd56e5a35fb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -22,7 +22,6 @@
#include <linux/notifier.h>
#include <linux/seq_file.h>
#include <linux/kasan.h>
-#include <linux/kmemcheck.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
@@ -193,8 +192,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
/* Internal SLUB flags */
-#define __OBJECT_POISON 0x80000000UL /* Poison object */
-#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
+/* Poison object */
+#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
+/* Use cmpxchg_double */
+#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
/*
* Tracking user of a slab.
@@ -485,9 +486,9 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p)
* Debug settings:
*/
#if defined(CONFIG_SLUB_DEBUG_ON)
-static int slub_debug = DEBUG_DEFAULT_FLAGS;
+static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
#else
-static int slub_debug;
+static slab_flags_t slub_debug;
#endif
static char *slub_debug_slabs;
@@ -1289,8 +1290,8 @@ out:
__setup("slub_debug", setup_slub_debug);
-unsigned long kmem_cache_flags(unsigned long object_size,
- unsigned long flags, const char *name,
+slab_flags_t kmem_cache_flags(unsigned long object_size,
+ slab_flags_t flags, const char *name,
void (*ctor)(void *))
{
/*
@@ -1322,8 +1323,8 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
-unsigned long kmem_cache_flags(unsigned long object_size,
- unsigned long flags, const char *name,
+slab_flags_t kmem_cache_flags(unsigned long object_size,
+ slab_flags_t flags, const char *name,
void (*ctor)(void *))
{
return flags;
@@ -1370,12 +1371,11 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x)
* So in order to make the debug calls that expect irqs to be
* disabled we need to disable interrupts temporarily.
*/
-#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+#ifdef CONFIG_LOCKDEP
{
unsigned long flags;
local_irq_save(flags);
- kmemcheck_slab_free(s, x, s->object_size);
debug_check_no_locks_freed(x, s->object_size);
local_irq_restore(flags);
}
@@ -1399,8 +1399,7 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
* Compiler cannot detect this function can be removed if slab_free_hook()
* evaluates to nothing. Thus, catch all relevant config debug options here.
*/
-#if defined(CONFIG_KMEMCHECK) || \
- defined(CONFIG_LOCKDEP) || \
+#if defined(CONFIG_LOCKDEP) || \
defined(CONFIG_DEBUG_KMEMLEAK) || \
defined(CONFIG_DEBUG_OBJECTS_FREE) || \
defined(CONFIG_KASAN)
@@ -1436,8 +1435,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
struct page *page;
int order = oo_order(oo);
- flags |= __GFP_NOTRACK;
-
if (node == NUMA_NO_NODE)
page = alloc_pages(flags, order);
else
@@ -1596,22 +1593,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
stat(s, ORDER_FALLBACK);
}
- if (kmemcheck_enabled &&
- !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
- int pages = 1 << oo_order(oo);
-
- kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
-
- /*
- * Objects from caches that have a constructor don't get
- * cleared when they're allocated, so we need to do it here.
- */
- if (s->ctor)
- kmemcheck_mark_uninitialized_pages(page, pages);
- else
- kmemcheck_mark_unallocated_pages(page, pages);
- }
-
page->objects = oo_objects(oo);
order = compound_order(page);
@@ -1687,8 +1668,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
check_object(s, page, p, SLUB_RED_INACTIVE);
}
- kmemcheck_free_shadow(page, compound_order(page));
-
mod_lruvec_page_state(page,
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
@@ -3477,7 +3456,7 @@ static void set_cpu_partial(struct kmem_cache *s)
*/
static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
- unsigned long flags = s->flags;
+ slab_flags_t flags = s->flags;
size_t size = s->object_size;
int order;
@@ -3593,7 +3572,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
return !!oo_objects(s->oo);
}
-static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
+static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
s->reserved = 0;
@@ -3655,7 +3634,7 @@ error:
if (flags & SLAB_PANIC)
panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n",
s->name, (unsigned long)s->size, s->size,
- oo_order(s->oo), s->offset, flags);
+ oo_order(s->oo), s->offset, (unsigned long)flags);
return -EINVAL;
}
@@ -3792,7 +3771,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
struct page *page;
void *ptr = NULL;
- flags |= __GFP_COMP | __GFP_NOTRACK;
+ flags |= __GFP_COMP;
page = alloc_pages_node(node, flags, get_order(size));
if (page)
ptr = page_address(page);
@@ -4245,7 +4224,7 @@ void __init kmem_cache_init_late(void)
struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *))
+ slab_flags_t flags, void (*ctor)(void *))
{
struct kmem_cache *s, *c;
@@ -4275,7 +4254,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
return s;
}
-int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
+int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
{
int err;
@@ -5655,8 +5634,6 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'a';
if (s->flags & SLAB_CONSISTENCY_CHECKS)
*p++ = 'F';
- if (!(s->flags & SLAB_NOTRACK))
- *p++ = 't';
if (s->flags & SLAB_ACCOUNT)
*p++ = 'A';
if (p != name + 1)
@@ -5704,6 +5681,10 @@ static int sysfs_slab_add(struct kmem_cache *s)
return 0;
}
+ if (!unmergeable && disable_higher_order_debug &&
+ (slub_debug & DEBUG_METADATA_FLAGS))
+ unmergeable = 1;
+
if (unmergeable) {
/*
* Slabcache can never be merged so we can use the name proper.
@@ -5852,7 +5833,7 @@ __initcall(slab_sysfs_init);
/*
* The /proc/slabinfo ABI
*/
-#ifdef CONFIG_SLABINFO
+#ifdef CONFIG_SLUB_DEBUG
void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
{
unsigned long nr_slabs = 0;
@@ -5884,4 +5865,4 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
{
return -EIO;
}
-#endif /* CONFIG_SLABINFO */
+#endif /* CONFIG_SLUB_DEBUG */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 478ce6d4a2c4..17acf01791fa 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -42,7 +42,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long align,
unsigned long goal)
{
- return memblock_virt_alloc_try_nid(size, align, goal,
+ return memblock_virt_alloc_try_nid_raw(size, align, goal,
BOOTMEM_ALLOC_ACCESSIBLE, node);
}
@@ -53,13 +53,20 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
{
/* If the main allocator is up use that, fallback to bootmem. */
if (slab_is_available()) {
+ gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ int order = get_order(size);
+ static bool warned;
struct page *page;
- page = alloc_pages_node(node,
- GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL,
- get_order(size));
+ page = alloc_pages_node(node, gfp_mask, order);
if (page)
return page_address(page);
+
+ if (!warned) {
+ warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
+ "vmemmap alloc failure: order:%u", order);
+ warned = true;
+ }
return NULL;
} else
return __earlyonly_bootmem_alloc(node, size, size,
@@ -180,11 +187,22 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
return pte;
}
+static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
+{
+ void *p = vmemmap_alloc_block(size, node);
+
+ if (!p)
+ return NULL;
+ memset(p, 0, size);
+
+ return p;
+}
+
pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
{
pmd_t *pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd)) {
- void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
pmd_populate_kernel(&init_mm, pmd, p);
@@ -196,7 +214,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
{
pud_t *pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
- void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
pud_populate(&init_mm, pud, p);
@@ -208,7 +226,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
{
p4d_t *p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
- void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
p4d_populate(&init_mm, p4d, p);
@@ -220,7 +238,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
{
pgd_t *pgd = pgd_offset_k(addr);
if (pgd_none(*pgd)) {
- void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+ void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
if (!p)
return NULL;
pgd_populate(&init_mm, pgd, p);
diff --git a/mm/sparse.c b/mm/sparse.c
index 044138852baf..2609aba121e8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -207,6 +207,16 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;
+#ifdef CONFIG_SPARSEMEM_EXTREME
+ if (unlikely(!mem_section)) {
+ unsigned long size, align;
+
+ size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
+ align = 1 << (INTERNODE_CACHE_SHIFT);
+ mem_section = memblock_virt_alloc(size, align);
+ }
+#endif
+
start &= PAGE_SECTION_MASK;
mminit_validate_memmodel_limits(&start, &end);
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
@@ -443,9 +453,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
}
size = PAGE_ALIGN(size);
- map = memblock_virt_alloc_try_nid(size * map_count,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
+ map = memblock_virt_alloc_try_nid_raw(size * map_count,
+ PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, nodeid);
if (map) {
for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
if (!present_section_nr(pnum))
diff --git a/mm/swap.c b/mm/swap.c
index a77d68f2c1b6..38e1b6374a97 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -76,7 +76,7 @@ static void __page_cache_release(struct page *page)
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
- free_hot_cold_page(page, false);
+ free_unref_page(page);
}
static void __put_compound_page(struct page *page)
@@ -210,7 +210,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
}
if (pgdat)
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- release_pages(pvec->pages, pvec->nr, pvec->cold);
+ release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
}
@@ -740,7 +740,7 @@ void lru_add_drain_all(void)
* Decrement the reference count on all the pages in @pages. If it
* fell to zero, remove the page from the LRU and free it.
*/
-void release_pages(struct page **pages, int nr, bool cold)
+void release_pages(struct page **pages, int nr)
{
int i;
LIST_HEAD(pages_to_free);
@@ -817,7 +817,7 @@ void release_pages(struct page **pages, int nr, bool cold)
spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
mem_cgroup_uncharge_list(&pages_to_free);
- free_hot_cold_page_list(&pages_to_free, cold);
+ free_unref_page_list(&pages_to_free);
}
EXPORT_SYMBOL(release_pages);
@@ -833,8 +833,11 @@ EXPORT_SYMBOL(release_pages);
*/
void __pagevec_release(struct pagevec *pvec)
{
- lru_add_drain();
- release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
+ if (!pvec->percpu_pvec_drained) {
+ lru_add_drain();
+ pvec->percpu_pvec_drained = true;
+ }
+ release_pages(pvec->pages, pagevec_count(pvec));
pagevec_reinit(pvec);
}
EXPORT_SYMBOL(__pagevec_release);
@@ -986,15 +989,25 @@ unsigned pagevec_lookup_range(struct pagevec *pvec,
}
EXPORT_SYMBOL(pagevec_lookup_range);
-unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
- pgoff_t *index, int tag, unsigned nr_pages)
+unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, pgoff_t end,
+ int tag)
{
- pvec->nr = find_get_pages_tag(mapping, index, tag,
- nr_pages, pvec->pages);
+ pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
+ PAGEVEC_SIZE, pvec->pages);
return pagevec_count(pvec);
}
-EXPORT_SYMBOL(pagevec_lookup_tag);
+EXPORT_SYMBOL(pagevec_lookup_range_tag);
+unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
+ struct address_space *mapping, pgoff_t *index, pgoff_t end,
+ int tag, unsigned max_pages)
+{
+ pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
+ min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
+ return pagevec_count(pvec);
+}
+EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
/*
* Perform any setup for the swap system
*/
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index d81cfc5a43d5..bebc19292018 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -149,6 +149,13 @@ static int alloc_swap_slot_cache(unsigned int cpu)
cache->nr = 0;
cache->cur = 0;
cache->n_ret = 0;
+ /*
+ * We initialized alloc_lock and free_lock earlier. We use
+ * !cache->slots or !cache->slots_ret to know if it is safe to acquire
+ * the corresponding lock and use the cache. Memory barrier below
+ * ensures the assumption.
+ */
+ mb();
cache->slots = slots;
slots = NULL;
cache->slots_ret = slots_ret;
@@ -275,7 +282,7 @@ int free_swap_slot(swp_entry_t entry)
struct swap_slots_cache *cache;
cache = raw_cpu_ptr(&swp_slots);
- if (use_swap_slot_cache && cache->slots_ret) {
+ if (likely(use_swap_slot_cache && cache->slots_ret)) {
spin_lock_irq(&cache->free_lock);
/* Swap slots cache may be deactivated before acquiring lock */
if (!use_swap_slot_cache || !cache->slots_ret) {
@@ -326,7 +333,7 @@ swp_entry_t get_swap_page(struct page *page)
*/
cache = raw_cpu_ptr(&swp_slots);
- if (check_cache_active()) {
+ if (likely(check_cache_active() && cache->slots)) {
mutex_lock(&cache->alloc_lock);
if (cache->slots) {
repeat:
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 326439428daf..39ae7cfad90f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -36,9 +36,9 @@ static const struct address_space_operations swap_aops = {
#endif
};
-struct address_space *swapper_spaces[MAX_SWAPFILES];
-static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
-bool swap_vma_readahead = true;
+struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
+static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
+bool swap_vma_readahead __read_mostly = true;
#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
@@ -319,7 +319,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
lru_add_drain();
for (i = 0; i < nr; i++)
free_swap_cache(pagep[i]);
- release_pages(pagep, nr, false);
+ release_pages(pagep, nr);
}
/*
@@ -559,6 +559,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
unsigned long mask;
+ struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
bool do_poll = true, page_allocated;
@@ -572,6 +573,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
end_offset = offset | mask;
if (!start_offset) /* First page is swap header. */
start_offset++;
+ if (end_offset >= si->max)
+ end_offset = si->max - 1;
blk_start_plug(&plug);
for (offset = start_offset; offset <= end_offset ; offset++) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e47a21e64764..3074b02eaa09 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1328,6 +1328,13 @@ int page_swapcount(struct page *page)
return count;
}
+int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+{
+ pgoff_t offset = swp_offset(entry);
+
+ return swap_count(si->swap_map[offset]);
+}
+
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
{
int count = 0;
@@ -3169,6 +3176,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
p->flags |= SWP_STABLE_WRITES;
+ if (bdi_cap_synchronous_io(inode_to_bdi(inode)))
+ p->flags |= SWP_SYNCHRONOUS_IO;
+
if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
int cpu;
unsigned long ci, nr_cluster;
@@ -3452,10 +3462,15 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
+struct swap_info_struct *swp_swap_info(swp_entry_t entry)
+{
+ return swap_info[swp_type(entry)];
+}
+
struct swap_info_struct *page_swap_info(struct page *page)
{
- swp_entry_t swap = { .val = page_private(page) };
- return swap_info[swp_type(swap)];
+ swp_entry_t entry = { .val = page_private(page) };
+ return swp_swap_info(entry);
}
/*
@@ -3463,7 +3478,6 @@ struct swap_info_struct *page_swap_info(struct page *page)
*/
struct address_space *__page_file_mapping(struct page *page)
{
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
return page_swap_info(page)->swap_file->f_mapping;
}
EXPORT_SYMBOL_GPL(__page_file_mapping);
@@ -3471,7 +3485,6 @@ EXPORT_SYMBOL_GPL(__page_file_mapping);
pgoff_t __page_file_index(struct page *page)
{
swp_entry_t swap = { .val = page_private(page) };
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
return swp_offset(swap);
}
EXPORT_SYMBOL_GPL(__page_file_index);
diff --git a/mm/truncate.c b/mm/truncate.c
index 2330223841fb..e4b4cf0f4070 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -25,44 +25,85 @@
#include <linux/rmap.h>
#include "internal.h"
-static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
- void *entry)
+/*
+ * Regular page slots are stabilized by the page lock even without the tree
+ * itself locked. These unlocked entries need verification under the tree
+ * lock.
+ */
+static inline void __clear_shadow_entry(struct address_space *mapping,
+ pgoff_t index, void *entry)
{
struct radix_tree_node *node;
void **slot;
- spin_lock_irq(&mapping->tree_lock);
- /*
- * Regular page slots are stabilized by the page lock even
- * without the tree itself locked. These unlocked entries
- * need verification under the tree lock.
- */
if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
- goto unlock;
+ return;
if (*slot != entry)
- goto unlock;
+ return;
__radix_tree_replace(&mapping->page_tree, node, slot, NULL,
- workingset_update_node, mapping);
+ workingset_update_node);
mapping->nrexceptional--;
-unlock:
+}
+
+static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
+ void *entry)
+{
+ spin_lock_irq(&mapping->tree_lock);
+ __clear_shadow_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
}
/*
- * Unconditionally remove exceptional entry. Usually called from truncate path.
+ * Unconditionally remove exceptional entries. Usually called from truncate
+ * path. Note that the pagevec may be altered by this function by removing
+ * exceptional entries similar to what pagevec_remove_exceptionals does.
*/
-static void truncate_exceptional_entry(struct address_space *mapping,
- pgoff_t index, void *entry)
+static void truncate_exceptional_pvec_entries(struct address_space *mapping,
+ struct pagevec *pvec, pgoff_t *indices,
+ pgoff_t end)
{
+ int i, j;
+ bool dax, lock;
+
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return;
- if (dax_mapping(mapping)) {
- dax_delete_mapping_entry(mapping, index);
+ for (j = 0; j < pagevec_count(pvec); j++)
+ if (radix_tree_exceptional_entry(pvec->pages[j]))
+ break;
+
+ if (j == pagevec_count(pvec))
return;
+
+ dax = dax_mapping(mapping);
+ lock = !dax && indices[j] < end;
+ if (lock)
+ spin_lock_irq(&mapping->tree_lock);
+
+ for (i = j; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+ pgoff_t index = indices[i];
+
+ if (!radix_tree_exceptional_entry(page)) {
+ pvec->pages[j++] = page;
+ continue;
+ }
+
+ if (index >= end)
+ continue;
+
+ if (unlikely(dax)) {
+ dax_delete_mapping_entry(mapping, index);
+ continue;
+ }
+
+ __clear_shadow_entry(mapping, index, page);
}
- clear_shadow_entry(mapping, index, entry);
+
+ if (lock)
+ spin_unlock_irq(&mapping->tree_lock);
+ pvec->nr = j;
}
/*
@@ -134,11 +175,17 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
-static int
-truncate_complete_page(struct address_space *mapping, struct page *page)
+static void
+truncate_cleanup_page(struct address_space *mapping, struct page *page)
{
- if (page->mapping != mapping)
- return -EIO;
+ if (page_mapped(page)) {
+ loff_t holelen;
+
+ holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
+ unmap_mapping_range(mapping,
+ (loff_t)page->index << PAGE_SHIFT,
+ holelen, 0);
+ }
if (page_has_private(page))
do_invalidatepage(page, 0, PAGE_SIZE);
@@ -150,8 +197,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
*/
cancel_dirty_page(page);
ClearPageMappedToDisk(page);
- delete_from_page_cache(page);
- return 0;
}
/*
@@ -180,16 +225,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
int truncate_inode_page(struct address_space *mapping, struct page *page)
{
- loff_t holelen;
VM_BUG_ON_PAGE(PageTail(page), page);
- holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
- if (page_mapped(page)) {
- unmap_mapping_range(mapping,
- (loff_t)page->index << PAGE_SHIFT,
- holelen, 0);
- }
- return truncate_complete_page(mapping, page);
+ if (page->mapping != mapping)
+ return -EIO;
+
+ truncate_cleanup_page(mapping, page);
+ delete_from_page_cache(page);
+ return 0;
}
/*
@@ -287,11 +330,19 @@ void truncate_inode_pages_range(struct address_space *mapping,
else
end = (lend + 1) >> PAGE_SHIFT;
- pagevec_init(&pvec, 0);
+ pagevec_init(&pvec);
index = start;
while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE),
indices)) {
+ /*
+ * Pagevec array has exceptional entries and we may also fail
+ * to lock some pages. So we store pages that can be deleted
+ * in a new pagevec.
+ */
+ struct pagevec locked_pvec;
+
+ pagevec_init(&locked_pvec);
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -300,11 +351,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
if (index >= end)
break;
- if (radix_tree_exceptional_entry(page)) {
- truncate_exceptional_entry(mapping, index,
- page);
+ if (radix_tree_exceptional_entry(page))
continue;
- }
if (!trylock_page(page))
continue;
@@ -313,15 +361,22 @@ void truncate_inode_pages_range(struct address_space *mapping,
unlock_page(page);
continue;
}
- truncate_inode_page(mapping, page);
- unlock_page(page);
+ if (page->mapping != mapping) {
+ unlock_page(page);
+ continue;
+ }
+ pagevec_add(&locked_pvec, page);
}
- pagevec_remove_exceptionals(&pvec);
+ for (i = 0; i < pagevec_count(&locked_pvec); i++)
+ truncate_cleanup_page(mapping, locked_pvec.pages[i]);
+ delete_from_page_cache_batch(mapping, &locked_pvec);
+ for (i = 0; i < pagevec_count(&locked_pvec); i++)
+ unlock_page(locked_pvec.pages[i]);
+ truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
pagevec_release(&pvec);
cond_resched();
index++;
}
-
if (partial_start) {
struct page *page = find_lock_page(mapping, start - 1);
if (page) {
@@ -379,6 +434,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
pagevec_release(&pvec);
break;
}
+
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -390,11 +446,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
break;
}
- if (radix_tree_exceptional_entry(page)) {
- truncate_exceptional_entry(mapping, index,
- page);
+ if (radix_tree_exceptional_entry(page))
continue;
- }
lock_page(page);
WARN_ON(page_to_index(page) != index);
@@ -402,7 +455,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
truncate_inode_page(mapping, page);
unlock_page(page);
}
- pagevec_remove_exceptionals(&pvec);
+ truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
pagevec_release(&pvec);
index++;
}
@@ -500,7 +553,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
unsigned long count = 0;
int i;
- pagevec_init(&pvec, 0);
+ pagevec_init(&pvec);
while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
indices)) {
@@ -630,7 +683,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
goto out;
- pagevec_init(&pvec, 0);
+ pagevec_init(&pvec);
index = start;
while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb2f0315b8c0..47d5ced51f2d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -297,10 +297,13 @@ EXPORT_SYMBOL(register_shrinker);
*/
void unregister_shrinker(struct shrinker *shrinker)
{
+ if (!shrinker->nr_deferred)
+ return;
down_write(&shrinker_rwsem);
list_del(&shrinker->list);
up_write(&shrinker_rwsem);
kfree(shrinker->nr_deferred);
+ shrinker->nr_deferred = NULL;
}
EXPORT_SYMBOL(unregister_shrinker);
@@ -1349,7 +1352,7 @@ keep:
mem_cgroup_uncharge_list(&free_pages);
try_to_unmap_flush();
- free_hot_cold_page_list(&free_pages, true);
+ free_unref_page_list(&free_pages);
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
@@ -1824,7 +1827,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&page_list);
- free_hot_cold_page_list(&page_list, true);
+ free_unref_page_list(&page_list);
/*
* If reclaim is isolating dirty pages under writeback, it implies
@@ -1868,7 +1871,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* also allow kswapd to start writing pages during reclaim.
*/
if (stat.nr_unqueued_dirty == nr_taken) {
- wakeup_flusher_threads(0, WB_REASON_VMSCAN);
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
set_bit(PGDAT_DIRTY, &pgdat->flags);
}
@@ -2063,7 +2066,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
spin_unlock_irq(&pgdat->lru_lock);
mem_cgroup_uncharge_list(&l_hold);
- free_hot_cold_page_list(&l_hold, true);
+ free_unref_page_list(&l_hold);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
@@ -2082,7 +2085,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* If that fails and refaulting is observed, the inactive list grows.
*
* The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
- * on this LRU, maintained by the pageout code. A zone->inactive_ratio
+ * on this LRU, maintained by the pageout code. An inactive_ratio
* of 3 means 3:1 or 25% of the pages are kept on the inactive list.
*
* total target max
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 4bb13e72ac97..40b2db6db6b1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -32,6 +32,77 @@
#define NUMA_STATS_THRESHOLD (U16_MAX - 2)
+#ifdef CONFIG_NUMA
+int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
+
+/* zero numa counters within a zone */
+static void zero_zone_numa_counters(struct zone *zone)
+{
+ int item, cpu;
+
+ for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
+ atomic_long_set(&zone->vm_numa_stat[item], 0);
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
+ = 0;
+ }
+}
+
+/* zero numa counters of all the populated zones */
+static void zero_zones_numa_counters(void)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ zero_zone_numa_counters(zone);
+}
+
+/* zero global numa counters */
+static void zero_global_numa_counters(void)
+{
+ int item;
+
+ for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
+ atomic_long_set(&vm_numa_stat[item], 0);
+}
+
+static void invalid_numa_statistics(void)
+{
+ zero_zones_numa_counters();
+ zero_global_numa_counters();
+}
+
+static DEFINE_MUTEX(vm_numa_stat_lock);
+
+int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int ret, oldval;
+
+ mutex_lock(&vm_numa_stat_lock);
+ if (write)
+ oldval = sysctl_vm_numa_stat;
+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (ret || !write)
+ goto out;
+
+ if (oldval == sysctl_vm_numa_stat)
+ goto out;
+ else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
+ static_branch_enable(&vm_numa_stat_key);
+ pr_info("enable numa statistics\n");
+ } else {
+ static_branch_disable(&vm_numa_stat_key);
+ invalid_numa_statistics();
+ pr_info("disable numa statistics, and clear numa counters\n");
+ }
+
+out:
+ mutex_unlock(&vm_numa_stat_lock);
+ return ret;
+}
+#endif
+
#ifdef CONFIG_VM_EVENT_COUNTERS
DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -1564,11 +1635,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
}
seq_printf(m,
"\n node_unreclaimable: %u"
- "\n start_pfn: %lu"
- "\n node_inactive_ratio: %u",
+ "\n start_pfn: %lu",
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
- zone->zone_start_pfn,
- zone->zone_pgdat->inactive_ratio);
+ zone->zone_start_pfn);
seq_putc(m, '\n');
}
diff --git a/mm/workingset.c b/mm/workingset.c
index b997c9de28f6..b7d616a3bbbe 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -340,14 +340,8 @@ out:
static struct list_lru shadow_nodes;
-void workingset_update_node(struct radix_tree_node *node, void *private)
+void workingset_update_node(struct radix_tree_node *node)
{
- struct address_space *mapping = private;
-
- /* Only regular page cache has shadow entries */
- if (dax_mapping(mapping) || shmem_mapping(mapping))
- return;
-
/*
* Track non-empty nodes that contain only shadow entries;
* unlink those that contain pages or are being freed.
@@ -475,7 +469,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out_invalid;
inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
__radix_tree_delete_node(&mapping->page_tree, node,
- workingset_update_node, mapping);
+ workingset_lookup_update(mapping));
out_invalid:
spin_unlock(&mapping->tree_lock);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index b2ba2ba585f3..39e19125d6a0 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -404,8 +404,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
WARN_ON(z3fold_page_trylock(zhdr));
else
z3fold_page_lock(zhdr);
- if (test_bit(PAGE_STALE, &page->private) ||
- !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) {
+ if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
z3fold_page_unlock(zhdr);
return;
}
@@ -413,6 +412,11 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
list_del_init(&zhdr->buddy);
spin_unlock(&pool->lock);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ atomic64_dec(&pool->pages_nr);
+ return;
+ }
+
z3fold_compact_page(zhdr);
unbuddied = get_cpu_ptr(pool->unbuddied);
fchunks = num_free_chunks(zhdr);
@@ -753,9 +757,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
list_del_init(&zhdr->buddy);
spin_unlock(&pool->lock);
zhdr->cpu = -1;
+ kref_get(&zhdr->refcount);
do_compact_page(zhdr, true);
return;
}
+ kref_get(&zhdr->refcount);
queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
z3fold_page_unlock(zhdr);
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 7c38e850a8fc..683c0651098c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -53,6 +53,7 @@
#include <linux/mount.h>
#include <linux/migrate.h>
#include <linux/pagemap.h>
+#include <linux/fs.h>
#define ZSPAGE_MAGIC 0x58
@@ -1349,7 +1350,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
* pools/users, we can't allow mapping in interrupt context
* because it can corrupt another users mappings.
*/
- WARN_ON_ONCE(in_interrupt());
+ BUG_ON(in_interrupt());
/* From now on, migration cannot move the object */
pin_tag(handle);