summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-06-03 20:24:15 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-06-03 20:24:15 -0700
commitee01c4d72adffb7d424535adf630f2955748fa8b (patch)
tree9ea9f40473e105e936e7477ab7dc7248d899af21 /include
parentc444eb564fb16645c172d550359cb3d75fe8a040 (diff)
parent09587a09ada2ed7c39aedfa2681152b5ac5641ee (diff)
downloadlinux-ee01c4d72adffb7d424535adf630f2955748fa8b.tar.bz2
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: "More mm/ work, plenty more to come Subsystems affected by this patch series: slub, memcg, gup, kasan, pagealloc, hugetlb, vmscan, tools, mempolicy, memblock, hugetlbfs, thp, mmap, kconfig" * akpm: (131 commits) arm64: mm: use ARCH_HAS_DEBUG_WX instead of arch defined x86: mm: use ARCH_HAS_DEBUG_WX instead of arch defined riscv: support DEBUG_WX mm: add DEBUG_WX support drivers/base/memory.c: cache memory blocks in xarray to accelerate lookup mm/thp: rename pmd_mknotpresent() as pmd_mkinvalid() powerpc/mm: drop platform defined pmd_mknotpresent() mm: thp: don't need to drain lru cache when splitting and mlocking THP hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs sparc32: register memory occupied by kernel as memblock.memory include/linux/memblock.h: fix minor typo and unclear comment mm, mempolicy: fix up gup usage in lookup_node tools/vm/page_owner_sort.c: filter out unneeded line mm: swap: memcg: fix memcg stats for huge pages mm: swap: fix vmstats for huge pages mm: vmscan: limit the range of LRU type balancing mm: vmscan: reclaim writepage is IO cost mm: vmscan: determine anon/file pressure balance at the reclaim root mm: balance LRU lists based on relative thrashing mm: only count actual rotations as LRU reclaim cost ...
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/hugetlb.h2
-rw-r--r--include/linux/compaction.h9
-rw-r--r--include/linux/gfp.h7
-rw-r--r--include/linux/hugetlb.h16
-rw-r--r--include/linux/memblock.h15
-rw-r--r--include/linux/memcontrol.h66
-rw-r--r--include/linux/mm.h48
-rw-r--r--include/linux/mmzone.h46
-rw-r--r--include/linux/padata.h43
-rw-r--r--include/linux/string.h60
-rw-r--r--include/linux/swap.h11
-rw-r--r--include/linux/vm_event_item.h4
-rw-r--r--include/linux/vmstat.h2
-rw-r--r--include/trace/events/compaction.h22
-rw-r--r--include/trace/events/huge_memory.h3
-rw-r--r--include/trace/events/vmscan.h14
16 files changed, 209 insertions, 159 deletions
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 822f433ac95c..40f85decc2ee 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -122,7 +122,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
#ifndef __HAVE_ARCH_HUGE_PTEP_GET
static inline pte_t huge_ptep_get(pte_t *ptep)
{
- return *ptep;
+ return READ_ONCE(*ptep);
}
#endif
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index a0eabfbeb0e1..6fa0eea3f530 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -97,7 +97,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
struct page **page);
extern void reset_isolation_suitable(pg_data_t *pgdat);
extern enum compact_result compaction_suitable(struct zone *zone, int order,
- unsigned int alloc_flags, int classzone_idx);
+ unsigned int alloc_flags, int highest_zoneidx);
extern void defer_compaction(struct zone *zone, int order);
extern bool compaction_deferred(struct zone *zone, int order);
@@ -182,7 +182,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
extern int kcompactd_run(int nid);
extern void kcompactd_stop(int nid);
-extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
+extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx);
#else
static inline void reset_isolation_suitable(pg_data_t *pgdat)
@@ -190,7 +190,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
}
static inline enum compact_result compaction_suitable(struct zone *zone, int order,
- int alloc_flags, int classzone_idx)
+ int alloc_flags, int highest_zoneidx)
{
return COMPACT_SKIPPED;
}
@@ -232,7 +232,8 @@ static inline void kcompactd_stop(int nid)
{
}
-static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+static inline void wakeup_kcompactd(pg_data_t *pgdat,
+ int order, int highest_zoneidx)
{
}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4aba4c86c626..67a0774e080b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -110,6 +110,11 @@ struct vm_area_struct;
* the caller guarantees the allocation will allow more memory to be freed
* very shortly e.g. process exiting or swapping. Users either should
* be the MM or co-ordinating closely with the VM (e.g. swap over NFS).
+ * Users of this flag have to be extremely careful to not deplete the reserve
+ * completely and implement a throttling mechanism which controls the
+ * consumption of the reserve based on the amount of freed memory.
+ * Usage of a pre-allocated pool (e.g. mempool) should be always considered
+ * before using this flag.
*
* %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves.
* This takes precedence over the %__GFP_MEMALLOC flag if both are set.
@@ -307,7 +312,7 @@ struct vm_area_struct;
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
#define GFP_MOVABLE_SHIFT 3
-static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
+static inline int gfp_migratetype(const gfp_t gfp_flags)
{
VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 92c21c5ccc58..0cced410e0bd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -518,8 +518,8 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
int __init __alloc_bootmem_huge_page(struct hstate *h);
int __init alloc_bootmem_huge_page(struct hstate *h);
-void __init hugetlb_bad_size(void);
void __init hugetlb_add_hstate(unsigned order);
+bool __init arch_hugetlb_valid_size(unsigned long size);
struct hstate *size_to_hstate(unsigned long size);
#ifndef HUGE_MAX_HSTATE
@@ -590,6 +590,20 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h)
#include <asm/hugetlb.h>
+#ifndef is_hugepage_only_range
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+ unsigned long addr, unsigned long len)
+{
+ return 0;
+}
+#define is_hugepage_only_range is_hugepage_only_range
+#endif
+
+#ifndef arch_clear_hugepage_flags
+static inline void arch_clear_hugepage_flags(struct page *page) { }
+#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#endif
+
#ifndef arch_make_huge_pte
static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
struct page *page, int writable)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 6bc37a731d27..017fae833d4a 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -41,7 +41,7 @@ enum memblock_flags {
/**
* struct memblock_region - represents a memory region
- * @base: physical address of the region
+ * @base: base address of the region
* @size: size of the region
* @flags: memory region attributes
* @nid: NUMA node id
@@ -50,7 +50,7 @@ struct memblock_region {
phys_addr_t base;
phys_addr_t size;
enum memblock_flags flags;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#ifdef CONFIG_NEED_MULTIPLE_NODES
int nid;
#endif
};
@@ -75,7 +75,7 @@ struct memblock_type {
* struct memblock - memblock allocator metadata
* @bottom_up: is bottom up direction?
* @current_limit: physical address of the current allocation limit
- * @memory: usabe memory regions
+ * @memory: usable memory regions
* @reserved: reserved memory regions
* @physmem: all physical memory
*/
@@ -215,7 +215,6 @@ static inline bool memblock_is_nomap(struct memblock_region *m)
return m->flags & MEMBLOCK_NOMAP;
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
unsigned long *end_pfn);
void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
@@ -234,7 +233,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \
for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
@@ -275,6 +273,9 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
for (; i != U64_MAX; \
__next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+
+int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask);
+
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/**
@@ -310,10 +311,10 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
nid, flags, p_start, p_end, p_nid)
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int memblock_set_node(phys_addr_t base, phys_addr_t size,
struct memblock_type *type, int nid);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
{
r->nid = nid;
@@ -332,7 +333,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r)
{
return 0;
}
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
/* Flags for memblock allocation APIs */
#define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bfe9533bb67e..e77197a62809 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -29,10 +29,7 @@ struct kmem_cache;
/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
- MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS,
- MEMCG_RSS,
- MEMCG_RSS_HUGE,
- MEMCG_SWAP,
+ MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
MEMCG_SOCK,
/* XXX: why are these zone and not node counters? */
MEMCG_KERNEL_STACK_KB,
@@ -358,16 +355,8 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg,
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
struct mem_cgroup *memcg);
-int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound);
-int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound);
-void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare, bool compound);
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
- bool compound);
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
+
void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -568,7 +557,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
#ifdef CONFIG_MEMCG_SWAP
-extern int do_swap_account;
+extern bool cgroup_memory_noswap;
#endif
struct mem_cgroup *lock_page_memcg(struct page *page);
@@ -708,16 +697,17 @@ static inline void mod_lruvec_state(struct lruvec *lruvec,
static inline void __mod_lruvec_page_state(struct page *page,
enum node_stat_item idx, int val)
{
+ struct page *head = compound_head(page); /* rmap on tail pages */
pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
/* Untracked pages have no memcg, no lruvec. Update only the node */
- if (!page->mem_cgroup) {
+ if (!head->mem_cgroup) {
__mod_node_page_state(pgdat, idx, val);
return;
}
- lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat);
+ lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat);
__mod_lruvec_state(lruvec, idx, val);
}
@@ -847,37 +837,12 @@ static inline enum mem_cgroup_protection mem_cgroup_protected(
return MEMCG_PROT_NONE;
}
-static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask,
- struct mem_cgroup **memcgp,
- bool compound)
-{
- *memcgp = NULL;
- return 0;
-}
-
-static inline int mem_cgroup_try_charge_delay(struct page *page,
- struct mm_struct *mm,
- gfp_t gfp_mask,
- struct mem_cgroup **memcgp,
- bool compound)
+static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
{
- *memcgp = NULL;
return 0;
}
-static inline void mem_cgroup_commit_charge(struct page *page,
- struct mem_cgroup *memcg,
- bool lrucare, bool compound)
-{
-}
-
-static inline void mem_cgroup_cancel_charge(struct page *page,
- struct mem_cgroup *memcg,
- bool compound)
-{
-}
-
static inline void mem_cgroup_uncharge(struct page *page)
{
}
@@ -1277,6 +1242,19 @@ static inline void dec_lruvec_page_state(struct page *page,
mod_lruvec_page_state(page, idx, -1);
}
+static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = lruvec_memcg(lruvec);
+ if (!memcg)
+ return NULL;
+ memcg = parent_mem_cgroup(memcg);
+ if (!memcg)
+ return NULL;
+ return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
+}
+
#ifdef CONFIG_CGROUP_WRITEBACK
struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 59adb47efc55..66e0977f970a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -501,7 +501,6 @@ struct vm_fault {
pte_t orig_pte; /* Value of PTE at the time of fault */
struct page *cow_page; /* Page handler may use for COW fault */
- struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */
struct page *page; /* ->fault handlers should return a
* page here, unless VM_FAULT_NOPAGE
* is set (which is also implied by
@@ -867,7 +866,7 @@ enum compound_dtor_id {
#endif
NR_COMPOUND_DTORS,
};
-extern compound_page_dtor * const compound_page_dtors[];
+extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS];
static inline void set_compound_page_dtor(struct page *page,
enum compound_dtor_id compound_dtor)
@@ -876,10 +875,10 @@ static inline void set_compound_page_dtor(struct page *page,
page[1].compound_dtor = compound_dtor;
}
-static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
+static inline void destroy_compound_page(struct page *page)
{
VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
- return compound_page_dtors[page[1].compound_dtor];
+ compound_page_dtors[page[1].compound_dtor](page);
}
static inline unsigned int compound_order(struct page *page)
@@ -946,8 +945,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
return pte;
}
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
- struct page *page);
+vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page);
vm_fault_t finish_fault(struct vm_fault *vmf);
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
#endif
@@ -1827,6 +1825,8 @@ extern int mprotect_fixup(struct vm_area_struct *vma,
*/
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages);
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages);
/*
* per-process(per-mm_struct) statistics.
*/
@@ -2327,9 +2327,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
}
extern void __init pagecache_init(void);
-extern void free_area_init(unsigned long * zones_size);
-extern void __init free_area_init_node(int nid, unsigned long * zones_size,
- unsigned long zone_start_pfn, unsigned long *zholes_size);
+extern void __init free_area_init_memoryless_node(int nid);
extern void free_initmem(void);
/*
@@ -2399,34 +2397,26 @@ static inline unsigned long get_num_physpages(void)
return phys_pages;
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
- * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
- * zones, allocate the backing mem_map and account for memory holes in a more
- * architecture independent manner. This is a substitute for creating the
- * zone_sizes[] and zholes_size[] arrays and passing them to
- * free_area_init_node()
+ * Using memblock node mappings, an architecture may initialise its
+ * zones, allocate the backing mem_map and account for memory holes in an
+ * architecture independent manner.
*
* An architecture is expected to register range of page frames backed by
* physical memory with memblock_add[_node]() before calling
- * free_area_init_nodes() passing in the PFN each zone ends at. At a basic
+ * free_area_init() passing in the PFN each zone ends at. At a basic
* usage, an architecture is expected to do something like
*
* unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
* max_highmem_pfn};
* for_each_valid_physical_page_range()
* memblock_add_node(base, size, nid)
- * free_area_init_nodes(max_zone_pfns);
+ * free_area_init(max_zone_pfns);
*
- * free_bootmem_with_active_regions() calls free_bootmem_node() for each
- * registered physical page range. Similarly
* sparse_memory_present_with_active_regions() calls memory_present() for
* each range when SPARSEMEM is enabled.
- *
- * See mm/page_alloc.c for more information on each function exposed by
- * CONFIG_HAVE_MEMBLOCK_NODE_MAP.
*/
-extern void free_area_init_nodes(unsigned long *max_zone_pfn);
+void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
unsigned long end_pfn);
@@ -2435,16 +2425,10 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn,
extern void get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn);
extern unsigned long find_min_pfn_with_active_regions(void);
-extern void free_bootmem_with_active_regions(int nid,
- unsigned long max_low_pfn);
extern void sparse_memory_present_with_active_regions(int nid);
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
-#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
- !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
-static inline int __early_pfn_to_nid(unsigned long pfn,
- struct mminit_pfnnid_cache *state)
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+static inline int early_pfn_to_nid(unsigned long pfn)
{
return 0;
}
@@ -2480,6 +2464,7 @@ extern void setup_per_cpu_pageset(void);
extern int min_free_kbytes;
extern int watermark_boost_factor;
extern int watermark_scale_factor;
+extern bool arch_has_descending_max_zone_pfns(void);
/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
@@ -2816,6 +2801,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */
+#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
/*
* FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index f5b6ccf41141..df1f08486d81 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -242,19 +242,6 @@ static inline bool is_active_lru(enum lru_list lru)
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}
-struct zone_reclaim_stat {
- /*
- * The pageout code in vmscan.c keeps track of how many of the
- * mem/swap backed and file backed pages are referenced.
- * The higher the rotated/scanned ratio, the more valuable
- * that cache is.
- *
- * The anon LRU stats live in [0], file LRU stats in [1]
- */
- unsigned long recent_rotated[2];
- unsigned long recent_scanned[2];
-};
-
enum lruvec_flags {
LRUVEC_CONGESTED, /* lruvec has many dirty pages
* backed by a congested BDI
@@ -263,7 +250,13 @@ enum lruvec_flags {
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
- struct zone_reclaim_stat reclaim_stat;
+ /*
+ * These track the cost of reclaiming one LRU - file or anon -
+ * over the other. As the observed cost of reclaiming one LRU
+ * increases, the reclaim scan balance tips toward the other.
+ */
+ unsigned long anon_cost;
+ unsigned long file_cost;
/* Evictions & activations on the inactive file list */
atomic_long_t inactive_age;
/* Refaults at the time of last reclaim cycle */
@@ -680,6 +673,8 @@ typedef struct pglist_data {
/*
* Must be held any time you expect node_start_pfn,
* node_present_pages, node_spanned_pages or nr_zones to stay constant.
+ * Also synchronizes pgdat->first_deferred_pfn during deferred page
+ * init.
*
* pgdat_resize_lock() and pgdat_resize_unlock() are provided to
* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
@@ -699,13 +694,13 @@ typedef struct pglist_data {
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
int kswapd_order;
- enum zone_type kswapd_classzone_idx;
+ enum zone_type kswapd_highest_zoneidx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
- enum zone_type kcompactd_classzone_idx;
+ enum zone_type kcompactd_highest_zoneidx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
#endif
@@ -783,15 +778,15 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat)
void build_all_zonelists(pg_data_t *pgdat);
void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
- enum zone_type classzone_idx);
+ enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
- int classzone_idx, unsigned int alloc_flags,
+ int highest_zoneidx, unsigned int alloc_flags,
long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx,
+ unsigned long mark, int highest_zoneidx,
unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx);
+ unsigned long mark, int highest_zoneidx);
enum memmap_context {
MEMMAP_EARLY,
MEMMAP_HOTPLUG,
@@ -876,7 +871,7 @@ extern int movable_zone;
#ifdef CONFIG_HIGHMEM
static inline int zone_movable_is_highmem(void)
{
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#ifdef CONFIG_NEED_MULTIPLE_NODES
return movable_zone == ZONE_HIGHMEM;
#else
return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM;
@@ -1079,15 +1074,6 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
#include <asm/sparsemem.h>
#endif
-#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
- !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-static inline unsigned long early_pfn_to_nid(unsigned long pfn)
-{
- BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA));
- return 0;
-}
-#endif
-
#ifdef CONFIG_FLATMEM
#define pfn_to_nid(pfn) (0)
#endif
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 693cae9bfe66..7302efff5e65 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -4,6 +4,9 @@
*
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ * Author: Daniel Jordan <daniel.m.jordan@oracle.com>
*/
#ifndef PADATA_H
@@ -24,7 +27,6 @@
* @list: List entry, to attach to the padata lists.
* @pd: Pointer to the internal control structure.
* @cb_cpu: Callback cpu for serializatioon.
- * @cpu: Cpu for parallelization.
* @seq_nr: Sequence number of the parallelized data object.
* @info: Used to pass information from the parallel to the serial function.
* @parallel: Parallel execution function.
@@ -34,7 +36,6 @@ struct padata_priv {
struct list_head list;
struct parallel_data *pd;
int cb_cpu;
- int cpu;
unsigned int seq_nr;
int info;
void (*parallel)(struct padata_priv *padata);
@@ -68,15 +69,11 @@ struct padata_serial_queue {
/**
* struct padata_parallel_queue - The percpu padata parallel queue
*
- * @parallel: List to wait for parallelization.
* @reorder: List to wait for reordering after parallel processing.
- * @work: work struct for parallelization.
* @num_obj: Number of objects that are processed by this cpu.
*/
struct padata_parallel_queue {
- struct padata_list parallel;
struct padata_list reorder;
- struct work_struct work;
atomic_t num_obj;
};
@@ -111,7 +108,7 @@ struct parallel_data {
struct padata_parallel_queue __percpu *pqueue;
struct padata_serial_queue __percpu *squeue;
atomic_t refcnt;
- atomic_t seq_nr;
+ unsigned int seq_nr;
unsigned int processed;
int cpu;
struct padata_cpumask cpumask;
@@ -137,6 +134,31 @@ struct padata_shell {
};
/**
+ * struct padata_mt_job - represents one multithreaded job
+ *
+ * @thread_fn: Called for each chunk of work that a padata thread does.
+ * @fn_arg: The thread function argument.
+ * @start: The start of the job (units are job-specific).
+ * @size: size of this node's work (units are job-specific).
+ * @align: Ranges passed to the thread function fall on this boundary, with the
+ * possible exceptions of the beginning and end of the job.
+ * @min_chunk: The minimum chunk size in job-specific units. This allows
+ * the client to communicate the minimum amount of work that's
+ * appropriate for one worker thread to do at once.
+ * @max_threads: Max threads to use for the job, actual number may be less
+ * depending on task size and minimum chunk size.
+ */
+struct padata_mt_job {
+ void (*thread_fn)(unsigned long start, unsigned long end, void *arg);
+ void *fn_arg;
+ unsigned long start;
+ unsigned long size;
+ unsigned long align;
+ unsigned long min_chunk;
+ int max_threads;
+};
+
+/**
* struct padata_instance - The overall control structure.
*
* @cpu_online_node: Linkage for CPU online callback.
@@ -166,6 +188,12 @@ struct padata_instance {
#define PADATA_INVALID 4
};
+#ifdef CONFIG_PADATA
+extern void __init padata_init(void);
+#else
+static inline void __init padata_init(void) {}
+#endif
+
extern struct padata_instance *padata_alloc_possible(const char *name);
extern void padata_free(struct padata_instance *pinst);
extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
@@ -173,6 +201,7 @@ extern void padata_free_shell(struct padata_shell *ps);
extern int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu);
extern void padata_do_serial(struct padata_priv *padata);
+extern void __init padata_do_multithreaded(struct padata_mt_job *job);
extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
cpumask_var_t cpumask);
extern int padata_start(struct padata_instance *pinst);
diff --git a/include/linux/string.h b/include/linux/string.h
index 6dfbb2efa815..9b7a0632e87a 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -272,6 +272,31 @@ void __read_overflow3(void) __compiletime_error("detected read beyond size of ob
void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter");
#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE)
+
+#ifdef CONFIG_KASAN
+extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
+extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
+extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
+extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
+extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
+extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
+extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
+extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
+extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
+extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);
+#else
+#define __underlying_memchr __builtin_memchr
+#define __underlying_memcmp __builtin_memcmp
+#define __underlying_memcpy __builtin_memcpy
+#define __underlying_memmove __builtin_memmove
+#define __underlying_memset __builtin_memset
+#define __underlying_strcat __builtin_strcat
+#define __underlying_strcpy __builtin_strcpy
+#define __underlying_strlen __builtin_strlen
+#define __underlying_strncat __builtin_strncat
+#define __underlying_strncpy __builtin_strncpy
+#endif
+
__FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
{
size_t p_size = __builtin_object_size(p, 0);
@@ -279,14 +304,14 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size)
__write_overflow();
if (p_size < size)
fortify_panic(__func__);
- return __builtin_strncpy(p, q, size);
+ return __underlying_strncpy(p, q, size);
}
__FORTIFY_INLINE char *strcat(char *p, const char *q)
{
size_t p_size = __builtin_object_size(p, 0);
if (p_size == (size_t)-1)
- return __builtin_strcat(p, q);
+ return __underlying_strcat(p, q);
if (strlcat(p, q, p_size) >= p_size)
fortify_panic(__func__);
return p;
@@ -300,7 +325,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p)
/* Work around gcc excess stack consumption issue */
if (p_size == (size_t)-1 ||
(__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0'))
- return __builtin_strlen(p);
+ return __underlying_strlen(p);
ret = strnlen(p, p_size);
if (p_size <= ret)
fortify_panic(__func__);
@@ -333,7 +358,7 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size)
__write_overflow();
if (len >= p_size)
fortify_panic(__func__);
- __builtin_memcpy(p, q, len);
+ __underlying_memcpy(p, q, len);
p[len] = '\0';
}
return ret;
@@ -346,12 +371,12 @@ __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count)
size_t p_size = __builtin_object_size(p, 0);
size_t q_size = __builtin_object_size(q, 0);
if (p_size == (size_t)-1 && q_size == (size_t)-1)
- return __builtin_strncat(p, q, count);
+ return __underlying_strncat(p, q, count);
p_len = strlen(p);
copy_len = strnlen(q, count);
if (p_size < p_len + copy_len + 1)
fortify_panic(__func__);
- __builtin_memcpy(p + p_len, q, copy_len);
+ __underlying_memcpy(p + p_len, q, copy_len);
p[p_len + copy_len] = '\0';
return p;
}
@@ -363,7 +388,7 @@ __FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size)
__write_overflow();
if (p_size < size)
fortify_panic(__func__);
- return __builtin_memset(p, c, size);
+ return __underlying_memset(p, c, size);
}
__FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
@@ -378,7 +403,7 @@ __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
}
if (p_size < size || q_size < size)
fortify_panic(__func__);
- return __builtin_memcpy(p, q, size);
+ return __underlying_memcpy(p, q, size);
}
__FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size)
@@ -393,7 +418,7 @@ __FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size)
}
if (p_size < size || q_size < size)
fortify_panic(__func__);
- return __builtin_memmove(p, q, size);
+ return __underlying_memmove(p, q, size);
}
extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
@@ -419,7 +444,7 @@ __FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size)
}
if (p_size < size || q_size < size)
fortify_panic(__func__);
- return __builtin_memcmp(p, q, size);
+ return __underlying_memcmp(p, q, size);
}
__FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size)
@@ -429,7 +454,7 @@ __FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size)
__read_overflow();
if (p_size < size)
fortify_panic(__func__);
- return __builtin_memchr(p, c, size);
+ return __underlying_memchr(p, c, size);
}
void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
@@ -460,11 +485,22 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q)
size_t p_size = __builtin_object_size(p, 0);
size_t q_size = __builtin_object_size(q, 0);
if (p_size == (size_t)-1 && q_size == (size_t)-1)
- return __builtin_strcpy(p, q);
+ return __underlying_strcpy(p, q);
memcpy(p, q, strlen(q) + 1);
return p;
}
+/* Don't use these outside the FORITFY_SOURCE implementation */
+#undef __underlying_memchr
+#undef __underlying_memcmp
+#undef __underlying_memcpy
+#undef __underlying_memmove
+#undef __underlying_memset
+#undef __underlying_strcat
+#undef __underlying_strcpy
+#undef __underlying_strlen
+#undef __underlying_strncat
+#undef __underlying_strncpy
#endif
/**
diff --git a/include/linux/swap.h b/include/linux/swap.h
index e92176fc8824..4c5974bb9ba9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -334,9 +334,10 @@ extern unsigned long nr_free_pagecache_pages(void);
/* linux/mm/swap.c */
+extern void lru_note_cost(struct lruvec *lruvec, bool file,
+ unsigned int nr_pages);
+extern void lru_note_cost_page(struct page *);
extern void lru_cache_add(struct page *);
-extern void lru_cache_add_anon(struct page *page);
-extern void lru_cache_add_file(struct page *page);
extern void lru_add_page_tail(struct page *page, struct page *page_tail,
struct lruvec *lruvec, struct list_head *head);
extern void activate_page(struct page *);
@@ -651,11 +652,9 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
#endif
#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
- gfp_t gfp_mask);
+extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask);
#else
-static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg,
- int node, gfp_t gfp_mask)
+static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
{
}
#endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index ffef0f279747..24fc7c3ae7d6 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -35,6 +35,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_DIRECT_THROTTLE,
+ PGSCAN_ANON,
+ PGSCAN_FILE,
+ PGSTEAL_ANON,
+ PGSTEAL_FILE,
#ifdef CONFIG_NUMA
PGSCAN_ZONE_RECLAIM_FAILED,
#endif
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index cb507151710f..aa961088c551 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -26,9 +26,11 @@ struct reclaim_stat {
unsigned nr_congested;
unsigned nr_writeback;
unsigned nr_immediate;
+ unsigned nr_pageout;
unsigned nr_activate[2];
unsigned nr_ref_keep;
unsigned nr_unmap_fail;
+ unsigned nr_lazyfree_fail;
};
enum writeback_stat_item {
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index e5bf6ee4e814..54e5bf081171 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -314,40 +314,44 @@ TRACE_EVENT(mm_compaction_kcompactd_sleep,
DECLARE_EVENT_CLASS(kcompactd_wake_template,
- TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+ TP_PROTO(int nid, int order, enum zone_type highest_zoneidx),
- TP_ARGS(nid, order, classzone_idx),
+ TP_ARGS(nid, order, highest_zoneidx),
TP_STRUCT__entry(
__field(int, nid)
__field(int, order)
- __field(enum zone_type, classzone_idx)
+ __field(enum zone_type, highest_zoneidx)
),
TP_fast_assign(
__entry->nid = nid;
__entry->order = order;
- __entry->classzone_idx = classzone_idx;
+ __entry->highest_zoneidx = highest_zoneidx;
),
+ /*
+ * classzone_idx is previous name of the highest_zoneidx.
+ * Reason not to change it is the ABI requirement of the tracepoint.
+ */
TP_printk("nid=%d order=%d classzone_idx=%-8s",
__entry->nid,
__entry->order,
- __print_symbolic(__entry->classzone_idx, ZONE_TYPE))
+ __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE))
);
DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd,
- TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+ TP_PROTO(int nid, int order, enum zone_type highest_zoneidx),
- TP_ARGS(nid, order, classzone_idx)
+ TP_ARGS(nid, order, highest_zoneidx)
);
DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake,
- TP_PROTO(int nid, int order, enum zone_type classzone_idx),
+ TP_PROTO(int nid, int order, enum zone_type highest_zoneidx),
- TP_ARGS(nid, order, classzone_idx)
+ TP_ARGS(nid, order, highest_zoneidx)
);
#endif
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 70e32ff096ec..4fdb14a81108 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -12,6 +12,8 @@
EM( SCAN_SUCCEED, "succeeded") \
EM( SCAN_PMD_NULL, "pmd_null") \
EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
+ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
+ EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \
EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \
EM( SCAN_PAGE_RO, "no_writable_page") \
@@ -31,7 +33,6 @@
EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
- EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
EM( SCAN_TRUNCATED, "truncated") \
EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index 74bb594ccb25..2070df64958e 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -265,7 +265,7 @@ TRACE_EVENT(mm_shrink_slab_end,
);
TRACE_EVENT(mm_vmscan_lru_isolate,
- TP_PROTO(int classzone_idx,
+ TP_PROTO(int highest_zoneidx,
int order,
unsigned long nr_requested,
unsigned long nr_scanned,
@@ -274,10 +274,10 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
isolate_mode_t isolate_mode,
int lru),
- TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru),
+ TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru),
TP_STRUCT__entry(
- __field(int, classzone_idx)
+ __field(int, highest_zoneidx)
__field(int, order)
__field(unsigned long, nr_requested)
__field(unsigned long, nr_scanned)
@@ -288,7 +288,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
),
TP_fast_assign(
- __entry->classzone_idx = classzone_idx;
+ __entry->highest_zoneidx = highest_zoneidx;
__entry->order = order;
__entry->nr_requested = nr_requested;
__entry->nr_scanned = nr_scanned;
@@ -298,9 +298,13 @@ TRACE_EVENT(mm_vmscan_lru_isolate,
__entry->lru = lru;
),
+ /*
+ * classzone is previous name of the highest_zoneidx.
+ * Reason not to change it is the ABI requirement of the tracepoint.
+ */
TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s",
__entry->isolate_mode,
- __entry->classzone_idx,
+ __entry->highest_zoneidx,
__entry->order,
__entry->nr_requested,
__entry->nr_scanned,