diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 20:24:15 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-03 20:24:15 -0700 |
commit | ee01c4d72adffb7d424535adf630f2955748fa8b (patch) | |
tree | 9ea9f40473e105e936e7477ab7dc7248d899af21 /include | |
parent | c444eb564fb16645c172d550359cb3d75fe8a040 (diff) | |
parent | 09587a09ada2ed7c39aedfa2681152b5ac5641ee (diff) | |
download | linux-ee01c4d72adffb7d424535adf630f2955748fa8b.tar.bz2 |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
"More mm/ work, plenty more to come
Subsystems affected by this patch series: slub, memcg, gup, kasan,
pagealloc, hugetlb, vmscan, tools, mempolicy, memblock, hugetlbfs,
thp, mmap, kconfig"
* akpm: (131 commits)
arm64: mm: use ARCH_HAS_DEBUG_WX instead of arch defined
x86: mm: use ARCH_HAS_DEBUG_WX instead of arch defined
riscv: support DEBUG_WX
mm: add DEBUG_WX support
drivers/base/memory.c: cache memory blocks in xarray to accelerate lookup
mm/thp: rename pmd_mknotpresent() as pmd_mkinvalid()
powerpc/mm: drop platform defined pmd_mknotpresent()
mm: thp: don't need to drain lru cache when splitting and mlocking THP
hugetlbfs: get unmapped area below TASK_UNMAPPED_BASE for hugetlbfs
sparc32: register memory occupied by kernel as memblock.memory
include/linux/memblock.h: fix minor typo and unclear comment
mm, mempolicy: fix up gup usage in lookup_node
tools/vm/page_owner_sort.c: filter out unneeded line
mm: swap: memcg: fix memcg stats for huge pages
mm: swap: fix vmstats for huge pages
mm: vmscan: limit the range of LRU type balancing
mm: vmscan: reclaim writepage is IO cost
mm: vmscan: determine anon/file pressure balance at the reclaim root
mm: balance LRU lists based on relative thrashing
mm: only count actual rotations as LRU reclaim cost
...
Diffstat (limited to 'include')
-rw-r--r-- | include/asm-generic/hugetlb.h | 2 | ||||
-rw-r--r-- | include/linux/compaction.h | 9 | ||||
-rw-r--r-- | include/linux/gfp.h | 7 | ||||
-rw-r--r-- | include/linux/hugetlb.h | 16 | ||||
-rw-r--r-- | include/linux/memblock.h | 15 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 66 | ||||
-rw-r--r-- | include/linux/mm.h | 48 | ||||
-rw-r--r-- | include/linux/mmzone.h | 46 | ||||
-rw-r--r-- | include/linux/padata.h | 43 | ||||
-rw-r--r-- | include/linux/string.h | 60 | ||||
-rw-r--r-- | include/linux/swap.h | 11 | ||||
-rw-r--r-- | include/linux/vm_event_item.h | 4 | ||||
-rw-r--r-- | include/linux/vmstat.h | 2 | ||||
-rw-r--r-- | include/trace/events/compaction.h | 22 | ||||
-rw-r--r-- | include/trace/events/huge_memory.h | 3 | ||||
-rw-r--r-- | include/trace/events/vmscan.h | 14 |
16 files changed, 209 insertions, 159 deletions
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 822f433ac95c..40f85decc2ee 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -122,7 +122,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, #ifndef __HAVE_ARCH_HUGE_PTEP_GET static inline pte_t huge_ptep_get(pte_t *ptep) { - return *ptep; + return READ_ONCE(*ptep); } #endif diff --git a/include/linux/compaction.h b/include/linux/compaction.h index a0eabfbeb0e1..6fa0eea3f530 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -97,7 +97,7 @@ extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, struct page **page); extern void reset_isolation_suitable(pg_data_t *pgdat); extern enum compact_result compaction_suitable(struct zone *zone, int order, - unsigned int alloc_flags, int classzone_idx); + unsigned int alloc_flags, int highest_zoneidx); extern void defer_compaction(struct zone *zone, int order); extern bool compaction_deferred(struct zone *zone, int order); @@ -182,7 +182,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, extern int kcompactd_run(int nid); extern void kcompactd_stop(int nid); -extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); +extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx); #else static inline void reset_isolation_suitable(pg_data_t *pgdat) @@ -190,7 +190,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) } static inline enum compact_result compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + int alloc_flags, int highest_zoneidx) { return COMPACT_SKIPPED; } @@ -232,7 +232,8 @@ static inline void kcompactd_stop(int nid) { } -static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +static inline void wakeup_kcompactd(pg_data_t *pgdat, + int order, int highest_zoneidx) { } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4aba4c86c626..67a0774e080b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -110,6 +110,11 @@ struct vm_area_struct; * the caller guarantees the allocation will allow more memory to be freed * very shortly e.g. process exiting or swapping. Users either should * be the MM or co-ordinating closely with the VM (e.g. swap over NFS). + * Users of this flag have to be extremely careful to not deplete the reserve + * completely and implement a throttling mechanism which controls the + * consumption of the reserve based on the amount of freed memory. + * Usage of a pre-allocated pool (e.g. mempool) should be always considered + * before using this flag. * * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. @@ -307,7 +312,7 @@ struct vm_area_struct; #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 -static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) +static inline int gfp_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 92c21c5ccc58..0cced410e0bd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -518,8 +518,8 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, int __init __alloc_bootmem_huge_page(struct hstate *h); int __init alloc_bootmem_huge_page(struct hstate *h); -void __init hugetlb_bad_size(void); void __init hugetlb_add_hstate(unsigned order); +bool __init arch_hugetlb_valid_size(unsigned long size); struct hstate *size_to_hstate(unsigned long size); #ifndef HUGE_MAX_HSTATE @@ -590,6 +590,20 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h) #include <asm/hugetlb.h> +#ifndef is_hugepage_only_range +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, unsigned long len) +{ + return 0; +} +#define is_hugepage_only_range is_hugepage_only_range +#endif + +#ifndef arch_clear_hugepage_flags +static inline void arch_clear_hugepage_flags(struct page *page) { } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags +#endif + #ifndef arch_make_huge_pte static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 6bc37a731d27..017fae833d4a 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -41,7 +41,7 @@ enum memblock_flags { /** * struct memblock_region - represents a memory region - * @base: physical address of the region + * @base: base address of the region * @size: size of the region * @flags: memory region attributes * @nid: NUMA node id @@ -50,7 +50,7 @@ struct memblock_region { phys_addr_t base; phys_addr_t size; enum memblock_flags flags; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES int nid; #endif }; @@ -75,7 +75,7 @@ struct memblock_type { * struct memblock - memblock allocator metadata * @bottom_up: is bottom up direction? * @current_limit: physical address of the current allocation limit - * @memory: usabe memory regions + * @memory: usable memory regions * @reserved: reserved memory regions * @physmem: all physical memory */ @@ -215,7 +215,6 @@ static inline bool memblock_is_nomap(struct memblock_region *m) return m->flags & MEMBLOCK_NOMAP; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, @@ -234,7 +233,6 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, #define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, @@ -275,6 +273,9 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, #define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ for (; i != U64_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) + +int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); + #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** @@ -310,10 +311,10 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int memblock_set_node(phys_addr_t base, phys_addr_t size, struct memblock_type *type, int nid); +#ifdef CONFIG_NEED_MULTIPLE_NODES static inline void memblock_set_region_node(struct memblock_region *r, int nid) { r->nid = nid; @@ -332,7 +333,7 @@ static inline int memblock_get_region_node(const struct memblock_region *r) { return 0; } -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#endif /* CONFIG_NEED_MULTIPLE_NODES */ /* Flags for memblock allocation APIs */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bfe9533bb67e..e77197a62809 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -29,10 +29,7 @@ struct kmem_cache; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { - MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS, - MEMCG_RSS, - MEMCG_RSS_HUGE, - MEMCG_SWAP, + MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, /* XXX: why are these zone and not node counters? */ MEMCG_KERNEL_STACK_KB, @@ -358,16 +355,8 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *memcg, enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, struct mem_cgroup *memcg); -int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound); -int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp, - bool compound); -void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare, bool compound); -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, - bool compound); +int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); + void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); @@ -568,7 +557,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_SWAP -extern int do_swap_account; +extern bool cgroup_memory_noswap; #endif struct mem_cgroup *lock_page_memcg(struct page *page); @@ -708,16 +697,17 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { + struct page *head = compound_head(page); /* rmap on tail pages */ pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; /* Untracked pages have no memcg, no lruvec. Update only the node */ - if (!page->mem_cgroup) { + if (!head->mem_cgroup) { __mod_node_page_state(pgdat, idx, val); return; } - lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat); + lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); __mod_lruvec_state(lruvec, idx, val); } @@ -847,37 +837,12 @@ static inline enum mem_cgroup_protection mem_cgroup_protected( return MEMCG_PROT_NONE; } -static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, - struct mem_cgroup **memcgp, - bool compound) -{ - *memcgp = NULL; - return 0; -} - -static inline int mem_cgroup_try_charge_delay(struct page *page, - struct mm_struct *mm, - gfp_t gfp_mask, - struct mem_cgroup **memcgp, - bool compound) +static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) { - *memcgp = NULL; return 0; } -static inline void mem_cgroup_commit_charge(struct page *page, - struct mem_cgroup *memcg, - bool lrucare, bool compound) -{ -} - -static inline void mem_cgroup_cancel_charge(struct page *page, - struct mem_cgroup *memcg, - bool compound) -{ -} - static inline void mem_cgroup_uncharge(struct page *page) { } @@ -1277,6 +1242,19 @@ static inline void dec_lruvec_page_state(struct page *page, mod_lruvec_page_state(page, idx, -1); } +static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) +{ + struct mem_cgroup *memcg; + + memcg = lruvec_memcg(lruvec); + if (!memcg) + return NULL; + memcg = parent_mem_cgroup(memcg); + if (!memcg) + return NULL; + return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); diff --git a/include/linux/mm.h b/include/linux/mm.h index 59adb47efc55..66e0977f970a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -501,7 +501,6 @@ struct vm_fault { pte_t orig_pte; /* Value of PTE at the time of fault */ struct page *cow_page; /* Page handler may use for COW fault */ - struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by @@ -867,7 +866,7 @@ enum compound_dtor_id { #endif NR_COMPOUND_DTORS, }; -extern compound_page_dtor * const compound_page_dtors[]; +extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void set_compound_page_dtor(struct page *page, enum compound_dtor_id compound_dtor) @@ -876,10 +875,10 @@ static inline void set_compound_page_dtor(struct page *page, page[1].compound_dtor = compound_dtor; } -static inline compound_page_dtor *get_compound_page_dtor(struct page *page) +static inline void destroy_compound_page(struct page *page) { VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); - return compound_page_dtors[page[1].compound_dtor]; + compound_page_dtors[page[1].compound_dtor](page); } static inline unsigned int compound_order(struct page *page) @@ -946,8 +945,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, - struct page *page); +vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif @@ -1827,6 +1825,8 @@ extern int mprotect_fixup(struct vm_area_struct *vma, */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); +int pin_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); /* * per-process(per-mm_struct) statistics. */ @@ -2327,9 +2327,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) } extern void __init pagecache_init(void); -extern void free_area_init(unsigned long * zones_size); -extern void __init free_area_init_node(int nid, unsigned long * zones_size, - unsigned long zone_start_pfn, unsigned long *zholes_size); +extern void __init free_area_init_memoryless_node(int nid); extern void free_initmem(void); /* @@ -2399,34 +2397,26 @@ static inline unsigned long get_num_physpages(void) return phys_pages; } -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its - * zones, allocate the backing mem_map and account for memory holes in a more - * architecture independent manner. This is a substitute for creating the - * zone_sizes[] and zholes_size[] arrays and passing them to - * free_area_init_node() + * Using memblock node mappings, an architecture may initialise its + * zones, allocate the backing mem_map and account for memory holes in an + * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling - * free_area_init_nodes() passing in the PFN each zone ends at. At a basic + * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid) - * free_area_init_nodes(max_zone_pfns); + * free_area_init(max_zone_pfns); * - * free_bootmem_with_active_regions() calls free_bootmem_node() for each - * registered physical page range. Similarly * sparse_memory_present_with_active_regions() calls memory_present() for * each range when SPARSEMEM is enabled. - * - * See mm/page_alloc.c for more information on each function exposed by - * CONFIG_HAVE_MEMBLOCK_NODE_MAP. */ -extern void free_area_init_nodes(unsigned long *max_zone_pfn); +void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); @@ -2435,16 +2425,10 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn, extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); extern unsigned long find_min_pfn_with_active_regions(void); -extern void free_bootmem_with_active_regions(int nid, - unsigned long max_low_pfn); extern void sparse_memory_present_with_active_regions(int nid); -#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ - -#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ - !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) -static inline int __early_pfn_to_nid(unsigned long pfn, - struct mminit_pfnnid_cache *state) +#ifndef CONFIG_NEED_MULTIPLE_NODES +static inline int early_pfn_to_nid(unsigned long pfn) { return 0; } @@ -2480,6 +2464,7 @@ extern void setup_per_cpu_pageset(void); extern int min_free_kbytes; extern int watermark_boost_factor; extern int watermark_scale_factor; +extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; @@ -2816,6 +2801,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ #define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ +#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f5b6ccf41141..df1f08486d81 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -242,19 +242,6 @@ static inline bool is_active_lru(enum lru_list lru) return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } -struct zone_reclaim_stat { - /* - * The pageout code in vmscan.c keeps track of how many of the - * mem/swap backed and file backed pages are referenced. - * The higher the rotated/scanned ratio, the more valuable - * that cache is. - * - * The anon LRU stats live in [0], file LRU stats in [1] - */ - unsigned long recent_rotated[2]; - unsigned long recent_scanned[2]; -}; - enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI @@ -263,7 +250,13 @@ enum lruvec_flags { struct lruvec { struct list_head lists[NR_LRU_LISTS]; - struct zone_reclaim_stat reclaim_stat; + /* + * These track the cost of reclaiming one LRU - file or anon - + * over the other. As the observed cost of reclaiming one LRU + * increases, the reclaim scan balance tips toward the other. + */ + unsigned long anon_cost; + unsigned long file_cost; /* Evictions & activations on the inactive file list */ atomic_long_t inactive_age; /* Refaults at the time of last reclaim cycle */ @@ -680,6 +673,8 @@ typedef struct pglist_data { /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. + * Also synchronizes pgdat->first_deferred_pfn during deferred page + * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG @@ -699,13 +694,13 @@ typedef struct pglist_data { struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ int kswapd_order; - enum zone_type kswapd_classzone_idx; + enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; - enum zone_type kcompactd_classzone_idx; + enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; #endif @@ -783,15 +778,15 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, - enum zone_type classzone_idx); + enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, unsigned int alloc_flags, + int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, + unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx); + unsigned long mark, int highest_zoneidx); enum memmap_context { MEMMAP_EARLY, MEMMAP_HOTPLUG, @@ -876,7 +871,7 @@ extern int movable_zone; #ifdef CONFIG_HIGHMEM static inline int zone_movable_is_highmem(void) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#ifdef CONFIG_NEED_MULTIPLE_NODES return movable_zone == ZONE_HIGHMEM; #else return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; @@ -1079,15 +1074,6 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #include <asm/sparsemem.h> #endif -#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ - !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) -static inline unsigned long early_pfn_to_nid(unsigned long pfn) -{ - BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA)); - return 0; -} -#endif - #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif diff --git a/include/linux/padata.h b/include/linux/padata.h index 693cae9bfe66..7302efff5e65 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -4,6 +4,9 @@ * * Copyright (C) 2008, 2009 secunet Security Networks AG * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> + * + * Copyright (c) 2020 Oracle and/or its affiliates. + * Author: Daniel Jordan <daniel.m.jordan@oracle.com> */ #ifndef PADATA_H @@ -24,7 +27,6 @@ * @list: List entry, to attach to the padata lists. * @pd: Pointer to the internal control structure. * @cb_cpu: Callback cpu for serializatioon. - * @cpu: Cpu for parallelization. * @seq_nr: Sequence number of the parallelized data object. * @info: Used to pass information from the parallel to the serial function. * @parallel: Parallel execution function. @@ -34,7 +36,6 @@ struct padata_priv { struct list_head list; struct parallel_data *pd; int cb_cpu; - int cpu; unsigned int seq_nr; int info; void (*parallel)(struct padata_priv *padata); @@ -68,15 +69,11 @@ struct padata_serial_queue { /** * struct padata_parallel_queue - The percpu padata parallel queue * - * @parallel: List to wait for parallelization. * @reorder: List to wait for reordering after parallel processing. - * @work: work struct for parallelization. * @num_obj: Number of objects that are processed by this cpu. */ struct padata_parallel_queue { - struct padata_list parallel; struct padata_list reorder; - struct work_struct work; atomic_t num_obj; }; @@ -111,7 +108,7 @@ struct parallel_data { struct padata_parallel_queue __percpu *pqueue; struct padata_serial_queue __percpu *squeue; atomic_t refcnt; - atomic_t seq_nr; + unsigned int seq_nr; unsigned int processed; int cpu; struct padata_cpumask cpumask; @@ -137,6 +134,31 @@ struct padata_shell { }; /** + * struct padata_mt_job - represents one multithreaded job + * + * @thread_fn: Called for each chunk of work that a padata thread does. + * @fn_arg: The thread function argument. + * @start: The start of the job (units are job-specific). + * @size: size of this node's work (units are job-specific). + * @align: Ranges passed to the thread function fall on this boundary, with the + * possible exceptions of the beginning and end of the job. + * @min_chunk: The minimum chunk size in job-specific units. This allows + * the client to communicate the minimum amount of work that's + * appropriate for one worker thread to do at once. + * @max_threads: Max threads to use for the job, actual number may be less + * depending on task size and minimum chunk size. + */ +struct padata_mt_job { + void (*thread_fn)(unsigned long start, unsigned long end, void *arg); + void *fn_arg; + unsigned long start; + unsigned long size; + unsigned long align; + unsigned long min_chunk; + int max_threads; +}; + +/** * struct padata_instance - The overall control structure. * * @cpu_online_node: Linkage for CPU online callback. @@ -166,6 +188,12 @@ struct padata_instance { #define PADATA_INVALID 4 }; +#ifdef CONFIG_PADATA +extern void __init padata_init(void); +#else +static inline void __init padata_init(void) {} +#endif + extern struct padata_instance *padata_alloc_possible(const char *name); extern void padata_free(struct padata_instance *pinst); extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst); @@ -173,6 +201,7 @@ extern void padata_free_shell(struct padata_shell *ps); extern int padata_do_parallel(struct padata_shell *ps, struct padata_priv *padata, int *cb_cpu); extern void padata_do_serial(struct padata_priv *padata); +extern void __init padata_do_multithreaded(struct padata_mt_job *job); extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, cpumask_var_t cpumask); extern int padata_start(struct padata_instance *pinst); diff --git a/include/linux/string.h b/include/linux/string.h index 6dfbb2efa815..9b7a0632e87a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -272,6 +272,31 @@ void __read_overflow3(void) __compiletime_error("detected read beyond size of ob void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + +#ifdef CONFIG_KASAN +extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); +extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); +extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); +extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); +extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); +extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); +extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); +extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); +extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); +extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); +#else +#define __underlying_memchr __builtin_memchr +#define __underlying_memcmp __builtin_memcmp +#define __underlying_memcpy __builtin_memcpy +#define __underlying_memmove __builtin_memmove +#define __underlying_memset __builtin_memset +#define __underlying_strcat __builtin_strcat +#define __underlying_strcpy __builtin_strcpy +#define __underlying_strlen __builtin_strlen +#define __underlying_strncat __builtin_strncat +#define __underlying_strncpy __builtin_strncpy +#endif + __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) { size_t p_size = __builtin_object_size(p, 0); @@ -279,14 +304,14 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) __write_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_strncpy(p, q, size); + return __underlying_strncpy(p, q, size); } __FORTIFY_INLINE char *strcat(char *p, const char *q) { size_t p_size = __builtin_object_size(p, 0); if (p_size == (size_t)-1) - return __builtin_strcat(p, q); + return __underlying_strcat(p, q); if (strlcat(p, q, p_size) >= p_size) fortify_panic(__func__); return p; @@ -300,7 +325,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) /* Work around gcc excess stack consumption issue */ if (p_size == (size_t)-1 || (__builtin_constant_p(p[p_size - 1]) && p[p_size - 1] == '\0')) - return __builtin_strlen(p); + return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) fortify_panic(__func__); @@ -333,7 +358,7 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) __write_overflow(); if (len >= p_size) fortify_panic(__func__); - __builtin_memcpy(p, q, len); + __underlying_memcpy(p, q, len); p[len] = '\0'; } return ret; @@ -346,12 +371,12 @@ __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strncat(p, q, count); + return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); if (p_size < p_len + copy_len + 1) fortify_panic(__func__); - __builtin_memcpy(p + p_len, q, copy_len); + __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; } @@ -363,7 +388,7 @@ __FORTIFY_INLINE void *memset(void *p, int c, __kernel_size_t size) __write_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_memset(p, c, size); + return __underlying_memset(p, c, size); } __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) @@ -378,7 +403,7 @@ __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memcpy(p, q, size); + return __underlying_memcpy(p, q, size); } __FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) @@ -393,7 +418,7 @@ __FORTIFY_INLINE void *memmove(void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memmove(p, q, size); + return __underlying_memmove(p, q, size); } extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); @@ -419,7 +444,7 @@ __FORTIFY_INLINE int memcmp(const void *p, const void *q, __kernel_size_t size) } if (p_size < size || q_size < size) fortify_panic(__func__); - return __builtin_memcmp(p, q, size); + return __underlying_memcmp(p, q, size); } __FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) @@ -429,7 +454,7 @@ __FORTIFY_INLINE void *memchr(const void *p, int c, __kernel_size_t size) __read_overflow(); if (p_size < size) fortify_panic(__func__); - return __builtin_memchr(p, c, size); + return __underlying_memchr(p, c, size); } void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); @@ -460,11 +485,22 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q) size_t p_size = __builtin_object_size(p, 0); size_t q_size = __builtin_object_size(q, 0); if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strcpy(p, q); + return __underlying_strcpy(p, q); memcpy(p, q, strlen(q) + 1); return p; } +/* Don't use these outside the FORITFY_SOURCE implementation */ +#undef __underlying_memchr +#undef __underlying_memcmp +#undef __underlying_memcpy +#undef __underlying_memmove +#undef __underlying_memset +#undef __underlying_strcat +#undef __underlying_strcpy +#undef __underlying_strlen +#undef __underlying_strncat +#undef __underlying_strncpy #endif /** diff --git a/include/linux/swap.h b/include/linux/swap.h index e92176fc8824..4c5974bb9ba9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -334,9 +334,10 @@ extern unsigned long nr_free_pagecache_pages(void); /* linux/mm/swap.c */ +extern void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_pages); +extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); -extern void lru_cache_add_anon(struct page *page); -extern void lru_cache_add_file(struct page *page); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); extern void activate_page(struct page *); @@ -651,11 +652,9 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -extern void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node, - gfp_t gfp_mask); +extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); #else -static inline void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, - int node, gfp_t gfp_mask) +static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { } #endif diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index ffef0f279747..24fc7c3ae7d6 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -35,6 +35,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_DIRECT_THROTTLE, + PGSCAN_ANON, + PGSCAN_FILE, + PGSTEAL_ANON, + PGSTEAL_FILE, #ifdef CONFIG_NUMA PGSCAN_ZONE_RECLAIM_FAILED, #endif diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index cb507151710f..aa961088c551 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -26,9 +26,11 @@ struct reclaim_stat { unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; + unsigned nr_pageout; unsigned nr_activate[2]; unsigned nr_ref_keep; unsigned nr_unmap_fail; + unsigned nr_lazyfree_fail; }; enum writeback_stat_item { diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index e5bf6ee4e814..54e5bf081171 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -314,40 +314,44 @@ TRACE_EVENT(mm_compaction_kcompactd_sleep, DECLARE_EVENT_CLASS(kcompactd_wake_template, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx), + TP_ARGS(nid, order, highest_zoneidx), TP_STRUCT__entry( __field(int, nid) __field(int, order) - __field(enum zone_type, classzone_idx) + __field(enum zone_type, highest_zoneidx) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; - __entry->classzone_idx = classzone_idx; + __entry->highest_zoneidx = highest_zoneidx; ), + /* + * classzone_idx is previous name of the highest_zoneidx. + * Reason not to change it is the ABI requirement of the tracepoint. + */ TP_printk("nid=%d order=%d classzone_idx=%-8s", __entry->nid, __entry->order, - __print_symbolic(__entry->classzone_idx, ZONE_TYPE)) + __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE)) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx) + TP_ARGS(nid, order, highest_zoneidx) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, - TP_PROTO(int nid, int order, enum zone_type classzone_idx), + TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), - TP_ARGS(nid, order, classzone_idx) + TP_ARGS(nid, order, highest_zoneidx) ); #endif diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 70e32ff096ec..4fdb14a81108 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -12,6 +12,8 @@ EM( SCAN_SUCCEED, "succeeded") \ EM( SCAN_PMD_NULL, "pmd_null") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ + EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ + EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ EM( SCAN_PAGE_RO, "no_writable_page") \ @@ -31,7 +33,6 @@ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ - EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ EM( SCAN_TRUNCATED, "truncated") \ EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 74bb594ccb25..2070df64958e 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -265,7 +265,7 @@ TRACE_EVENT(mm_shrink_slab_end, ); TRACE_EVENT(mm_vmscan_lru_isolate, - TP_PROTO(int classzone_idx, + TP_PROTO(int highest_zoneidx, int order, unsigned long nr_requested, unsigned long nr_scanned, @@ -274,10 +274,10 @@ TRACE_EVENT(mm_vmscan_lru_isolate, isolate_mode_t isolate_mode, int lru), - TP_ARGS(classzone_idx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), + TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), TP_STRUCT__entry( - __field(int, classzone_idx) + __field(int, highest_zoneidx) __field(int, order) __field(unsigned long, nr_requested) __field(unsigned long, nr_scanned) @@ -288,7 +288,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate, ), TP_fast_assign( - __entry->classzone_idx = classzone_idx; + __entry->highest_zoneidx = highest_zoneidx; __entry->order = order; __entry->nr_requested = nr_requested; __entry->nr_scanned = nr_scanned; @@ -298,9 +298,13 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __entry->lru = lru; ), + /* + * classzone is previous name of the highest_zoneidx. + * Reason not to change it is the ABI requirement of the tracepoint. + */ TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->isolate_mode, - __entry->classzone_idx, + __entry->highest_zoneidx, __entry->order, __entry->nr_requested, __entry->nr_scanned, |