From b2770da6425406cf3f6d3fddbf9086b1db0106a1 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Sep 2017 16:18:35 -0700 Subject: mm: add vm_insert_mixed_mkwrite() When servicing mmap() reads from file holes the current DAX code allocates a page cache page of all zeroes and places the struct page pointer in the mapping->page_tree radix tree. This has three major drawbacks: 1) It consumes memory unnecessarily. For every 4k page that is read via a DAX mmap() over a hole, we allocate a new page cache page. This means that if you read 1GiB worth of pages, you end up using 1GiB of zeroed memory. 2) It is slower than using a common zero page because each page fault has more work to do. Instead of just inserting a common zero page we have to allocate a page cache page, zero it, and then insert it. 3) The fact that we had to check for both DAX exceptional entries and for page cache pages in the radix tree made the DAX code more complex. This series solves these issues by following the lead of the DAX PMD code and using a common 4k zero page instead. This reduces memory usage and decreases latencies for some workloads, and it simplifies the DAX code, removing over 100 lines in total. This patch (of 5): To be able to use the common 4k zero page in DAX we need to have our PTE fault path look more like our PMD fault path where a PTE entry can be marked as dirty and writeable as it is first inserted rather than waiting for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault() call. Right now we can rely on having a dax_pfn_mkwrite() call because we can distinguish between these two cases in do_wp_page(): case 1: 4k zero page => writable DAX storage case 2: read-only DAX storage => writeable DAX storage This distinction is made by via vm_normal_page(). vm_normal_page() returns false for the common 4k zero page, though, just as it does for DAX ptes. Instead of special casing the DAX + 4k zero page case we will simplify our DAX PTE page fault sequence so that it matches our DAX PMD sequence, and get rid of the dax_pfn_mkwrite() helper. We will instead use dax_iomap_fault() to handle write-protection faults. This means that insert_pfn() needs to follow the lead of insert_pfn_pmd() and allow us to pass in a 'mkwrite' flag. If 'mkwrite' is set insert_pfn() will do the work that was previously done by wp_page_reuse() as part of the dax_pfn_mkwrite() call path. Link: http://lkml.kernel.org/r/20170724170616.25810-2-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Acked-by: Kirill A. Shutemov Cc: "Darrick J. Wong" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 50 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 56e48e4593cb..71c0b6f98a62 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1676,7 +1676,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, EXPORT_SYMBOL(vm_insert_page); static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn, pgprot_t prot) + pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; int retval; @@ -1688,14 +1688,35 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (!pte) goto out; retval = -EBUSY; - if (!pte_none(*pte)) - goto out_unlock; + if (!pte_none(*pte)) { + if (mkwrite) { + /* + * For read faults on private mappings the PFN passed + * in may not match the PFN we have mapped if the + * mapped PFN is a writeable COW page. In the mkwrite + * case we are creating a writable PTE for a shared + * mapping and we expect the PFNs to match. + */ + if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn))) + goto out_unlock; + entry = *pte; + goto out_mkwrite; + } else + goto out_unlock; + } /* Ok, finally just insert the thing.. */ if (pfn_t_devmap(pfn)) entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); + +out_mkwrite: + if (mkwrite) { + entry = pte_mkyoung(entry); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + } + set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ @@ -1766,14 +1787,15 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); - ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); + ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, + false); return ret; } EXPORT_SYMBOL(vm_insert_pfn_prot); -int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn, bool mkwrite) { pgprot_t pgprot = vma->vm_page_prot; @@ -1802,10 +1824,24 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, page = pfn_to_page(pfn_t_to_pfn(pfn)); return insert_page(vma, addr, page, pgprot); } - return insert_pfn(vma, addr, pfn, pgprot); + return insert_pfn(vma, addr, pfn, pgprot, mkwrite); +} + +int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) +{ + return __vm_insert_mixed(vma, addr, pfn, false); + } EXPORT_SYMBOL(vm_insert_mixed); +int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, + pfn_t pfn) +{ + return __vm_insert_mixed(vma, addr, pfn, true); +} +EXPORT_SYMBOL(vm_insert_mixed_mkwrite); + /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results -- cgit v1.2.3 From d01ad197ac3b50a99ea668697acefe12e73c5fea Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Wed, 6 Sep 2017 16:18:47 -0700 Subject: dax: remove DAX code from page_cache_tree_insert() Now that we no longer insert struct page pointers in DAX radix trees we can remove the special casing for DAX in page_cache_tree_insert(). This also allows us to make dax_wake_mapping_entry_waiter() local to fs/dax.c, removing it from dax.h. Link: http://lkml.kernel.org/r/20170724170616.25810-5-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: "Darrick J. Wong" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Steven Rostedt Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 2 +- include/linux/dax.h | 2 -- mm/filemap.c | 13 ++----------- 3 files changed, 3 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/fs/dax.c b/fs/dax.c index ab67ae30ccbf..8f70e3b59b91 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -127,7 +127,7 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo * correct waitqueue where tasks might be waiting for that old 'entry' and * wake them. */ -void dax_wake_mapping_entry_waiter(struct address_space *mapping, +static void dax_wake_mapping_entry_waiter(struct address_space *mapping, pgoff_t index, void *entry, bool wake_all) { struct exceptional_entry_key key; diff --git a/include/linux/dax.h b/include/linux/dax.h index b3518559f0da..319e0d997446 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -123,8 +123,6 @@ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); -void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, void *entry, bool wake_all); #ifdef CONFIG_FS_DAX int __dax_zero_page_range(struct block_device *bdev, diff --git a/mm/filemap.c b/mm/filemap.c index 65b4b6e7f7bd..dad935769055 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space *mapping, return -EEXIST; mapping->nrexceptional--; - if (!dax_mapping(mapping)) { - if (shadowp) - *shadowp = p; - } else { - /* DAX can replace empty locked entry with a hole */ - WARN_ON_ONCE(p != - dax_radix_locked_entry(0, RADIX_DAX_EMPTY)); - /* Wakeup waiters for exceptional entry lock */ - dax_wake_mapping_entry_waiter(mapping, page->index, p, - true); - } + if (shadowp) + *shadowp = p; } __radix_tree_replace(&mapping->page_tree, node, slot, page, workingset_update_node, mapping); -- cgit v1.2.3 From ea37df54d2b7950d607800ee417a1d59b95068c2 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 6 Sep 2017 16:19:15 -0700 Subject: slub: tidy up initialization ordering - free_kmem_cache_nodes() frees the cache node before nulling out a reference to it - init_kmem_cache_nodes() publishes the cache node before initializing it Neither of these matter at runtime because the cache nodes cannot be looked up by any other thread. But it's neater and more consistent to reorder these. Link: http://lkml.kernel.org/r/20170707083408.40410-1-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index e8b4e31162ca..3e90d791dd41 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3358,8 +3358,8 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) { - kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; + kmem_cache_free(kmem_cache_node, n); } } @@ -3389,8 +3389,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) return 0; } - s->node[node] = n; init_kmem_cache_node(n); + s->node[node] = n; } return 1; } -- cgit v1.2.3 From 2482ddec670fb83717d129012bc558777cb159f7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 6 Sep 2017 16:19:18 -0700 Subject: mm: add SLUB free list pointer obfuscation This SLUB free list pointer obfuscation code is modified from Brad Spengler/PaX Team's code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. This adds a per-cache random value to SLUB caches that is XORed with their freelist pointer address and value. This adds nearly zero overhead and frustrates the very common heap overflow exploitation method of overwriting freelist pointers. A recent example of the attack is written up here: http://cyseclabs.com/blog/cve-2016-6187-heap-off-by-one-exploit and there is a section dedicated to the technique the book "A Guide to Kernel Exploitation: Attacking the Core". This is based on patches by Daniel Micay, and refactored to minimize the use of #ifdef. With 200-count cycles of "hackbench -g 20 -l 1000" I saw the following run times: before: mean 10.11882499999999999995 variance .03320378329145728642 stdev .18221905304181911048 after: mean 10.12654000000000000014 variance .04700556623115577889 stdev .21680767106160192064 The difference gets lost in the noise, but if the above is to be taken literally, using CONFIG_FREELIST_HARDENED is 0.07% slower. Link: http://lkml.kernel.org/r/20170802180609.GA66807@beast Signed-off-by: Kees Cook Suggested-by: Daniel Micay Cc: Rik van Riel Cc: Tycho Andersen Cc: Alexander Popov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 4 ++++ init/Kconfig | 9 +++++++++ mm/slub.c | 42 +++++++++++++++++++++++++++++++++++++----- 3 files changed, 50 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index cc0faf3a90be..0783b622311e 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -115,6 +115,10 @@ struct kmem_cache { #endif #endif +#ifdef CONFIG_SLAB_FREELIST_HARDENED + unsigned long random; +#endif + #ifdef CONFIG_NUMA /* * Defragmentation by allocating from a remote node. diff --git a/init/Kconfig b/init/Kconfig index 5f0ef850e808..78cb2461012e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1576,6 +1576,15 @@ config SLAB_FREELIST_RANDOM security feature reduces the predictability of the kernel slab allocator against heap overflows. +config SLAB_FREELIST_HARDENED + bool "Harden slab freelist metadata" + depends on SLUB + help + Many kernel heap attacks try to target slab cache metadata and + other infrastructure. This options makes minor performance + sacrifies to harden the kernel slab allocator against common + freelist exploit methods. + config SLUB_CPU_PARTIAL default y depends on SLUB && SMP diff --git a/mm/slub.c b/mm/slub.c index 3e90d791dd41..6c87c2c6af24 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -34,6 +34,7 @@ #include #include #include +#include #include @@ -238,30 +239,58 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ +/* + * Returns freelist pointer (ptr). With hardening, this is obfuscated + * with an XOR of the address where the pointer is held and a per-cache + * random number. + */ +static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, + unsigned long ptr_addr) +{ +#ifdef CONFIG_SLAB_FREELIST_HARDENED + return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr); +#else + return ptr; +#endif +} + +/* Returns the freelist pointer recorded at location ptr_addr. */ +static inline void *freelist_dereference(const struct kmem_cache *s, + void *ptr_addr) +{ + return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr), + (unsigned long)ptr_addr); +} + static inline void *get_freepointer(struct kmem_cache *s, void *object) { - return *(void **)(object + s->offset); + return freelist_dereference(s, object + s->offset); } static void prefetch_freepointer(const struct kmem_cache *s, void *object) { - prefetch(object + s->offset); + if (object) + prefetch(freelist_dereference(s, object + s->offset)); } static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { + unsigned long freepointer_addr; void *p; if (!debug_pagealloc_enabled()) return get_freepointer(s, object); - probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); - return p; + freepointer_addr = (unsigned long)object + s->offset; + probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p)); + return freelist_ptr(s, p, freepointer_addr); } static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { - *(void **)(object + s->offset) = fp; + unsigned long freeptr_addr = (unsigned long)object + s->offset; + + *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); } /* Loop over all objects in a slab */ @@ -3563,6 +3592,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) { s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; +#ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); +#endif if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) s->reserved = sizeof(struct rcu_head); -- cgit v1.2.3 From ce6fa91b93630396ca220c33dd38ffc62686d499 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Wed, 6 Sep 2017 16:19:22 -0700 Subject: mm/slub.c: add a naive detection of double free or corruption Add an assertion similar to "fasttop" check in GNU C Library allocator as a part of SLAB_FREELIST_HARDENED feature. An object added to a singly linked freelist should not point to itself. That helps to detect some double free errors (e.g. CVE-2017-2636) without slub_debug and KASAN. Link: http://lkml.kernel.org/r/1502468246-1262-1-git-send-email-alex.popov@linux.com Signed-off-by: Alexander Popov Acked-by: Christoph Lameter Cc: Kees Cook Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Paul E McKenney Cc: Ingo Molnar Cc: Tejun Heo Cc: Andy Lutomirski Cc: Nicolas Pitre Cc: Rik van Riel Cc: Tycho Andersen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 6c87c2c6af24..16a60f871f39 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -290,6 +290,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { unsigned long freeptr_addr = (unsigned long)object + s->offset; +#ifdef CONFIG_SLAB_FREELIST_HARDENED + BUG_ON(object == fp); /* naive detection of double free or corruption */ +#endif + *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); } -- cgit v1.2.3 From d460acb5bdffc19b492b70b8f416c24dc03c474e Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Wed, 6 Sep 2017 16:19:26 -0700 Subject: mm: track actual nr_scanned during shrink_slab() Some shrinkers may only be able to free a bunch of objects at a time, and so free more than the requested nr_to_scan in one pass. Whilst other shrinkers may find themselves even unable to scan as many objects as they counted, and so underreport. Account for the extra freed/scanned objects against the total number of objects we intend to scan, otherwise we may end up penalising the slab far more than intended. Similarly, we want to add the underperforming scan to the deferred pass so that we try harder and harder in future passes. Link: http://lkml.kernel.org/r/20170822135325.9191-1-chris@chris-wilson.co.uk Signed-off-by: Chris Wilson Cc: Michal Hocko Cc: Johannes Weiner Cc: Hillf Danton Cc: Minchan Kim Cc: Vlastimil Babka Cc: Mel Gorman Cc: Shaohua Li Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Joonas Lahtinen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shrinker.h | 7 +++++++ mm/vmscan.c | 7 ++++--- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 4fcacd915d45..51d189615bda 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -18,6 +18,13 @@ struct shrink_control { */ unsigned long nr_to_scan; + /* + * How many objects did scan_objects process? + * This defaults to nr_to_scan before every call, but the callee + * should track its actual progress. + */ + unsigned long nr_scanned; + /* current node being shrunk (for NUMA aware shrinkers) */ int nid; diff --git a/mm/vmscan.c b/mm/vmscan.c index f957afe900ec..095817820a56 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -393,14 +393,15 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, unsigned long nr_to_scan = min(batch_size, total_scan); shrinkctl->nr_to_scan = nr_to_scan; + shrinkctl->nr_scanned = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; - count_vm_events(SLABS_SCANNED, nr_to_scan); - total_scan -= nr_to_scan; - scanned += nr_to_scan; + count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); + total_scan -= shrinkctl->nr_scanned; + scanned += shrinkctl->nr_scanned; cond_resched(); } -- cgit v1.2.3 From c11525830f92dea58353ace0f074ece5d9ef37c8 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 6 Sep 2017 16:19:33 -0700 Subject: mm/memory_hotplug: just build zonelist for newly added node Commit 9adb62a5df9c ("mm/hotplug: correctly setup fallback zonelists when creating new pgdat") tries to build the correct zonelist for a newly added node, while it is not necessary to rebuild it for already exist nodes. In build_zonelists(), it will iterate on nodes with memory. For a newly added node, it will have memory until node_states_set_node() is called in online_pages(). This patch avoids rebuilding the zonelists for already existing nodes. build_zonelists_node() uses managed_zone(zone) checks, so it should not include empty zones anyway. So effectively we avoid some pointless work under stop_machine(). [akpm@linux-foundation.org: tweak comment text] [akpm@linux-foundation.org: coding-style tweak, per Vlastimil] Link: http://lkml.kernel.org/r/20170626035822.50155-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Jiang Liu Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9327a940e373..dcc8a1cf55b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5282,14 +5282,18 @@ static int __build_all_zonelists(void *data) memset(node_load, 0, sizeof(node_load)); #endif + /* + * This node is hotadded and no memory is yet present. So just + * building zonelists is fine - no need to touch other nodes. + */ if (self && !node_online(self->node_id)) { build_zonelists(self); - } - - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); + } else { + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); - build_zonelists(pgdat); + build_zonelists(pgdat); + } } /* -- cgit v1.2.3 From e5e68930263377c6d4f6da0ff06f36b55d83a83f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:19:37 -0700 Subject: mm, memory_hotplug: display allowed zones in the preferred ordering Prior to commit f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") we used to allow to change the valid zone types of a memory block if it is adjacent to a different zone type. This fact was reflected in memoryNN/valid_zones by the ordering of printed zones. The first one was default (echo online > memoryNN/state) and the other one could be onlined explicitly by online_{movable,kernel}. This behavior was removed by the said patch and as such the ordering was not all that important. In most cases a kernel zone would be default anyway. The only exception is movable_node handled by "mm, memory_hotplug: support movable_node for hotpluggable nodes". Let's reintroduce this behavior again because later patch will remove the zone overlap restriction and so user will be allowed to online kernel resp. movable block regardless of its placement. Original behavior will then become significant again because it would be non-trivial for users to see what is the default zone to online into. Implementation is really simple. Pull out zone selection out of move_pfn_range into zone_for_pfn_range helper and use it in show_valid_zones to display the zone for default onlining and then both kernel and movable if they are allowed. Default online zone is not duplicated. Link: http://lkml.kernel.org/r/20170714121233.16861-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Reza Arbab Cc: Yasuaki Ishimatsu Cc: Xishi Qiu Cc: Kani Toshimitsu Cc: Cc: Daniel Kiper Cc: Igor Mammedov Cc: Vitaly Kuznetsov Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 33 +++++++++++++------ include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 73 ++++++++++++++++++++++++------------------ 3 files changed, 65 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c7c4e0325cdb..26383af9900c 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -388,6 +388,22 @@ static ssize_t show_phys_device(struct device *dev, } #ifdef CONFIG_MEMORY_HOTREMOVE +static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn, + unsigned long nr_pages, int online_type, + struct zone *default_zone) +{ + struct zone *zone; + + if (!allow_online_pfn_range(nid, start_pfn, nr_pages, online_type)) + return; + + zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); + if (zone != default_zone) { + strcat(buf, " "); + strcat(buf, zone->name); + } +} + static ssize_t show_valid_zones(struct device *dev, struct device_attribute *attr, char *buf) { @@ -395,7 +411,7 @@ static ssize_t show_valid_zones(struct device *dev, unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; unsigned long valid_start_pfn, valid_end_pfn; - bool append = false; + struct zone *default_zone; int nid; /* @@ -418,16 +434,13 @@ static ssize_t show_valid_zones(struct device *dev, } nid = pfn_to_nid(start_pfn); - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { - strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name); - append = true; - } + default_zone = zone_for_pfn_range(MMOP_ONLINE_KEEP, nid, start_pfn, nr_pages); + strcat(buf, default_zone->name); - if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { - if (append) - strcat(buf, " "); - strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); - } + print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL, + default_zone); + print_allowed_zone(buf, nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE, + default_zone); out: strcat(buf, "\n"); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c8a5056a5ae0..5e6e4cc36ff4 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -319,6 +319,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type); -extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn, +extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, unsigned long nr_pages); #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8dccc317aac2..e342624622a1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -773,31 +773,6 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) -{ - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; - struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); - - /* - * TODO there shouldn't be any inherent reason to have ZONE_NORMAL - * physically before ZONE_MOVABLE. All we need is they do not - * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE - * though so let's stick with it for simplicity for now. - * TODO make sure we do not overlap with ZONE_DEVICE - */ - if (online_type == MMOP_ONLINE_KERNEL) { - if (zone_is_empty(movable_zone)) - return true; - return movable_zone->zone_start_pfn >= pfn + nr_pages; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - return zone_end_pfn(default_zone) <= pfn; - } - - /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ - return online_type == MMOP_ONLINE_KEEP; -} - static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -856,7 +831,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, * If no kernel zone covers this pfn range it will automatically go * to the ZONE_NORMAL. */ -struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, +static struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { struct pglist_data *pgdat = NODE_DATA(nid); @@ -872,6 +847,31 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, return &pgdat->node_zones[ZONE_NORMAL]; } +bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); + + /* + * TODO there shouldn't be any inherent reason to have ZONE_NORMAL + * physically before ZONE_MOVABLE. All we need is they do not + * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE + * though so let's stick with it for simplicity for now. + * TODO make sure we do not overlap with ZONE_DEVICE + */ + if (online_type == MMOP_ONLINE_KERNEL) { + if (zone_is_empty(movable_zone)) + return true; + return movable_zone->zone_start_pfn >= pfn + nr_pages; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + return zone_end_pfn(default_zone) <= pfn; + } + + /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ + return online_type == MMOP_ONLINE_KEEP; +} + static inline bool movable_pfn_range(int nid, struct zone *default_zone, unsigned long start_pfn, unsigned long nr_pages) { @@ -885,12 +885,8 @@ static inline bool movable_pfn_range(int nid, struct zone *default_zone, return !zone_intersects(default_zone, start_pfn, nr_pages); } -/* - * Associates the given pfn range with the given node and the zone appropriate - * for the given online type. - */ -static struct zone * __meminit move_pfn_range(int online_type, int nid, - unsigned long start_pfn, unsigned long nr_pages) +struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, + unsigned long nr_pages) { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); @@ -909,6 +905,19 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid, zone = &pgdat->node_zones[ZONE_MOVABLE]; } + return zone; +} + +/* + * Associates the given pfn range with the given node and the zone appropriate + * for the given online type. + */ +static struct zone * __meminit move_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct zone *zone; + + zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); move_pfn_range_to_zone(zone, start_pfn, nr_pages); return zone; } -- cgit v1.2.3 From c6f03e2903c9ecd8fd709a5b3fa8cf0a8ae0b3da Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:19:40 -0700 Subject: mm, memory_hotplug: remove zone restrictions Historically we have enforced that any kernel zone (e.g ZONE_NORMAL) has to precede the Movable zone in the physical memory range. The purpose of the movable zone is, however, not bound to any physical memory restriction. It merely defines a class of migrateable and reclaimable memory. There are users (e.g. CMA) who might want to reserve specific physical memory ranges for their own purpose. Moreover our pfn walkers have to be prepared for zones overlapping in the physical range already because we do support interleaving NUMA nodes and therefore zones can interleave as well. This means we can allow each memory block to be associated with a different zone. Loosen the current onlining semantic and allow explicit onlining type on any memblock. That means that online_{kernel,movable} will be allowed regardless of the physical address of the memblock as long as it is offline of course. This might result in moveble zone overlapping with other kernel zones. Default onlining then becomes a bit tricky but still sensible. echo online > memoryXY/state will online the given block to 1) the default zone if the given range is outside of any zone 2) the enclosing zone if such a zone doesn't interleave with any other zone 3) the default zone if more zones interleave for this range where default zone is movable zone only if movable_node is enabled otherwise it is a kernel zone. Here is an example of the semantic with (movable_node is not present but it work in an analogous way). We start with following memblocks, all of them offline: memory34/valid_zones:Normal Movable memory35/valid_zones:Normal Movable memory36/valid_zones:Normal Movable memory37/valid_zones:Normal Movable memory38/valid_zones:Normal Movable memory39/valid_zones:Normal Movable memory40/valid_zones:Normal Movable memory41/valid_zones:Normal Movable Now, we online block 34 in default mode and block 37 as movable root@test1:/sys/devices/system/node/node1# echo online > memory34/state root@test1:/sys/devices/system/node/node1# echo online_movable > memory37/state memory34/valid_zones:Normal memory35/valid_zones:Normal Movable memory36/valid_zones:Normal Movable memory37/valid_zones:Movable memory38/valid_zones:Normal Movable memory39/valid_zones:Normal Movable memory40/valid_zones:Normal Movable memory41/valid_zones:Normal Movable As we can see all other blocks can still be onlined both into Normal and Movable zones and the Normal is default because the Movable zone spans only block37 now. root@test1:/sys/devices/system/node/node1# echo online_movable > memory41/state memory34/valid_zones:Normal memory35/valid_zones:Normal Movable memory36/valid_zones:Normal Movable memory37/valid_zones:Movable memory38/valid_zones:Movable Normal memory39/valid_zones:Movable Normal memory40/valid_zones:Movable Normal memory41/valid_zones:Movable Now the default zone for blocks 37-41 has changed because movable zone spans that range. root@test1:/sys/devices/system/node/node1# echo online_kernel > memory39/state memory34/valid_zones:Normal memory35/valid_zones:Normal Movable memory36/valid_zones:Normal Movable memory37/valid_zones:Movable memory38/valid_zones:Normal Movable memory39/valid_zones:Normal memory40/valid_zones:Movable Normal memory41/valid_zones:Movable Note that the block 39 now belongs to the zone Normal and so block38 falls into Normal by default as well. For completness root@test1:/sys/devices/system/node/node1# for i in memory[34]? do echo online > $i/state 2>/dev/null done memory34/valid_zones:Normal memory35/valid_zones:Normal memory36/valid_zones:Normal memory37/valid_zones:Movable memory38/valid_zones:Normal memory39/valid_zones:Normal memory40/valid_zones:Movable memory41/valid_zones:Movable Implementation wise the change is quite straightforward. We can get rid of allow_online_pfn_range altogether. online_pages allows only offline nodes already. The original default_zone_for_pfn will become default_kernel_zone_for_pfn. New default_zone_for_pfn implements the above semantic. zone_for_pfn_range is slightly reorganized to implement kernel and movable online type explicitly and MMOP_ONLINE_KEEP becomes a catch all default behavior. Link: http://lkml.kernel.org/r/20170714121233.16861-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Joonsoo Kim Acked-by: Vlastimil Babka Acked-by: Reza Arbab Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Yasuaki Ishimatsu Cc: Xishi Qiu Cc: Kani Toshimitsu Cc: Cc: Daniel Kiper Cc: Igor Mammedov Cc: Vitaly Kuznetsov Cc: Wei Yang Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 3 --- mm/memory_hotplug.c | 74 ++++++++++++++++----------------------------------- 2 files changed, 23 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 26383af9900c..4e3b61cda520 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -394,9 +394,6 @@ static void print_allowed_zone(char *buf, int nid, unsigned long start_pfn, { struct zone *zone; - if (!allow_online_pfn_range(nid, start_pfn, nr_pages, online_type)) - return; - zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); if (zone != default_zone) { strcat(buf, " "); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e342624622a1..3e69984346da 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -831,7 +831,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, * If no kernel zone covers this pfn range it will automatically go * to the ZONE_NORMAL. */ -static struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, +static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn, unsigned long nr_pages) { struct pglist_data *pgdat = NODE_DATA(nid); @@ -847,65 +847,40 @@ static struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, return &pgdat->node_zones[ZONE_NORMAL]; } -bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) +static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, + unsigned long nr_pages) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; - struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); + struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn, + nr_pages); + struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; + bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages); + bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages); /* - * TODO there shouldn't be any inherent reason to have ZONE_NORMAL - * physically before ZONE_MOVABLE. All we need is they do not - * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE - * though so let's stick with it for simplicity for now. - * TODO make sure we do not overlap with ZONE_DEVICE + * We inherit the existing zone in a simple case where zones do not + * overlap in the given range */ - if (online_type == MMOP_ONLINE_KERNEL) { - if (zone_is_empty(movable_zone)) - return true; - return movable_zone->zone_start_pfn >= pfn + nr_pages; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - return zone_end_pfn(default_zone) <= pfn; - } - - /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ - return online_type == MMOP_ONLINE_KEEP; -} - -static inline bool movable_pfn_range(int nid, struct zone *default_zone, - unsigned long start_pfn, unsigned long nr_pages) -{ - if (!allow_online_pfn_range(nid, start_pfn, nr_pages, - MMOP_ONLINE_KERNEL)) - return true; - - if (!movable_node_is_enabled()) - return false; + if (in_kernel ^ in_movable) + return (in_kernel) ? kernel_zone : movable_zone; - return !zone_intersects(default_zone, start_pfn, nr_pages); + /* + * If the range doesn't belong to any zone or two zones overlap in the + * given range then we use movable zone only if movable_node is + * enabled because we always online to a kernel zone by default. + */ + return movable_node_enabled ? movable_zone : kernel_zone; } struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, unsigned long nr_pages) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); + if (online_type == MMOP_ONLINE_KERNEL) + return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages); - if (online_type == MMOP_ONLINE_KEEP) { - struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; - /* - * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use - * movable zone if that is not possible (e.g. we are within - * or past the existing movable zone). movable_node overrides - * this default and defaults to movable zone - */ - if (movable_pfn_range(nid, zone, start_pfn, nr_pages)) - zone = movable_zone; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - zone = &pgdat->node_zones[ZONE_MOVABLE]; - } + if (online_type == MMOP_ONLINE_MOVABLE) + return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE]; - return zone; + return default_zone_for_pfn(nid, start_pfn, nr_pages); } /* @@ -934,9 +909,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ struct memory_notify arg; nid = pfn_to_nid(pfn); - if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) - return -EINVAL; - /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); -- cgit v1.2.3 From c9bff3eebc09be23fbc868f5e6731666d23cbea3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:20:13 -0700 Subject: mm, page_alloc: rip out ZONELIST_ORDER_ZONE Patch series "cleanup zonelists initialization", v1. This is aimed at cleaning up the zonelists initialization code we have but the primary motivation was bug report [2] which got resolved but the usage of stop_machine is just too ugly to live. Most patches are straightforward but 3 of them need a special consideration. Patch 1 removes zone ordered zonelists completely. I am CCing linux-api because this is a user visible change. As I argue in the patch description I do not think we have a strong usecase for it these days. I have kept sysctl in place and warn into the log if somebody tries to configure zone lists ordering. If somebody has a real usecase for it we can revert this patch but I do not expect anybody will actually notice runtime differences. This patch is not strictly needed for the rest but it made patch 6 easier to implement. Patch 7 removes stop_machine from build_all_zonelists without adding any special synchronization between iterators and updater which I _believe_ is acceptable as explained in the changelog. I hope I am not missing anything. Patch 8 then removes zonelists_mutex which is kind of ugly as well and not really needed AFAICS but a care should be taken when double checking my thinking. This patch (of 9): Supporting zone ordered zonelists costs us just a lot of code while the usefulness is arguable if existent at all. Mel has already made node ordering default on 64b systems. 32b systems are still using ZONELIST_ORDER_ZONE because it is considered better to fallback to a different NUMA node rather than consume precious lowmem zones. This argument is, however, weaken by the fact that the memory reclaim has been reworked to be node rather than zone oriented. This means that lowmem requests have to skip over all highmem pages on LRUs already and so zone ordering doesn't save the reclaim time much. So the only advantage of the zone ordering is under a light memory pressure when highmem requests do not ever hit into lowmem zones and the lowmem pressure doesn't need to reclaim. Considering that 32b NUMA systems are rather suboptimal already and it is generally advisable to use 64b kernel on such a HW I believe we should rather care about the code maintainability and just get rid of ZONELIST_ORDER_ZONE altogether. Keep systcl in place and warn if somebody tries to set zone ordering either from kernel command line or the sysctl. [mhocko@suse.com: reading vm.numa_zonelist_order will never terminate] Link: http://lkml.kernel.org/r/20170721143915.14161-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Shaohua Li Cc: Toshi Kani Cc: Abdul Haleem Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/sysctl/vm.txt | 4 +- Documentation/vm/numa | 7 +- include/linux/mmzone.h | 2 +- mm/page_alloc.c | 179 +++--------------------- 5 files changed, 28 insertions(+), 166 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6996b7727b85..86b0e8ec8ad7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2783,7 +2783,7 @@ Allowed values are enable and disable numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. - one of ['zone', 'node', 'default'] can be specified + 'node', 'default' can be specified This can be set from sysctl after boot. See Documentation/sysctl/vm.txt for details. diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 48244c42ff52..9baf66a9ef4e 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -572,7 +572,9 @@ See Documentation/nommu-mmap.txt for more information. numa_zonelist_order -This sysctl is only for NUMA. +This sysctl is only for NUMA and it is deprecated. Anything but +Node order will fail! + 'where the memory is allocated from' is controlled by zonelists. (This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation. you may be able to read ZONE_DMA as ZONE_DMA32...) diff --git a/Documentation/vm/numa b/Documentation/vm/numa index a08f71647714..a31b85b9bb88 100644 --- a/Documentation/vm/numa +++ b/Documentation/vm/numa @@ -79,11 +79,8 @@ memory, Linux must decide whether to order the zonelists such that allocations fall back to the same zone type on a different node, or to a different zone type on the same node. This is an important consideration because some zones, such as DMA or DMA32, represent relatively scarce resources. Linux chooses -a default zonelist order based on the sizes of the various zone types relative -to the total memory of the node and the total memory of the system. The -default zonelist order may be overridden using the numa_zonelist_order kernel -boot parameter or sysctl. [see Documentation/admin-guide/kernel-parameters.rst and -Documentation/sysctl/vm.txt] +a default Node ordered zonelist. This means it tries to fallback to other zones +from the same node before using remote nodes which are ordered by NUMA distance. By default, Linux will attempt to satisfy memory allocation requests from the node to which the CPU that executes the request is assigned. Specifically, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc14b8b3f6ce..bfdc37b77d88 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -896,7 +896,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, extern int numa_zonelist_order_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern char numa_zonelist_order[]; -#define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */ +#define NUMA_ZONELIST_ORDER_LEN 16 #ifndef CONFIG_NEED_MULTIPLE_NODES diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dcc8a1cf55b6..6b23df1be909 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4858,52 +4858,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, return nr_zones; } - -/* - * zonelist_order: - * 0 = automatic detection of better ordering. - * 1 = order by ([node] distance, -zonetype) - * 2 = order by (-zonetype, [node] distance) - * - * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create - * the same zonelist. So only NUMA can configure this param. - */ -#define ZONELIST_ORDER_DEFAULT 0 -#define ZONELIST_ORDER_NODE 1 -#define ZONELIST_ORDER_ZONE 2 - -/* zonelist order in the kernel. - * set_zonelist_order() will set this to NODE or ZONE. - */ -static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; -static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; - - #ifdef CONFIG_NUMA -/* The value user specified ....changed by config */ -static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; -/* string for sysctl */ -#define NUMA_ZONELIST_ORDER_LEN 16 -char numa_zonelist_order[16] = "default"; - -/* - * interface for configure zonelist ordering. - * command line option "numa_zonelist_order" - * = "[dD]efault - default, automatic configuration. - * = "[nN]ode - order by node locality, then by zone within node - * = "[zZ]one - order by zone, then by locality within zone - */ static int __parse_numa_zonelist_order(char *s) { - if (*s == 'd' || *s == 'D') { - user_zonelist_order = ZONELIST_ORDER_DEFAULT; - } else if (*s == 'n' || *s == 'N') { - user_zonelist_order = ZONELIST_ORDER_NODE; - } else if (*s == 'z' || *s == 'Z') { - user_zonelist_order = ZONELIST_ORDER_ZONE; - } else { - pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); + /* + * We used to support different zonlists modes but they turned + * out to be just not useful. Let's keep the warning in place + * if somebody still use the cmd line parameter so that we do + * not fail it silently + */ + if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) { + pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4911,19 +4877,15 @@ static int __parse_numa_zonelist_order(char *s) static __init int setup_numa_zonelist_order(char *s) { - int ret; - if (!s) return 0; - ret = __parse_numa_zonelist_order(s); - if (ret == 0) - strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); - - return ret; + return __parse_numa_zonelist_order(s); } early_param("numa_zonelist_order", setup_numa_zonelist_order); +char numa_zonelist_order[] = "Node"; + /* * sysctl handler for numa_zonelist_order */ @@ -4931,42 +4893,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - char saved_string[NUMA_ZONELIST_ORDER_LEN]; + char *str; int ret; - static DEFINE_MUTEX(zl_order_mutex); - mutex_lock(&zl_order_mutex); - if (write) { - if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { - ret = -EINVAL; - goto out; - } - strcpy(saved_string, (char *)table->data); - } - ret = proc_dostring(table, write, buffer, length, ppos); - if (ret) - goto out; - if (write) { - int oldval = user_zonelist_order; + if (!write) + return proc_dostring(table, write, buffer, length, ppos); + str = memdup_user_nul(buffer, 16); + if (IS_ERR(str)) + return PTR_ERR(str); - ret = __parse_numa_zonelist_order((char *)table->data); - if (ret) { - /* - * bogus value. restore saved string - */ - strncpy((char *)table->data, saved_string, - NUMA_ZONELIST_ORDER_LEN); - user_zonelist_order = oldval; - } else if (oldval != user_zonelist_order) { - mem_hotplug_begin(); - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - mem_hotplug_done(); - } - } -out: - mutex_unlock(&zl_order_mutex); + ret = __parse_numa_zonelist_order(str); + kfree(str); return ret; } @@ -5075,70 +5012,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) */ static int node_order[MAX_NUMNODES]; -static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) -{ - int pos, j, node; - int zone_type; /* needs to be signed */ - struct zone *z; - struct zonelist *zonelist; - - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - pos = 0; - for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { - for (j = 0; j < nr_nodes; j++) { - node = node_order[j]; - z = &NODE_DATA(node)->node_zones[zone_type]; - if (managed_zone(z)) { - zoneref_set_zone(z, - &zonelist->_zonerefs[pos++]); - check_highest_zone(zone_type); - } - } - } - zonelist->_zonerefs[pos].zone = NULL; - zonelist->_zonerefs[pos].zone_idx = 0; -} - -#if defined(CONFIG_64BIT) -/* - * Devices that require DMA32/DMA are relatively rare and do not justify a - * penalty to every machine in case the specialised case applies. Default - * to Node-ordering on 64-bit NUMA machines - */ -static int default_zonelist_order(void) -{ - return ZONELIST_ORDER_NODE; -} -#else -/* - * On 32-bit, the Normal zone needs to be preserved for allocations accessible - * by the kernel. If processes running on node 0 deplete the low memory zone - * then reclaim will occur more frequency increasing stalls and potentially - * be easier to OOM if a large percentage of the zone is under writeback or - * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. - * Hence, default to zone ordering on 32-bit. - */ -static int default_zonelist_order(void) -{ - return ZONELIST_ORDER_ZONE; -} -#endif /* CONFIG_64BIT */ - -static void set_zonelist_order(void) -{ - if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) - current_zonelist_order = default_zonelist_order(); - else - current_zonelist_order = user_zonelist_order; -} - static void build_zonelists(pg_data_t *pgdat) { int i, node, load; nodemask_t used_mask; int local_node, prev_node; struct zonelist *zonelist; - unsigned int order = current_zonelist_order; /* initialize zonelists */ for (i = 0; i < MAX_ZONELISTS; i++) { @@ -5168,15 +5047,7 @@ static void build_zonelists(pg_data_t *pgdat) prev_node = node; load--; - if (order == ZONELIST_ORDER_NODE) - build_zonelists_in_node_order(pgdat, node); - else - node_order[i++] = node; /* remember order */ - } - - if (order == ZONELIST_ORDER_ZONE) { - /* calculate node order -- i.e., DMA last! */ - build_zonelists_in_zone_order(pgdat, i); + build_zonelists_in_node_order(pgdat, node); } build_thisnode_zonelists(pgdat); @@ -5204,11 +5075,6 @@ static void setup_min_unmapped_ratio(void); static void setup_min_slab_ratio(void); #else /* CONFIG_NUMA */ -static void set_zonelist_order(void) -{ - current_zonelist_order = ZONELIST_ORDER_ZONE; -} - static void build_zonelists(pg_data_t *pgdat) { int node, local_node; @@ -5348,8 +5214,6 @@ build_all_zonelists_init(void) */ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) { - set_zonelist_order(); - if (system_state == SYSTEM_BOOTING) { build_all_zonelists_init(); } else { @@ -5375,9 +5239,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", nr_online_nodes, - zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); #ifdef CONFIG_NUMA -- cgit v1.2.3 From afb6ebb3faeb382f7c8b4478f2a84cee37bb8610 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:20:17 -0700 Subject: mm, page_alloc: remove boot pageset initialization from memory hotplug boot_pageset is a boot time hack which gets superseded by normal pagesets later in the boot process. It makes zero sense to reinitialize it again and again during memory hotplug. Link: http://lkml.kernel.org/r/20170721143915.14161-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Shaohua Li Cc: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6b23df1be909..94e64784a8be 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5141,7 +5141,7 @@ DEFINE_MUTEX(zonelists_mutex); static int __build_all_zonelists(void *data) { int nid; - int cpu; + int __maybe_unused cpu; pg_data_t *self = data; #ifdef CONFIG_NUMA @@ -5162,23 +5162,8 @@ static int __build_all_zonelists(void *data) } } - /* - * Initialize the boot_pagesets that are going to be used - * for bootstrapping processors. The real pagesets for - * each zone will be allocated later when the per cpu - * allocator is available. - * - * boot_pagesets are used also for bootstrapping offline - * cpus if the system is already booted because the pagesets - * are needed to initialize allocators on a specific cpu too. - * F.e. the percpu allocator needs the page allocator which - * needs the percpu allocator in order to allocate its pagesets - * (a chicken-egg dilemma). - */ - for_each_possible_cpu(cpu) { - setup_pageset(&per_cpu(boot_pageset, cpu), 0); - #ifdef CONFIG_HAVE_MEMORYLESS_NODES + for_each_possible_cpu(cpu) { /* * We now know the "local memory node" for each node-- * i.e., the node of the first zone in the generic zonelist. @@ -5189,8 +5174,8 @@ static int __build_all_zonelists(void *data) */ if (cpu_online(cpu)) set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); -#endif } +#endif return 0; } @@ -5198,7 +5183,26 @@ static int __build_all_zonelists(void *data) static noinline void __init build_all_zonelists_init(void) { + int cpu; + __build_all_zonelists(NULL); + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } -- cgit v1.2.3 From d9c9a0b9729c35c96ef7bee4ded56d9441bebd58 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:20:20 -0700 Subject: mm, page_alloc: do not set_cpu_numa_mem on empty nodes initialization __build_all_zonelists reinitializes each online cpu local node for CONFIG_HAVE_MEMORYLESS_NODES. This makes sense because previously memory less nodes could gain some memory during memory hotplug and so the local node should be changed for CPUs close to such a node. It makes less sense to do that unconditionally for a newly creaded NUMA node which is still offline and without any memory. Let's also simplify the cpu loop and use for_each_online_cpu instead of an explicit cpu_online check for all possible cpus. Link: http://lkml.kernel.org/r/20170721143915.14161-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Shaohua Li Cc: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94e64784a8be..94fb4370e000 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5160,10 +5160,8 @@ static int __build_all_zonelists(void *data) build_zonelists(pgdat); } - } #ifdef CONFIG_HAVE_MEMORYLESS_NODES - for_each_possible_cpu(cpu) { /* * We now know the "local memory node" for each node-- * i.e., the node of the first zone in the generic zonelist. @@ -5172,10 +5170,10 @@ static int __build_all_zonelists(void *data) * secondary cpus' numa_mem as they come on-line. During * node/memory hotplug, we'll fixup all on-line cpus. */ - if (cpu_online(cpu)) + for_each_online_cpu(cpu) set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); - } #endif + } return 0; } -- cgit v1.2.3 From 72675e131eb418c78980c1e683c0c25a25b61221 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:20:24 -0700 Subject: mm, memory_hotplug: drop zone from build_all_zonelists build_all_zonelists gets a zone parameter to initialize zone's pagesets. There is only a single user which gives a non-NULL zone parameter and that one doesn't really need the rest of the build_all_zonelists (see commit 6dcd73d7011b ("memory-hotplug: allocate zone's pcp before onlining pages")). Therefore remove setup_zone_pageset from build_all_zonelists and call it from its only user directly. This will also remove a pointless zonlists rebuilding which is always good. Link: http://lkml.kernel.org/r/20170721143915.14161-5-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Shaohua Li Cc: Toshi Kani Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- init/main.c | 2 +- mm/internal.h | 1 + mm/memory_hotplug.c | 10 +++++----- mm/page_alloc.c | 13 +++---------- 5 files changed, 11 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index bfdc37b77d88..551f68bec2fa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -771,7 +771,7 @@ static inline bool is_dev_zone(const struct zone *zone) #include extern struct mutex zonelists_mutex; -void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); +void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags, diff --git a/init/main.c b/init/main.c index 8828fc148670..a21a1a8708a8 100644 --- a/init/main.c +++ b/init/main.c @@ -542,7 +542,7 @@ asmlinkage __visible void __init start_kernel(void) boot_cpu_state_init(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ - build_all_zonelists(NULL, NULL); + build_all_zonelists(NULL); page_alloc_init(); pr_notice("Kernel command line: %s\n", boot_command_line); diff --git a/mm/internal.h b/mm/internal.h index 4ef49fc55e58..781c0d54d75a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -525,4 +525,5 @@ static inline bool is_migrate_highatomic_page(struct page *page) return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; } +void setup_zone_pageset(struct zone *zone); #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3e69984346da..c4df7d3c64d1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -929,7 +929,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ mutex_lock(&zonelists_mutex); if (!populated_zone(zone)) { need_zonelists_rebuild = 1; - build_all_zonelists(NULL, zone); + setup_zone_pageset(zone); } ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, @@ -950,7 +950,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) { node_states_set_node(nid, &arg); if (need_zonelists_rebuild) - build_all_zonelists(NULL, NULL); + build_all_zonelists(NULL); else zone_pcp_update(zone); } @@ -1028,7 +1028,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) * to access not-initialized zonelist, build here. */ mutex_lock(&zonelists_mutex); - build_all_zonelists(pgdat, NULL); + build_all_zonelists(pgdat); mutex_unlock(&zonelists_mutex); /* @@ -1084,7 +1084,7 @@ int try_online_node(int nid) if (pgdat->node_zonelists->_zonerefs->zone == NULL) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); + build_all_zonelists(NULL); mutex_unlock(&zonelists_mutex); } @@ -1704,7 +1704,7 @@ repeat: if (!populated_zone(zone)) { zone_pcp_reset(zone); mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); + build_all_zonelists(NULL); mutex_unlock(&zonelists_mutex); } else zone_pcp_update(zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94fb4370e000..2523d5b3b22f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5129,7 +5129,6 @@ static void build_zonelists(pg_data_t *pgdat) static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); -static void setup_zone_pageset(struct zone *zone); /* * Global mutex to protect against size modification of zonelists @@ -5209,20 +5208,14 @@ build_all_zonelists_init(void) * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. * - * __ref due to (1) call of __meminit annotated setup_zone_pageset - * [we're only called with non-NULL zone through __meminit paths] and - * (2) call of __init annotated helper build_all_zonelists_init + * __ref due to call of __init annotated helper build_all_zonelists_init * [protected by SYSTEM_BOOTING]. */ -void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) +void __ref build_all_zonelists(pg_data_t *pgdat) { if (system_state == SYSTEM_BOOTING) { build_all_zonelists_init(); } else { -#ifdef CONFIG_MEMORY_HOTPLUG - if (zone) - setup_zone_pageset(zone); -#endif /* we have to stop all cpus to guarantee there is no user of zonelist */ stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL); @@ -5496,7 +5489,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_set_high_and_batch(zone, pcp); } -static void __meminit setup_zone_pageset(struct zone *zone) +void __meminit setup_zone_pageset(struct zone *zone) { int cpu; zone->pageset = alloc_percpu(struct per_cpu_pageset); -- cgit v1.2.3 From 34ad1296571f7a004a761e3afc18e79428a726a8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:20:27 -0700 Subject: mm, memory_hotplug: remove explicit build_all_zonelists from try_online_node try_online_node calls hotadd_new_pgdat which already calls build_all_zonelists. So the additional call is redundant. Even though hotadd_new_pgdat will only initialize zonelists of the new node this is the right thing to do because such a node doesn't have any memory so other zonelists would ignore all the zones from this node anyway. Link: http://lkml.kernel.org/r/20170721143915.14161-6-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Toshi Kani Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c4df7d3c64d1..2f0c7ebc7624 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1081,13 +1081,6 @@ int try_online_node(int nid) node_set_online(nid); ret = register_one_node(nid); BUG_ON(ret); - - if (pgdat->node_zonelists->_zonerefs->zone == NULL) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); - mutex_unlock(&zonelists_mutex); - } - out: mem_hotplug_done(); return ret; -- cgit v1.2.3 From 9d3be21bf9c0b849a13e6b51e9c2ce7ccdf50851 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:20:30 -0700 Subject: mm, page_alloc: simplify zonelist initialization build_zonelists gradually builds zonelists from the nearest to the most distant node. As we do not know how many populated zones we will have in each node we rely on the _zoneref to terminate initialized part of the zonelist by a NULL zone. While this is functionally correct it is quite suboptimal because we cannot allow updaters to race with zonelists users because they could see an empty zonelist and fail the allocation or hit the OOM killer in the worst case. We can do much better, though. We can store the node ordering into an already existing node_order array and then give this array to build_zonelists_in_node_order and do the whole initialization at once. zonelists consumers still might see halfway initialized state but that should be much more tolerateable because the list will not be empty and they would either see some zone twice or skip over some zone(s) in the worst case which shouldn't lead to immediate failures. While at it let's simplify build_zonelists_node which is rather confusing now. It gets an index into the zoneref array and returns the updated index for the next iteration. Let's rename the function to build_zonerefs_node to better reflect its purpose and give it zoneref array to update. The function doesn't the index anymore. It just returns the number of added zones so that the caller can advance the zonered array start for the next update. This patch alone doesn't introduce any functional change yet, though, it is merely a preparatory work for later changes. Link: http://lkml.kernel.org/r/20170721143915.14161-7-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Shaohua Li Cc: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 81 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 41 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2523d5b3b22f..36a2f18c5e0a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4839,18 +4839,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * * Add all populated zones of a node to the zonelist. */ -static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones) +static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs) { struct zone *zone; enum zone_type zone_type = MAX_NR_ZONES; + int nr_zones = 0; do { zone_type--; zone = pgdat->node_zones + zone_type; if (managed_zone(zone)) { - zoneref_set_zone(zone, - &zonelist->_zonerefs[nr_zones++]); + zoneref_set_zone(zone, &zonerefs[nr_zones++]); check_highest_zone(zone_type); } } while (zone_type); @@ -4977,17 +4976,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) * This results in maximum locality--normal zone overflows into local * DMA zone, if any--but risks exhausting DMA zone. */ -static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) +static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order, + unsigned nr_nodes) { - int j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int i; + + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + + for (i = 0; i < nr_nodes; i++) { + int nr_zones; - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) - ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + pg_data_t *node = NODE_DATA(node_order[i]); + + nr_zones = build_zonerefs_node(node, zonerefs); + zonerefs += nr_zones; + } + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } /* @@ -4995,13 +5001,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) */ static void build_thisnode_zonelists(pg_data_t *pgdat) { - int j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int nr_zones; - zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK]; - j = build_zonelists_node(pgdat, zonelist, 0); - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } /* @@ -5010,21 +5017,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) * exhausted, but results in overflowing to remote node while memory * may still exist in local DMA zone. */ -static int node_order[MAX_NUMNODES]; static void build_zonelists(pg_data_t *pgdat) { - int i, node, load; + static int node_order[MAX_NUMNODES]; + int node, load, nr_nodes = 0; nodemask_t used_mask; int local_node, prev_node; - struct zonelist *zonelist; - - /* initialize zonelists */ - for (i = 0; i < MAX_ZONELISTS; i++) { - zonelist = pgdat->node_zonelists + i; - zonelist->_zonerefs[0].zone = NULL; - zonelist->_zonerefs[0].zone_idx = 0; - } /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; @@ -5033,8 +5032,6 @@ static void build_zonelists(pg_data_t *pgdat) nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); - i = 0; - while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { /* * We don't want to pressure a particular node. @@ -5045,11 +5042,12 @@ static void build_zonelists(pg_data_t *pgdat) node_distance(local_node, prev_node)) node_load[node] = load; + node_order[nr_nodes++] = node; prev_node = node; load--; - build_zonelists_in_node_order(pgdat, node); } + build_zonelists_in_node_order(pgdat, node_order, nr_nodes); build_thisnode_zonelists(pgdat); } @@ -5078,13 +5076,14 @@ static void setup_min_slab_ratio(void); static void build_zonelists(pg_data_t *pgdat) { int node, local_node; - enum zone_type j; - struct zonelist *zonelist; + struct zoneref *zonerefs; + int nr_zones; local_node = pgdat->node_id; - zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK]; - j = build_zonelists_node(pgdat, zonelist, 0); + zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs; + nr_zones = build_zonerefs_node(pgdat, zonerefs); + zonerefs += nr_zones; /* * Now we build the zonelist so that it contains the zones @@ -5097,16 +5096,18 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j); + nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs); + zonerefs += nr_zones; } - zonelist->_zonerefs[j].zone = NULL; - zonelist->_zonerefs[j].zone_idx = 0; + zonerefs->zone = NULL; + zonerefs->zone_idx = 0; } #endif /* CONFIG_NUMA */ -- cgit v1.2.3 From 11cd8638c37f6c400cc472cc52b6eccb505aba6e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:20:34 -0700 Subject: mm, page_alloc: remove stop_machine from build_all_zonelists build_all_zonelists has been (ab)using stop_machine to make sure that zonelists do not change while somebody is looking at them. This is is just a gross hack because a) it complicates the context from which we can call build_all_zonelists (see 3f906ba23689 ("mm/memory-hotplug: switch locking to a percpu rwsem")) and b) is is not really necessary especially after "mm, page_alloc: simplify zonelist initialization" and c) it doesn't really provide the protection it claims (see below). Updates of the zonelists happen very seldom, basically only when a zone becomes populated during memory online or when it loses all the memory during offline. A racing iteration over zonelists could either miss a zone or try to work on one zone twice. Both of these are something we can live with occasionally because there will always be at least one zone visible so we are not likely to fail allocation too easily for example. Please note that the original stop_machine approach doesn't really provide a better exclusion because the iteration might be interrupted half way (unless the whole iteration is preempt disabled which is not the case in most cases) so the some zones could still be seen twice or a zone missed. I have run the pathological online/offline of the single memblock in the movable zone while stressing the same small node with some memory pressure. Node 1, zone DMA pages free 0 min 0 low 0 high 0 spanned 0 present 0 managed 0 protection: (0, 943, 943, 943) Node 1, zone DMA32 pages free 227310 min 8294 low 10367 high 12440 spanned 262112 present 262112 managed 241436 protection: (0, 0, 0, 0) Node 1, zone Normal pages free 0 min 0 low 0 high 0 spanned 0 present 0 managed 0 protection: (0, 0, 0, 1024) Node 1, zone Movable pages free 32722 min 85 low 117 high 149 spanned 32768 present 32768 managed 32768 protection: (0, 0, 0, 0) root@test1:/sys/devices/system/node/node1# while true do echo offline > memory34/state echo online_movable > memory34/state done root@test1:/mnt/data/test/linux-3.7-rc5# numactl --preferred=1 make -j4 and it survived without any unexpected behavior. While this is not really a great testing coverage it should exercise the allocation path quite a lot. Link: http://lkml.kernel.org/r/20170721143915.14161-8-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Shaohua Li Cc: Toshi Kani Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36a2f18c5e0a..e3086d0fd945 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5137,8 +5137,7 @@ static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); */ DEFINE_MUTEX(zonelists_mutex); -/* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *data) +static void __build_all_zonelists(void *data) { int nid; int __maybe_unused cpu; @@ -5174,8 +5173,6 @@ static int __build_all_zonelists(void *data) set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); #endif } - - return 0; } static noinline void __init @@ -5217,9 +5214,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat) if (system_state == SYSTEM_BOOTING) { build_all_zonelists_init(); } else { - /* we have to stop all cpus to guarantee there is no user - of zonelist */ - stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL); + __build_all_zonelists(pgdat); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); -- cgit v1.2.3 From b93e0f329e24f3615aa551fd9b99a75fb7c9195f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:20:37 -0700 Subject: mm, memory_hotplug: get rid of zonelists_mutex zonelists_mutex was introduced by commit 4eaf3f64397c ("mem-hotplug: fix potential race while building zonelist for new populated zone") to protect zonelist building from races. This is no longer needed though because both memory online and offline are fully serialized. New users have grown since then. Notably setup_per_zone_wmarks wants to prevent from races between memory hotplug, khugepaged setup and manual min_free_kbytes update via sysctl (see cfd3da1e49bb ("mm: Serialize access to min_free_kbytes"). Let's add a private lock for that purpose. This will not prevent from seeing halfway through memory hotplug operation but that shouldn't be a big deal becuse memory hotplug will update watermarks explicitly so we will eventually get a full picture. The lock just makes sure we won't race when updating watermarks leading to weird results. Also __build_all_zonelists manipulates global data so add a private lock for it as well. This doesn't seem to be necessary today but it is more robust to have a lock there. While we are at it make sure we document that memory online/offline depends on a full serialization either via mem_hotplug_begin() or device_lock. Link: http://lkml.kernel.org/r/20170721143915.14161-9-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Shaohua Li Cc: Toshi Kani Cc: Vlastimil Babka Cc: Haicheng Li Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 - mm/memory_hotplug.c | 12 ++---------- mm/page_alloc.c | 18 +++++++++--------- 3 files changed, 11 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 551f68bec2fa..e7e92c8f4883 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -770,7 +770,6 @@ static inline bool is_dev_zone(const struct zone *zone) #include -extern struct mutex zonelists_mutex; void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2f0c7ebc7624..73bf17df6899 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -897,7 +897,7 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid, return zone; } -/* Must be protected by mem_hotplug_begin() */ +/* Must be protected by mem_hotplug_begin() or a device_lock */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -926,7 +926,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ - mutex_lock(&zonelists_mutex); if (!populated_zone(zone)) { need_zonelists_rebuild = 1; setup_zone_pageset(zone); @@ -937,7 +936,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (ret) { if (need_zonelists_rebuild) zone_pcp_reset(zone); - mutex_unlock(&zonelists_mutex); goto failed_addition; } @@ -955,8 +953,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone_pcp_update(zone); } - mutex_unlock(&zonelists_mutex); - init_per_zone_wmark_min(); if (onlined_pages) { @@ -1027,9 +1023,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) * The node we allocated has no zone fallback lists. For avoiding * to access not-initialized zonelist, build here. */ - mutex_lock(&zonelists_mutex); build_all_zonelists(pgdat); - mutex_unlock(&zonelists_mutex); /* * zone->managed_pages is set to an approximate value in @@ -1696,9 +1690,7 @@ repeat: if (!populated_zone(zone)) { zone_pcp_reset(zone); - mutex_lock(&zonelists_mutex); build_all_zonelists(NULL); - mutex_unlock(&zonelists_mutex); } else zone_pcp_update(zone); @@ -1724,7 +1716,7 @@ failed_removal: return ret; } -/* Must be protected by mem_hotplug_begin() */ +/* Must be protected by mem_hotplug_begin() or a device_lock */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3086d0fd945..0bea94af0423 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5131,17 +5131,14 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); -/* - * Global mutex to protect against size modification of zonelists - * as well as to serialize pageset setup for the new populated zone. - */ -DEFINE_MUTEX(zonelists_mutex); - static void __build_all_zonelists(void *data) { int nid; int __maybe_unused cpu; pg_data_t *self = data; + static DEFINE_SPINLOCK(lock); + + spin_lock(&lock); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -5173,6 +5170,8 @@ static void __build_all_zonelists(void *data) set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); #endif } + + spin_unlock(&lock); } static noinline void __init @@ -5203,7 +5202,6 @@ build_all_zonelists_init(void) } /* - * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. * * __ref due to call of __init annotated helper build_all_zonelists_init @@ -6939,9 +6937,11 @@ static void __setup_per_zone_wmarks(void) */ void setup_per_zone_wmarks(void) { - mutex_lock(&zonelists_mutex); + static DEFINE_SPINLOCK(lock); + + spin_lock(&lock); __setup_per_zone_wmarks(); - mutex_unlock(&zonelists_mutex); + spin_unlock(&lock); } /* -- cgit v1.2.3 From b95046b0472f7a805fa28fbcfc7205a76ff7a7d0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:20:41 -0700 Subject: mm, sparse, page_ext: drop ugly N_HIGH_MEMORY branches for allocations Commit f52407ce2dea ("memory hotplug: alloc page from other node in memory online") has introduced N_HIGH_MEMORY checks to only use NUMA aware allocations when there is some memory present because the respective node might not have any memory yet at the time and so it could fail or even OOM. Things have changed since then though. Zonelists are now always initialized before we do any allocations even for hotplug (see 959ecc48fc75 ("mm/memory_hotplug.c: fix building of node hotplug zonelist")). Therefore these checks are not really needed. In fact caller of the allocator should never care about whether the node is populated because that might change at any time. Link: http://lkml.kernel.org/r/20170721143915.14161-10-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Shaohua Li Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Mel Gorman Cc: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_ext.c | 5 +---- mm/sparse-vmemmap.c | 11 +++-------- mm/sparse.c | 10 +++------- 3 files changed, 7 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/page_ext.c b/mm/page_ext.c index 88ccc044b09a..714ce79256c5 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -222,10 +222,7 @@ static void *__meminit alloc_page_ext(size_t size, int nid) return addr; } - if (node_state(nid, N_HIGH_MEMORY)) - addr = vzalloc_node(size, nid); - else - addr = vzalloc(size); + addr = vzalloc_node(size, nid); return addr; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index c50b1a14d55e..d1a39b8051e0 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -54,14 +54,9 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) if (slab_is_available()) { struct page *page; - if (node_state(node, N_HIGH_MEMORY)) - page = alloc_pages_node( - node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, - get_order(size)); - else - page = alloc_pages( - GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, - get_order(size)); + page = alloc_pages_node(node, + GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, + get_order(size)); if (page) return page_address(page); return NULL; diff --git a/mm/sparse.c b/mm/sparse.c index 7b4be3fd5cac..a9783acf2bb9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -65,14 +65,10 @@ static noinline struct mem_section __ref *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - if (slab_is_available()) { - if (node_state(nid, N_HIGH_MEMORY)) - section = kzalloc_node(array_size, GFP_KERNEL, nid); - else - section = kzalloc(array_size, GFP_KERNEL); - } else { + if (slab_is_available()) + section = kzalloc_node(array_size, GFP_KERNEL, nid); + else section = memblock_virt_alloc_node(array_size, nid); - } return section; } -- cgit v1.2.3 From dab4ead1a9d88361c85a8209c7e23a8fd124e8d7 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 6 Sep 2017 16:20:44 -0700 Subject: mm, page_owner: make init_pages_in_zone() faster In init_pages_in_zone() we currently use the generic set_page_owner() function to initialize page_owner info for early allocated pages. This means we needlessly do lookup_page_ext() twice for each page, and more importantly save_stack(), which has to unwind the stack and find the corresponding stack depot handle. Because the stack is always the same for the initialization, unwind it once in init_pages_in_zone() and reuse the handle. Also avoid the repeated lookup_page_ext(). This can significantly reduce boot times with page_owner=on on large machines, especially for kernels built without frame pointer, where the stack unwinding is noticeably slower. [vbabka@suse.cz: don't duplicate code of __set_page_owner(), per Michal Hocko] [akpm@linux-foundation.org: coding-style fixes] [vbabka@suse.cz: create statically allocated fake stack trace for early allocated pages, per Michal] Link: http://lkml.kernel.org/r/45813564-2342-fc8d-d31a-f4b68a724325@suse.cz Link: http://lkml.kernel.org/r/20170720134029.25268-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Mel Gorman Cc: Yang Shi Cc: Laura Abbott Cc: Vinayak Menon Cc: zhong jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 52 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/page_owner.c b/mm/page_owner.c index 0fd9dcf2c5dc..33634f74d0b2 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -30,6 +30,7 @@ DEFINE_STATIC_KEY_FALSE(page_owner_inited); static depot_stack_handle_t dummy_handle; static depot_stack_handle_t failure_handle; +static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); @@ -53,7 +54,7 @@ static bool need_page_owner(void) return true; } -static noinline void register_dummy_stack(void) +static __always_inline depot_stack_handle_t create_dummy_stack(void) { unsigned long entries[4]; struct stack_trace dummy; @@ -64,21 +65,22 @@ static noinline void register_dummy_stack(void) dummy.skip = 0; save_stack_trace(&dummy); - dummy_handle = depot_save_stack(&dummy, GFP_KERNEL); + return depot_save_stack(&dummy, GFP_KERNEL); } -static noinline void register_failure_stack(void) +static noinline void register_dummy_stack(void) { - unsigned long entries[4]; - struct stack_trace failure; + dummy_handle = create_dummy_stack(); +} - failure.nr_entries = 0; - failure.max_entries = ARRAY_SIZE(entries); - failure.entries = &entries[0]; - failure.skip = 0; +static noinline void register_failure_stack(void) +{ + failure_handle = create_dummy_stack(); +} - save_stack_trace(&failure); - failure_handle = depot_save_stack(&failure, GFP_KERNEL); +static noinline void register_early_stack(void) +{ + early_handle = create_dummy_stack(); } static void init_page_owner(void) @@ -88,6 +90,7 @@ static void init_page_owner(void) register_dummy_stack(); register_failure_stack(); + register_early_stack(); static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } @@ -165,17 +168,13 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) return handle; } -noinline void __set_page_owner(struct page *page, unsigned int order, - gfp_t gfp_mask) +static inline void __set_page_owner_handle(struct page_ext *page_ext, + depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask) { - struct page_ext *page_ext = lookup_page_ext(page); struct page_owner *page_owner; - if (unlikely(!page_ext)) - return; - page_owner = get_page_owner(page_ext); - page_owner->handle = save_stack(gfp_mask); + page_owner->handle = handle; page_owner->order = order; page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; @@ -183,6 +182,19 @@ noinline void __set_page_owner(struct page *page, unsigned int order, __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } +noinline void __set_page_owner(struct page *page, unsigned int order, + gfp_t gfp_mask) +{ + struct page_ext *page_ext = lookup_page_ext(page); + depot_stack_handle_t handle; + + if (unlikely(!page_ext)) + return; + + handle = save_stack(gfp_mask); + __set_page_owner_handle(page_ext, handle, order, gfp_mask); +} + void __set_page_owner_migrate_reason(struct page *page, int reason) { struct page_ext *page_ext = lookup_page_ext(page); @@ -565,12 +577,12 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (unlikely(!page_ext)) continue; - /* Maybe overraping zone */ + /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; /* Found early allocated page */ - set_page_owner(page, 0, 0); + __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; } } -- cgit v1.2.3 From 0fc542b7dd90a7080fc1a8c846d13a4ddad509ba Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 6 Sep 2017 16:20:48 -0700 Subject: mm, page_ext: periodically reschedule during page_ext_init() page_ext_init() can take long on large machines, so add a cond_resched() point after each section is processed. This will allow moving the init to a later point at boot without triggering lockup reports. Link: http://lkml.kernel.org/r/20170720134029.25268-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Mel Gorman Cc: Yang Shi Cc: Laura Abbott Cc: Vinayak Menon Cc: zhong jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_ext.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_ext.c b/mm/page_ext.c index 714ce79256c5..32f18911deda 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -406,6 +406,7 @@ void __init page_ext_init(void) continue; if (init_section_page_ext(pfn, nid)) goto oom; + cond_resched(); } } hotplug_memory_notifier(page_ext_callback, 0); -- cgit v1.2.3 From 10903027948d768d9639b31e9a555802e2dabafc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 6 Sep 2017 16:20:51 -0700 Subject: mm, page_owner: don't grab zone->lock for init_pages_in_zone() init_pages_in_zone() is run under zone->lock, which means a long lock time and disabled interrupts on large machines. This is currently not an issue since it runs early in boot, but a later patch will change that. However, like other pfn scanners, we don't actually need zone->lock even when other cpus are running. The only potentially dangerous operation here is reading bogus buddy page owner due to race, and we already know how to handle that. The worst that can happen is that we skip some early allocated pages, which should not affect the debugging power of page_owner noticeably. Link: http://lkml.kernel.org/r/20170720134029.25268-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Joonsoo Kim Cc: Mel Gorman Cc: Yang Shi Cc: Laura Abbott Cc: Vinayak Menon Cc: zhong jiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_owner.c b/mm/page_owner.c index 33634f74d0b2..8e2d7137510c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -562,11 +562,17 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; /* - * We are safe to check buddy flag and order, because - * this is init stage and only single thread runs. + * To avoid having to grab zone->lock, be a little + * careful when reading buddy page order. The only + * danger is that we skip too much and potentially miss + * some early allocated pages, which is better than + * heavy lock contention. */ if (PageBuddy(page)) { - pfn += (1UL << page_order(page)) - 1; + unsigned long order = page_order_unsafe(page); + + if (order > 0 && order < MAX_ORDER) + pfn += (1UL << order) - 1; continue; } @@ -585,6 +591,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; } + cond_resched(); } pr_info("Node %d, zone %8s: page owner found early allocated %lu pages\n", @@ -595,15 +602,12 @@ static void init_zones_in_node(pg_data_t *pgdat) { struct zone *zone; struct zone *node_zones = pgdat->node_zones; - unsigned long flags; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!populated_zone(zone)) continue; - spin_lock_irqsave(&zone->lock, flags); init_pages_in_zone(pgdat, zone); - spin_unlock_irqrestore(&zone->lock, flags); } } -- cgit v1.2.3 From dba58d3b8c5045ad89c1c95d33d01451e3964db7 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 6 Sep 2017 16:20:55 -0700 Subject: mm/mremap: fail map duplication attempts for private mappings mremap will attempt to create a 'duplicate' mapping if old_size == 0 is specified. In the case of private mappings, mremap will actually create a fresh separate private mapping unrelated to the original. This does not fit with the design semantics of mremap as the intention is to create a new mapping based on the original. Therefore, return EINVAL in the case where an attempt is made to duplicate a private mapping. Also, print a warning message (once) if such an attempt is made. Link: http://lkml.kernel.org/r/cb9d9f6a-7095-582f-15a5-62643d65c736@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: Aaron Lu Cc: "Kirill A . Shutemov" Cc: Vlastimil Babka Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index 3f23715d3c69..7395564daa6c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -384,6 +384,19 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (!vma || vma->vm_start > addr) return ERR_PTR(-EFAULT); + /* + * !old_len is a special case where an attempt is made to 'duplicate' + * a mapping. This makes no sense for private mappings as it will + * instead create a fresh/new mapping unrelated to the original. This + * is contrary to the basic idea of mremap which creates new mappings + * based on the original. There are no known use cases for this + * behavior. As a result, fail such attempts. + */ + if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { + pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid); + return ERR_PTR(-EINVAL); + } + if (is_vm_hugetlb_page(vma)) return ERR_PTR(-EINVAL); -- cgit v1.2.3 From 09180ca4b3bc7b1ffdbae62a75f716dfc0685861 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Wed, 6 Sep 2017 16:20:58 -0700 Subject: mm/gup: make __gup_device_* require THP These functions are the only bits of generic code that use {pud,pmd}_pfn() without checking for CONFIG_TRANSPARENT_HUGEPAGE. This works fine on x86, the only arch with devmap support, since the *_pfn() functions are always defined there, but this isn't true for every architecture. Link: http://lkml.kernel.org/r/20170626063833.11094-1-oohall@gmail.com Signed-off-by: Oliver O'Halloran Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 23f01c40c88f..33d651deeae2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1352,7 +1352,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* __HAVE_ARCH_PTE_SPECIAL */ -#ifdef __HAVE_ARCH_PTE_DEVMAP +#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, struct page **pages, int *nr) { -- cgit v1.2.3 From 9b19df292c666b57c407fed2496827c6aba05be2 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Wed, 6 Sep 2017 16:21:01 -0700 Subject: mm/hugetlb.c: make huge_pte_offset() consistent and document behaviour When walking the page tables to resolve an address that points to !p*d_present() entry, huge_pte_offset() returns inconsistent values depending on the level of page table (PUD or PMD). It returns NULL in the case of a PUD entry while in the case of a PMD entry, it returns a pointer to the page table entry. A similar inconsitency exists when handling swap entries - returns NULL for a PUD entry while a pointer to the pte_t is retured for the PMD entry. Update huge_pte_offset() to make the behaviour consistent - return a pointer to the pte_t for hugepage or swap entries. Only return NULL in instances where we have a p*d_none() entry and the size parameter doesn't match the hugepage size at this level of the page table. Document the behaviour to clarify the expected behaviour of this function. This is to set clear semantics for architecture specific implementations of huge_pte_offset(). Discussions on the arm64 implementation of huge_pte_offset() (http://www.spinics.net/lists/linux-mm/msg133699.html) showed that there is benefit from returning a pte_t* in the case of p*d_none(). The fault handling code in hugetlb_fault() can handle p*d_none() entries and saves an extra round trip to huge_pte_alloc(). Other callers of huge_pte_offset() should be ok as well. [punit.agrawal@arm.com: v2] Link: http://lkml.kernel.org/r/20170725154114.24131-2-punit.agrawal@arm.com Signed-off-by: Punit Agrawal Reviewed-by: Catalin Marinas Reviewed-by: Mike Kravetz Reviewed-by: Catalin Marinas Acked-by: Michal Hocko Cc: Naoya Horiguchi Cc: Steve Capper Cc: Will Deacon Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 31e207cb399b..1d54a131bdd5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4600,6 +4600,15 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } +/* + * huge_pte_offset() - Walk the page table to resolve the hugepage + * entry at address @addr + * + * Return: Pointer to page table or swap entry (PUD or PMD) for + * address @addr, or NULL if a p*d_none() entry is encountered and the + * size @sz doesn't match the hugepage size at this level of the page + * table. + */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { @@ -4614,13 +4623,22 @@ pte_t *huge_pte_offset(struct mm_struct *mm, p4d = p4d_offset(pgd, addr); if (!p4d_present(*p4d)) return NULL; + pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) + if (sz != PUD_SIZE && pud_none(*pud)) return NULL; - if (pud_huge(*pud)) + /* hugepage or swap? */ + if (pud_huge(*pud) || !pud_present(*pud)) return (pte_t *)pud; + pmd = pmd_offset(pud, addr); - return (pte_t *) pmd; + if (sz != PMD_SIZE && pmd_none(*pmd)) + return NULL; + /* hugepage or swap? */ + if (pmd_huge(*pmd) || !pmd_present(*pmd)) + return (pte_t *)pmd; + + return NULL; } #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ -- cgit v1.2.3 From 4647706ebeee6e50f7b9f922b095f4ec94d581c3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 6 Sep 2017 16:21:05 -0700 Subject: mm: always flush VMA ranges affected by zap_page_range Nadav Amit report zap_page_range only specifies that the caller protect the VMA list but does not specify whether it is held for read or write with callers using either. madvise holds mmap_sem for read meaning that a parallel zap operation can unmap PTEs which are then potentially skipped by madvise which potentially returns with stale TLB entries present. While the API could be extended, it would be a difficult API to use. This patch causes zap_page_range() to always consider flushing the full affected range. For small ranges or sparsely populated mappings, this may result in one additional spurious TLB flush. For larger ranges, it is possible that the TLB has already been flushed and the overhead is negligible. Either way, this approach is safer overall and avoids stale entries being present when madvise returns. This can be illustrated with the following program provided by Nadav Amit and slightly modified. With the patch applied, it has an exit code of 0 indicating a stale TLB entry did not leak to userspace. ---8<--- volatile int sync_step = 0; volatile char *p; static inline unsigned long rdtsc() { unsigned long hi, lo; __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); return lo | (hi << 32); } static inline void wait_rdtsc(unsigned long cycles) { unsigned long tsc = rdtsc(); while (rdtsc() - tsc < cycles); } void *big_madvise_thread(void *ign) { sync_step = 1; while (sync_step != 2); madvise((void*)p, PAGE_SIZE * N_PAGES, MADV_DONTNEED); } int main(void) { pthread_t aux_thread; p = mmap(0, PAGE_SIZE * N_PAGES, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); memset((void*)p, 8, PAGE_SIZE * N_PAGES); pthread_create(&aux_thread, NULL, big_madvise_thread, NULL); while (sync_step != 1); *p = 8; // Cache in TLB sync_step = 2; wait_rdtsc(100000); madvise((void*)p, PAGE_SIZE, MADV_DONTNEED); printf("data: %d (%s)\n", *p, (*p == 8 ? "stale, broken" : "cleared, fine")); return *p == 8 ? -1 : 0; } ---8<--- Link: http://lkml.kernel.org/r/20170725101230.5v7gvnjmcnkzzql3@techsingularity.net Signed-off-by: Mel Gorman Reported-by: Nadav Amit Cc: Andy Lutomirski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 71c0b6f98a62..1416485e278c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1513,8 +1513,20 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) + for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { unmap_single_vma(&tlb, vma, start, end, NULL); + + /* + * zap_page_range does not specify whether mmap_sem should be + * held for read or write. That allows parallel zap_page_range + * operations to unmap a PTE and defer a flush meaning that + * this call observes pte_none and fails to flush the TLB. + * Rather than adding a complex API, ensure that no stale + * TLB entries exist when this call returns. + */ + flush_tlb_range(vma, start, end); + } + mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } -- cgit v1.2.3 From 77ff465799c60294e248000cd22ae8171da3304c Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Wed, 6 Sep 2017 16:21:08 -0700 Subject: zsmalloc: zs_page_migrate: skip unnecessary loops but not return -EBUSY if zspage is not inuse Getting -EBUSY from zs_page_migrate will make migration slow (retry) or fail (zs_page_putback will schedule_work free_work, but it cannot ensure the success). I noticed this issue because my Kernel patched (https://lkml.org/lkml/2014/5/28/113) that will remove retry in __alloc_contig_migrate_range. This retry will handle the -EBUSY because it will re-isolate the page and re-call migrate_pages. Without it will make cma_alloc fail at once with -EBUSY. According to the review from Minchan Kim in https://lkml.org/lkml/2014/5/28/113, I update the patch to skip unnecessary loops but not return -EBUSY if zspage is not inuse. Following is what I got with highalloc-performance in a vbox with 2 cpu 1G memory 512 zram as swap. And the swappiness is set to 100. ori ne orig new Minor Faults 50805113 50830235 Major Faults 43918 56530 Swap Ins 42087 55680 Swap Outs 89718 104700 Allocation stalls 0 0 DMA allocs 57787 52364 DMA32 allocs 47964599 48043563 Normal allocs 0 0 Movable allocs 0 0 Direct pages scanned 45493 23167 Kswapd pages scanned 1565222 1725078 Kswapd pages reclaimed 1342222 1503037 Direct pages reclaimed 45615 25186 Kswapd efficiency 85% 87% Kswapd velocity 1897.101 1949.042 Direct efficiency 100% 108% Direct velocity 55.139 26.175 Percentage direct scans 2% 1% Zone normal velocity 1952.240 1975.217 Zone dma32 velocity 0.000 0.000 Zone dma velocity 0.000 0.000 Page writes by reclaim 89764.000 105233.000 Page writes file 46 533 Page writes anon 89718 104700 Page reclaim immediate 21457 3699 Sector Reads 3259688 3441368 Sector Writes 3667252 3754836 Page rescued immediate 0 0 Slabs scanned 1042872 1160855 Direct inode steals 8042 10089 Kswapd inode steals 54295 29170 Kswapd skipped wait 0 0 THP fault alloc 175 154 THP collapse alloc 226 289 THP splits 0 0 THP fault fallback 11 14 THP collapse fail 3 2 Compaction stalls 536 646 Compaction success 322 358 Compaction failures 214 288 Page migrate success 119608 111063 Page migrate failure 2723 2593 Compaction pages isolated 250179 232652 Compaction migrate scanned 9131832 9942306 Compaction free scanned 2093272 2613998 Compaction cost 192 189 NUMA alloc hit 47124555 47193990 NUMA alloc miss 0 0 NUMA interleave hit 0 0 NUMA alloc local 47124555 47193990 NUMA base PTE updates 0 0 NUMA huge PMD updates 0 0 NUMA page range updates 0 0 NUMA hint faults 0 0 NUMA hint local faults 0 0 NUMA hint local percent 100 100 NUMA pages migrated 0 0 AutoNUMA cost 0% 0% [akpm@linux-foundation.org: remove newline, per Minchan] Link: http://lkml.kernel.org/r/1500889535-19648-1-git-send-email-zhuhui@xiaomi.com Signed-off-by: Hui Zhu Acked-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 308acb9d814b..62457eb82330 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1983,8 +1983,11 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage, spin_lock(&class->lock); if (!get_zspage_inuse(zspage)) { - ret = -EBUSY; - goto unlock_class; + /* + * Set "offset" to end of the page so that every loops + * skips unnecessary object scanning. + */ + offset = PAGE_SIZE; } pos = offset; @@ -2052,7 +2055,6 @@ unpin_objects: } } kunmap_atomic(s_addr); -unlock_class: spin_unlock(&class->lock); migrate_write_unlock(zspage); -- cgit v1.2.3 From db73ee0d463799223244e96e7b7eea73b4a6ec31 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:21:11 -0700 Subject: mm, vmscan: do not loop on too_many_isolated for ever Tetsuo Handa has reported[1][2][3] that direct reclaimers might get stuck in too_many_isolated loop basically for ever because the last few pages on the LRU lists are isolated by the kswapd which is stuck on fs locks when doing the pageout or slab reclaim. This in turn means that there is nobody to actually trigger the oom killer and the system is basically unusable. too_many_isolated has been introduced by commit 35cd78156c49 ("vmscan: throttle direct reclaim when too many pages are isolated already") to prevent from pre-mature oom killer invocations because back then no reclaim progress could indeed trigger the OOM killer too early. But since the oom detection rework in commit 0a0337e0d1d1 ("mm, oom: rework oom detection") the allocation/reclaim retry loop considers all the reclaimable pages and throttles the allocation at that layer so we can loosen the direct reclaim throttling. Make shrink_inactive_list loop over too_many_isolated bounded and returns immediately when the situation hasn't resolved after the first sleep. Replace congestion_wait by a simple schedule_timeout_interruptible because we are not really waiting on the IO congestion in this path. Please note that this patch can theoretically cause the OOM killer to trigger earlier while there are many pages isolated for the reclaim which makes progress only very slowly. This would be obvious from the oom report as the number of isolated pages are printed there. If we ever hit this should_reclaim_retry should consider those numbers in the evaluation in one way or another. [1] http://lkml.kernel.org/r/201602092349.ACG81273.OSVtMJQHLOFOFF@I-love.SAKURA.ne.jp [2] http://lkml.kernel.org/r/201702212335.DJB30777.JOFMHSFtVLQOOF@I-love.SAKURA.ne.jp [3] http://lkml.kernel.org/r/201706300914.CEH95859.FMQOLVFHJFtOOS@I-love.SAKURA.ne.jp [mhocko@suse.com: switch to uninterruptible sleep] Link: http://lkml.kernel.org/r/20170724065048.GB25221@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20170710074842.23175-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Tested-by: Tetsuo Handa Acked-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Rik van Riel Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 095817820a56..1638814c7848 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1743,9 +1743,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, int file = is_file_lru(lru); struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + bool stalled = false; while (unlikely(too_many_isolated(pgdat, file, sc))) { - congestion_wait(BLK_RW_ASYNC, HZ/10); + if (stalled) + return 0; + + /* wait a bit for the reclaimer. */ + msleep(100); + stalled = true; /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) -- cgit v1.2.3 From d72dc8a25afc71ce90ee92bdd77550e9beb85d4d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Sep 2017 16:21:18 -0700 Subject: mm: make pagevec_lookup() update index Make pagevec_lookup() (and underlying find_get_pages()) update index to the next page where iteration should continue. Most callers want this and also pagevec_lookup_tag() already does this. Link: http://lkml.kernel.org/r/20170726114704.7626-3-jack@suse.cz Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 10 ++++------ fs/ext4/file.c | 4 +--- fs/ext4/inode.c | 8 ++------ fs/fscache/page.c | 5 ++--- fs/hugetlbfs/inode.c | 17 ++++++++--------- fs/nilfs2/page.c | 3 +-- fs/ramfs/file-nommu.c | 2 +- include/linux/pagemap.h | 2 +- include/linux/pagevec.h | 2 +- mm/filemap.c | 11 ++++++++--- mm/swap.c | 5 +++-- 11 files changed, 32 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/fs/buffer.c b/fs/buffer.c index 5715dac7821f..5b20893708e2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1633,13 +1633,12 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); pagevec_init(&pvec, 0); - while (index <= end && pagevec_lookup(&pvec, bd_mapping, index, + while (index <= end && pagevec_lookup(&pvec, bd_mapping, &index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) + if (page->index > end) break; if (!page_has_buffers(page)) continue; @@ -1670,7 +1669,6 @@ unlock_page: } pagevec_release(&pvec); cond_resched(); - index++; } } EXPORT_SYMBOL(clean_bdev_aliases); @@ -3552,7 +3550,8 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, unsigned want, nr_pages, i; want = min_t(unsigned, end - index, PAGEVEC_SIZE); - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &index, + want); if (nr_pages == 0) break; @@ -3594,7 +3593,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, if (nr_pages < want) break; - index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index < end); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index f28ac999dfba..1c73b21fdc13 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -468,7 +468,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unsigned long nr_pages; num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &index, (pgoff_t)num); if (nr_pages == 0) break; @@ -536,8 +536,6 @@ next: /* The no. of pages is less than our desired, we are done. */ if (nr_pages < num) break; - - index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c774bdc22759..b3ce1c6d9f23 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1720,7 +1720,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, pagevec_init(&pvec, 0); while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, mapping, &index, PAGEVEC_SIZE); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { @@ -1737,7 +1737,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, } unlock_page(page); } - index = pvec.pages[nr_pages - 1]->index + 1; pagevec_release(&pvec); } } @@ -2348,7 +2347,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) pagevec_init(&pvec, 0); while (start <= end) { - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, &start, PAGEVEC_SIZE); if (nr_pages == 0) break; @@ -2357,8 +2356,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (page->index > end) break; - /* Up to 'end' pages must be contiguous */ - BUG_ON(page->index != start); bh = head = page_buffers(page); do { if (lblk < mpd->map.m_lblk) @@ -2403,7 +2400,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) pagevec_release(&pvec); return err; } - start++; } pagevec_release(&pvec); } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index c8c4f79c7ce1..83018861dcd2 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, pagevec_init(&pvec, 0); next = 0; do { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) + if (!pagevec_lookup(&pvec, mapping, &next, PAGEVEC_SIZE)) break; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - next = page->index; if (PageFsCache(page)) { __fscache_wait_on_page_write(cookie, page); __fscache_uncache_page(cookie, page); @@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, } pagevec_release(&pvec); cond_resched(); - } while (++next); + } while (next); _leave(""); } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 28d2753be094..b9678ce91e25 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -401,7 +401,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, const pgoff_t end = lend >> huge_page_shift(h); struct vm_area_struct pseudo_vma; struct pagevec pvec; - pgoff_t next; + pgoff_t next, index; int i, freed = 0; long lookup_nr = PAGEVEC_SIZE; bool truncate_op = (lend == LLONG_MAX); @@ -420,7 +420,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, /* * When no more pages are found, we are done. */ - if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) + if (!pagevec_lookup(&pvec, mapping, &next, lookup_nr)) break; for (i = 0; i < pagevec_count(&pvec); ++i) { @@ -432,13 +432,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * only possible in the punch hole case as end is * max page offset in the truncate case. */ - next = page->index; - if (next >= end) + index = page->index; + if (index >= end) break; hash = hugetlb_fault_mutex_hash(h, current->mm, &pseudo_vma, - mapping, next, 0); + mapping, index, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -455,8 +455,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, i_mmap_lock_write(mapping); hugetlb_vmdelete_list(&mapping->i_mmap, - next * pages_per_huge_page(h), - (next + 1) * pages_per_huge_page(h)); + index * pages_per_huge_page(h), + (index + 1) * pages_per_huge_page(h)); i_mmap_unlock_write(mapping); } @@ -475,14 +475,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, - next, next + 1, 1))) + index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); } unlock_page(page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); } - ++next; huge_pagevec_release(&pvec); cond_resched(); } diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index f11a3ad2df0c..382a36c72d72 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap, pagevec_init(&pvec, 0); repeat: - n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); + n = pagevec_lookup(&pvec, smap, &index, PAGEVEC_SIZE); if (!n) return; - index = pvec.pages[n - 1]->index + 1; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i], *dpage; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 2ef7ce75c062..3ac1f2387083 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, if (!pages) goto out_free; - nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); + nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages); if (nr != lpages) goto out_free_pages; /* leave if some pages were missing */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 79b36f57c3ba..249b1b5964c7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -353,7 +353,7 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices); -unsigned find_get_pages(struct address_space *mapping, pgoff_t start, +unsigned find_get_pages(struct address_space *mapping, pgoff_t *start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index b45d391b4540..c395a5bb58b2 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -28,7 +28,7 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec, pgoff_t *indices); void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned nr_pages); + pgoff_t *start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages); diff --git a/mm/filemap.c b/mm/filemap.c index dad935769055..ab9011408d81 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -403,7 +403,7 @@ bool filemap_range_has_page(struct address_space *mapping, return false; pagevec_init(&pvec, 0); - if (!pagevec_lookup(&pvec, mapping, index, 1)) + if (!pagevec_lookup(&pvec, mapping, &index, 1)) return false; ret = (pvec.pages[0]->index <= end); pagevec_release(&pvec); @@ -1569,10 +1569,11 @@ export: * * The search returns a group of mapping-contiguous pages with ascending * indexes. There may be holes in the indices due to not-present pages. + * We also update @start to index the next page for the traversal. * * find_get_pages() returns the number of pages which were found. */ -unsigned find_get_pages(struct address_space *mapping, pgoff_t start, +unsigned find_get_pages(struct address_space *mapping, pgoff_t *start, unsigned int nr_pages, struct page **pages) { struct radix_tree_iter iter; @@ -1583,7 +1584,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -1625,6 +1626,10 @@ repeat: } rcu_read_unlock(); + + if (ret) + *start = pages[ret - 1]->index + 1; + return ret; } diff --git a/mm/swap.c b/mm/swap.c index 60b1d2a75852..4bffd1198ce5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -957,12 +957,13 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) * reference against the pages in @pvec. * * The search returns a group of mapping-contiguous pages with ascending - * indexes. There may be holes in the indices due to not-present pages. + * indexes. There may be holes in the indices due to not-present pages. We + * also update @start to index the next page for the traversal. * * pagevec_lookup() returns the number of pages which were found. */ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned nr_pages) + pgoff_t *start, unsigned nr_pages) { pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); return pagevec_count(pvec); -- cgit v1.2.3 From b947cee4b96306037e166ff1ea5156c0ecdd7d91 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Sep 2017 16:21:21 -0700 Subject: mm: implement find_get_pages_range() Implement a variant of find_get_pages() that stops iterating at given index. This may be substantial performance gain if the mapping is sparse. See following commit for details. Furthermore lots of users of this function (through pagevec_lookup()) actually want a range lookup and all of them are currently open-coding this. Also create corresponding pagevec_lookup_range() function. Link: http://lkml.kernel.org/r/20170726114704.7626-4-jack@suse.cz Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 ++++++++++-- include/linux/pagevec.h | 13 +++++++++++-- mm/filemap.c | 42 ++++++++++++++++++++++++++++++------------ mm/swap.c | 22 ++++++++++++++-------- 4 files changed, 65 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 249b1b5964c7..5bbd6780f205 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -353,8 +353,16 @@ struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices); -unsigned find_get_pages(struct address_space *mapping, pgoff_t *start, - unsigned int nr_pages, struct page **pages); +unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, + pgoff_t end, unsigned int nr_pages, + struct page **pages); +static inline unsigned find_get_pages(struct address_space *mapping, + pgoff_t *start, unsigned int nr_pages, + struct page **pages) +{ + return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages, + pages); +} unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index c395a5bb58b2..7df056910437 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -27,8 +27,17 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec, pgoff_t start, unsigned nr_entries, pgoff_t *indices); void pagevec_remove_exceptionals(struct pagevec *pvec); -unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t *start, unsigned nr_pages); +unsigned pagevec_lookup_range(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t *start, pgoff_t end, unsigned nr_pages); +static inline unsigned pagevec_lookup(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t *start, unsigned nr_pages) +{ + return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1, + nr_pages); +} + unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages); diff --git a/mm/filemap.c b/mm/filemap.c index ab9011408d81..129883f160a7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1557,24 +1557,29 @@ export: } /** - * find_get_pages - gang pagecache lookup + * find_get_pages_range - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page index + * @end: The final page index (inclusive) * @nr_pages: The maximum number of pages * @pages: Where the resulting pages are placed * - * find_get_pages() will search for and return a group of up to - * @nr_pages pages in the mapping. The pages are placed at @pages. - * find_get_pages() takes a reference against the returned pages. + * find_get_pages_range() will search for and return a group of up to @nr_pages + * pages in the mapping starting at index @start and up to index @end + * (inclusive). The pages are placed at @pages. find_get_pages_range() takes + * a reference against the returned pages. * * The search returns a group of mapping-contiguous pages with ascending * indexes. There may be holes in the indices due to not-present pages. * We also update @start to index the next page for the traversal. * - * find_get_pages() returns the number of pages which were found. + * find_get_pages_range() returns the number of pages which were found. If this + * number is smaller than @nr_pages, the end of specified range has been + * reached. */ -unsigned find_get_pages(struct address_space *mapping, pgoff_t *start, - unsigned int nr_pages, struct page **pages) +unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, + pgoff_t end, unsigned int nr_pages, + struct page **pages) { struct radix_tree_iter iter; void **slot; @@ -1586,6 +1591,9 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) { struct page *head, *page; + + if (iter.index > end) + break; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1621,15 +1629,25 @@ repeat: } pages[ret] = page; - if (++ret == nr_pages) - break; + if (++ret == nr_pages) { + *start = pages[ret - 1]->index + 1; + goto out; + } } + /* + * We come here when there is no page beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is page at index -1 but that is + * already broken anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: rcu_read_unlock(); - if (ret) - *start = pages[ret - 1]->index + 1; - return ret; } diff --git a/mm/swap.c b/mm/swap.c index 4bffd1198ce5..e06e9aa2478e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -946,29 +946,35 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) } /** - * pagevec_lookup - gang pagecache lookup + * pagevec_lookup_range - gang pagecache lookup * @pvec: Where the resulting pages are placed * @mapping: The address_space to search * @start: The starting page index + * @end: The final page index * @nr_pages: The maximum number of pages * - * pagevec_lookup() will search for and return a group of up to @nr_pages pages - * in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a + * pagevec_lookup_range() will search for and return a group of up to @nr_pages + * pages in the mapping starting from index @start and upto index @end + * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a * reference against the pages in @pvec. * * The search returns a group of mapping-contiguous pages with ascending * indexes. There may be holes in the indices due to not-present pages. We * also update @start to index the next page for the traversal. * - * pagevec_lookup() returns the number of pages which were found. + * pagevec_lookup_range() returns the number of pages which were found. If this + * number is smaller than @nr_pages, the end of specified range has been + * reached. */ -unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t *start, unsigned nr_pages) +unsigned pagevec_lookup_range(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *start, pgoff_t end, + unsigned nr_pages) { - pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages); + pvec->nr = find_get_pages_range(mapping, start, end, nr_pages, + pvec->pages); return pagevec_count(pvec); } -EXPORT_SYMBOL(pagevec_lookup); +EXPORT_SYMBOL(pagevec_lookup_range); unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages) -- cgit v1.2.3 From f7b68046873724129798c405e1a4e326b409c08f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Sep 2017 16:21:40 -0700 Subject: mm: use find_get_pages_range() in filemap_range_has_page() We want only pages from given range in filemap_range_has_page(), furthermore we want at most a single page. So use find_get_pages_range() instead of pagevec_lookup() and remove unnecessary code. Link: http://lkml.kernel.org/r/20170726114704.7626-10-jack@suse.cz Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 129883f160a7..84617a05db50 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -393,8 +393,7 @@ bool filemap_range_has_page(struct address_space *mapping, { pgoff_t index = start_byte >> PAGE_SHIFT; pgoff_t end = end_byte >> PAGE_SHIFT; - struct pagevec pvec; - bool ret; + struct page *page; if (end_byte < start_byte) return false; @@ -402,12 +401,10 @@ bool filemap_range_has_page(struct address_space *mapping, if (mapping->nrpages == 0) return false; - pagevec_init(&pvec, 0); - if (!pagevec_lookup(&pvec, mapping, &index, 1)) + if (!find_get_pages_range(mapping, &index, end, 1, &page)) return false; - ret = (pvec.pages[0]->index <= end); - pagevec_release(&pvec); - return ret; + put_page(page); + return true; } EXPORT_SYMBOL(filemap_range_has_page); -- cgit v1.2.3 From 397162ffa2ed1cadffe05c324c6ddc53647f9c62 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Sep 2017 16:21:43 -0700 Subject: mm: remove nr_pages argument from pagevec_lookup{,_range}() All users of pagevec_lookup() and pagevec_lookup_range() now pass PAGEVEC_SIZE as a desired number of pages. Just drop the argument. Link: http://lkml.kernel.org/r/20170726114704.7626-11-jack@suse.cz Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 5 ++--- fs/ext4/file.c | 2 +- fs/ext4/inode.c | 5 ++--- fs/fscache/page.c | 2 +- fs/hugetlbfs/inode.c | 3 +-- fs/nilfs2/page.c | 2 +- include/linux/pagevec.h | 7 +++---- mm/swap.c | 5 ++--- 8 files changed, 13 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/fs/buffer.c b/fs/buffer.c index 8770b58ca569..50da0e102ca0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1633,8 +1633,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); pagevec_init(&pvec, 0); - while (pagevec_lookup_range(&pvec, bd_mapping, &index, end, - PAGEVEC_SIZE)) { + while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { count = pagevec_count(&pvec); for (i = 0; i < count; i++) { struct page *page = pvec.pages[i]; @@ -3552,7 +3551,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, unsigned nr_pages, i; nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, - end - 1, PAGEVEC_SIZE); + end - 1); if (nr_pages == 0) break; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 726344809721..484c1326dd6a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -468,7 +468,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, unsigned long nr_pages; nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, - &index, end, PAGEVEC_SIZE); + &index, end); if (nr_pages == 0) break; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6103ce0430df..ce68a3483666 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1720,8 +1720,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, pagevec_init(&pvec, 0); while (index <= end) { - nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end, - PAGEVEC_SIZE); + nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { @@ -2348,7 +2347,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) pagevec_init(&pvec, 0); while (start <= end) { nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, - &start, end, PAGEVEC_SIZE); + &start, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 83018861dcd2..0ad3fd3ad0b4 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -1178,7 +1178,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, pagevec_init(&pvec, 0); next = 0; do { - if (!pagevec_lookup(&pvec, mapping, &next, PAGEVEC_SIZE)) + if (!pagevec_lookup(&pvec, mapping, &next)) break; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8931236f3ef4..7c02b3f738e1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -413,8 +413,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, /* * When no more pages are found, we are done. */ - if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1, - PAGEVEC_SIZE)) + if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1)) break; for (i = 0; i < pagevec_count(&pvec); ++i) { diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 382a36c72d72..8616c46d33da 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -312,7 +312,7 @@ void nilfs_copy_back_pages(struct address_space *dmap, pagevec_init(&pvec, 0); repeat: - n = pagevec_lookup(&pvec, smap, &index, PAGEVEC_SIZE); + n = pagevec_lookup(&pvec, smap, &index); if (!n) return; diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 7df056910437..4dcd5506f1ed 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -29,13 +29,12 @@ unsigned pagevec_lookup_entries(struct pagevec *pvec, void pagevec_remove_exceptionals(struct pagevec *pvec); unsigned pagevec_lookup_range(struct pagevec *pvec, struct address_space *mapping, - pgoff_t *start, pgoff_t end, unsigned nr_pages); + pgoff_t *start, pgoff_t end); static inline unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, - pgoff_t *start, unsigned nr_pages) + pgoff_t *start) { - return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1, - nr_pages); + return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1); } unsigned pagevec_lookup_tag(struct pagevec *pvec, diff --git a/mm/swap.c b/mm/swap.c index e06e9aa2478e..62d96b8e5eb3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -967,10 +967,9 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) * reached. */ unsigned pagevec_lookup_range(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *start, pgoff_t end, - unsigned nr_pages) + struct address_space *mapping, pgoff_t *start, pgoff_t end) { - pvec->nr = find_get_pages_range(mapping, start, end, nr_pages, + pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE, pvec->pages); return pagevec_count(pvec); } -- cgit v1.2.3 From 63677c745d63ba75ef97a7728c85168ddd6c0040 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 6 Sep 2017 16:21:47 -0700 Subject: mm, memcg: reset memory.low during memcg offlining A removed memory cgroup with a defined memory.low and some belonging pagecache has very low chances to be freed. If a cgroup has been removed, there is likely no memory pressure inside the cgroup, and the pagecache is protected from the external pressure by the defined low limit. The cgroup will be freed only after the reclaim of all belonging pages. And it will not happen until there are any reclaimable memory in the system. That means, there is a good chance, that a cold pagecache will reside in the memory for an undefined amount of time, wasting system resources. This problem was fixed earlier by fa06235b8eb0 ("cgroup: reset css on destruction"), but it's not a best way to do it, as we can't really reset all limits/counters during cgroup offlining. Link: http://lkml.kernel.org/r/20170727130428.28856-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e09741af816f..2926c44519b6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4319,6 +4319,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); + memcg->low = 0; + memcg_offline_kmem(memcg); wb_memcg_offline(memcg); -- cgit v1.2.3 From f907c26a9166a1f4076940b9146d54bd1043db77 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 6 Sep 2017 16:21:53 -0700 Subject: mm/ksm.c: constify attribute_group structures attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. Link: http://lkml.kernel.org/r/1501157167-3706-2-git-send-email-arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index db20f8436bc3..15dd7415f7b3 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3043,7 +3043,7 @@ static struct attribute *ksm_attrs[] = { NULL, }; -static struct attribute_group ksm_attr_group = { +static const struct attribute_group ksm_attr_group = { .attrs = ksm_attrs, .name = "ksm", }; -- cgit v1.2.3 From 1fdaaa2329d15559de8446a8d3d4d72635704eab Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 6 Sep 2017 16:21:56 -0700 Subject: mm/slub.c: constify attribute_group structures attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. Link: http://lkml.kernel.org/r/1501157186-3749-1-git-send-email-arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 16a60f871f39..ddb04576b342 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5459,7 +5459,7 @@ static struct attribute *slab_attrs[] = { NULL }; -static struct attribute_group slab_attr_group = { +static const struct attribute_group slab_attr_group = { .attrs = slab_attrs, }; -- cgit v1.2.3 From fd147cbb6a337673cc267358a602d23f085325f6 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 6 Sep 2017 16:21:59 -0700 Subject: mm/page_idle.c: constify attribute_group structures attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. Link: http://lkml.kernel.org/r/1501157221-3832-1-git-send-email-arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_idle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_idle.c b/mm/page_idle.c index 1b0f48c62316..4bd03a8d809e 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -204,7 +204,7 @@ static struct bin_attribute *page_idle_bin_attrs[] = { NULL, }; -static struct attribute_group page_idle_attr_group = { +static const struct attribute_group page_idle_attr_group = { .bin_attrs = page_idle_bin_attrs, .name = "page_idle", }; -- cgit v1.2.3 From 8aa95a21bc51205112b3a416810666616287b49b Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 6 Sep 2017 16:22:03 -0700 Subject: mm/huge_memory.c: constify attribute_group structures attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. Link: http://lkml.kernel.org/r/1501157240-3876-1-git-send-email-arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3644ff918434..7f0b3f6db923 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -328,7 +328,7 @@ static struct attribute *hugepage_attr[] = { NULL, }; -static struct attribute_group hugepage_attr_group = { +static const struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, }; -- cgit v1.2.3 From 67e5ed969944d7c6d93f658a188240cf60c49f71 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Wed, 6 Sep 2017 16:22:06 -0700 Subject: mm/hugetlb.c: constify attribute_group structures attribute_group are not supposed to change at runtime. All functions working with attribute_group provided by work with const attribute_group. So mark the non-const structs as const. Link: http://lkml.kernel.org/r/1501157260-3922-1-git-send-email-arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1d54a131bdd5..cafd60316e68 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2569,13 +2569,13 @@ static struct attribute *hstate_attrs[] = { NULL, }; -static struct attribute_group hstate_attr_group = { +static const struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct kobject **hstate_kobjs, - struct attribute_group *hstate_attr_group) + const struct attribute_group *hstate_attr_group) { int retval; int hi = hstate_index(h); @@ -2633,7 +2633,7 @@ static struct attribute *per_node_hstate_attrs[] = { NULL, }; -static struct attribute_group per_node_hstate_attr_group = { +static const struct attribute_group per_node_hstate_attr_group = { .attrs = per_node_hstate_attrs, }; -- cgit v1.2.3 From 04fecbf51b3cb79a628d50b65797c4866342b8d2 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Wed, 6 Sep 2017 16:22:09 -0700 Subject: mm: memcontrol: use int for event/state parameter in several functions Several functions use an enum type as parameter for an event/state, but are called in some locations with an argument of a different enum type. Adjust the interface of these functions to reality by changing the parameter to int. This fixes a ton of enum-conversion warnings that are generated when building the kernel with clang. [mka@chromium.org: also change parameter type of inc/dec/mod_memcg_page_state()] Link: http://lkml.kernel.org/r/20170728213442.93823-1-mka@chromium.org Link: http://lkml.kernel.org/r/20170727211004.34435-1-mka@chromium.org Signed-off-by: Matthias Kaehlcke Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: Doug Anderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 52 ++++++++++++++++++++++++++++------------------ mm/memcontrol.c | 4 +++- 2 files changed, 35 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9b15a4bcfa77..69966c461d1c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -488,8 +488,9 @@ struct mem_cgroup *lock_page_memcg(struct page *page); void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page); +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) + int idx) { long val = 0; int cpu; @@ -503,15 +504,17 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, return val; } +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, int val) + int idx, int val) { if (!mem_cgroup_disabled()) __this_cpu_add(memcg->stat->count[idx], val); } +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, int val) + int idx, int val) { if (!mem_cgroup_disabled()) this_cpu_add(memcg->stat->count[idx], val); @@ -535,14 +538,14 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, * Kernel pages are an exception to this, since they'll never move. */ static inline void __mod_memcg_page_state(struct page *page, - enum memcg_stat_item idx, int val) + int idx, int val) { if (page->mem_cgroup) __mod_memcg_state(page->mem_cgroup, idx, val); } static inline void mod_memcg_page_state(struct page *page, - enum memcg_stat_item idx, int val) + int idx, int val) { if (page->mem_cgroup) mod_memcg_state(page->mem_cgroup, idx, val); @@ -632,8 +635,9 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, this_cpu_add(memcg->stat->events[idx], count); } +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void count_memcg_page_event(struct page *page, - enum memcg_stat_item idx) + int idx) { if (page->mem_cgroup) count_memcg_events(page->mem_cgroup, idx, 1); @@ -846,31 +850,31 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) } static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) + int idx) { return 0; } static inline void __mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, + int idx, int nr) { } static inline void mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, + int idx, int nr) { } static inline void __mod_memcg_page_state(struct page *page, - enum memcg_stat_item idx, + int idx, int nr) { } static inline void mod_memcg_page_state(struct page *page, - enum memcg_stat_item idx, + int idx, int nr) { } @@ -924,7 +928,7 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, } static inline void count_memcg_page_event(struct page *page, - enum memcg_stat_item idx) + int idx) { } @@ -934,26 +938,30 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) } #endif /* CONFIG_MEMCG */ +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __inc_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) + int idx) { __mod_memcg_state(memcg, idx, 1); } +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __dec_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) + int idx) { __mod_memcg_state(memcg, idx, -1); } +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __inc_memcg_page_state(struct page *page, - enum memcg_stat_item idx) + int idx) { __mod_memcg_page_state(page, idx, 1); } +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __dec_memcg_page_state(struct page *page, - enum memcg_stat_item idx) + int idx) { __mod_memcg_page_state(page, idx, -1); } @@ -982,26 +990,30 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void inc_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) + int idx) { mod_memcg_state(memcg, idx, 1); } +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void dec_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) + int idx) { mod_memcg_state(memcg, idx, -1); } +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void inc_memcg_page_state(struct page *page, - enum memcg_stat_item idx) + int idx) { mod_memcg_page_state(page, idx, 1); } +/* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void dec_memcg_page_state(struct page *page, - enum memcg_stat_item idx) + int idx) { mod_memcg_page_state(page, idx, -1); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2926c44519b6..3d3f7b1686ac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -550,10 +550,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) * value, and reading all cpu value can be performance bottleneck in some * common workload, threshold and synchronization as vmstat[] should be * implemented. + * + * The parameter idx can be of type enum memcg_event_item or vm_event_item. */ static unsigned long memcg_sum_events(struct mem_cgroup *memcg, - enum memcg_event_item event) + int event) { unsigned long val = 0; int cpu; -- cgit v1.2.3 From a3aea839e42ef8d76bb58091ab7f5a45a85ea299 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:12 -0700 Subject: mm, THP, swap: support to clear swap cache flag for THP swapped out Patch series "mm, THP, swap: Delay splitting THP after swapped out", v3. This is the second step of THP (Transparent Huge Page) swap optimization. In the first step, the splitting huge page is delayed from almost the first step of swapping out to after allocating the swap space for the THP and adding the THP into the swap cache. In the second step, the splitting is delayed further to after the swapping out finished. The plan is to delay splitting THP step by step, finally avoid splitting THP for the THP swapping out and swap out/in the THP as a whole. In the patchset, more operations for the anonymous THP reclaiming, such as TLB flushing, writing the THP to the swap device, removing the THP from the swap cache are batched. So that the performance of anonymous THP swapping out are improved. During the development, the following scenarios/code paths have been checked, - swap out/in - swap off - write protect page fault - madvise_free - process exit - split huge page With the patchset, the swap out throughput improves 42% (from about 5.81GB/s to about 8.25GB/s) in the vm-scalability swap-w-seq test case with 16 processes. At the same time, the IPI (reflect TLB flushing) reduced about 78.9%. The test is done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. To test the sequential swapping out, the test case creates 8 processes, which sequentially allocate and write to the anonymous pages until the RAM and part of the swap device is used up. Below is the part of the cover letter for the first step patchset of THP swap optimization which applies to all steps. ========================= Recently, the performance of the storage devices improved so fast that we cannot saturate the disk bandwidth with single logical CPU when do page swap out even on a high-end server machine. Because the performance of the storage device improved faster than that of single logical CPU. And it seems that the trend will not change in the near future. On the other hand, the THP becomes more and more popular because of increased memory size. So it becomes necessary to optimize THP swap performance. The advantages of the THP swap support include: - Batch the swap operations for the THP to reduce TLB flushing and lock acquiring/releasing, including allocating/freeing the swap space, adding/deleting to/from the swap cache, and writing/reading the swap space, etc. This will help improve the performance of the THP swap. - The THP swap space read/write will be 2M sequential IO. It is particularly helpful for the swap read, which are usually 4k random IO. This will improve the performance of the THP swap too. - It will help the memory fragmentation, especially when the THP is heavily used by the applications. The 2M continuous pages will be free up after THP swapping out. - It will improve the THP utilization on the system with the swap turned on. Because the speed for khugepaged to collapse the normal pages into the THP is quite slow. After the THP is split during the swapping out, it will take quite long time for the normal pages to collapse back into the THP after being swapped in. The high THP utilization helps the efficiency of the page based memory management too. There are some concerns regarding THP swap in, mainly because possible enlarged read/write IO size (for swap in/out) may put more overhead on the storage device. To deal with that, the THP swap in should be turned on only when necessary. For example, it can be selected via "always/never/madvise" logic, to be turned on globally, turned off globally, or turned on only for VMA with MADV_HUGEPAGE, etc. This patch (of 12): Previously, swapcache_free_cluster() is used only in the error path of shrink_page_list() to free the swap cluster just allocated if the THP (Transparent Huge Page) is failed to be split. In this patch, it is enhanced to clear the swap cache flag (SWAP_HAS_CACHE) for the swap cluster that holds the contents of THP swapped out. This will be used in delaying splitting THP after swapping out support. Because there is no THP swapping in as a whole support yet, after clearing the swap cache flag, the swap cluster backing the THP swapped out will be split. So that the swap slots in the swap cluster can be swapped in as normal pages later. Link: http://lkml.kernel.org/r/20170724051840.2309-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Dan Williams Cc: Jens Axboe Cc: Michal Hocko Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 6ba4aab2db0b..c32e9b23d642 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1168,22 +1168,40 @@ static void swapcache_free_cluster(swp_entry_t entry) struct swap_cluster_info *ci; struct swap_info_struct *si; unsigned char *map; - unsigned int i; + unsigned int i, free_entries = 0; + unsigned char val; - si = swap_info_get(entry); + si = _swap_info_get(entry); if (!si) return; ci = lock_cluster(si, offset); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { - VM_BUG_ON(map[i] != SWAP_HAS_CACHE); - map[i] = 0; + val = map[i]; + VM_BUG_ON(!(val & SWAP_HAS_CACHE)); + if (val == SWAP_HAS_CACHE) + free_entries++; + } + if (!free_entries) { + for (i = 0; i < SWAPFILE_CLUSTER; i++) + map[i] &= ~SWAP_HAS_CACHE; } unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); - swap_free_cluster(si, idx); - spin_unlock(&si->lock); + if (free_entries == SWAPFILE_CLUSTER) { + spin_lock(&si->lock); + ci = lock_cluster(si, offset); + memset(map, 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); + } else if (free_entries) { + for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) { + if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE)) + free_swap_slot(entry); + } + } } #else static inline void swapcache_free_cluster(swp_entry_t entry) -- cgit v1.2.3 From e07098294adfd03d582af7626752255e3d170393 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:16 -0700 Subject: mm, THP, swap: support to reclaim swap space for THP swapped out The normal swap slot reclaiming can be done when the swap count reaches SWAP_HAS_CACHE. But for the swap slot which is backing a THP, all swap slots backing one THP must be reclaimed together, because the swap slot may be used again when the THP is swapped out again later. So the swap slots backing one THP can be reclaimed together when the swap count for all swap slots for the THP reached SWAP_HAS_CACHE. In the patch, the functions to check whether the swap count for all swap slots backing one THP reached SWAP_HAS_CACHE are implemented and used when checking whether a swap slot can be reclaimed. To make it easier to determine whether a swap slot is backing a THP, a new swap cluster flag named CLUSTER_FLAG_HUGE is added to mark a swap cluster which is backing a THP (Transparent Huge Page). Because THP swap in as a whole isn't supported now. After deleting the THP from the swap cache (for example, swapping out finished), the CLUSTER_FLAG_HUGE flag will be cleared. So that, the normal pages inside THP can be swapped in individually. [ying.huang@intel.com: fix swap_page_trans_huge_swapped on HDD] Link: http://lkml.kernel.org/r/874ltsm0bi.fsf@yhuang-dev.intel.com Link: http://lkml.kernel.org/r/20170724051840.2309-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Dan Williams Cc: Jens Axboe Cc: Michal Hocko Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/swapfile.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 72 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index d83d28e53e62..964b4f1fba4a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -188,6 +188,7 @@ struct swap_cluster_info { }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ +#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */ /* * We assign a cluster to each CPU, so each CPU can allocate swap entry from diff --git a/mm/swapfile.c b/mm/swapfile.c index c32e9b23d642..164d9624d7d2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -265,6 +265,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +static inline bool cluster_is_huge(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_HUGE; +} + +static inline void cluster_clear_huge(struct swap_cluster_info *info) +{ + info->flags &= ~CLUSTER_FLAG_HUGE; +} + static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { @@ -846,7 +856,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) offset = idx * SWAPFILE_CLUSTER; ci = lock_cluster(si, offset); alloc_cluster(si, idx); - cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); + cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) @@ -1176,6 +1186,7 @@ static void swapcache_free_cluster(swp_entry_t entry) return; ci = lock_cluster(si, offset); + VM_BUG_ON(!cluster_is_huge(ci)); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { val = map[i]; @@ -1187,6 +1198,7 @@ static void swapcache_free_cluster(swp_entry_t entry) for (i = 0; i < SWAPFILE_CLUSTER; i++) map[i] &= ~SWAP_HAS_CACHE; } + cluster_clear_huge(ci); unlock_cluster(ci); if (free_entries == SWAPFILE_CLUSTER) { spin_lock(&si->lock); @@ -1350,6 +1362,54 @@ out: return count; } +#ifdef CONFIG_THP_SWAP +static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, + swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned char *map = si->swap_map; + unsigned long roffset = swp_offset(entry); + unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); + int i; + bool ret = false; + + ci = lock_cluster_or_swap_info(si, offset); + if (!ci || !cluster_is_huge(ci)) { + if (map[roffset] != SWAP_HAS_CACHE) + ret = true; + goto unlock_out; + } + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + if (map[offset + i] != SWAP_HAS_CACHE) { + ret = true; + break; + } + } +unlock_out: + unlock_cluster_or_swap_info(si, ci); + return ret; +} + +static bool page_swapped(struct page *page) +{ + swp_entry_t entry; + struct swap_info_struct *si; + + if (likely(!PageTransCompound(page))) + return page_swapcount(page) != 0; + + page = compound_head(page); + entry.val = page_private(page); + si = _swap_info_get(entry); + if (si) + return swap_page_trans_huge_swapped(si, entry); + return false; +} +#else +#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) +#define page_swapped(page) (page_swapcount(page) != 0) +#endif + /* * We can write to an anon page without COW if there are no other references * to it. And as a side-effect, free up its swap: because the old content @@ -1404,7 +1464,7 @@ int try_to_free_swap(struct page *page) return 0; if (PageWriteback(page)) return 0; - if (page_swapcount(page)) + if (page_swapped(page)) return 0; /* @@ -1425,6 +1485,7 @@ int try_to_free_swap(struct page *page) if (pm_suspended_storage()) return 0; + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); return 1; @@ -1446,7 +1507,8 @@ int free_swap_and_cache(swp_entry_t entry) p = _swap_info_get(entry); if (p) { count = __swap_entry_free(p, entry, 1); - if (count == SWAP_HAS_CACHE) { + if (count == SWAP_HAS_CACHE && + !swap_page_trans_huge_swapped(p, entry)) { page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page && !trylock_page(page)) { @@ -1463,7 +1525,8 @@ int free_swap_and_cache(swp_entry_t entry) */ if (PageSwapCache(page) && !PageWriteback(page) && (!page_mapped(page) || mem_cgroup_swap_full(page)) && - !swap_swapcount(p, entry)) { + !swap_page_trans_huge_swapped(p, entry)) { + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); } @@ -2017,7 +2080,7 @@ int try_to_unuse(unsigned int type, bool frontswap, .sync_mode = WB_SYNC_NONE, }; - swap_writepage(page, &wbc); + swap_writepage(compound_head(page), &wbc); lock_page(page); wait_on_page_writeback(page); } @@ -2030,8 +2093,9 @@ int try_to_unuse(unsigned int type, bool frontswap, * delete, since it may not have been written out to swap yet. */ if (PageSwapCache(page) && - likely(page_private(page) == entry.val)) - delete_from_swap_cache(page); + likely(page_private(page) == entry.val) && + !page_swapped(page)) + delete_from_swap_cache(compound_head(page)); /* * So we could skip searching mms once swap count went -- cgit v1.2.3 From ba3c4ce6def4915093be80585ff69f780630f32f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:19 -0700 Subject: mm, THP, swap: make reuse_swap_page() works for THP swapped out After supporting to delay THP (Transparent Huge Page) splitting after swapped out, it is possible that some page table mappings of the THP are turned into swap entries. So reuse_swap_page() need to check the swap count in addition to the map count as before. This patch done that. In the huge PMD write protect fault handler, in addition to the page map count, the swap count need to be checked too, so the page lock need to be acquired too when calling reuse_swap_page() in addition to the page table lock. [ying.huang@intel.com: silence a compiler warning] Link: http://lkml.kernel.org/r/87bmnzizjy.fsf@yhuang-dev.intel.com Link: http://lkml.kernel.org/r/20170724051840.2309-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: Rik van Riel Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Dan Williams Cc: Jens Axboe Cc: Michal Hocko Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 +- mm/huge_memory.c | 16 +++++++- mm/memory.c | 6 +-- mm/swapfile.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 113 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 964b4f1fba4a..7176ba780e83 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -510,8 +510,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page, total_mapcount) \ - (page_trans_huge_mapcount(page, total_mapcount) == 1) +#define reuse_swap_page(page, total_map_swapcount) \ + (page_trans_huge_mapcount(page, total_map_swapcount) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7f0b3f6db923..02dfe635c9fe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1240,15 +1240,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) * We can only reuse the page if nobody else maps the huge page or it's * part. */ - if (page_trans_huge_mapcount(page, NULL) == 1) { + if (!trylock_page(page)) { + get_page(page); + spin_unlock(vmf->ptl); + lock_page(page); + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + unlock_page(page); + put_page(page); + goto out_unlock; + } + put_page(page); + } + if (reuse_swap_page(page, NULL)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); ret |= VM_FAULT_WRITE; + unlock_page(page); goto out_unlock; } + unlock_page(page); get_page(page); spin_unlock(vmf->ptl); alloc: diff --git a/mm/memory.c b/mm/memory.c index 1416485e278c..3dd8bb46391b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2619,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf) * not dirty accountable. */ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { - int total_mapcount; + int total_map_swapcount; if (!trylock_page(vmf->page)) { get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2634,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf) } put_page(vmf->page); } - if (reuse_swap_page(vmf->page, &total_mapcount)) { - if (total_mapcount == 1) { + if (reuse_swap_page(vmf->page, &total_map_swapcount)) { + if (total_map_swapcount == 1) { /* * The page is all ours. Move it to * our anon_vma so the rmap code will diff --git a/mm/swapfile.c b/mm/swapfile.c index 164d9624d7d2..2bfbfb87123a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1405,9 +1405,89 @@ static bool page_swapped(struct page *page) return swap_page_trans_huge_swapped(si, entry); return false; } + +static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, + int *total_swapcount) +{ + int i, map_swapcount, _total_mapcount, _total_swapcount; + unsigned long offset = 0; + struct swap_info_struct *si; + struct swap_cluster_info *ci = NULL; + unsigned char *map = NULL; + int mapcount, swapcount = 0; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (total_mapcount) + *total_mapcount = mapcount; + if (PageSwapCache(page)) + swapcount = page_swapcount(page); + if (total_swapcount) + *total_swapcount = swapcount; + return mapcount + swapcount; + } + + page = compound_head(page); + + _total_mapcount = _total_swapcount = map_swapcount = 0; + if (PageSwapCache(page)) { + swp_entry_t entry; + + entry.val = page_private(page); + si = _swap_info_get(entry); + if (si) { + map = si->swap_map; + offset = swp_offset(entry); + } + } + if (map) + ci = lock_cluster(si, offset); + for (i = 0; i < HPAGE_PMD_NR; i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + _total_mapcount += mapcount; + if (map) { + swapcount = swap_count(map[offset + i]); + _total_swapcount += swapcount; + } + map_swapcount = max(map_swapcount, mapcount + swapcount); + } + unlock_cluster(ci); + if (PageDoubleMap(page)) { + map_swapcount -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + mapcount = compound_mapcount(page); + map_swapcount += mapcount; + _total_mapcount += mapcount; + if (total_mapcount) + *total_mapcount = _total_mapcount; + if (total_swapcount) + *total_swapcount = _total_swapcount; + + return map_swapcount; +} #else #define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) #define page_swapped(page) (page_swapcount(page) != 0) + +static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, + int *total_swapcount) +{ + int mapcount, swapcount = 0; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + mapcount = page_trans_huge_mapcount(page, total_mapcount); + if (PageSwapCache(page)) + swapcount = page_swapcount(page); + if (total_swapcount) + *total_swapcount = swapcount; + return mapcount + swapcount; +} #endif /* @@ -1416,23 +1496,27 @@ static bool page_swapped(struct page *page) * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. * - * NOTE: total_mapcount should not be relied upon by the caller if + * NOTE: total_map_swapcount should not be relied upon by the caller if * reuse_swap_page() returns false, but it may be always overwritten * (see the other implementation for CONFIG_SWAP=n). */ -bool reuse_swap_page(struct page *page, int *total_mapcount) +bool reuse_swap_page(struct page *page, int *total_map_swapcount) { - int count; + int count, total_mapcount, total_swapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return false; - count = page_trans_huge_mapcount(page, total_mapcount); - if (count <= 1 && PageSwapCache(page)) { - count += page_swapcount(page); - if (count != 1) - goto out; + count = page_trans_huge_map_swapcount(page, &total_mapcount, + &total_swapcount); + if (total_map_swapcount) + *total_map_swapcount = total_mapcount + total_swapcount; + if (count == 1 && PageSwapCache(page) && + (likely(!PageTransCompound(page)) || + /* The remaining swap count will be freed soon */ + total_swapcount == page_swapcount(page))) { if (!PageWriteback(page)) { + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); } else { @@ -1448,7 +1532,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount) spin_unlock(&p->lock); } } -out: + return count <= 1; } -- cgit v1.2.3 From f0eea189e8e969b66e03bac8a7d92888ba267854 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:23 -0700 Subject: mm, THP, swap: don't allocate huge cluster for file backed swap device It's hard to write a whole transparent huge page (THP) to a file backed swap device during swapping out and the file backed swap device isn't very popular. So the huge cluster allocation for the file backed swap device is disabled. Link: http://lkml.kernel.org/r/20170724051840.2309-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: Rik van Riel Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Dan Williams Cc: Jens Axboe Cc: Michal Hocko Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 2bfbfb87123a..267b1fe41844 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -948,9 +948,10 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - if (cluster) - n_ret = swap_alloc_cluster(si, swp_entries); - else + if (cluster) { + if (!(si->flags & SWP_FILE)) + n_ret = swap_alloc_cluster(si, swp_entries); + } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries); spin_unlock(&si->lock); -- cgit v1.2.3 From 225311a46411c37e20e73d99f4382f141e12f6f9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:30 -0700 Subject: mm: test code to write THP to swap device as a whole To support delay splitting THP (Transparent Huge Page) after swapped out, we need to enhance swap writing code to support to write a THP as a whole. This will improve swap write IO performance. As Ming Lei pointed out, this should be based on multipage bvec support, which hasn't been merged yet. So this patch is only for testing the functionality of the other patches in the series. And will be reimplemented after multipage bvec support is merged. Link: http://lkml.kernel.org/r/20170724051840.2309-7-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Dan Williams Cc: Hugh Dickins Cc: Jens Axboe Cc: Johannes Weiner Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Shaohua Li Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bio.h | 8 ++++++++ include/linux/page-flags.h | 4 ++-- include/linux/vm_event_item.h | 1 + mm/page_io.c | 21 ++++++++++++++++----- mm/vmstat.c | 1 + 5 files changed, 28 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/bio.h b/include/linux/bio.h index 7b1cf4ba0902..1f0720de8990 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -38,7 +38,15 @@ #define BIO_BUG_ON #endif +#ifdef CONFIG_THP_SWAP +#if HPAGE_PMD_NR > 256 +#define BIO_MAX_PAGES HPAGE_PMD_NR +#else #define BIO_MAX_PAGES 256 +#endif +#else +#define BIO_MAX_PAGES 256 +#endif #define bio_prio(bio) (bio)->bi_ioprio #define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d33e3280c8ad..ba2d470d2d0a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -303,8 +303,8 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ -TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) - TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) +TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL) + TESTSCFLAG(Writeback, writeback, PF_NO_TAIL) PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 37e8d31a4632..c75024e80eed 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -85,6 +85,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #endif THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, + THP_SWPOUT, #endif #ifdef CONFIG_MEMORY_BALLOON BALLOON_INFLATE, diff --git a/mm/page_io.c b/mm/page_io.c index 5f61b54ee1f3..20139b90125a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -28,16 +28,18 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, struct page *page, bio_end_io_t end_io) { + int i, nr = hpage_nr_pages(page); struct bio *bio; - bio = bio_alloc(gfp_flags, 1); + bio = bio_alloc(gfp_flags, nr); if (bio) { bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; bio->bi_end_io = end_io; - bio_add_page(bio, page, PAGE_SIZE, 0); - BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE); + for (i = 0; i < nr; i++) + bio_add_page(bio, page + i, PAGE_SIZE, 0); + VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr); } return bio; } @@ -262,6 +264,15 @@ static sector_t swap_page_sector(struct page *page) return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9); } +static inline void count_swpout_vm_event(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (unlikely(PageTransHuge(page))) + count_vm_event(THP_SWPOUT); +#endif + count_vm_events(PSWPOUT, hpage_nr_pages(page)); +} + int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func) { @@ -313,7 +324,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); if (!ret) { - count_vm_event(PSWPOUT); + count_swpout_vm_event(page); return 0; } @@ -326,7 +337,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, goto out; } bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); - count_vm_event(PSWPOUT); + count_swpout_vm_event(page); set_page_writeback(page); unlock_page(page); submit_bio(bio); diff --git a/mm/vmstat.c b/mm/vmstat.c index 9a4441bbeef2..bccf426453cd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1071,6 +1071,7 @@ const char * const vmstat_text[] = { #endif "thp_zero_page_alloc", "thp_zero_page_alloc_failed", + "thp_swpout", #endif #ifdef CONFIG_MEMORY_BALLOON "balloon_inflate", -- cgit v1.2.3 From 59807685a7e77e8c8fe5925613968841538d53d7 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:34 -0700 Subject: mm, THP, swap: support splitting THP for THP swap out After adding swapping out support for THP (Transparent Huge Page), it is possible that a THP in swap cache (partly swapped out) need to be split. To split such a THP, the swap cluster backing the THP need to be split too, that is, the CLUSTER_FLAG_HUGE flag need to be cleared for the swap cluster. The patch implemented this. And because the THP swap writing needs the THP keeps as huge page during writing. The PageWriteback flag is checked before splitting. Link: http://lkml.kernel.org/r/20170724051840.2309-8-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: Rik van Riel Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Dan Williams Cc: Jens Axboe Cc: Michal Hocko Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 9 +++++++++ mm/huge_memory.c | 10 +++++++++- mm/swapfile.c | 15 +++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 7176ba780e83..461cf107ad52 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -527,6 +527,15 @@ static inline swp_entry_t get_swap_page(struct page *page) #endif /* CONFIG_SWAP */ +#ifdef CONFIG_THP_SWAP +extern int split_swap_cluster(swp_entry_t entry); +#else +static inline int split_swap_cluster(swp_entry_t entry) +{ + return 0; +} +#endif + #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 02dfe635c9fe..772048c233d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2481,6 +2481,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); + if (PageWriteback(page)) + return -EBUSY; + if (PageAnon(head)) { /* * The caller does not necessarily hold an mmap_sem that would @@ -2558,7 +2561,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) __dec_node_page_state(page, NR_SHMEM_THPS); spin_unlock(&pgdata->split_queue_lock); __split_huge_page(page, list, flags); - ret = 0; + if (PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + ret = split_swap_cluster(entry); + } else + ret = 0; } else { if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { pr_alert("total_mapcount: %u, page_count(): %u\n", diff --git a/mm/swapfile.c b/mm/swapfile.c index 267b1fe41844..42eff9e4e972 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1216,6 +1216,21 @@ static void swapcache_free_cluster(swp_entry_t entry) } } } + +int split_swap_cluster(swp_entry_t entry) +{ + struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + si = _swap_info_get(entry); + if (!si) + return -EBUSY; + ci = lock_cluster(si, offset); + cluster_clear_huge(ci); + unlock_cluster(ci); + return 0; +} #else static inline void swapcache_free_cluster(swp_entry_t entry) { -- cgit v1.2.3 From 3e14a57b2416b7c94189b95baffd673cf5e0d0a3 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:37 -0700 Subject: memcg, THP, swap: support move mem cgroup charge for THP swapped out PTE mapped THP (Transparent Huge Page) will be ignored when moving memory cgroup charge. But for THP which is in the swap cache, the memory cgroup charge for the swap of a tail-page may be moved in current implementation. That isn't correct, because the swap charge for all sub-pages of a THP should be moved together. Following the processing of the PTE mapped THP, the mem cgroup charge moving for the swap entry for a tail-page of a THP is ignored too. Link: http://lkml.kernel.org/r/20170724051840.2309-9-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Michal Hocko Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Dan Williams Cc: Hugh Dickins Cc: Jens Axboe Cc: Rik van Riel Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Shaohua Li Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3d3f7b1686ac..2ae315e4d9f6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4639,8 +4639,11 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (!ret || !target) put_page(page); } - /* There is a swap entry and a page doesn't exist or isn't charged */ - if (ent.val && !ret && + /* + * There is a swap entry and a page doesn't exist or isn't charged. + * But we cannot move a tail-page in a THP. + */ + if (ent.val && !ret && (!page || !PageTransCompound(page)) && mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) -- cgit v1.2.3 From abe2895b76047bf5430990f2584cd91f76692218 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:41 -0700 Subject: memcg, THP, swap: avoid to duplicated charge THP in swap cache For a THP (Transparent Huge Page), tail_page->mem_cgroup is NULL. So to check whether the page is charged already, we need to check the head page. This is not an issue before because it is impossible for a THP to be in the swap cache before. But after we add delaying splitting THP after swapped out support, it is possible now. Link: http://lkml.kernel.org/r/20170724051840.2309-10-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Michal Hocko Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Dan Williams Cc: Hugh Dickins Cc: Jens Axboe Cc: Rik van Riel Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Shaohua Li Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2ae315e4d9f6..e7f47a38fe4f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5430,7 +5430,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, * in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); - if (page->mem_cgroup) + if (compound_head(page)->mem_cgroup) goto out; if (do_swap_account) { -- cgit v1.2.3 From d6810d730022016d9c0f389452b86b035dba1492 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:45 -0700 Subject: memcg, THP, swap: make mem_cgroup_swapout() support THP This patch makes mem_cgroup_swapout() works for the transparent huge page (THP). Which will move the memory cgroup charge from memory to swap for a THP. This will be used for the THP swap support. Where a THP may be swapped out as a whole to a set of (HPAGE_PMD_NR) continuous swap slots on the swap device. Link: http://lkml.kernel.org/r/20170724051840.2309-11-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Michal Hocko Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Dan Williams Cc: Hugh Dickins Cc: Jens Axboe Cc: Rik van Riel Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Shaohua Li Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e7f47a38fe4f..c1f9b79817d7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4654,8 +4654,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* - * We don't consider swapping or file mapped pages because THP does not - * support them for now. + * We don't consider PMD mapped swapping or file mapped pages because THP does + * not support them for now. * Caller should make sure that pmd_trans_huge(pmd) is true. */ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, @@ -5913,6 +5913,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { struct mem_cgroup *memcg, *swap_memcg; + unsigned int nr_entries; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5933,19 +5934,24 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * ancestor for the swap instead and transfer the memory+swap charge. */ swap_memcg = mem_cgroup_id_get_online(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1); + nr_entries = hpage_nr_pages(page); + /* Get references for the tail pages, too */ + if (nr_entries > 1) + mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), + nr_entries); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(swap_memcg, 1); + mem_cgroup_swap_statistics(swap_memcg, nr_entries); page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) - page_counter_uncharge(&memcg->memory, 1); + page_counter_uncharge(&memcg->memory, nr_entries); if (memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) - page_counter_charge(&swap_memcg->memsw, 1); - page_counter_uncharge(&memcg->memsw, 1); + page_counter_charge(&swap_memcg->memsw, nr_entries); + page_counter_uncharge(&memcg->memsw, nr_entries); } /* @@ -5955,7 +5961,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for udpating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, false, -1); + mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), + -nr_entries); memcg_check_events(memcg, page); if (!mem_cgroup_is_root(memcg)) -- cgit v1.2.3 From bd4c82c22c367e068acb1ec9ec02be2fac3e09e2 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:49 -0700 Subject: mm, THP, swap: delay splitting THP after swapped out In this patch, splitting transparent huge page (THP) during swapping out is delayed from after adding the THP into the swap cache to after swapping out finishes. After the patch, more operations for the anonymous THP reclaiming, such as writing the THP to the swap device, removing the THP from the swap cache could be batched. So that the performance of anonymous THP swapping out could be improved. This is the second step for the THP swap support. The plan is to delay splitting the THP step by step and avoid splitting the THP finally. With the patchset, the swap out throughput improves 42% (from about 5.81GB/s to about 8.25GB/s) in the vm-scalability swap-w-seq test case with 16 processes. At the same time, the IPI (reflect TLB flushing) reduced about 78.9%. The test is done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. To test the sequential swapping out, the test case creates 8 processes, which sequentially allocate and write to the anonymous pages until the RAM and part of the swap device is used up. Link: http://lkml.kernel.org/r/20170724051840.2309-12-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: Rik van Riel Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: Dan Williams Cc: Jens Axboe Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 95 +++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 52 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 1638814c7848..6fbf707c0ce2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -536,7 +536,9 @@ static inline int is_page_cache_freeable(struct page *page) * that isolated the page, the page cache radix tree and * optional buffer heads at page->private. */ - return page_count(page) - page_has_private(page) == 2; + int radix_pins = PageTransHuge(page) && PageSwapCache(page) ? + HPAGE_PMD_NR : 1; + return page_count(page) - page_has_private(page) == 1 + radix_pins; } static int may_write_to_inode(struct inode *inode, struct scan_control *sc) @@ -666,6 +668,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { unsigned long flags; + int refcount; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -696,11 +699,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (!page_ref_freeze(page, 2)) + if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) + refcount = 1 + HPAGE_PMD_NR; + else + refcount = 2; + if (!page_ref_freeze(page, refcount)) goto cannot_free; /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ if (unlikely(PageDirty(page))) { - page_ref_unfreeze(page, 2); + page_ref_unfreeze(page, refcount); goto cannot_free; } @@ -1122,58 +1129,56 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Try to allocate it some swap space here. * Lazyfree page could be freed directly */ - if (PageAnon(page) && PageSwapBacked(page) && - !PageSwapCache(page)) { - if (!(sc->gfp_mask & __GFP_IO)) - goto keep_locked; - if (PageTransHuge(page)) { - /* cannot split THP, skip it */ - if (!can_split_huge_page(page, NULL)) - goto activate_locked; - /* - * Split pages without a PMD map right - * away. Chances are some or all of the - * tail pages can be freed without IO. - */ - if (!compound_mapcount(page) && - split_huge_page_to_list(page, page_list)) - goto activate_locked; - } - if (!add_to_swap(page)) { - if (!PageTransHuge(page)) - goto activate_locked; - /* Split THP and swap individual base pages */ - if (split_huge_page_to_list(page, page_list)) - goto activate_locked; - if (!add_to_swap(page)) - goto activate_locked; - } - - /* XXX: We don't support THP writes */ - if (PageTransHuge(page) && - split_huge_page_to_list(page, page_list)) { - delete_from_swap_cache(page); - goto activate_locked; - } + if (PageAnon(page) && PageSwapBacked(page)) { + if (!PageSwapCache(page)) { + if (!(sc->gfp_mask & __GFP_IO)) + goto keep_locked; + if (PageTransHuge(page)) { + /* cannot split THP, skip it */ + if (!can_split_huge_page(page, NULL)) + goto activate_locked; + /* + * Split pages without a PMD map right + * away. Chances are some or all of the + * tail pages can be freed without IO. + */ + if (!compound_mapcount(page) && + split_huge_page_to_list(page, + page_list)) + goto activate_locked; + } + if (!add_to_swap(page)) { + if (!PageTransHuge(page)) + goto activate_locked; + /* Fallback to swap normal pages */ + if (split_huge_page_to_list(page, + page_list)) + goto activate_locked; + if (!add_to_swap(page)) + goto activate_locked; + } - may_enter_fs = 1; + may_enter_fs = 1; - /* Adding to swap updated mapping */ - mapping = page_mapping(page); + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } } else if (unlikely(PageTransHuge(page))) { /* Split file THP */ if (split_huge_page_to_list(page, page_list)) goto keep_locked; } - VM_BUG_ON_PAGE(PageTransHuge(page), page); - /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ if (page_mapped(page)) { - if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { + enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + + if (unlikely(PageTransHuge(page))) + flags |= TTU_SPLIT_HUGE_PMD; + if (!try_to_unmap(page, flags)) { nr_unmap_fail++; goto activate_locked; } @@ -1313,7 +1318,11 @@ free_it: * Is there need to periodically free_page_list? It would * appear not as the counts should be low */ - list_add(&page->lru, &free_pages); + if (unlikely(PageTransHuge(page))) { + mem_cgroup_uncharge(page); + (*get_compound_page_dtor(page))(page); + } else + list_add(&page->lru, &free_pages); continue; activate_locked: -- cgit v1.2.3 From fe490cc0fe9e6ee48cc48bb5dc463bc5f0f1428f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:52 -0700 Subject: mm, THP, swap: add THP swapping out fallback counting When swapping out THP (Transparent Huge Page), instead of swapping out the THP as a whole, sometimes we have to fallback to split the THP into normal pages before swapping, because no free swap clusters are available, or cgroup limit is exceeded, etc. To count the number of the fallback, a new VM event THP_SWPOUT_FALLBACK is added, and counted when we fallback to split the THP. Link: http://lkml.kernel.org/r/20170724051840.2309-13-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: Rik van Riel Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: Dan Williams Cc: Jens Axboe Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 1 + mm/vmscan.c | 3 +++ mm/vmstat.c | 1 + 3 files changed, 5 insertions(+) (limited to 'mm') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index c75024e80eed..e02820fc2861 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -86,6 +86,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, THP_SWPOUT, + THP_SWPOUT_FALLBACK, #endif #ifdef CONFIG_MEMORY_BALLOON BALLOON_INFLATE, diff --git a/mm/vmscan.c b/mm/vmscan.c index 6fbf707c0ce2..13d711dd8776 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1154,6 +1154,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (split_huge_page_to_list(page, page_list)) goto activate_locked; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_vm_event(THP_SWPOUT_FALLBACK); +#endif if (!add_to_swap(page)) goto activate_locked; } diff --git a/mm/vmstat.c b/mm/vmstat.c index bccf426453cd..e131b51654c7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1072,6 +1072,7 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", "thp_swpout", + "thp_swpout_fallback", #endif #ifdef CONFIG_MEMORY_BALLOON "balloon_inflate", -- cgit v1.2.3 From b1cc94ab2f2ba31fcb2c59df0b9cf03f6d720553 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 6 Sep 2017 16:22:56 -0700 Subject: shmem: shmem_charge: verify max_block is not exceeded before inode update Patch series "userfaultfd: enable zeropage support for shmem". These patches enable support for UFFDIO_ZEROPAGE for shared memory. The first two patches are not strictly related to userfaultfd, they are just minor refactoring to reduce amount of code duplication. This patch (of 7): Currently we update inode and shmem_inode_info before verifying that used_blocks will not exceed max_blocks. In case it will, we undo the update. Let's switch the order and move the verification of the blocks count before the inode and shmem_inode_info update. Link: http://lkml.kernel.org/r/1497939652-16528-2-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Hillf Danton Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index fbcb3c96a186..35b524085c44 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -266,6 +266,14 @@ bool shmem_charge(struct inode *inode, long pages) if (shmem_acct_block(info->flags, pages)) return false; + + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks - pages) > 0) + goto unacct; + percpu_counter_add(&sbinfo->used_blocks, pages); + } + spin_lock_irqsave(&info->lock, flags); info->alloced += pages; inode->i_blocks += pages * BLOCKS_PER_PAGE; @@ -273,20 +281,11 @@ bool shmem_charge(struct inode *inode, long pages) spin_unlock_irqrestore(&info->lock, flags); inode->i_mapping->nrpages += pages; - if (!sbinfo->max_blocks) - return true; - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - pages) > 0) { - inode->i_mapping->nrpages -= pages; - spin_lock_irqsave(&info->lock, flags); - info->alloced -= pages; - shmem_recalc_inode(inode); - spin_unlock_irqrestore(&info->lock, flags); - shmem_unacct_blocks(info->flags, pages); - return false; - } - percpu_counter_add(&sbinfo->used_blocks, pages); return true; + +unacct: + shmem_unacct_blocks(info->flags, pages); + return false; } void shmem_uncharge(struct inode *inode, long pages) -- cgit v1.2.3 From 0f0796945614b7523987f7eea32407421af4b1ee Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 6 Sep 2017 16:22:59 -0700 Subject: shmem: introduce shmem_inode_acct_block The shmem_acct_block and the update of used_blocks are following one another in all the places they are used. Combine these two into a helper function. Link: http://lkml.kernel.org/r/1497939652-16528-3-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Hillf Danton Cc: Hugh Dickins Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 102 ++++++++++++++++++++++++++++--------------------------------- 1 file changed, 46 insertions(+), 56 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 35b524085c44..b7d84c4f2a5c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -188,6 +188,38 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } +static inline bool shmem_inode_acct_block(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + + if (shmem_acct_block(info->flags, pages)) + return false; + + if (sbinfo->max_blocks) { + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks - pages) > 0) + goto unacct; + percpu_counter_add(&sbinfo->used_blocks, pages); + } + + return true; + +unacct: + shmem_unacct_blocks(info->flags, pages); + return false; +} + +static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) +{ + struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + + if (sbinfo->max_blocks) + percpu_counter_sub(&sbinfo->used_blocks, pages); + shmem_unacct_blocks(info->flags, pages); +} + static const struct super_operations shmem_ops; static const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; @@ -249,31 +281,20 @@ static void shmem_recalc_inode(struct inode *inode) freed = info->alloced - info->swapped - inode->i_mapping->nrpages; if (freed > 0) { - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -freed); info->alloced -= freed; inode->i_blocks -= freed * BLOCKS_PER_PAGE; - shmem_unacct_blocks(info->flags, freed); + shmem_inode_unacct_blocks(inode, freed); } } bool shmem_charge(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); unsigned long flags; - if (shmem_acct_block(info->flags, pages)) + if (!shmem_inode_acct_block(inode, pages)) return false; - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - pages) > 0) - goto unacct; - percpu_counter_add(&sbinfo->used_blocks, pages); - } - spin_lock_irqsave(&info->lock, flags); info->alloced += pages; inode->i_blocks += pages * BLOCKS_PER_PAGE; @@ -282,16 +303,11 @@ bool shmem_charge(struct inode *inode, long pages) inode->i_mapping->nrpages += pages; return true; - -unacct: - shmem_unacct_blocks(info->flags, pages); - return false; } void shmem_uncharge(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); unsigned long flags; spin_lock_irqsave(&info->lock, flags); @@ -300,9 +316,7 @@ void shmem_uncharge(struct inode *inode, long pages) shmem_recalc_inode(inode); spin_unlock_irqrestore(&info->lock, flags); - if (sbinfo->max_blocks) - percpu_counter_sub(&sbinfo->used_blocks, pages); - shmem_unacct_blocks(info->flags, pages); + shmem_inode_unacct_blocks(inode, pages); } /* @@ -1451,9 +1465,10 @@ static struct page *shmem_alloc_page(gfp_t gfp, } static struct page *shmem_alloc_and_acct_page(gfp_t gfp, - struct shmem_inode_info *info, struct shmem_sb_info *sbinfo, + struct inode *inode, pgoff_t index, bool huge) { + struct shmem_inode_info *info = SHMEM_I(inode); struct page *page; int nr; int err = -ENOSPC; @@ -1462,14 +1477,8 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, huge = false; nr = huge ? HPAGE_PMD_NR : 1; - if (shmem_acct_block(info->flags, nr)) + if (!shmem_inode_acct_block(inode, nr)) goto failed; - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - nr) > 0) - goto unacct; - percpu_counter_add(&sbinfo->used_blocks, nr); - } if (huge) page = shmem_alloc_hugepage(gfp, info, index); @@ -1482,10 +1491,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, } err = -ENOMEM; - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -nr); -unacct: - shmem_unacct_blocks(info->flags, nr); + shmem_inode_unacct_blocks(inode, nr); failed: return ERR_PTR(err); } @@ -1750,10 +1756,9 @@ repeat: } alloc_huge: - page = shmem_alloc_and_acct_page(gfp, info, sbinfo, - index, true); + page = shmem_alloc_and_acct_page(gfp, inode, index, true); if (IS_ERR(page)) { -alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, +alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, index, false); } if (IS_ERR(page)) { @@ -1875,10 +1880,7 @@ clear: * Error recovery. */ unacct: - if (sbinfo->max_blocks) - percpu_counter_sub(&sbinfo->used_blocks, - 1 << compound_order(page)); - shmem_unacct_blocks(info->flags, 1 << compound_order(page)); + shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); if (PageTransHuge(page)) { unlock_page(page); @@ -2214,7 +2216,6 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); @@ -2226,19 +2227,13 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, int ret; ret = -ENOMEM; - if (shmem_acct_block(info->flags, 1)) + if (!shmem_inode_acct_block(inode, 1)) goto out; - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks) >= 0) - goto out_unacct_blocks; - percpu_counter_inc(&sbinfo->used_blocks); - } if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); if (!page) - goto out_dec_used_blocks; + goto out_unacct_blocks; page_kaddr = kmap_atomic(page); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, @@ -2248,9 +2243,7 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, /* fallback to copy_from_user outside mmap_sem */ if (unlikely(ret)) { *pagep = page; - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -1); - shmem_unacct_blocks(info->flags, 1); + shmem_inode_unacct_blocks(inode, 1); /* don't free the page */ return -EFAULT; } @@ -2313,11 +2306,8 @@ out_release_uncharge: out_release: unlock_page(page); put_page(page); -out_dec_used_blocks: - if (sbinfo->max_blocks) - percpu_counter_add(&sbinfo->used_blocks, -1); out_unacct_blocks: - shmem_unacct_blocks(info->flags, 1); + shmem_inode_unacct_blocks(inode, 1); goto out; } -- cgit v1.2.3 From 8d10396342063c79e92c4e46215370ab7b988569 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 6 Sep 2017 16:23:02 -0700 Subject: userfaultfd: shmem: add shmem_mfill_zeropage_pte for userfaultfd support shmem_mfill_zeropage_pte is the low level routine that implements the userfaultfd UFFDIO_ZEROPAGE command. Since for shmem mappings zero pages are always allocated and accounted, the new method is a slight extension of the existing shmem_mcopy_atomic_pte. Link: http://lkml.kernel.org/r/1497939652-16528-4-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Hillf Danton Cc: Hugh Dickins Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 6 +++++ mm/shmem.c | 62 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a7d6bd2a918f..b6c3540e07bc 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -137,9 +137,15 @@ extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, unsigned long dst_addr, unsigned long src_addr, struct page **pagep); +extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr); #else #define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ src_addr, pagep) ({ BUG(); 0; }) +#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \ + dst_addr) ({ BUG(); 0; }) #endif #endif diff --git a/mm/shmem.c b/mm/shmem.c index b7d84c4f2a5c..64bdc91187f7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2207,12 +2207,13 @@ bool shmem_mapping(struct address_space *mapping) return mapping->a_ops == &shmem_aops; } -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep) +static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + bool zeropage, + struct page **pagep) { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); @@ -2235,17 +2236,22 @@ int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!page) goto out_unacct_blocks; - page_kaddr = kmap_atomic(page); - ret = copy_from_user(page_kaddr, (const void __user *)src_addr, - PAGE_SIZE); - kunmap_atomic(page_kaddr); - - /* fallback to copy_from_user outside mmap_sem */ - if (unlikely(ret)) { - *pagep = page; - shmem_inode_unacct_blocks(inode, 1); - /* don't free the page */ - return -EFAULT; + if (!zeropage) { /* mcopy_atomic */ + page_kaddr = kmap_atomic(page); + ret = copy_from_user(page_kaddr, + (const void __user *)src_addr, + PAGE_SIZE); + kunmap_atomic(page_kaddr); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + *pagep = page; + shmem_inode_unacct_blocks(inode, 1); + /* don't free the page */ + return -EFAULT; + } + } else { /* mfill_zeropage_atomic */ + clear_highpage(page); } } else { page = *pagep; @@ -2311,6 +2317,28 @@ out_unacct_blocks: goto out; } +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, false, pagep); +} + +int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + struct page *page = NULL; + + return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, 0, true, &page); +} + #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; -- cgit v1.2.3 From 3217d3c79b5d7aabf62daa4db8cf757abedc9f28 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 6 Sep 2017 16:23:06 -0700 Subject: userfaultfd: mcopy_atomic: introduce mfill_atomic_pte helper Shuffle the code a bit to improve readability. Link: http://lkml.kernel.org/r/1497939652-16528-5-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Hillf Danton Cc: Hugh Dickins Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/userfaultfd.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 8bcb501bce60..48c015c80120 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -371,6 +371,34 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, bool zeropage); #endif /* CONFIG_HUGETLB_PAGE */ +static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **page, + bool zeropage) +{ + ssize_t err; + + if (vma_is_anonymous(dst_vma)) { + if (!zeropage) + err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, page); + else + err = mfill_zeropage_pte(dst_mm, dst_pmd, + dst_vma, dst_addr); + } else { + err = -EINVAL; /* if zeropage is true return -EINVAL */ + if (likely(!zeropage)) + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, + dst_vma, dst_addr, + src_addr, page); + } + + return err; +} + static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, @@ -487,22 +515,8 @@ retry: BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd)); - if (vma_is_anonymous(dst_vma)) { - if (!zeropage) - err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, - &page); - else - err = mfill_zeropage_pte(dst_mm, dst_pmd, - dst_vma, dst_addr); - } else { - err = -EINVAL; /* if zeropage is true return -EINVAL */ - if (likely(!zeropage)) - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, - dst_vma, dst_addr, - src_addr, &page); - } - + err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + src_addr, &page, zeropage); cond_resched(); if (unlikely(err == -EFAULT)) { -- cgit v1.2.3 From 8fb44e5403ca86e33411dfa12dd298ed5ab1c3f7 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 6 Sep 2017 16:23:09 -0700 Subject: userfaultfd: shmem: wire up shmem_mfill_zeropage_pte For shmem VMAs we can use shmem_mfill_zeropage_pte for UFFDIO_ZEROPAGE Link: http://lkml.kernel.org/r/1497939652-16528-6-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Hillf Danton Cc: Hugh Dickins Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/userfaultfd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 48c015c80120..81192701964d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -389,11 +389,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr); } else { - err = -EINVAL; /* if zeropage is true return -EINVAL */ - if (likely(!zeropage)) + if (!zeropage) err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, src_addr, page); + else + err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd, + dst_vma, dst_addr); } return err; -- cgit v1.2.3 From c41f012ade0b95b0a6e25c7150673e0554736165 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:23:36 -0700 Subject: mm: rename global_page_state to global_zone_page_state global_page_state is error prone as a recent bug report pointed out [1]. It only returns proper values for zone based counters as the enum it gets suggests. We already have global_node_page_state so let's rename global_page_state to global_zone_page_state to be more explicit here. All existing users seems to be correct: $ git grep "global_page_state(NR_" | sed 's@.*(\(NR_[A-Z_]*\)).*@\1@' | sort | uniq -c 2 NR_BOUNCE 2 NR_FREE_CMA_PAGES 11 NR_FREE_PAGES 1 NR_KERNEL_STACK_KB 1 NR_MLOCK 2 NR_PAGETABLE This patch shouldn't introduce any functional change. [1] http://lkml.kernel.org/r/201707260628.v6Q6SmaS030814@www262.sakura.ne.jp Link: http://lkml.kernel.org/r/20170801134256.5400-2-hannes@cmpxchg.org Signed-off-by: Michal Hocko Signed-off-by: Johannes Weiner Cc: Tetsuo Handa Cc: Josef Bacik Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 10 +++++----- include/linux/swap.h | 4 ++-- include/linux/vmstat.h | 4 ++-- mm/mmap.c | 6 +++--- mm/nommu.c | 4 ++-- mm/page-writeback.c | 4 ++-- mm/page_alloc.c | 12 ++++++------ mm/util.c | 2 +- mm/vmstat.c | 4 ++-- 9 files changed, 25 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 509a61668d90..cdd979724c74 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); - show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK)); + show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); #ifdef CONFIG_HIGHMEM show_val_kb(m, "HighTotal: ", i.totalhigh); @@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SUnreclaim: ", global_node_page_state(NR_SLAB_UNRECLAIMABLE)); seq_printf(m, "KernelStack: %8lu kB\n", - global_page_state(NR_KERNEL_STACK_KB)); + global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", - global_page_state(NR_PAGETABLE)); + global_zone_page_state(NR_PAGETABLE)); #ifdef CONFIG_QUICKLIST show_val_kb(m, "Quicklists: ", quicklist_total_size()); #endif @@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "NFS_Unstable: ", global_node_page_state(NR_UNSTABLE_NFS)); show_val_kb(m, "Bounce: ", - global_page_state(NR_BOUNCE)); + global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); @@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); show_val_kb(m, "CmaFree: ", - global_page_state(NR_FREE_CMA_PAGES)); + global_zone_page_state(NR_FREE_CMA_PAGES)); #endif hugetlb_report_meminfo(m); diff --git a/include/linux/swap.h b/include/linux/swap.h index 461cf107ad52..76f1632eea5a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -263,8 +263,8 @@ extern unsigned long totalreserve_pages; extern unsigned long nr_free_buffer_pages(void); extern unsigned long nr_free_pagecache_pages(void); -/* Definition of global_page_state not available yet */ -#define nr_free_pages() global_page_state(NR_FREE_PAGES) +/* Definition of global_zone_page_state not available yet */ +#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) /* linux/mm/swap.c */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index b3d85f30d424..97e11ab573f0 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -123,7 +123,7 @@ static inline void node_page_state_add(long x, struct pglist_data *pgdat, atomic_long_add(x, &vm_node_stat[item]); } -static inline unsigned long global_page_state(enum zone_stat_item item) +static inline unsigned long global_zone_page_state(enum zone_stat_item item) { long x = atomic_long_read(&vm_zone_stat[item]); #ifdef CONFIG_SMP @@ -199,7 +199,7 @@ extern unsigned long sum_zone_node_page_state(int node, extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); #else -#define sum_zone_node_page_state(node, item) global_page_state(item) +#define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) #endif /* CONFIG_NUMA */ diff --git a/mm/mmap.c b/mm/mmap.c index f19efcf75418..9800e29763f4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3514,7 +3514,7 @@ static int init_user_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; @@ -3535,7 +3535,7 @@ static int init_admin_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; @@ -3579,7 +3579,7 @@ static int reserve_mem_notifier(struct notifier_block *nb, break; case MEM_OFFLINE: - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); if (sysctl_user_reserve_kbytes > free_kbytes) { init_user_reserve(); diff --git a/mm/nommu.c b/mm/nommu.c index fc184f597d59..53d5175a5c14 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1962,7 +1962,7 @@ static int __meminit init_user_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; @@ -1983,7 +1983,7 @@ static int __meminit init_admin_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bf050ab025b7..0b9c5cbe8eba 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -363,7 +363,7 @@ static unsigned long global_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES); + x = global_zone_page_state(NR_FREE_PAGES); /* * Pages reserved for the kernel should not be considered * dirtyable, to prevent a situation where reclaim has to @@ -1405,7 +1405,7 @@ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive - * global_page_state() too often. So scale it near-sqrt to the safety margin + * global_zone_page_state() too often. So scale it near-sqrt to the safety margin * (the number of pages we may dirty without exceeding the dirty limits). */ static unsigned long dirty_poll_interval(unsigned long dirty, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0bea94af0423..a4562c058ec4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4509,7 +4509,7 @@ long si_mem_available(void) * Estimate the amount of memory available for userspace allocations, * without causing swapping. */ - available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will @@ -4538,7 +4538,7 @@ void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; val->sharedram = global_node_page_state(NR_SHMEM); - val->freeram = global_page_state(NR_FREE_PAGES); + val->freeram = global_zone_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; val->freehigh = nr_free_highpages(); @@ -4673,11 +4673,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), - global_page_state(NR_PAGETABLE), - global_page_state(NR_BOUNCE), - global_page_state(NR_FREE_PAGES), + global_zone_page_state(NR_PAGETABLE), + global_zone_page_state(NR_BOUNCE), + global_zone_page_state(NR_FREE_PAGES), free_pcp, - global_page_state(NR_FREE_CMA_PAGES)); + global_zone_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) diff --git a/mm/util.c b/mm/util.c index 9ecddf568fe3..34e57fae959d 100644 --- a/mm/util.c +++ b/mm/util.c @@ -614,7 +614,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_page_state(NR_FREE_PAGES); + free = global_zone_page_state(NR_FREE_PAGES); free += global_node_page_state(NR_FILE_PAGES); /* diff --git a/mm/vmstat.c b/mm/vmstat.c index e131b51654c7..ba9b202e8500 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1502,7 +1502,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) if (!v) return ERR_PTR(-ENOMEM); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - v[i] = global_page_state(i); + v[i] = global_zone_page_state(i); v += NR_VM_ZONE_STAT_ITEMS; for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) @@ -1591,7 +1591,7 @@ int vmstat_refresh(struct ctl_table *table, int write, * which can equally be echo'ed to or cat'ted from (by root), * can be used to update the stats just before reading them. * - * Oh, and since global_page_state() etc. are so careful to hide + * Oh, and since global_zone_page_state() etc. are so careful to hide * transiently negative values, report an error here if any of * the stats is negative, so we know to go looking for imbalance. */ -- cgit v1.2.3 From 2376dd7ceddae67432db055ff3f2b7f4122a919d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 6 Sep 2017 16:23:53 -0700 Subject: userfaultfd: call userfaultfd_unmap_prep only if __split_vma succeeds A __split_vma is not a worthy event to report, and it's definitely not a unmap so it would be incorrect to report unmap for the whole region to the userfaultfd manager if a __split_vma fails. So only call userfaultfd_unmap_prep after the __vma_splitting is over and do_munmap cannot fail anymore. Also add unlikely because it's better to optimize for the vast majority of apps that aren't using userfaultfd in a non cooperative way. Ideally we should also find a way to eliminate the branch entirely if CONFIG_USERFAULTFD=n, but it would complicate things so stick to unlikely for now. Link: http://lkml.kernel.org/r/20170802165145.22628-5-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Alexey Perevalov Cc: Maxime Coquelin Cc: Mike Kravetz Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 9800e29763f4..52f6c6b18f40 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2639,13 +2639,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (vma->vm_start >= end) return 0; - if (uf) { - int error = userfaultfd_unmap_prep(vma, start, end, uf); - - if (error) - return error; - } - /* * If we need to split any vma, do it now to save pain later. * @@ -2679,6 +2672,21 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, } vma = prev ? prev->vm_next : mm->mmap; + if (unlikely(uf)) { + /* + * If userfaultfd_unmap_prep returns an error the vmas + * will remain splitted, but userland will get a + * highly unexpected error anyway. This is no + * different than the case where the first of the two + * __split_vma fails, but we don't undo the first + * split, despite we could. This is unlikely enough + * failure that it's not worth optimizing it for. + */ + int error = userfaultfd_unmap_prep(vma, start, end, uf); + if (error) + return error; + } + /* * unlock any mlock()ed ranges before detaching vmas */ -- cgit v1.2.3 From 79b63f12abcbbd2caf7064b294af648a87de07ff Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:24:03 -0700 Subject: mm, hugetlb: do not allocate non-migrateable gigantic pages from movable zones alloc_gigantic_page doesn't consider movability of the gigantic hugetlb when scanning eligible ranges for the allocation. As 1GB hugetlb pages are not movable currently this can break the movable zone assumption that all allocations are migrateable and as such break memory hotplug. Reorganize the code and use the standard zonelist allocations scheme that we use for standard hugetbl pages. htlb_alloc_mask will ensure that only migratable hugetlb pages will ever see a movable zone. Link: http://lkml.kernel.org/r/20170803083549.21407-1-mhocko@kernel.org Fixes: 944d9fec8d7a ("hugetlb: add support for gigantic page allocation at runtime") Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Cc: Luiz Capitulino Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cafd60316e68..34625b257128 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1066,11 +1066,11 @@ static void free_gigantic_page(struct page *page, unsigned int order) } static int __alloc_gigantic_page(unsigned long start_pfn, - unsigned long nr_pages) + unsigned long nr_pages, gfp_t gfp_mask) { unsigned long end_pfn = start_pfn + nr_pages; return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - GFP_KERNEL); + gfp_mask); } static bool pfn_range_valid_gigantic(struct zone *z, @@ -1108,19 +1108,24 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, unsigned int order) +static struct page *alloc_gigantic_page(int nid, struct hstate *h) { + unsigned int order = huge_page_order(h); unsigned long nr_pages = 1 << order; unsigned long ret, pfn, flags; - struct zone *z; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + gfp_t gfp_mask; - z = NODE_DATA(nid)->node_zones; - for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { - spin_lock_irqsave(&z->lock, flags); + gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + zonelist = node_zonelist(nid, gfp_mask); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) { + spin_lock_irqsave(&zone->lock, flags); - pfn = ALIGN(z->zone_start_pfn, nr_pages); - while (zone_spans_last_pfn(z, pfn, nr_pages)) { - if (pfn_range_valid_gigantic(z, pfn, nr_pages)) { + pfn = ALIGN(zone->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(zone, pfn, nr_pages)) { + if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) { /* * We release the zone lock here because * alloc_contig_range() will also lock the zone @@ -1128,16 +1133,16 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order) * spinning on this lock, it may win the race * and cause alloc_contig_range() to fail... */ - spin_unlock_irqrestore(&z->lock, flags); - ret = __alloc_gigantic_page(pfn, nr_pages); + spin_unlock_irqrestore(&zone->lock, flags); + ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask); if (!ret) return pfn_to_page(pfn); - spin_lock_irqsave(&z->lock, flags); + spin_lock_irqsave(&zone->lock, flags); } pfn += nr_pages; } - spin_unlock_irqrestore(&z->lock, flags); + spin_unlock_irqrestore(&zone->lock, flags); } return NULL; @@ -1150,7 +1155,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) { struct page *page; - page = alloc_gigantic_page(nid, huge_page_order(h)); + page = alloc_gigantic_page(nid, h); if (page) { prep_compound_gigantic_page(page, huge_page_order(h)); prep_new_huge_page(h, page, nid); -- cgit v1.2.3 From 88d6ac40c1c6b2334046d3f89f1911a3c8febd4c Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Wed, 6 Sep 2017 16:24:06 -0700 Subject: mm/vmstat: fix divide error at __fragmentation_index When order is -1 or too big, *1UL << order* will be 0, which will cause a divide error. Although it seems that all callers of __fragmentation_index() will only do so with a valid order, the patch can make it more robust. Should prevent reoccurrences of https://bugzilla.kernel.org/show_bug.cgi?id=196555 Link: http://lkml.kernel.org/r/1501751520-2598-1-git-send-email-wen.yang99@zte.com.cn Signed-off-by: Wen Yang Reviewed-by: Jiang Biao Suggested-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index ba9b202e8500..05de233a6fca 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -870,6 +870,9 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in { unsigned long requested = 1UL << order; + if (WARN_ON_ONCE(order >= MAX_ORDER)) + return 0; + if (!info->free_blocks_total) return 0; -- cgit v1.2.3 From c568da282bbc8f09c4b49201177fa259fe184c47 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 6 Sep 2017 16:24:09 -0700 Subject: mm/vmalloc.c: halve the number of comparisons performed in pcpu_get_vm_areas() In pcpu_get_vm_areas(), it checks each range is not overlapped. To make sure it is, only (N^2)/2 comparison is necessary, while current code does N^2 times. By starting from the next range, it achieves the goal and the continue could be removed. Also, - the overlap check of two ranges could be done with one clause - one typo in comment is fixed. Link: http://lkml.kernel.org/r/20170803063822.48702-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: Tejun Heo Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a47e3894c775..fa409c9092be 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2482,7 +2482,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * matching slot. While scanning, if any of the areas overlaps with * existing vmap_area, the base address is pulled down to fit the * area. Scanning is repeated till all the areas fit and then all - * necessary data structres are inserted and the result is returned. + * necessary data structures are inserted and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, @@ -2510,15 +2510,11 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, if (start > offsets[last_area]) last_area = area; - for (area2 = 0; area2 < nr_vms; area2++) { + for (area2 = area + 1; area2 < nr_vms; area2++) { unsigned long start2 = offsets[area2]; unsigned long end2 = start2 + sizes[area2]; - if (area2 == area) - continue; - - BUG_ON(start2 >= start && start2 < end); - BUG_ON(end2 <= end && end2 > start); + BUG_ON(start2 < end && start < end2); } } last_end = offsets[last_area] + sizes[last_area]; -- cgit v1.2.3 From ab1b597ee0e4208a1db227bb7b2c9512c8234b48 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 6 Sep 2017 16:24:13 -0700 Subject: mm, devm_memremap_pages: use multi-order radix for ZONE_DEVICE lookups devm_memremap_pages() records mapped ranges in pgmap_radix with an entry per section's worth of memory (128MB). The key for each of those entries is a section number. This leads to false positives when devm_memremap_pages() is passed a section-unaligned range as lookups in the misalignment fail to return NULL. We can close this hole by using the pfn as the key for entries in the tree. The number of entries required to describe a remapped range is reduced by leveraging multi-order entries. In practice this approach usually yields just one entry in the tree if the size and starting address are of the same power-of-2 alignment. Previously we always needed nr_entries = mapping_size / 128MB. Link: https://lists.01.org/pipermail/linux-nvdimm/2016-August/006666.html Link: http://lkml.kernel.org/r/150215410565.39310.13767886055248249438.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Reported-by: Toshi Kani Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 52 ++++++++++++++++++++++++++++++++++++++-------------- mm/Kconfig | 1 + 2 files changed, 39 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/kernel/memremap.c b/kernel/memremap.c index 9afdc434fb49..066e73c2fcc9 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -194,18 +194,41 @@ struct page_map { struct vmem_altmap altmap; }; -static void pgmap_radix_release(struct resource *res) +static unsigned long order_at(struct resource *res, unsigned long pgoff) { - resource_size_t key, align_start, align_size, align_end; + unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; + unsigned long nr_pages, mask; - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); - align_end = align_start + align_size - 1; + nr_pages = PHYS_PFN(resource_size(res)); + if (nr_pages == pgoff) + return ULONG_MAX; + + /* + * What is the largest aligned power-of-2 range available from + * this resource pgoff to the end of the resource range, + * considering the alignment of the current pgoff? + */ + mask = phys_pgoff | rounddown_pow_of_two(nr_pages - pgoff); + if (!mask) + return ULONG_MAX; + + return find_first_bit(&mask, BITS_PER_LONG); +} + +#define foreach_order_pgoff(res, order, pgoff) \ + for (pgoff = 0, order = order_at((res), pgoff); order < ULONG_MAX; \ + pgoff += 1UL << order, order = order_at((res), pgoff)) + +static void pgmap_radix_release(struct resource *res) +{ + unsigned long pgoff, order; mutex_lock(&pgmap_lock); - for (key = res->start; key <= res->end; key += SECTION_SIZE) - radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT); + foreach_order_pgoff(res, order, pgoff) + radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); mutex_unlock(&pgmap_lock); + + synchronize_rcu(); } static unsigned long pfn_first(struct page_map *page_map) @@ -268,7 +291,7 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) WARN_ON_ONCE(!rcu_read_lock_held()); - page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT); + page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); return page_map ? &page_map->pgmap : NULL; } @@ -293,12 +316,12 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { - resource_size_t key, align_start, align_size, align_end; + resource_size_t align_start, align_size, align_end; + unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; - unsigned long pfn; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -337,11 +360,12 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mutex_lock(&pgmap_lock); error = 0; align_end = align_start + align_size - 1; - for (key = align_start; key <= align_end; key += SECTION_SIZE) { + + foreach_order_pgoff(res, order, pgoff) { struct dev_pagemap *dup; rcu_read_lock(); - dup = find_dev_pagemap(key); + dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff)); rcu_read_unlock(); if (dup) { dev_err(dev, "%s: %pr collides with mapping for %s\n", @@ -349,8 +373,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, error = -EBUSY; break; } - error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT, - page_map); + error = __radix_tree_insert(&pgmap_radix, + PHYS_PFN(res->start) + pgoff, order, page_map); if (error) { dev_err(dev, "%s: failed: %d\n", __func__, error); break; diff --git a/mm/Kconfig b/mm/Kconfig index 48b1af447fa7..0ded10a22639 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -678,6 +678,7 @@ config ZONE_DEVICE depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP depends on ARCH_HAS_ZONE_DEVICE + select RADIX_TREE_MULTIORDER help Device memory hotplug support allows for establishing pmem, -- cgit v1.2.3 From 749df87bd7bee5a79cef073f5d032ddb2b211de8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 6 Sep 2017 16:24:16 -0700 Subject: mm/shmem: add hugetlbfs support to memfd_create() This patch came out of discussions in this e-mail thread: http://lkml.kernel.org/r/1499357846-7481-1-git-send-email-mike.kravetz%40oracle.com The Oracle JVM team is developing a new garbage collection model. This new model requires multiple mappings of the same anonymous memory. One straight forward way to accomplish this is with memfd_create. They can use the returned fd to create multiple mappings of the same memory. The JVM today has an option to use (static hugetlb) huge pages. If this option is specified, they would like to use the same garbage collection model requiring multiple mappings to the same memory. Using hugetlbfs, it is possible to explicitly mount a filesystem and specify file paths in order to get an fd that can be used for multiple mappings. However, this introduces additional system admin work and coordination. Ideally they would like to get a hugetlbfs fd without requiring explicit mounting of a filesystem. Today, mmap and shmget can make use of hugetlbfs without explicitly mounting a filesystem. The patch adds this functionality to memfd_create. Add a new flag MFD_HUGETLB to memfd_create() that will specify the file to be created resides in the hugetlbfs filesystem. This is the generic hugetlbfs filesystem not associated with any specific mount point. As with other system calls that request hugetlbfs backed pages, there is the ability to encode huge page size in the flag arguments. hugetlbfs does not support sealing operations, therefore specifying MFD_ALLOW_SEALING with MFD_HUGETLB will result in EINVAL. Of course, the memfd_man page would need updating if this type of functionality moves forward. Link: http://lkml.kernel.org/r/1502149672-7759-2-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/memfd.h | 24 ++++++++++++++++++++++++ mm/shmem.c | 37 +++++++++++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h index 534e364bda92..7f3a722dbd72 100644 --- a/include/uapi/linux/memfd.h +++ b/include/uapi/linux/memfd.h @@ -1,8 +1,32 @@ #ifndef _UAPI_LINUX_MEMFD_H #define _UAPI_LINUX_MEMFD_H +#include + /* flags for memfd_create(2) (unsigned int) */ #define MFD_CLOEXEC 0x0001U #define MFD_ALLOW_SEALING 0x0002U +#define MFD_HUGETLB 0x0004U + +/* + * Huge page size encoding when MFD_HUGETLB is specified, and a huge page + * size other than the default is desired. See hugetlb_encode.h. + * All known huge page size encodings are provided here. It is the + * responsibility of the application to know which sizes are supported on + * the running system. See mmap(2) man page for details. + */ +#define MFD_HUGE_SHIFT HUGETLB_FLAG_ENCODE_SHIFT +#define MFD_HUGE_MASK HUGETLB_FLAG_ENCODE_MASK + +#define MFD_HUGE_64KB HUGETLB_FLAG_ENCODE_64KB +#define MFD_HUGE_512KB HUGETLB_FLAG_ENCODE_512KB +#define MFD_HUGE_1MB HUGETLB_FLAG_ENCODE_1MB +#define MFD_HUGE_2MB HUGETLB_FLAG_ENCODE_2MB +#define MFD_HUGE_8MB HUGETLB_FLAG_ENCODE_8MB +#define MFD_HUGE_16MB HUGETLB_FLAG_ENCODE_16MB +#define MFD_HUGE_256MB HUGETLB_FLAG_ENCODE_256MB +#define MFD_HUGE_1GB HUGETLB_FLAG_ENCODE_1GB +#define MFD_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB +#define MFD_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB #endif /* _UAPI_LINUX_MEMFD_H */ diff --git a/mm/shmem.c b/mm/shmem.c index 64bdc91187f7..47179bbe9ee7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -34,6 +34,7 @@ #include #include #include +#include #include /* for arch/microblaze update_mmu_cache() */ @@ -3652,7 +3653,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) SYSCALL_DEFINE2(memfd_create, const char __user *, uname, @@ -3664,8 +3665,18 @@ SYSCALL_DEFINE2(memfd_create, char *name; long len; - if (flags & ~(unsigned int)MFD_ALL_FLAGS) - return -EINVAL; + if (!(flags & MFD_HUGETLB)) { + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + } else { + /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */ + if (flags & MFD_ALLOW_SEALING) + return -EINVAL; + /* Allow huge page size encoding in flags. */ + if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) + return -EINVAL; + } /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); @@ -3696,16 +3707,30 @@ SYSCALL_DEFINE2(memfd_create, goto err_name; } - file = shmem_file_setup(name, 0, VM_NORESERVE); + if (flags & MFD_HUGETLB) { + struct user_struct *user = NULL; + + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, + (flags >> MFD_HUGE_SHIFT) & + MFD_HUGE_MASK); + } else + file = shmem_file_setup(name, 0, VM_NORESERVE); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_fd; } - info = SHMEM_I(file_inode(file)); file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_RDWR | O_LARGEFILE; - if (flags & MFD_ALLOW_SEALING) + + if (flags & MFD_ALLOW_SEALING) { + /* + * flags check at beginning of function ensures + * this is not a hugetlbfs (MFD_HUGETLB) file. + */ + info = SHMEM_I(file_inode(file)); info->seals &= ~F_SEAL_SEAL; + } fd_install(fd, file); kfree(name); -- cgit v1.2.3 From f113e64121ba9f4791332248b315d9f57ee33a6b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 6 Sep 2017 16:24:23 -0700 Subject: mm/vmstat.c: fix wrong comment Comment for pagetypeinfo_showblockcount() is mistakenly duplicated from pagetypeinfo_show_free()'s comment. This commit fixes it. Link: http://lkml.kernel.org/r/20170809185816.11244-1-sj38.park@gmail.com Fixes: 467c996c1e19 ("Print out statistics in relation to fragmentation avoidance to /proc/pagetypeinfo") Signed-off-by: SeongJae Park Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 05de233a6fca..85f3a2e04adc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1255,7 +1255,7 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, seq_putc(m, '\n'); } -/* Print out the free pages at each order for each migratetype */ +/* Print out the number of pageblocks for each migratetype */ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) { int mtype; -- cgit v1.2.3 From 894e58c1475a03cd8260be7f28444cc298239432 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 6 Sep 2017 16:24:26 -0700 Subject: mm/vmalloc.c: don't reinvent the wheel but use existing llist API Although llist provides proper APIs, they are not used. Make them used. Link: http://lkml.kernel.org/r/1502095374-16112-1-git-send-email-byungchul.park@lge.com Signed-off-by: Byungchul Park Cc: zijun_hu Cc: Michal Hocko Cc: Vlastimil Babka Cc: Joel Fernandes Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fa409c9092be..8a43db6284eb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -49,12 +49,10 @@ static void __vunmap(const void *, int); static void free_work(struct work_struct *w) { struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); - struct llist_node *llnode = llist_del_all(&p->list); - while (llnode) { - void *p = llnode; - llnode = llist_next(llnode); - __vunmap(p, 1); - } + struct llist_node *t, *llnode; + + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + __vunmap((void *)llnode, 1); } /*** Page table manipulation functions ***/ -- cgit v1.2.3 From cbc65df240c104bf540af1ad58595bf1eaa5ee10 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:24:29 -0700 Subject: mm, swap: add swap readahead hit statistics Patch series "mm, swap: VMA based swap readahead", v4. The swap readahead is an important mechanism to reduce the swap in latency. Although pure sequential memory access pattern isn't very popular for anonymous memory, the space locality is still considered valid. In the original swap readahead implementation, the consecutive blocks in swap device are readahead based on the global space locality estimation. But the consecutive blocks in swap device just reflect the order of page reclaiming, don't necessarily reflect the access pattern in virtual memory space. And the different tasks in the system may have different access patterns, which makes the global space locality estimation incorrect. In this patchset, when page fault occurs, the virtual pages near the fault address will be readahead instead of the swap slots near the fault swap slot in swap device. This avoid to readahead the unrelated swap slots. At the same time, the swap readahead is changed to work on per-VMA from globally. So that the different access patterns of the different VMAs could be distinguished, and the different readahead policy could be applied accordingly. The original core readahead detection and scaling algorithm is reused, because it is an effect algorithm to detect the space locality. In addition to the swap readahead changes, some new sysfs interface is added to show the efficiency of the readahead algorithm and some other swap statistics. This new implementation will incur more small random read, on SSD, the improved correctness of estimation and readahead target should beat the potential increased overhead, this is also illustrated in the test results below. But on HDD, the overhead may beat the benefit, so the original implementation will be used by default. The test and result is as follow, Common test condition ===================== Test Machine: Xeon E5 v3 (2 sockets, 72 threads, 32G RAM) Swap device: NVMe disk Micro-benchmark with combined access pattern ============================================ vm-scalability, sequential swap test case, 4 processes to eat 50G virtual memory space, repeat the sequential memory writing until 300 seconds. The first round writing will trigger swap out, the following rounds will trigger sequential swap in and out. At the same time, run vm-scalability random swap test case in background, 8 processes to eat 30G virtual memory space, repeat the random memory write until 300 seconds. This will trigger random swap-in in the background. This is a combined workload with sequential and random memory accessing at the same time. The result (for sequential workload) is as follow, Base Optimized ---- --------- throughput 345413 KB/s 414029 KB/s (+19.9%) latency.average 97.14 us 61.06 us (-37.1%) latency.50th 2 us 1 us latency.60th 2 us 1 us latency.70th 98 us 2 us latency.80th 160 us 2 us latency.90th 260 us 217 us latency.95th 346 us 369 us latency.99th 1.34 ms 1.09 ms ra_hit% 52.69% 99.98% The original swap readahead algorithm is confused by the background random access workload, so readahead hit rate is lower. The VMA-base readahead algorithm works much better. Linpack ======= The test memory size is bigger than RAM to trigger swapping. Base Optimized ---- --------- elapsed_time 393.49 s 329.88 s (-16.2%) ra_hit% 86.21% 98.82% The score of base and optimized kernel hasn't visible changes. But the elapsed time reduced and readahead hit rate improved, so the optimized kernel runs better for startup and tear down stages. And the absolute value of readahead hit rate is high, shows that the space locality is still valid in some practical workloads. This patch (of 5): The statistics for total readahead pages and total readahead hits are recorded and exported via the following sysfs interface. /sys/kernel/mm/swap/ra_hits /sys/kernel/mm/swap/ra_total With them, the efficiency of the swap readahead could be measured, so that the swap readahead algorithm and parameters could be tuned accordingly. [akpm@linux-foundation.org: don't display swap stats if CONFIG_SWAP=n] Link: http://lkml.kernel.org/r/20170807054038.1843-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Hugh Dickins Cc: Fengguang Wu Cc: Tim Chen Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 4 ++++ mm/swap_state.c | 9 +++++++-- mm/vmstat.c | 4 ++++ 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index e02820fc2861..d77bc35278b0 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -105,6 +105,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, VMACACHE_FIND_CALLS, VMACACHE_FIND_HITS, VMACACHE_FULL_FLUSHES, +#endif +#ifdef CONFIG_SWAP + SWAP_RA, + SWAP_RA_HIT, #endif NR_VM_EVENT_ITEMS }; diff --git a/mm/swap_state.c b/mm/swap_state.c index b68c93014f50..d1bdb31cab13 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -305,8 +305,10 @@ struct page * lookup_swap_cache(swp_entry_t entry) if (page && likely(!PageTransCompound(page))) { INC_CACHE_INFO(find_success); - if (TestClearPageReadahead(page)) + if (TestClearPageReadahead(page)) { atomic_inc(&swapin_readahead_hits); + count_vm_event(SWAP_RA_HIT); + } } INC_CACHE_INFO(find_total); @@ -516,8 +518,11 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, gfp_mask, vma, addr, false); if (!page) continue; - if (offset != entry_offset && likely(!PageTransCompound(page))) + if (offset != entry_offset && + likely(!PageTransCompound(page))) { SetPageReadahead(page); + count_vm_event(SWAP_RA); + } put_page(page); } blk_finish_plug(&plug); diff --git a/mm/vmstat.c b/mm/vmstat.c index 85f3a2e04adc..c7e4b8458023 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1098,6 +1098,10 @@ const char * const vmstat_text[] = { "vmacache_find_hits", "vmacache_full_flushes", #endif +#ifdef CONFIG_SWAP + "swap_ra", + "swap_ra_hit", +#endif #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ -- cgit v1.2.3 From c4fa63092f216737b60c789968371d9960a598e5 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:24:33 -0700 Subject: mm, swap: fix swap readahead marking In the original implementation, it is possible that the existing pages in the swap cache (not newly readahead) could be marked as the readahead pages. This will cause the statistics of swap readahead be wrong and influence the swap readahead algorithm too. This is fixed via marking a page as the readahead page only if it is newly allocated and read from the disk. When testing with linpack, after the fixing the swap readahead hit rate increased from ~66% to ~86%. Link: http://lkml.kernel.org/r/20170807054038.1843-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Hugh Dickins Cc: Fengguang Wu Cc: Tim Chen Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index d1bdb31cab13..a901afe9da61 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -498,7 +498,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long start_offset, end_offset; unsigned long mask; struct blk_plug plug; - bool do_poll = true; + bool do_poll = true, page_allocated; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -514,14 +514,18 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ - page = read_swap_cache_async(swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr, false); + page = __read_swap_cache_async( + swp_entry(swp_type(entry), offset), + gfp_mask, vma, addr, &page_allocated); if (!page) continue; - if (offset != entry_offset && - likely(!PageTransCompound(page))) { - SetPageReadahead(page); - count_vm_event(SWAP_RA); + if (page_allocated) { + swap_readpage(page, false); + if (offset != entry_offset && + likely(!PageTransCompound(page))) { + SetPageReadahead(page); + count_vm_event(SWAP_RA); + } } put_page(page); } -- cgit v1.2.3 From ec560175c0b6fce86994bdf036754d48122c5c87 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:24:36 -0700 Subject: mm, swap: VMA based swap readahead The swap readahead is an important mechanism to reduce the swap in latency. Although pure sequential memory access pattern isn't very popular for anonymous memory, the space locality is still considered valid. In the original swap readahead implementation, the consecutive blocks in swap device are readahead based on the global space locality estimation. But the consecutive blocks in swap device just reflect the order of page reclaiming, don't necessarily reflect the access pattern in virtual memory. And the different tasks in the system may have different access patterns, which makes the global space locality estimation incorrect. In this patch, when page fault occurs, the virtual pages near the fault address will be readahead instead of the swap slots near the fault swap slot in swap device. This avoid to readahead the unrelated swap slots. At the same time, the swap readahead is changed to work on per-VMA from globally. So that the different access patterns of the different VMAs could be distinguished, and the different readahead policy could be applied accordingly. The original core readahead detection and scaling algorithm is reused, because it is an effect algorithm to detect the space locality. The test and result is as follow, Common test condition ===================== Test Machine: Xeon E5 v3 (2 sockets, 72 threads, 32G RAM) Swap device: NVMe disk Micro-benchmark with combined access pattern ============================================ vm-scalability, sequential swap test case, 4 processes to eat 50G virtual memory space, repeat the sequential memory writing until 300 seconds. The first round writing will trigger swap out, the following rounds will trigger sequential swap in and out. At the same time, run vm-scalability random swap test case in background, 8 processes to eat 30G virtual memory space, repeat the random memory write until 300 seconds. This will trigger random swap-in in the background. This is a combined workload with sequential and random memory accessing at the same time. The result (for sequential workload) is as follow, Base Optimized ---- --------- throughput 345413 KB/s 414029 KB/s (+19.9%) latency.average 97.14 us 61.06 us (-37.1%) latency.50th 2 us 1 us latency.60th 2 us 1 us latency.70th 98 us 2 us latency.80th 160 us 2 us latency.90th 260 us 217 us latency.95th 346 us 369 us latency.99th 1.34 ms 1.09 ms ra_hit% 52.69% 99.98% The original swap readahead algorithm is confused by the background random access workload, so readahead hit rate is lower. The VMA-base readahead algorithm works much better. Linpack ======= The test memory size is bigger than RAM to trigger swapping. Base Optimized ---- --------- elapsed_time 393.49 s 329.88 s (-16.2%) ra_hit% 86.21% 98.82% The score of base and optimized kernel hasn't visible changes. But the elapsed time reduced and readahead hit rate improved, so the optimized kernel runs better for startup and tear down stages. And the absolute value of readahead hit rate is high, shows that the space locality is still valid in some practical workloads. Link: http://lkml.kernel.org/r/20170807054038.1843-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Hugh Dickins Cc: Fengguang Wu Cc: Tim Chen Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 1 + include/linux/swap.h | 57 ++++++++++++- mm/memory.c | 23 +++-- mm/shmem.c | 2 +- mm/swap_state.c | 215 +++++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 273 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 57378c7cb5f8..f45ad815b7d7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -335,6 +335,7 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ + atomic_long_t swap_readahead_info; #ifndef CONFIG_MMU struct vm_region *vm_region; /* NOMMU mapping region */ #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index 76f1632eea5a..61d63379e956 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -251,6 +251,25 @@ struct swap_info_struct { struct swap_cluster_list discard_clusters; /* discard clusters list */ }; +#ifdef CONFIG_64BIT +#define SWAP_RA_ORDER_CEILING 5 +#else +/* Avoid stack overflow, because we need to save part of page table */ +#define SWAP_RA_ORDER_CEILING 3 +#define SWAP_RA_PTE_CACHE_SIZE (1 << SWAP_RA_ORDER_CEILING) +#endif + +struct vma_swap_readahead { + unsigned short win; + unsigned short offset; + unsigned short nr_pte; +#ifdef CONFIG_64BIT + pte_t *ptes; +#else + pte_t ptes[SWAP_RA_PTE_CACHE_SIZE]; +#endif +}; + /* linux/mm/workingset.c */ void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); @@ -350,6 +369,7 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *, #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) extern struct address_space *swapper_spaces[]; +extern bool swap_vma_readahead; #define swap_address_space(entry) \ (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ >> SWAP_ADDRESS_SPACE_SHIFT]) @@ -362,7 +382,9 @@ extern void __delete_from_swap_cache(struct page *); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); -extern struct page *lookup_swap_cache(swp_entry_t); +extern struct page *lookup_swap_cache(swp_entry_t entry, + struct vm_area_struct *vma, + unsigned long addr); extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr, bool do_poll); @@ -372,6 +394,17 @@ extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, extern struct page *swapin_readahead(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr); +extern struct page *swap_readahead_detect(struct vm_fault *vmf, + struct vma_swap_readahead *swap_ra); +extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, + struct vm_fault *vmf, + struct vma_swap_readahead *swap_ra); + +static inline bool swap_use_vma_readahead(void) +{ + return READ_ONCE(swap_vma_readahead); +} + /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; @@ -466,12 +499,32 @@ static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } +static inline bool swap_use_vma_readahead(void) +{ + return false; +} + +static inline struct page *swap_readahead_detect( + struct vm_fault *vmf, struct vma_swap_readahead *swap_ra) +{ + return NULL; +} + +static inline struct page *do_swap_page_readahead( + swp_entry_t fentry, gfp_t gfp_mask, + struct vm_fault *vmf, struct vma_swap_readahead *swap_ra) +{ + return NULL; +} + static inline int swap_writepage(struct page *p, struct writeback_control *wbc) { return 0; } -static inline struct page *lookup_swap_cache(swp_entry_t swp) +static inline struct page *lookup_swap_cache(swp_entry_t swp, + struct vm_area_struct *vma, + unsigned long addr) { return NULL; } diff --git a/mm/memory.c b/mm/memory.c index 3dd8bb46391b..e87953775e3c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2752,16 +2752,23 @@ EXPORT_SYMBOL(unmap_mapping_range); int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page, *swapcache; + struct page *page = NULL, *swapcache; struct mem_cgroup *memcg; + struct vma_swap_readahead swap_ra; swp_entry_t entry; pte_t pte; int locked; int exclusive = 0; int ret = 0; + bool vma_readahead = swap_use_vma_readahead(); - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) + if (vma_readahead) + page = swap_readahead_detect(vmf, &swap_ra); + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) { + if (page) + put_page(page); goto out; + } entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { @@ -2777,10 +2784,16 @@ int do_swap_page(struct vm_fault *vmf) goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); - page = lookup_swap_cache(entry); + if (!page) + page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, + vmf->address); if (!page) { - page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, - vmf->address); + if (vma_readahead) + page = do_swap_page_readahead(entry, + GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); + else + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!page) { /* * Back out if somebody else faulted in this pte diff --git a/mm/shmem.c b/mm/shmem.c index 47179bbe9ee7..ace53a582be5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1650,7 +1650,7 @@ repeat: if (swap.val) { /* Look it up and read it in.. */ - page = lookup_swap_cache(swap); + page = lookup_swap_cache(swap, NULL, 0); if (!page) { /* Or update major stats only when swapin succeeds?? */ if (fault_type) { diff --git a/mm/swap_state.c b/mm/swap_state.c index a901afe9da61..3885fef7bdf5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -37,6 +37,29 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES]; static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; +bool swap_vma_readahead = true; + +#define SWAP_RA_MAX_ORDER_DEFAULT 3 + +static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT; + +#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) +#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) +#define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK +#define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) + +#define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) +#define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) +#define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) + +#define SWAP_RA_VAL(addr, win, hits) \ + (((addr) & PAGE_MASK) | \ + (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ + ((hits) & SWAP_RA_HITS_MASK)) + +/* Initial readahead hits is 4 to start up with a small window */ +#define GET_SWAP_RA_VAL(vma) \ + (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) #define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) @@ -297,21 +320,36 @@ void free_pages_and_swap_cache(struct page **pages, int nr) * lock getting page table operations atomic even if we drop the page * lock before returning. */ -struct page * lookup_swap_cache(swp_entry_t entry) +struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, + unsigned long addr) { struct page *page; + unsigned long ra_info; + int win, hits, readahead; page = find_get_page(swap_address_space(entry), swp_offset(entry)); - if (page && likely(!PageTransCompound(page))) { + INC_CACHE_INFO(find_total); + if (page) { INC_CACHE_INFO(find_success); - if (TestClearPageReadahead(page)) { - atomic_inc(&swapin_readahead_hits); + if (unlikely(PageTransCompound(page))) + return page; + readahead = TestClearPageReadahead(page); + if (vma) { + ra_info = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_info); + hits = SWAP_RA_HITS(ra_info); + if (readahead) + hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(addr, win, hits)); + } + if (readahead) { count_vm_event(SWAP_RA_HIT); + if (!vma) + atomic_inc(&swapin_readahead_hits); } } - - INC_CACHE_INFO(find_total); return page; } @@ -426,22 +464,20 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return retpage; } -static unsigned long swapin_nr_pages(unsigned long offset) +static unsigned int __swapin_nr_pages(unsigned long prev_offset, + unsigned long offset, + int hits, + int max_pages, + int prev_win) { - static unsigned long prev_offset; - unsigned int pages, max_pages, last_ra; - static atomic_t last_readahead_pages; - - max_pages = 1 << READ_ONCE(page_cluster); - if (max_pages <= 1) - return 1; + unsigned int pages, last_ra; /* * This heuristic has been found to work well on both sequential and * random loads, swapping to hard disk or to SSD: please don't ask * what the "+ 2" means, it just happens to work well, that's all. */ - pages = atomic_xchg(&swapin_readahead_hits, 0) + 2; + pages = hits + 2; if (pages == 2) { /* * We can have no readahead hits to judge by: but must not get @@ -450,7 +486,6 @@ static unsigned long swapin_nr_pages(unsigned long offset) */ if (offset != prev_offset + 1 && offset != prev_offset - 1) pages = 1; - prev_offset = offset; } else { unsigned int roundup = 4; while (roundup < pages) @@ -462,9 +497,28 @@ static unsigned long swapin_nr_pages(unsigned long offset) pages = max_pages; /* Don't shrink readahead too fast */ - last_ra = atomic_read(&last_readahead_pages) / 2; + last_ra = prev_win / 2; if (pages < last_ra) pages = last_ra; + + return pages; +} + +static unsigned long swapin_nr_pages(unsigned long offset) +{ + static unsigned long prev_offset; + unsigned int hits, pages, max_pages; + static atomic_t last_readahead_pages; + + max_pages = 1 << READ_ONCE(page_cluster); + if (max_pages <= 1) + return 1; + + hits = atomic_xchg(&swapin_readahead_hits, 0); + pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages, + atomic_read(&last_readahead_pages)); + if (!hits) + prev_offset = offset; atomic_set(&last_readahead_pages, pages); return pages; @@ -570,3 +624,130 @@ void exit_swap_address_space(unsigned int type) synchronize_rcu(); kvfree(spaces); } + +static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, + unsigned long faddr, + unsigned long lpfn, + unsigned long rpfn, + unsigned long *start, + unsigned long *end) +{ + *start = max3(lpfn, PFN_DOWN(vma->vm_start), + PFN_DOWN(faddr & PMD_MASK)); + *end = min3(rpfn, PFN_DOWN(vma->vm_end), + PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); +} + +struct page *swap_readahead_detect(struct vm_fault *vmf, + struct vma_swap_readahead *swap_ra) +{ + struct vm_area_struct *vma = vmf->vma; + unsigned long swap_ra_info; + struct page *page; + swp_entry_t entry; + unsigned long faddr, pfn, fpfn; + unsigned long start, end; + pte_t *pte; + unsigned int max_win, hits, prev_win, win, left; +#ifndef CONFIG_64BIT + pte_t *tpte; +#endif + + faddr = vmf->address; + entry = pte_to_swp_entry(vmf->orig_pte); + if ((unlikely(non_swap_entry(entry)))) + return NULL; + page = lookup_swap_cache(entry, vma, faddr); + if (page) + return page; + + max_win = 1 << READ_ONCE(swap_ra_max_order); + if (max_win == 1) { + swap_ra->win = 1; + return NULL; + } + + fpfn = PFN_DOWN(faddr); + swap_ra_info = GET_SWAP_RA_VAL(vma); + pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); + prev_win = SWAP_RA_WIN(swap_ra_info); + hits = SWAP_RA_HITS(swap_ra_info); + swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, + max_win, prev_win); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(faddr, win, 0)); + + if (win == 1) + return NULL; + + /* Copy the PTEs because the page table may be unmapped */ + if (fpfn == pfn + 1) + swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); + else if (pfn == fpfn + 1) + swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, + &start, &end); + else { + left = (win - 1) / 2; + swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, + &start, &end); + } + swap_ra->nr_pte = end - start; + swap_ra->offset = fpfn - start; + pte = vmf->pte - swap_ra->offset; +#ifdef CONFIG_64BIT + swap_ra->ptes = pte; +#else + tpte = swap_ra->ptes; + for (pfn = start; pfn != end; pfn++) + *tpte++ = *pte++; +#endif + + return NULL; +} + +struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, + struct vm_fault *vmf, + struct vma_swap_readahead *swap_ra) +{ + struct blk_plug plug; + struct vm_area_struct *vma = vmf->vma; + struct page *page; + pte_t *pte, pentry; + swp_entry_t entry; + unsigned int i; + bool page_allocated; + + if (swap_ra->win == 1) + goto skip; + + blk_start_plug(&plug); + for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; + i++, pte++) { + pentry = *pte; + if (pte_none(pentry)) + continue; + if (pte_present(pentry)) + continue; + entry = pte_to_swp_entry(pentry); + if (unlikely(non_swap_entry(entry))) + continue; + page = __read_swap_cache_async(entry, gfp_mask, vma, + vmf->address, &page_allocated); + if (!page) + continue; + if (page_allocated) { + swap_readpage(page, false); + if (i != swap_ra->offset && + likely(!PageTransCompound(page))) { + SetPageReadahead(page); + count_vm_event(SWAP_RA); + } + } + put_page(page); + } + blk_finish_plug(&plug); + lru_add_drain(); +skip: + return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, + swap_ra->win == 1); +} -- cgit v1.2.3 From d9bfcfdc41e8e5d80f7591f95a09ccce7cb8ad05 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:24:40 -0700 Subject: mm, swap: add sysfs interface for VMA based swap readahead The sysfs interface to control the VMA based swap readahead is added as follow, /sys/kernel/mm/swap/vma_ra_enabled Enable the VMA based swap readahead algorithm, or use the original global swap readahead algorithm. /sys/kernel/mm/swap/vma_ra_max_order Set the max order of the readahead window size for the VMA based swap readahead algorithm. The corresponding ABI documentation is added too. Link: http://lkml.kernel.org/r/20170807054038.1843-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Hugh Dickins Cc: Fengguang Wu Cc: Tim Chen Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-kernel-mm-swap | 26 +++++++++ mm/swap_state.c | 80 ++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-swap (limited to 'mm') diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-swap b/Documentation/ABI/testing/sysfs-kernel-mm-swap new file mode 100644 index 000000000000..587db52084c7 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-mm-swap @@ -0,0 +1,26 @@ +What: /sys/kernel/mm/swap/ +Date: August 2017 +Contact: Linux memory management mailing list +Description: Interface for swapping + +What: /sys/kernel/mm/swap/vma_ra_enabled +Date: August 2017 +Contact: Linux memory management mailing list +Description: Enable/disable VMA based swap readahead. + + If set to true, the VMA based swap readahead algorithm + will be used for swappable anonymous pages mapped in a + VMA, and the global swap readahead algorithm will be + still used for tmpfs etc. other users. If set to + false, the global swap readahead algorithm will be + used for all swappable pages. + +What: /sys/kernel/mm/swap/vma_ra_max_order +Date: August 2017 +Contact: Linux memory management mailing list +Description: The max readahead size in order for VMA based swap readahead + + VMA based swap readahead algorithm will readahead at + most 1 << max_order pages for each readahead. The + real readahead size for each readahead will be scaled + according to the estimation algorithm. diff --git a/mm/swap_state.c b/mm/swap_state.c index 3885fef7bdf5..71ce2d1ccbf7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -751,3 +751,83 @@ skip: return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, swap_ra->win == 1); } + +#ifdef CONFIG_SYSFS +static ssize_t vma_ra_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); +} +static ssize_t vma_ra_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) + swap_vma_readahead = true; + else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) + swap_vma_readahead = false; + else + return -EINVAL; + + return count; +} +static struct kobj_attribute vma_ra_enabled_attr = + __ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show, + vma_ra_enabled_store); + +static ssize_t vma_ra_max_order_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", swap_ra_max_order); +} +static ssize_t vma_ra_max_order_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err, v; + + err = kstrtoint(buf, 10, &v); + if (err || v > SWAP_RA_ORDER_CEILING || v <= 0) + return -EINVAL; + + swap_ra_max_order = v; + + return count; +} +static struct kobj_attribute vma_ra_max_order_attr = + __ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show, + vma_ra_max_order_store); + +static struct attribute *swap_attrs[] = { + &vma_ra_enabled_attr.attr, + &vma_ra_max_order_attr.attr, + NULL, +}; + +static struct attribute_group swap_attr_group = { + .attrs = swap_attrs, +}; + +static int __init swap_init_sysfs(void) +{ + int err; + struct kobject *swap_kobj; + + swap_kobj = kobject_create_and_add("swap", mm_kobj); + if (!swap_kobj) { + pr_err("failed to create swap kobject\n"); + return -ENOMEM; + } + err = sysfs_create_group(swap_kobj, &swap_attr_group); + if (err) { + pr_err("failed to register swap group\n"); + goto delete_obj; + } + return 0; + +delete_obj: + kobject_put(swap_kobj); + return err; +} +subsys_initcall(swap_init_sysfs); +#endif -- cgit v1.2.3 From 81a0298bdfab0203d360df7c9bf690d1d457f999 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:24:43 -0700 Subject: mm, swap: don't use VMA based swap readahead if HDD is used as swap VMA based swap readahead will readahead the virtual pages that is continuous in the virtual address space. While the original swap readahead will readahead the swap slots that is continuous in the swap device. Although VMA based swap readahead is more correct for the swap slots to be readahead, it will trigger more small random readings, which may cause the performance of HDD (hard disk) to degrade heavily, and may finally exceed the benefit. To avoid the issue, in this patch, if the HDD is used as swap, the VMA based swap readahead will be disabled, and the original swap readahead will be used instead. Link: http://lkml.kernel.org/r/20170807054038.1843-6-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Hugh Dickins Cc: Fengguang Wu Cc: Tim Chen Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 11 ++++++----- mm/swapfile.c | 8 +++++++- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 61d63379e956..9c4ae6f14eea 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -400,16 +400,17 @@ extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct vm_fault *vmf, struct vma_swap_readahead *swap_ra); -static inline bool swap_use_vma_readahead(void) -{ - return READ_ONCE(swap_vma_readahead); -} - /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; +extern atomic_t nr_rotate_swap; extern bool has_usable_swap(void); +static inline bool swap_use_vma_readahead(void) +{ + return READ_ONCE(swap_vma_readahead) && !atomic_read(&nr_rotate_swap); +} + /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 42eff9e4e972..4f8b3e08a547 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0); +atomic_t nr_rotate_swap = ATOMIC_INIT(0); + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ @@ -2569,6 +2571,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); + if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev))) + atomic_dec(&nr_rotate_swap); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); spin_lock(&p->lock); @@ -3145,7 +3150,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) cluster = per_cpu_ptr(p->percpu_cluster, cpu); cluster_set_null(&cluster->index); } - } + } else + atomic_inc(&nr_rotate_swap); error = swap_cgroup_swapon(p->type, maxpages); if (error) -- cgit v1.2.3 From d30561c56f4114f7d6595a40498ba364ffa6e28e Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Wed, 6 Sep 2017 16:24:47 -0700 Subject: z3fold: use per-cpu unbuddied lists It's been noted that z3fold doesn't scale well when it's run in a large number of threads on many cores, which can be easily reproduced with fio 'randrw' test with --numjobs=32. E.g. the result for 1 cluster (4 cores) is: Run status group 0 (all jobs): READ: io=244785MB, aggrb=496883KB/s, minb=15527KB/s, ... WRITE: io=246735MB, aggrb=500841KB/s, minb=15651KB/s, ... While for 8 cores (2 clusters) the result is: Run status group 0 (all jobs): READ: io=244785MB, aggrb=265942KB/s, minb=8310KB/s, ... WRITE: io=246735MB, aggrb=268060KB/s, minb=8376KB/s, ... The bottleneck here is the pool lock which many threads become waiting upon. To reduce that spin lock contention, z3fold can operate only on the lists local to the current CPU whenever possible. Due to the nature of z3fold unbuddied list handling (it only takes the first entry off the list on a hot path), if the z3fold pool is big enough and balanced well enough, limiting search to only local unbuddied list doesn't lead to a significant compression ratio degrade (2.57x vs 2.65x in our measurements). This patch also introduces two worker threads: one for async in-page object layout optimization and one for releasing freed pages. This is done to speed up z3fold_free() which is often on a hot path. The fio results for 8-core case are now the following: Run status group 0 (all jobs): READ: io=244785MB, aggrb=1568.3MB/s, minb=50182KB/s, ... WRITE: io=246735MB, aggrb=1580.8MB/s, minb=50582KB/s, ... So we're in for almost 6x performance increase. Link: http://lkml.kernel.org/r/20170806181443.f9b65018f8bde25ef990f9e8@gmail.com Signed-off-by: Vitaly Wool Cc: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 479 +++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 344 insertions(+), 135 deletions(-) (limited to 'mm') diff --git a/mm/z3fold.c b/mm/z3fold.c index 54f63c4a809a..486550df32be 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -23,10 +23,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include +#include #include +#include #include #include #include @@ -48,11 +51,15 @@ enum buddy { }; /* - * struct z3fold_header - z3fold page metadata occupying the first chunk of each + * struct z3fold_header - z3fold page metadata occupying first chunks of each * z3fold page, except for HEADLESS pages - * @buddy: links the z3fold page into the relevant list in the pool + * @buddy: links the z3fold page into the relevant list in the + * pool * @page_lock: per-page lock - * @refcount: reference cound for the z3fold page + * @refcount: reference count for the z3fold page + * @work: work_struct for page layout optimization + * @pool: pointer to the pool which this page belongs to + * @cpu: CPU which this page "belongs" to * @first_chunks: the size of the first buddy in chunks, 0 if free * @middle_chunks: the size of the middle buddy in chunks, 0 if free * @last_chunks: the size of the last buddy in chunks, 0 if free @@ -62,6 +69,9 @@ struct z3fold_header { struct list_head buddy; spinlock_t page_lock; struct kref refcount; + struct work_struct work; + struct z3fold_pool *pool; + short cpu; unsigned short first_chunks; unsigned short middle_chunks; unsigned short last_chunks; @@ -92,28 +102,39 @@ struct z3fold_header { /** * struct z3fold_pool - stores metadata for each z3fold pool - * @lock: protects all pool fields and first|last_chunk fields of any - * z3fold page in the pool - * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies; - * the lists each z3fold page is added to depends on the size of - * its free region. + * @name: pool name + * @lock: protects pool unbuddied/lru lists + * @stale_lock: protects pool stale page list + * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2- + * buddies; the list each z3fold page is added to depends on + * the size of its free region. * @lru: list tracking the z3fold pages in LRU order by most recently * added buddy. + * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. * @ops: pointer to a structure of user defined operations specified at * pool creation time. + * @compact_wq: workqueue for page layout background optimization + * @release_wq: workqueue for safe page release + * @work: work_struct for safe page release * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. */ struct z3fold_pool { + const char *name; spinlock_t lock; - struct list_head unbuddied[NCHUNKS]; + spinlock_t stale_lock; + struct list_head *unbuddied; struct list_head lru; + struct list_head stale; atomic64_t pages_nr; const struct z3fold_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; + struct workqueue_struct *compact_wq; + struct workqueue_struct *release_wq; + struct work_struct work; }; /* @@ -122,9 +143,10 @@ struct z3fold_pool { enum z3fold_page_flags { PAGE_HEADLESS = 0, MIDDLE_CHUNK_MAPPED, + NEEDS_COMPACTING, + PAGE_STALE }; - /***************** * Helpers *****************/ @@ -138,14 +160,19 @@ static int size_to_chunks(size_t size) #define for_each_unbuddied_list(_iter, _begin) \ for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) +static void compact_page_work(struct work_struct *w); + /* Initializes the z3fold header of a newly allocated z3fold page */ -static struct z3fold_header *init_z3fold_page(struct page *page) +static struct z3fold_header *init_z3fold_page(struct page *page, + struct z3fold_pool *pool) { struct z3fold_header *zhdr = page_address(page); INIT_LIST_HEAD(&page->lru); clear_bit(PAGE_HEADLESS, &page->private); clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + clear_bit(NEEDS_COMPACTING, &page->private); + clear_bit(PAGE_STALE, &page->private); spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); @@ -154,7 +181,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page) zhdr->last_chunks = 0; zhdr->first_num = 0; zhdr->start_middle = 0; + zhdr->cpu = -1; + zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); + INIT_WORK(&zhdr->work, compact_page_work); return zhdr; } @@ -164,21 +194,6 @@ static void free_z3fold_page(struct page *page) __free_page(page); } -static void release_z3fold_page(struct kref *ref) -{ - struct z3fold_header *zhdr; - struct page *page; - - zhdr = container_of(ref, struct z3fold_header, refcount); - page = virt_to_page(zhdr); - - if (!list_empty(&zhdr->buddy)) - list_del(&zhdr->buddy); - if (!list_empty(&page->lru)) - list_del(&page->lru); - free_z3fold_page(page); -} - /* Lock a z3fold page */ static inline void z3fold_page_lock(struct z3fold_header *zhdr) { @@ -228,6 +243,76 @@ static enum buddy handle_to_buddy(unsigned long handle) return (handle - zhdr->first_num) & BUDDY_MASK; } +static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) +{ + struct page *page = virt_to_page(zhdr); + struct z3fold_pool *pool = zhdr->pool; + + WARN_ON(!list_empty(&zhdr->buddy)); + set_bit(PAGE_STALE, &page->private); + spin_lock(&pool->lock); + if (!list_empty(&page->lru)) + list_del(&page->lru); + spin_unlock(&pool->lock); + if (locked) + z3fold_page_unlock(zhdr); + spin_lock(&pool->stale_lock); + list_add(&zhdr->buddy, &pool->stale); + queue_work(pool->release_wq, &pool->work); + spin_unlock(&pool->stale_lock); +} + +static void __attribute__((__unused__)) + release_z3fold_page(struct kref *ref) +{ + struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, + refcount); + __release_z3fold_page(zhdr, false); +} + +static void release_z3fold_page_locked(struct kref *ref) +{ + struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, + refcount); + WARN_ON(z3fold_page_trylock(zhdr)); + __release_z3fold_page(zhdr, true); +} + +static void release_z3fold_page_locked_list(struct kref *ref) +{ + struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, + refcount); + spin_lock(&zhdr->pool->lock); + list_del_init(&zhdr->buddy); + spin_unlock(&zhdr->pool->lock); + + WARN_ON(z3fold_page_trylock(zhdr)); + __release_z3fold_page(zhdr, true); +} + +static void free_pages_work(struct work_struct *w) +{ + struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); + + spin_lock(&pool->stale_lock); + while (!list_empty(&pool->stale)) { + struct z3fold_header *zhdr = list_first_entry(&pool->stale, + struct z3fold_header, buddy); + struct page *page = virt_to_page(zhdr); + + list_del(&zhdr->buddy); + if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) + continue; + clear_bit(NEEDS_COMPACTING, &page->private); + spin_unlock(&pool->stale_lock); + cancel_work_sync(&zhdr->work); + free_z3fold_page(page); + cond_resched(); + spin_lock(&pool->stale_lock); + } + spin_unlock(&pool->stale_lock); +} + /* * Returns the number of free chunks in a z3fold page. * NB: can't be used with HEADLESS pages. @@ -252,46 +337,6 @@ static int num_free_chunks(struct z3fold_header *zhdr) return nfree; } -/***************** - * API Functions -*****************/ -/** - * z3fold_create_pool() - create a new z3fold pool - * @gfp: gfp flags when allocating the z3fold pool structure - * @ops: user-defined operations for the z3fold pool - * - * Return: pointer to the new z3fold pool or NULL if the metadata allocation - * failed. - */ -static struct z3fold_pool *z3fold_create_pool(gfp_t gfp, - const struct z3fold_ops *ops) -{ - struct z3fold_pool *pool; - int i; - - pool = kzalloc(sizeof(struct z3fold_pool), gfp); - if (!pool) - return NULL; - spin_lock_init(&pool->lock); - for_each_unbuddied_list(i, 0) - INIT_LIST_HEAD(&pool->unbuddied[i]); - INIT_LIST_HEAD(&pool->lru); - atomic64_set(&pool->pages_nr, 0); - pool->ops = ops; - return pool; -} - -/** - * z3fold_destroy_pool() - destroys an existing z3fold pool - * @pool: the z3fold pool to be destroyed - * - * The pool should be emptied before this function is called. - */ -static void z3fold_destroy_pool(struct z3fold_pool *pool) -{ - kfree(pool); -} - static inline void *mchunk_memmove(struct z3fold_header *zhdr, unsigned short dst_chunk) { @@ -347,6 +392,117 @@ static int z3fold_compact_page(struct z3fold_header *zhdr) return 0; } +static void do_compact_page(struct z3fold_header *zhdr, bool locked) +{ + struct z3fold_pool *pool = zhdr->pool; + struct page *page; + struct list_head *unbuddied; + int fchunks; + + page = virt_to_page(zhdr); + if (locked) + WARN_ON(z3fold_page_trylock(zhdr)); + else + z3fold_page_lock(zhdr); + if (test_bit(PAGE_STALE, &page->private) || + !test_and_clear_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + return; + } + spin_lock(&pool->lock); + list_del_init(&zhdr->buddy); + spin_unlock(&pool->lock); + + z3fold_compact_page(zhdr); + unbuddied = get_cpu_ptr(pool->unbuddied); + fchunks = num_free_chunks(zhdr); + if (fchunks < NCHUNKS && + (!zhdr->first_chunks || !zhdr->middle_chunks || + !zhdr->last_chunks)) { + /* the page's not completely free and it's unbuddied */ + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[fchunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); + } + put_cpu_ptr(pool->unbuddied); + z3fold_page_unlock(zhdr); +} + +static void compact_page_work(struct work_struct *w) +{ + struct z3fold_header *zhdr = container_of(w, struct z3fold_header, + work); + + do_compact_page(zhdr, false); +} + + +/* + * API Functions + */ + +/** + * z3fold_create_pool() - create a new z3fold pool + * @name: pool name + * @gfp: gfp flags when allocating the z3fold pool structure + * @ops: user-defined operations for the z3fold pool + * + * Return: pointer to the new z3fold pool or NULL if the metadata allocation + * failed. + */ +static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, + const struct z3fold_ops *ops) +{ + struct z3fold_pool *pool = NULL; + int i, cpu; + + pool = kzalloc(sizeof(struct z3fold_pool), gfp); + if (!pool) + goto out; + spin_lock_init(&pool->lock); + spin_lock_init(&pool->stale_lock); + pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); + for_each_possible_cpu(cpu) { + struct list_head *unbuddied = + per_cpu_ptr(pool->unbuddied, cpu); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&unbuddied[i]); + } + INIT_LIST_HEAD(&pool->lru); + INIT_LIST_HEAD(&pool->stale); + atomic64_set(&pool->pages_nr, 0); + pool->name = name; + pool->compact_wq = create_singlethread_workqueue(pool->name); + if (!pool->compact_wq) + goto out; + pool->release_wq = create_singlethread_workqueue(pool->name); + if (!pool->release_wq) + goto out_wq; + INIT_WORK(&pool->work, free_pages_work); + pool->ops = ops; + return pool; + +out_wq: + destroy_workqueue(pool->compact_wq); +out: + kfree(pool); + return NULL; +} + +/** + * z3fold_destroy_pool() - destroys an existing z3fold pool + * @pool: the z3fold pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +static void z3fold_destroy_pool(struct z3fold_pool *pool) +{ + destroy_workqueue(pool->release_wq); + destroy_workqueue(pool->compact_wq); + kfree(pool); +} + /** * z3fold_alloc() - allocates a region of a given size * @pool: z3fold pool from which to allocate @@ -371,8 +527,9 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, { int chunks = 0, i, freechunks; struct z3fold_header *zhdr = NULL; + struct page *page = NULL; enum buddy bud; - struct page *page; + bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM; if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; @@ -383,23 +540,57 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) bud = HEADLESS; else { + struct list_head *unbuddied; chunks = size_to_chunks(size); +lookup: /* First, try to find an unbuddied z3fold page. */ - zhdr = NULL; + unbuddied = get_cpu_ptr(pool->unbuddied); for_each_unbuddied_list(i, chunks) { - spin_lock(&pool->lock); - zhdr = list_first_entry_or_null(&pool->unbuddied[i], + struct list_head *l = &unbuddied[i]; + + zhdr = list_first_entry_or_null(READ_ONCE(l), struct z3fold_header, buddy); - if (!zhdr || !z3fold_page_trylock(zhdr)) { - spin_unlock(&pool->lock); + + if (!zhdr) continue; + + /* Re-check under lock. */ + spin_lock(&pool->lock); + l = &unbuddied[i]; + if (unlikely(zhdr != list_first_entry(READ_ONCE(l), + struct z3fold_header, buddy)) || + !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + put_cpu_ptr(pool->unbuddied); + goto lookup; } - kref_get(&zhdr->refcount); list_del_init(&zhdr->buddy); + zhdr->cpu = -1; spin_unlock(&pool->lock); page = virt_to_page(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + put_cpu_ptr(pool->unbuddied); + if (can_sleep) + cond_resched(); + goto lookup; + } + + /* + * this page could not be removed from its unbuddied + * list while pool lock was held, and then we've taken + * page lock so kref_put could not be called before + * we got here, so it's safe to just call kref_get() + */ + kref_get(&zhdr->refcount); + break; + } + put_cpu_ptr(pool->unbuddied); + + if (zhdr) { if (zhdr->first_chunks == 0) { if (zhdr->middle_chunks != 0 && chunks >= zhdr->start_middle) @@ -411,32 +602,49 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, else if (zhdr->middle_chunks == 0) bud = MIDDLE; else { - z3fold_page_unlock(zhdr); - spin_lock(&pool->lock); if (kref_put(&zhdr->refcount, - release_z3fold_page)) + release_z3fold_page_locked)) atomic64_dec(&pool->pages_nr); - spin_unlock(&pool->lock); + else + z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); - continue; + goto lookup; } goto found; } bud = FIRST; } - /* Couldn't find unbuddied z3fold page, create new one */ - page = alloc_page(gfp); + spin_lock(&pool->stale_lock); + zhdr = list_first_entry_or_null(&pool->stale, + struct z3fold_header, buddy); + /* + * Before allocating a page, let's see if we can take one from the + * stale pages list. cancel_work_sync() can sleep so we must make + * sure it won't be called in case we're in atomic context. + */ + if (zhdr && (can_sleep || !work_pending(&zhdr->work) || + !unlikely(work_busy(&zhdr->work)))) { + list_del(&zhdr->buddy); + clear_bit(NEEDS_COMPACTING, &page->private); + spin_unlock(&pool->stale_lock); + if (can_sleep) + cancel_work_sync(&zhdr->work); + page = virt_to_page(zhdr); + } else { + spin_unlock(&pool->stale_lock); + page = alloc_page(gfp); + } + if (!page) return -ENOMEM; atomic64_inc(&pool->pages_nr); - zhdr = init_z3fold_page(page); + zhdr = init_z3fold_page(page, pool); if (bud == HEADLESS) { set_bit(PAGE_HEADLESS, &page->private); - spin_lock(&pool->lock); goto headless; } z3fold_page_lock(zhdr); @@ -451,15 +659,21 @@ found: zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; } - spin_lock(&pool->lock); if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || zhdr->middle_chunks == 0) { + struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); + /* Add to unbuddied list */ freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[freechunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); + put_cpu_ptr(pool->unbuddied); } headless: + spin_lock(&pool->lock); /* Add/move z3fold page to beginning of LRU */ if (!list_empty(&page->lru)) list_del(&page->lru); @@ -487,7 +701,6 @@ headless: static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) { struct z3fold_header *zhdr; - int freechunks; struct page *page; enum buddy bud; @@ -526,25 +739,27 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) spin_unlock(&pool->lock); free_z3fold_page(page); atomic64_dec(&pool->pages_nr); - } else { - if (zhdr->first_chunks != 0 || zhdr->middle_chunks != 0 || - zhdr->last_chunks != 0) { - z3fold_compact_page(zhdr); - /* Add to the unbuddied list */ - spin_lock(&pool->lock); - if (!list_empty(&zhdr->buddy)) - list_del(&zhdr->buddy); - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); - spin_unlock(&pool->lock); - } + return; + } + + if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { + atomic64_dec(&pool->pages_nr); + return; + } + if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { z3fold_page_unlock(zhdr); + return; + } + if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { spin_lock(&pool->lock); - if (kref_put(&zhdr->refcount, release_z3fold_page)) - atomic64_dec(&pool->pages_nr); + list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); + zhdr->cpu = -1; + do_compact_page(zhdr, true); + return; } - + queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); + z3fold_page_unlock(zhdr); } /** @@ -585,9 +800,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) */ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) { - int i, ret = 0, freechunks; - struct z3fold_header *zhdr; - struct page *page; + int i, ret = 0; + struct z3fold_header *zhdr = NULL; + struct page *page = NULL; + struct list_head *pos; unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; spin_lock(&pool->lock); @@ -600,16 +816,24 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) spin_unlock(&pool->lock); return -EINVAL; } - page = list_last_entry(&pool->lru, struct page, lru); + list_for_each_prev(pos, &pool->lru) { + page = list_entry(pos, struct page, lru); + if (test_bit(PAGE_HEADLESS, &page->private)) + /* candidate found */ + break; + + zhdr = page_address(page); + if (!z3fold_page_trylock(zhdr)) + continue; /* can't evict at this point */ + kref_get(&zhdr->refcount); + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + } + list_del_init(&page->lru); + spin_unlock(&pool->lock); - zhdr = page_address(page); if (!test_bit(PAGE_HEADLESS, &page->private)) { - if (!list_empty(&zhdr->buddy)) - list_del_init(&zhdr->buddy); - kref_get(&zhdr->refcount); - spin_unlock(&pool->lock); - z3fold_page_lock(zhdr); /* * We need encode the handles before unlocking, since * we can race with free that will set @@ -624,11 +848,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) middle_handle = encode_handle(zhdr, MIDDLE); if (zhdr->last_chunks) last_handle = encode_handle(zhdr, LAST); + /* + * it's safe to unlock here because we hold a + * reference to this page + */ z3fold_page_unlock(zhdr); } else { first_handle = encode_handle(zhdr, HEADLESS); last_handle = middle_handle = 0; - spin_unlock(&pool->lock); } /* Issue the eviction callback(s) */ @@ -652,31 +879,12 @@ next: if (ret == 0) { free_z3fold_page(page); return 0; - } else { - spin_lock(&pool->lock); - } - } else { - z3fold_page_lock(zhdr); - if ((zhdr->first_chunks || zhdr->last_chunks || - zhdr->middle_chunks) && - !(zhdr->first_chunks && zhdr->last_chunks && - zhdr->middle_chunks)) { - z3fold_compact_page(zhdr); - /* add to unbuddied list */ - spin_lock(&pool->lock); - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, - &pool->unbuddied[freechunks]); - spin_unlock(&pool->lock); - } - z3fold_page_unlock(zhdr); - spin_lock(&pool->lock); - if (kref_put(&zhdr->refcount, release_z3fold_page)) { - spin_unlock(&pool->lock); - atomic64_dec(&pool->pages_nr); - return 0; } + } else if (kref_put(&zhdr->refcount, release_z3fold_page)) { + atomic64_dec(&pool->pages_nr); + return 0; } + spin_lock(&pool->lock); /* * Add to the beginning of LRU. @@ -795,7 +1003,8 @@ static void *z3fold_zpool_create(const char *name, gfp_t gfp, { struct z3fold_pool *pool; - pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL); + pool = z3fold_create_pool(name, gfp, + zpool_ops ? &z3fold_zpool_ops : NULL); if (pool) { pool->zpool = zpool; pool->zpool_ops = zpool_ops; -- cgit v1.2.3 From cd04ae1e2dc8e3651b8c427ec1b9500c6eed7b90 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:24:50 -0700 Subject: mm, oom: do not rely on TIF_MEMDIE for memory reserves access For ages we have been relying on TIF_MEMDIE thread flag to mark OOM victims and then, among other things, to give these threads full access to memory reserves. There are few shortcomings of this implementation, though. First of all and the most serious one is that the full access to memory reserves is quite dangerous because we leave no safety room for the system to operate and potentially do last emergency steps to move on. Secondly this flag is per task_struct while the OOM killer operates on mm_struct granularity so all processes sharing the given mm are killed. Giving the full access to all these task_structs could lead to a quick memory reserves depletion. We have tried to reduce this risk by giving TIF_MEMDIE only to the main thread and the currently allocating task but that doesn't really solve this problem while it surely opens up a room for corner cases - e.g. GFP_NO{FS,IO} requests might loop inside the allocator without access to memory reserves because a particular thread was not the group leader. Now that we have the oom reaper and that all oom victims are reapable after 1b51e65eab64 ("oom, oom_reaper: allow to reap mm shared by the kthreads") we can be more conservative and grant only partial access to memory reserves because there are reasonable chances of the parallel memory freeing. We still want some access to reserves because we do not want other consumers to eat up the victim's freed memory. oom victims will still contend with __GFP_HIGH users but those shouldn't be so aggressive to starve oom victims completely. Introduce ALLOC_OOM flag and give all tsk_is_oom_victim tasks access to the half of the reserves. This makes the access to reserves independent on which task has passed through mark_oom_victim. Also drop any usage of TIF_MEMDIE from the page allocator proper and replace it by tsk_is_oom_victim as well which will make page_alloc.c completely TIF_MEMDIE free finally. CONFIG_MMU=n doesn't have oom reaper so let's stick to the original ALLOC_NO_WATERMARKS approach. There is a demand to make the oom killer memcg aware which will imply many tasks killed at once. This change will allow such a usecase without worrying about complete memory reserves depletion. Link: http://lkml.kernel.org/r/20170810075019.28998-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Cc: Tetsuo Handa Cc: David Rientjes Cc: Johannes Weiner Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 11 +++++++++ mm/oom_kill.c | 9 ++++--- mm/page_alloc.c | 76 ++++++++++++++++++++++++++++++++++++++++++--------------- 3 files changed, 73 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 781c0d54d75a..1df011f62480 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, /* Mask to get the watermark bits */ #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) +/* + * Only MMU archs have async oom victim reclaim - aka oom_reaper so we + * cannot assume a reduced access to memory reserves is sufficient for + * !MMU + */ +#ifdef CONFIG_MMU +#define ALLOC_OOM 0x08 +#else +#define ALLOC_OOM ALLOC_NO_WATERMARKS +#endif + #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9e8b4f030c1c..c9f3569a76c7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -824,7 +824,8 @@ static void oom_kill_process(struct oom_control *oc, const char *message) /* * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just set TIF_MEMDIE so it can die quickly + * its children or threads, just give it access to memory reserves + * so it can die quickly */ task_lock(p); if (task_will_free_mem(p)) { @@ -889,9 +890,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message) count_memcg_event_mm(mm, OOM_KILL); /* - * We should send SIGKILL before setting TIF_MEMDIE in order to prevent - * the OOM victim from depleting the memory reserves from the user - * space under its control. + * We should send SIGKILL before granting access to memory reserves + * in order to prevent the OOM victim from depleting the memory + * reserves from the user space under its control. */ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); mark_oom_victim(victim); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a4562c058ec4..a9add06fe768 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2951,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, { long min = mark; int o; - const bool alloc_harder = (alloc_flags & ALLOC_HARDER); + const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2964,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, * the high-atomic reserves. This will over-estimate the size of the * atomic reserve but it avoids a search. */ - if (likely(!alloc_harder)) + if (likely(!alloc_harder)) { free_pages -= z->nr_reserved_highatomic; - else - min -= min / 4; + } else { + /* + * OOM victims can try even harder than normal ALLOC_HARDER + * users on the grounds that it's definitely going to be in + * the exit path shortly and free memory. Any allocation it + * makes during the free path will be small and short-lived. + */ + if (alloc_flags & ALLOC_OOM) + min -= min / 2; + else + min -= min / 4; + } + #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ @@ -3205,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) * of allowed nodes. */ if (!(gfp_mask & __GFP_NOMEMALLOC)) - if (test_thread_flag(TIF_MEMDIE) || + if (tsk_is_oom_victim(current) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) @@ -3668,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask) return alloc_flags; } -bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +static bool oom_reserves_allowed(struct task_struct *tsk) { - if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + if (!tsk_is_oom_victim(tsk)) + return false; + + /* + * !MMU doesn't have oom reaper so give access to memory reserves + * only to the thread with TIF_MEMDIE set + */ + if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE)) return false; + return true; +} + +/* + * Distinguish requests which really need access to full memory + * reserves from oom victims which can live with a portion of it + */ +static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask) +{ + if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) + return 0; if (gfp_mask & __GFP_MEMALLOC) - return true; + return ALLOC_NO_WATERMARKS; if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) - return true; - if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) - return true; + return ALLOC_NO_WATERMARKS; + if (!in_interrupt()) { + if (current->flags & PF_MEMALLOC) + return ALLOC_NO_WATERMARKS; + else if (oom_reserves_allowed(current)) + return ALLOC_OOM; + } - return false; + return 0; +} + +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ + return !!__gfp_pfmemalloc_flags(gfp_mask); } /* @@ -3835,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long alloc_start = jiffies; unsigned int stall_timeout = 10 * HZ; unsigned int cpuset_mems_cookie; + int reserve_flags; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3940,15 +3977,16 @@ retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); - if (gfp_pfmemalloc_allowed(gfp_mask)) - alloc_flags = ALLOC_NO_WATERMARKS; + reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); + if (reserve_flags) + alloc_flags = reserve_flags; /* * Reset the zonelist iterators if memory policies can be ignored. * These allocations are high priority and system rather than user * orientated. */ - if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { + if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); @@ -4025,8 +4063,8 @@ retry: goto got_pg; /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && - (alloc_flags == ALLOC_NO_WATERMARKS || + if (tsk_is_oom_victim(current) && + (alloc_flags == ALLOC_OOM || (gfp_mask & __GFP_NOMEMALLOC))) goto nopage; -- cgit v1.2.3 From da99ecf117fce6570bd3989263d68ee0007e1249 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 6 Sep 2017 16:24:53 -0700 Subject: mm: replace TIF_MEMDIE checks by tsk_is_oom_victim TIF_MEMDIE is set only to the tasks whick were either directly selected by the OOM killer or passed through mark_oom_victim from the allocator path. tsk_is_oom_victim is more generic and allows to identify all tasks (threads) which share the mm with the oom victim. Please note that the freezer still needs to check TIF_MEMDIE because we cannot thaw tasks which do not participage in oom_victims counting otherwise a !TIF_MEMDIE task could interfere after oom_disbale returns. Link: http://lkml.kernel.org/r/20170810075019.28998-3-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Mel Gorman Cc: Tetsuo Handa Cc: David Rientjes Cc: Johannes Weiner Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cpuset.c | 9 +++++---- mm/memcontrol.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2f4039bafebb..e7485786db9b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -2500,12 +2501,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * If we're in interrupt, yes, we can always allocate. If @node is set in * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, - * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. + * yes. If current has access to memory reserves as an oom victim, yes. * Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed as is marked TIF_MEMDIE. + * unless the task has been OOM killed. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * @@ -2528,7 +2529,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok - * TIF_MEMDIE - any node ok + * tsk_is_oom_victim - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ @@ -2546,7 +2547,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) + if (unlikely(tsk_is_oom_victim(current))) return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return false; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c1f9b79817d7..ad15850ee157 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1917,7 +1917,7 @@ retry: * bypass the last charges so that they can exit quickly and * free their memory. */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || + if (unlikely(tsk_is_oom_victim(current) || fatal_signal_pending(current) || current->flags & PF_EXITING)) goto force; -- cgit v1.2.3 From a2468cc9bfdff6139f59ca896671e5819ff5f94a Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 6 Sep 2017 16:24:57 -0700 Subject: swap: choose swap device according to numa node If the system has more than one swap device and swap device has the node information, we can make use of this information to decide which swap device to use in get_swap_pages() to get better performance. The current code uses a priority based list, swap_avail_list, to decide which swap device to use and if multiple swap devices share the same priority, they are used round robin. This patch changes the previous single global swap_avail_list into a per-numa-node list, i.e. for each numa node, it sees its own priority based list of available swap devices. Swap device's priority can be promoted on its matching node's swap_avail_list. The current swap device's priority is set as: user can set a >=0 value, or the system will pick one starting from -1 then downwards. The priority value in the swap_avail_list is the negated value of the swap device's due to plist being sorted from low to high. The new policy doesn't change the semantics for priority >=0 cases, the previous starting from -1 then downwards now becomes starting from -2 then downwards and -1 is reserved as the promoted value. Take 4-node EX machine as an example, suppose 4 swap devices are available, each sit on a different node: swapA on node 0 swapB on node 1 swapC on node 2 swapD on node 3 After they are all swapped on in the sequence of ABCD. Current behaviour: their priorities will be: swapA: -1 swapB: -2 swapC: -3 swapD: -4 And their position in the global swap_avail_list will be: swapA -> swapB -> swapC -> swapD prio:1 prio:2 prio:3 prio:4 New behaviour: their priorities will be(note that -1 is skipped): swapA: -2 swapB: -3 swapC: -4 swapD: -5 And their positions in the 4 swap_avail_lists[nid] will be: swap_avail_lists[0]: /* node 0's available swap device list */ swapA -> swapB -> swapC -> swapD prio:1 prio:3 prio:4 prio:5 swap_avali_lists[1]: /* node 1's available swap device list */ swapB -> swapA -> swapC -> swapD prio:1 prio:2 prio:4 prio:5 swap_avail_lists[2]: /* node 2's available swap device list */ swapC -> swapA -> swapB -> swapD prio:1 prio:2 prio:3 prio:5 swap_avail_lists[3]: /* node 3's available swap device list */ swapD -> swapA -> swapB -> swapC prio:1 prio:2 prio:3 prio:4 To see the effect of the patch, a test that starts N process, each mmap a region of anonymous memory and then continually write to it at random position to trigger both swap in and out is used. On a 2 node Skylake EP machine with 64GiB memory, two 170GB SSD drives are used as swap devices with each attached to a different node, the result is: runtime=30m/processes=32/total test size=128G/each process mmap region=4G kernel throughput vanilla 13306 auto-binding 15169 +14% runtime=30m/processes=64/total test size=128G/each process mmap region=2G kernel throughput vanilla 11885 auto-binding 14879 +25% [aaron.lu@intel.com: v2] Link: http://lkml.kernel.org/r/20170814053130.GD2369@aaronlu.sh.intel.com Link: http://lkml.kernel.org/r/20170816024439.GA10925@aaronlu.sh.intel.com [akpm@linux-foundation.org: use kmalloc_array()] Link: http://lkml.kernel.org/r/20170814053130.GD2369@aaronlu.sh.intel.com Link: http://lkml.kernel.org/r/20170816024439.GA10925@aaronlu.sh.intel.com Signed-off-by: Aaron Lu Cc: "Chen, Tim C" Cc: Huang Ying Cc: Andi Kleen Cc: Michal Hocko Cc: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/swap_numa.txt | 69 ++++++++++++++++++++++++ include/linux/swap.h | 2 +- mm/swapfile.c | 120 ++++++++++++++++++++++++++++++++--------- 3 files changed, 164 insertions(+), 27 deletions(-) create mode 100644 Documentation/vm/swap_numa.txt (limited to 'mm') diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt new file mode 100644 index 000000000000..d5960c9124f5 --- /dev/null +++ b/Documentation/vm/swap_numa.txt @@ -0,0 +1,69 @@ +Automatically bind swap device to numa node +------------------------------------------- + +If the system has more than one swap device and swap device has the node +information, we can make use of this information to decide which swap +device to use in get_swap_pages() to get better performance. + + +How to use this feature +----------------------- + +Swap device has priority and that decides the order of it to be used. To make +use of automatically binding, there is no need to manipulate priority settings +for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and +swapB, with swapA attached to node 0 and swapB attached to node 1, are going +to be swapped on. Simply swapping them on by doing: +# swapon /dev/swapA +# swapon /dev/swapB + +Then node 0 will use the two swap devices in the order of swapA then swapB and +node 1 will use the two swap devices in the order of swapB then swapA. Note +that the order of them being swapped on doesn't matter. + +A more complex example on a 4 node machine. Assume 6 swap devices are going to +be swapped on: swapA and swapB are attached to node 0, swapC is attached to +node 1, swapD and swapE are attached to node 2 and swapF is attached to node3. +The way to swap them on is the same as above: +# swapon /dev/swapA +# swapon /dev/swapB +# swapon /dev/swapC +# swapon /dev/swapD +# swapon /dev/swapE +# swapon /dev/swapF + +Then node 0 will use them in the order of: +swapA/swapB -> swapC -> swapD -> swapE -> swapF +swapA and swapB will be used in a round robin mode before any other swap device. + +node 1 will use them in the order of: +swapC -> swapA -> swapB -> swapD -> swapE -> swapF + +node 2 will use them in the order of: +swapD/swapE -> swapA -> swapB -> swapC -> swapF +Similaly, swapD and swapE will be used in a round robin mode before any +other swap devices. + +node 3 will use them in the order of: +swapF -> swapA -> swapB -> swapC -> swapD -> swapE + + +Implementation details +---------------------- + +The current code uses a priority based list, swap_avail_list, to decide +which swap device to use and if multiple swap devices share the same +priority, they are used round robin. This change here replaces the single +global swap_avail_list with a per-numa-node list, i.e. for each numa node, +it sees its own priority based list of available swap devices. Swap +device's priority can be promoted on its matching node's swap_avail_list. + +The current swap device's priority is set as: user can set a >=0 value, +or the system will pick one starting from -1 then downwards. The priority +value in the swap_avail_list is the negated value of the swap device's +due to plist being sorted from low to high. The new policy doesn't change +the semantics for priority >=0 cases, the previous starting from -1 then +downwards now becomes starting from -2 then downwards and -1 is reserved +as the promoted value. So if multiple swap devices are attached to the same +node, they will all be promoted to priority -1 on that node's plist and will +be used round robin before any other swap devices. diff --git a/include/linux/swap.h b/include/linux/swap.h index 9c4ae6f14eea..8bf3487fb204 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -212,7 +212,7 @@ struct swap_info_struct { unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ - struct plist_node avail_list; /* entry in swap_avail_head */ + struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */ signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f8b3e08a547..d483278ee35b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority; +static int least_priority = -1; static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; @@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static PLIST_HEAD(swap_avail_head); +struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -592,6 +592,21 @@ new_cluster: return found_free; } +static void __del_from_avail_list(struct swap_info_struct *p) +{ + int nid; + + for_each_node(nid) + plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); +} + +static void del_from_avail_list(struct swap_info_struct *p) +{ + spin_lock(&swap_avail_lock); + __del_from_avail_list(p); + spin_unlock(&swap_avail_lock); +} + static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { @@ -605,12 +620,22 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; - spin_lock(&swap_avail_lock); - plist_del(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + del_from_avail_list(si); } } +static void add_to_avail_list(struct swap_info_struct *p) +{ + int nid; + + spin_lock(&swap_avail_lock); + for_each_node(nid) { + WARN_ON(!plist_node_empty(&p->avail_lists[nid])); + plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); + } + spin_unlock(&swap_avail_lock); +} + static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { @@ -623,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, bool was_full = !si->highest_bit; si->highest_bit = end; - if (was_full && (si->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - WARN_ON(!plist_node_empty(&si->avail_list)); - if (plist_node_empty(&si->avail_list)) - plist_add(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); - } + if (was_full && (si->flags & SWP_WRITEOK)) + add_to_avail_list(si); } atomic_long_add(nr_entries, &nr_swap_pages); si->inuse_pages -= nr_entries; @@ -910,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; + int node; /* Only single cluster request supported */ WARN_ON_ONCE(n_goal > 1 && cluster); @@ -929,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) spin_lock(&swap_avail_lock); start_over: - plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + node = numa_node_id(); + plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* requeue si to after same-priority siblings */ - plist_requeue(&si->avail_list, &swap_avail_head); + plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); - if (plist_node_empty(&si->avail_list)) { + if (plist_node_empty(&si->avail_lists[node])) { spin_unlock(&si->lock); goto nextsi; } @@ -946,7 +968,7 @@ start_over: WARN(!(si->flags & SWP_WRITEOK), "swap_info %d in list but !SWP_WRITEOK\n", si->type); - plist_del(&si->avail_list, &swap_avail_head); + __del_from_avail_list(si); spin_unlock(&si->lock); goto nextsi; } @@ -975,7 +997,7 @@ nextsi: * swap_avail_head list then try it, otherwise start over * if we have not gotten any slots. */ - if (plist_node_empty(&next->avail_list)) + if (plist_node_empty(&next->avail_lists[node])) goto start_over; } @@ -2410,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } +static int swap_node(struct swap_info_struct *p) +{ + struct block_device *bdev; + + if (p->bdev) + bdev = p->bdev; + else + bdev = p->swap_file->f_inode->i_sb->s_bdev; + + return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; +} + static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { + int i; + if (prio >= 0) p->prio = prio; else @@ -2423,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, * low-to-high, while swap ordering is high-to-low */ p->list.prio = -p->prio; - p->avail_list.prio = -p->prio; + for_each_node(i) { + if (p->prio >= 0) + p->avail_lists[i].prio = -p->prio; + else { + if (swap_node(p) == i) + p->avail_lists[i].prio = 1; + else + p->avail_lists[i].prio = -p->prio; + } + } p->swap_map = swap_map; p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; @@ -2442,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, * swap_info_struct. */ plist_add(&p->list, &swap_active_head); - spin_lock(&swap_avail_lock); - plist_add(&p->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + add_to_avail_list(p); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -2529,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - spin_lock(&swap_avail_lock); - plist_del(&p->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + del_from_avail_list(p); spin_lock(&p->lock); if (p->prio < 0) { struct swap_info_struct *si = p; + int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; - si->avail_list.prio--; + for_each_node(nid) { + if (si->avail_lists[nid].prio != 1) + si->avail_lists[nid].prio--; + } } least_priority++; } @@ -2783,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; unsigned int type; + int i; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) @@ -2818,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void) } INIT_LIST_HEAD(&p->first_swap_extent.list); plist_node_init(&p->list, 0); - plist_node_init(&p->avail_list, 0); + for_each_node(i) + plist_node_init(&p->avail_lists[i], 0); p->flags = SWP_USED; spin_unlock(&swap_lock); spin_lock_init(&p->lock); @@ -3060,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!swap_avail_heads) + return -ENOMEM; + p = alloc_swap_info(); if (IS_ERR(p)) return PTR_ERR(p); @@ -3645,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } } } + +static int __init swapfile_init(void) +{ + int nid; + + swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), + GFP_KERNEL); + if (!swap_avail_heads) { + pr_emerg("Not enough memory for swap heads, swap is disabled\n"); + return -ENOMEM; + } + + for_each_node(nid) + plist_head_init(&swap_avail_heads[nid]); + + return 0; +} +subsys_initcall(swapfile_init); -- cgit v1.2.3 From 212925802454672e6cd2949a727f5e2c1377bf06 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 6 Sep 2017 16:25:00 -0700 Subject: mm: oom: let oom_reap_task and exit_mmap run concurrently This is purely required because exit_aio() may block and exit_mmap() may never start, if the oom_reap_task cannot start running on a mm with mm_users == 0. At the same time if the OOM reaper doesn't wait at all for the memory of the current OOM candidate to be freed by exit_mmap->unmap_vmas, it would generate a spurious OOM kill. If it wasn't because of the exit_aio or similar blocking functions in the last mmput, it would be enough to change the oom_reap_task() in the case it finds mm_users == 0, to wait for a timeout or to wait for __mmput to set MMF_OOM_SKIP itself, but it's not just exit_mmap the problem here so the concurrency of exit_mmap and oom_reap_task is apparently warranted. It's a non standard runtime, exit_mmap() runs without mmap_sem, and oom_reap_task runs with the mmap_sem for reading as usual (kind of MADV_DONTNEED). The race between the two is solved with a combination of tsk_is_oom_victim() (serialized by task_lock) and MMF_OOM_SKIP (serialized by a dummy down_write/up_write cycle on the same lines of the ksm_exit method). If the oom_reap_task() may be running concurrently during exit_mmap, exit_mmap will wait it to finish in down_write (before taking down mm structures that would make the oom_reap_task fail with use after free). If exit_mmap comes first, oom_reap_task() will skip the mm if MMF_OOM_SKIP is already set and in turn all memory is already freed and furthermore the mm data structures may already have been taken down by free_pgtables. [aarcange@redhat.com: incremental one liner] Link: http://lkml.kernel.org/r/20170726164319.GC29716@redhat.com [rientjes@google.com: remove unused mmput_async] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1708141733130.50317@chino.kir.corp.google.com [aarcange@redhat.com: microoptimization] Link: http://lkml.kernel.org/r/20170817171240.GB5066@redhat.com Link: http://lkml.kernel.org/r/20170726162912.GA29716@redhat.com Fixes: 26db62f179d1 ("oom: keep mm of the killed task available") Signed-off-by: Andrea Arcangeli Signed-off-by: David Rientjes Reported-by: David Rientjes Tested-by: David Rientjes Reviewed-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 6 ------ kernel/fork.c | 17 ----------------- mm/mmap.c | 18 ++++++++++++++++++ mm/oom_kill.c | 15 +++++---------- 4 files changed, 23 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2b0a281f9d26..3a19c253bdb1 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -84,12 +84,6 @@ static inline bool mmget_not_zero(struct mm_struct *mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); -#ifdef CONFIG_MMU -/* same as above but performs the slow path from the async context. Can - * be called from the atomic context as well - */ -extern void mmput_async(struct mm_struct *); -#endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); diff --git a/kernel/fork.c b/kernel/fork.c index 4e5345c07344..7ed64600da6c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -922,7 +922,6 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); - set_bit(MMF_OOM_SKIP, &mm->flags); mmdrop(mm); } @@ -938,22 +937,6 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); -#ifdef CONFIG_MMU -static void mmput_async_fn(struct work_struct *work) -{ - struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); - __mmput(mm); -} - -void mmput_async(struct mm_struct *mm) -{ - if (atomic_dec_and_test(&mm->mm_users)) { - INIT_WORK(&mm->async_put_work, mmput_async_fn); - schedule_work(&mm->async_put_work); - } -} -#endif - /** * set_mm_exe_file - change a reference to the mm's executable file * diff --git a/mm/mmap.c b/mm/mmap.c index 52f6c6b18f40..4c5981651407 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -3001,6 +3002,23 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); + set_bit(MMF_OOM_SKIP, &mm->flags); + if (unlikely(tsk_is_oom_victim(current))) { + /* + * Wait for oom_reap_task() to stop working on this + * mm. Because MMF_OOM_SKIP is already set before + * calling down_read(), oom_reap_task() will not run + * on this "mm" post up_write(). + * + * tsk_is_oom_victim() cannot be set from under us + * either because current->mm is already set to NULL + * under task_lock before calling mmput and oom_mm is + * set not NULL by the OOM killer only if current->mm + * is found not NULL while holding the task_lock. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c9f3569a76c7..99736e026712 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -495,11 +495,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) } /* - * increase mm_users only after we know we will reap something so - * that the mmput_async is called only when we have reaped something - * and delayed __mmput doesn't matter that much + * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't + * work on the mm anymore. The check for MMF_OOM_SKIP must run + * under mmap_sem for reading because it serializes against the + * down_write();up_write() cycle in exit_mmap(). */ - if (!mmget_not_zero(mm)) { + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); goto unlock_oom; @@ -542,12 +543,6 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) K(get_mm_counter(mm, MM_SHMEMPAGES))); up_read(&mm->mmap_sem); - /* - * Drop our reference but make sure the mmput slow path is called from a - * different context because we shouldn't risk we get stuck there and - * put the oom_reaper out of the way. - */ - mmput_async(mm); trace_finish_task_reaping(tsk->pid); unlock_oom: mutex_unlock(&oom_lock); -- cgit v1.2.3 From c79b57e462b5d2f47afa5f175cf1828f16e18612 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:25:04 -0700 Subject: mm: hugetlb: clear target sub-page last when clearing huge page Huge page helps to reduce TLB miss rate, but it has higher cache footprint, sometimes this may cause some issue. For example, when clearing huge page on x86_64 platform, the cache footprint is 2M. But on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M LLC (last level cache). That is, in average, there are 2.5M LLC for each core and 1.25M LLC for each thread. If the cache pressure is heavy when clearing the huge page, and we clear the huge page from the begin to the end, it is possible that the begin of huge page is evicted from the cache after we finishing clearing the end of the huge page. And it is possible for the application to access the begin of the huge page after clearing the huge page. To help the above situation, in this patch, when we clear a huge page, the order to clear sub-pages is changed. In quite some situation, we can get the address that the application will access after we clear the huge page, for example, in a page fault handler. Instead of clearing the huge page from begin to end, we will clear the sub-pages farthest from the the sub-page to access firstly, and clear the sub-page to access last. This will make the sub-page to access most cache-hot and sub-pages around it more cache-hot too. If we cannot know the address the application will access, the begin of the huge page is assumed to be the the address the application will access. With this patch, the throughput increases ~28.3% in vm-scalability anon-w-seq test case with 72 processes on a 2 socket Xeon E5 v3 2699 system (36 cores, 72 threads). The test case creates 72 processes, each process mmap a big anonymous memory area and writes to it from the begin to the end. For each process, other processes could be seen as other workload which generates heavy cache pressure. At the same time, the cache miss rate reduced from ~33.4% to ~31.7%, the IPC (instruction per cycle) increased from 0.56 to 0.74, and the time spent in user space is reduced ~7.9% Christopher Lameter suggests to clear bytes inside a sub-page from end to begin too. But tests show no visible performance difference in the tests. May because the size of page is small compared with the cache size. Thanks Andi Kleen to propose to use address to access to determine the order of sub-pages to clear. The hugetlbfs access address could be improved, will do that in another patch. [ying.huang@intel.com: improve readability of clear_huge_page()] Link: http://lkml.kernel.org/r/20170830051842.1397-1-ying.huang@intel.com Link: http://lkml.kernel.org/r/20170815014618.15842-1-ying.huang@intel.com Suggested-by: Andi Kleen Signed-off-by: "Huang, Ying" Acked-by: Jan Kara Reviewed-by: Michal Hocko Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Nadia Yvette Chambers Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Minchan Kim Cc: Shaohua Li Cc: Christopher Lameter Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/huge_memory.c | 4 ++-- mm/memory.c | 42 ++++++++++++++++++++++++++++++++++++++---- 3 files changed, 41 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index eb5e4bc946cc..0acea73af839 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2508,7 +2508,7 @@ enum mf_action_page_type { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, - unsigned long addr, + unsigned long addr_hint, unsigned int pages_per_huge_page); extern void copy_user_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 772048c233d1..0b51e70e0a8b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -567,7 +567,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, goto release; } - clear_huge_page(page, haddr, HPAGE_PMD_NR); + clear_huge_page(page, vmf->address, HPAGE_PMD_NR); /* * The memory barrier inside __SetPageUptodate makes sure that * clear_huge_page writes become visible before the set_pmd_at() @@ -1305,7 +1305,7 @@ alloc: count_vm_event(THP_FAULT_ALLOC); if (!page) - clear_huge_page(new_page, haddr, HPAGE_PMD_NR); + clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); else copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); diff --git a/mm/memory.c b/mm/memory.c index e87953775e3c..13ee83b43878 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4417,19 +4417,53 @@ static void clear_gigantic_page(struct page *page, } } void clear_huge_page(struct page *page, - unsigned long addr, unsigned int pages_per_huge_page) + unsigned long addr_hint, unsigned int pages_per_huge_page) { - int i; + int i, n, base, l; + unsigned long addr = addr_hint & + ~(((unsigned long)pages_per_huge_page << PAGE_SHIFT) - 1); if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, pages_per_huge_page); return; } + /* Clear sub-page to access last to keep its cache lines hot */ might_sleep(); - for (i = 0; i < pages_per_huge_page; i++) { + n = (addr_hint - addr) / PAGE_SIZE; + if (2 * n <= pages_per_huge_page) { + /* If sub-page to access in first half of huge page */ + base = 0; + l = n; + /* Clear sub-pages at the end of huge page */ + for (i = pages_per_huge_page - 1; i >= 2 * n; i--) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } + } else { + /* If sub-page to access in second half of huge page */ + base = pages_per_huge_page - 2 * (pages_per_huge_page - n); + l = pages_per_huge_page - n; + /* Clear sub-pages at the begin of huge page */ + for (i = 0; i < base; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } + } + /* + * Clear remaining sub-pages in left-right-left-right pattern + * towards the sub-page to access + */ + for (i = 0; i < l; i++) { + int left_idx = base + i; + int right_idx = base + 2 * l - 1 - i; + + cond_resched(); + clear_user_highpage(page + left_idx, + addr + left_idx * PAGE_SIZE); cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); + clear_user_highpage(page + right_idx, + addr + right_idx * PAGE_SIZE); } } -- cgit v1.2.3 From d2cd9ede6e193dd7d88b6d27399e96229a551b19 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 6 Sep 2017 16:25:15 -0700 Subject: mm,fork: introduce MADV_WIPEONFORK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce MADV_WIPEONFORK semantics, which result in a VMA being empty in the child process after fork. This differs from MADV_DONTFORK in one important way. If a child process accesses memory that was MADV_WIPEONFORK, it will get zeroes. The address ranges are still valid, they are just empty. If a child process accesses memory that was MADV_DONTFORK, it will get a segmentation fault, since those address ranges are no longer valid in the child after fork. Since MADV_DONTFORK also seems to be used to allow very large programs to fork in systems with strict memory overcommit restrictions, changing the semantics of MADV_DONTFORK might break existing programs. MADV_WIPEONFORK only works on private, anonymous VMAs. The use case is libraries that store or cache information, and want to know that they need to regenerate it in the child process after fork. Examples of this would be: - systemd/pulseaudio API checks (fail after fork) (replacing a getpid check, which is too slow without a PID cache) - PKCS#11 API reinitialization check (mandated by specification) - glibc's upcoming PRNG (reseed after fork) - OpenSSL PRNG (reseed after fork) The security benefits of a forking server having a re-inialized PRNG in every child process are pretty obvious. However, due to libraries having all kinds of internal state, and programs getting compiled with many different versions of each library, it is unreasonable to expect calling programs to re-initialize everything manually after fork. A further complication is the proliferation of clone flags, programs bypassing glibc's functions to call clone directly, and programs calling unshare, causing the glibc pthread_atfork hook to not get called. It would be better to have the kernel take care of this automatically. The patch also adds MADV_KEEPONFORK, to undo the effects of a prior MADV_WIPEONFORK. This is similar to the OpenBSD minherit syscall with MAP_INHERIT_ZERO: https://man.openbsd.org/minherit.2 [akpm@linux-foundation.org: numerically order arch/parisc/include/uapi/asm/mman.h #defines] Link: http://lkml.kernel.org/r/20170811212829.29186-3-riel@redhat.com Signed-off-by: Rik van Riel Reported-by: Florian Weimer Reported-by: Colm MacCártaigh Reviewed-by: Mike Kravetz Cc: "H. Peter Anvin" Cc: "Kirill A. Shutemov" Cc: Andy Lutomirski Cc: Dave Hansen Cc: Ingo Molnar Cc: Helge Deller Cc: Kees Cook Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Will Drewry Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/uapi/asm/mman.h | 3 +++ arch/mips/include/uapi/asm/mman.h | 3 +++ arch/parisc/include/uapi/asm/mman.h | 3 +++ arch/xtensa/include/uapi/asm/mman.h | 3 +++ fs/proc/task_mmu.c | 1 + include/linux/mm.h | 2 +- include/trace/events/mmflags.h | 8 +------- include/uapi/asm-generic/mman-common.h | 3 +++ kernel/fork.c | 10 ++++++++-- mm/madvise.c | 13 +++++++++++++ 10 files changed, 39 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 13b52aad3c43..3b26cc62dadb 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -64,6 +64,9 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 398eebcc3541..da3216007fe0 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -91,6 +91,9 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index b87fbe3f338a..775b5d5e41a1 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -57,6 +57,9 @@ overrides the coredump filter bits */ #define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */ + #define MADV_HWPOISON 100 /* poison a page for testing */ #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 8ce77a2e9bab..b15b278aa314 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -103,6 +103,9 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b2330aedc63f..a290966f91ec 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -663,6 +663,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", diff --git a/include/linux/mm.h b/include/linux/mm.h index 9efe62032094..39db8e54c5d5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -189,7 +189,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_ARCH_2 0x02000000 +#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #ifdef CONFIG_MEM_SOFT_DIRTY diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 8e50d01c645f..4c2e4737d7bc 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -125,12 +125,6 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" ) #define __VM_ARCH_SPECIFIC_1 {VM_ARCH_1, "arch_1" } #endif -#if defined(CONFIG_X86) -#define __VM_ARCH_SPECIFIC_2 {VM_MPX, "mpx" } -#else -#define __VM_ARCH_SPECIFIC_2 {VM_ARCH_2, "arch_2" } -#endif - #ifdef CONFIG_MEM_SOFT_DIRTY #define IF_HAVE_VM_SOFTDIRTY(flag,name) {flag, name }, #else @@ -162,7 +156,7 @@ IF_HAVE_PG_IDLE(PG_idle, "idle" ) {VM_NORESERVE, "noreserve" }, \ {VM_HUGETLB, "hugetlb" }, \ __VM_ARCH_SPECIFIC_1 , \ - __VM_ARCH_SPECIFIC_2 , \ + {VM_WIPEONFORK, "wipeonfork" }, \ {VM_DONTDUMP, "dontdump" }, \ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ {VM_MIXEDMAP, "mixedmap" }, \ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index d248f3c335b5..203268f9231e 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -58,6 +58,9 @@ overrides the coredump filter bits */ #define MADV_DODUMP 17 /* Clear the MADV_DONTDUMP flag */ +#define MADV_WIPEONFORK 18 /* Zero memory on fork, child only */ +#define MADV_KEEPONFORK 19 /* Undo MADV_WIPEONFORK */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/kernel/fork.c b/kernel/fork.c index 7ed64600da6c..24a4c0be80d5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -657,7 +657,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = dup_userfaultfd(tmp, &uf); if (retval) goto fail_nomem_anon_vma_fork; - if (anon_vma_fork(tmp, mpnt)) + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); tmp->vm_next = tmp->vm_prev = NULL; @@ -701,7 +706,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); diff --git a/mm/madvise.c b/mm/madvise.c index 4d7d1e5ddba9..eea1c733286f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -80,6 +80,17 @@ static long madvise_behavior(struct vm_area_struct *vma, } new_flags &= ~VM_DONTCOPY; break; + case MADV_WIPEONFORK: + /* MADV_WIPEONFORK is only supported on anonymous memory. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) { + error = -EINVAL; + goto out; + } + new_flags |= VM_WIPEONFORK; + break; + case MADV_KEEPONFORK: + new_flags &= ~VM_WIPEONFORK; + break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; @@ -696,6 +707,8 @@ madvise_behavior_valid(int behavior) #endif case MADV_DONTDUMP: case MADV_DODUMP: + case MADV_WIPEONFORK: + case MADV_KEEPONFORK: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: -- cgit v1.2.3