From 545b1b077ca6b359820436af097bc65e3f6f6cc9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 25 Jun 2020 20:29:21 -0700 Subject: mm: do_swap_page(): fix up the error code do_swap_page() returns error codes from the VM_FAULT* space. try_charge() might return -ENOMEM, though, and then do_swap_page() simply returns 0 which means a success. We almost never return ENOMEM for GFP_KERNEL single page charge. Except for async OOM handling (oom_disabled v1). So this needs translation to VM_FAULT_OOM otherwise the the page fault path will not notify the userspace and wait for an action. Link: http://lkml.kernel.org/r/20200617090238.GL9499@dhcp22.suse.cz Fixes: 4c6355b25e8b ("mm: memcontrol: charge swapin pages on instantiation") Signed-off-by: Michal Hocko Acked-by: Johannes Weiner Cc: Alex Shi Cc: Joonsoo Kim Cc: Shakeel Butt Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index dc7f3543b1fd..1c632faa2611 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3140,8 +3140,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) err = mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL); ClearPageSwapCache(page); - if (err) + if (err) { + ret = VM_FAULT_OOM; goto out_page; + } lru_cache_add(page); swap_readpage(page, true); -- cgit v1.2.3 From b9e20f0da1f5c9c68689450a8cb436c9486434c8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 25 Jun 2020 20:29:24 -0700 Subject: mm, compaction: make capture control handling safe wrt interrupts Hugh reports: "While stressing compaction, one run oopsed on NULL capc->cc in __free_one_page()'s task_capc(zone): compact_zone_order() had been interrupted, and a page was being freed in the return from interrupt. Though you would not expect it from the source, both gccs I was using (4.8.1 and 7.5.0) had chosen to compile compact_zone_order() with the ".cc = &cc" implemented by mov %rbx,-0xb0(%rbp) immediately before callq compact_zone - long after the "current->capture_control = &capc". An interrupt in between those finds capc->cc NULL (zeroed by an earlier rep stos). This could presumably be fixed by a barrier() before setting current->capture_control in compact_zone_order(); but would also need more care on return from compact_zone(), in order not to risk leaking a page captured by interrupt just before capture_control is reset. Maybe that is the preferable fix, but I felt safer for task_capc() to exclude the rather surprising possibility of capture at interrupt time" I have checked that gcc10 also behaves the same. The advantage of fix in compact_zone_order() is that we don't add another test in the page freeing hot path, and that it might prevent future problems if we stop exposing pointers to uninitialized structures in current task. So this patch implements the suggestion for compact_zone_order() with barrier() (and WRITE_ONCE() to prevent store tearing) for setting current->capture_control, and prevents page leaking with WRITE_ONCE/READ_ONCE in the proper order. Link: http://lkml.kernel.org/r/20200616082649.27173-1-vbabka@suse.cz Fixes: 5e1f0f098b46 ("mm, compaction: capture a page under direct compaction") Signed-off-by: Vlastimil Babka Reported-by: Hugh Dickins Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Cc: Alex Shi Cc: Li Wang Cc: Mel Gorman Cc: [5.1+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index fd988b7e5f2b..86375605faa9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2316,15 +2316,26 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, .page = NULL, }; - current->capture_control = &capc; + /* + * Make sure the structs are really initialized before we expose the + * capture control, in case we are interrupted and the interrupt handler + * frees a page. + */ + barrier(); + WRITE_ONCE(current->capture_control, &capc); ret = compact_zone(&cc, &capc); VM_BUG_ON(!list_empty(&cc.freepages)); VM_BUG_ON(!list_empty(&cc.migratepages)); - *capture = capc.page; - current->capture_control = NULL; + /* + * Make sure we hide capture control first before we read the captured + * page pointer, otherwise an interrupt could free and capture a page + * and we would leak it. + */ + WRITE_ONCE(current->capture_control, NULL); + *capture = READ_ONCE(capc.page); return ret; } -- cgit v1.2.3 From d7670879c5c4aa443d518fb234a9e5f30931efa3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 25 Jun 2020 20:29:49 -0700 Subject: mm, slab: fix sign conversion problem in memcg_uncharge_slab() It was found that running the LTP test on a PowerPC system could produce erroneous values in /proc/meminfo, like: MemTotal: 531915072 kB MemFree: 507962176 kB MemAvailable: 1100020596352 kB Using bisection, the problem is tracked down to commit 9c315e4d7d8c ("mm: memcg/slab: cache page number in memcg_(un)charge_slab()"). In memcg_uncharge_slab() with a "int order" argument: unsigned int nr_pages = 1 << order; : mod_lruvec_state(lruvec, cache_vmstat_idx(s), -nr_pages); The mod_lruvec_state() function will eventually call the __mod_zone_page_state() which accepts a long argument. Depending on the compiler and how inlining is done, "-nr_pages" may be treated as a negative number or a very large positive number. Apparently, it was treated as a large positive number in that PowerPC system leading to incorrect stat counts. This problem hasn't been seen in x86-64 yet, perhaps the gcc compiler there has some slight difference in behavior. It is fixed by making nr_pages a signed value. For consistency, a similar change is applied to memcg_charge_slab() as well. Link: http://lkml.kernel.org/r/20200620184719.10994-1-longman@redhat.com Fixes: 9c315e4d7d8c ("mm: memcg/slab: cache page number in memcg_(un)charge_slab()"). Signed-off-by: Waiman Long Acked-by: Roman Gushchin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 207c83ef6e06..74f7e09a7cfd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -348,7 +348,7 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - unsigned int nr_pages = 1 << order; + int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; int ret; @@ -388,7 +388,7 @@ out: static __always_inline void memcg_uncharge_slab(struct page *page, int order, struct kmem_cache *s) { - unsigned int nr_pages = 1 << order; + int nr_pages = 1 << order; struct mem_cgroup *memcg; struct lruvec *lruvec; -- cgit v1.2.3 From 8982ae527fbef170ef298650c15d55a9ccd33973 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 25 Jun 2020 20:29:52 -0700 Subject: mm/slab: use memzero_explicit() in kzfree() The kzfree() function is normally used to clear some sensitive information, like encryption keys, in the buffer before freeing it back to the pool. Memset() is currently used for buffer clearing. However unlikely, there is still a non-zero probability that the compiler may choose to optimize away the memory clearing especially if LTO is being used in the future. To make sure that this optimization will never happen, memzero_explicit(), which is introduced in v3.18, is now used in kzfree() to future-proof it. Link: http://lkml.kernel.org/r/20200616154311.12314-2-longman@redhat.com Fixes: 3ef0e5ba4673 ("slab: introduce kzfree()") Signed-off-by: Waiman Long Acked-by: Michal Hocko Cc: David Howells Cc: Jarkko Sakkinen Cc: James Morris Cc: "Serge E. Hallyn" Cc: Joe Perches Cc: Matthew Wilcox Cc: David Rientjes Cc: Johannes Weiner Cc: Dan Carpenter Cc: "Jason A . Donenfeld" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 9e72ba224175..37d48a56431d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1726,7 +1726,7 @@ void kzfree(const void *p) if (unlikely(ZERO_OR_NULL_PTR(mem))) return; ks = ksize(mem); - memset(mem, 0, ks); + memzero_explicit(mem, ks); kfree(mem); } EXPORT_SYMBOL(kzfree); -- cgit v1.2.3 From 55860d96ca59265d35427da0ee7d7f61e404f8e7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 25 Jun 2020 20:29:55 -0700 Subject: slub: cure list_slab_objects() from double fix According to Christopher Lameter two fixes have been merged for the same problem. As far as I can tell, the code does not acquire the list_lock and invoke kmalloc(). list_slab_objects() misses an unlock (the counterpart to get_map()) and the memory allocated in free_partial() isn't used. Revert the mentioned commit. Link: http://lkml.kernel.org/r/20200618201234.795692-1-bigeasy@linutronix.de Fixes: aa456c7aebb14 ("slub: remove kmalloc under list_lock from list_slab_objects() V2") Link: https://lkml.kernel.org/r/alpine.DEB.2.22.394.2006181501480.12014@www.lameter.com Signed-off-by: Sebastian Andrzej Siewior Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Thomas Gleixner Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index fe81773fd97e..ef303070d175 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3766,15 +3766,13 @@ error: } static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text, unsigned long *map) + const char *text) { #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); + unsigned long *map; void *p; - if (!map) - return; - slab_err(s, page, text, s->name); slab_lock(page); @@ -3786,6 +3784,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, print_tracking(s, p); } } + put_map(map); slab_unlock(page); #endif } @@ -3799,11 +3798,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { LIST_HEAD(discard); struct page *page, *h; - unsigned long *map = NULL; - -#ifdef CONFIG_SLUB_DEBUG - map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL); -#endif BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); @@ -3813,16 +3807,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, - "Objects remaining in %s on __kmem_cache_shutdown()", - map); + "Objects remaining in %s on __kmem_cache_shutdown()"); } } spin_unlock_irq(&n->list_lock); -#ifdef CONFIG_SLUB_DEBUG - bitmap_free(map); -#endif - list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } -- cgit v1.2.3 From 243bce09c91b0145aeaedd5afba799d81841c030 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 25 Jun 2020 20:29:59 -0700 Subject: mm: fix swap cache node allocation mask Chris Murphy reports that a slightly overcommitted load, testing swap and zram along with i915, splats and keeps on splatting, when it had better fail less noisily: gnome-shell: page allocation failure: order:0, mode:0x400d0(__GFP_IO|__GFP_FS|__GFP_COMP|__GFP_RECLAIMABLE), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 2 PID: 1155 Comm: gnome-shell Not tainted 5.7.0-1.fc33.x86_64 #1 Call Trace: dump_stack+0x64/0x88 warn_alloc.cold+0x75/0xd9 __alloc_pages_slowpath.constprop.0+0xcfa/0xd30 __alloc_pages_nodemask+0x2df/0x320 alloc_slab_page+0x195/0x310 allocate_slab+0x3c5/0x440 ___slab_alloc+0x40c/0x5f0 __slab_alloc+0x1c/0x30 kmem_cache_alloc+0x20e/0x220 xas_nomem+0x28/0x70 add_to_swap_cache+0x321/0x400 __read_swap_cache_async+0x105/0x240 swap_cluster_readahead+0x22c/0x2e0 shmem_swapin+0x8e/0xc0 shmem_swapin_page+0x196/0x740 shmem_getpage_gfp+0x3a2/0xa60 shmem_read_mapping_page_gfp+0x32/0x60 shmem_get_pages+0x155/0x5e0 [i915] __i915_gem_object_get_pages+0x68/0xa0 [i915] i915_vma_pin+0x3fe/0x6c0 [i915] eb_add_vma+0x10b/0x2c0 [i915] i915_gem_do_execbuffer+0x704/0x3430 [i915] i915_gem_execbuffer2_ioctl+0x1ea/0x3e0 [i915] drm_ioctl_kernel+0x86/0xd0 [drm] drm_ioctl+0x206/0x390 [drm] ksys_ioctl+0x82/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0xf0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Reported on 5.7, but it goes back really to 3.1: when shmem_read_mapping_page_gfp() was implemented for use by i915, and allowed for __GFP_NORETRY and __GFP_NOWARN flags in most places, but missed swapin's "& GFP_KERNEL" mask for page tree node allocation in __read_swap_cache_async() - that was to mask off HIGHUSER_MOVABLE bits from what page cache uses, but GFP_RECLAIM_MASK is now what's needed. Link: https://bugzilla.kernel.org/show_bug.cgi?id=208085 Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2006151330070.11064@eggly.anvils Fixes: 68da9f055755 ("tmpfs: pass gfp to shmem_getpage_gfp") Signed-off-by: Hugh Dickins Reviewed-by: Vlastimil Babka Reviewed-by: Matthew Wilcox (Oracle) Reported-by: Chris Murphy Analyzed-by: Vlastimil Babka Analyzed-by: Matthew Wilcox Tested-by: Chris Murphy Cc: [3.1+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index e98ff460e9e9..05889e8e3c97 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -21,7 +21,7 @@ #include #include #include - +#include "internal.h" /* * swapper_space is a fiction, retained to simplify the path through @@ -429,7 +429,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __SetPageSwapBacked(page); /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) { + if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK)) { put_swap_page(page, entry); goto fail_unlock; } -- cgit v1.2.3 From 7f70c2a68a51496289df163f6969d4db7c383f30 Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Thu, 25 Jun 2020 20:30:01 -0700 Subject: mm/memory.c: properly pte_offset_map_lock/unlock in vm_insert_pages() Calls to pte_offset_map() in vm_insert_pages() are erroneously not matched with a call to pte_unmap(). This would cause problems on architectures where that is not a no-op. This patch does away with the non-traditional locking in the existing code, and instead uses pte_offset_map_lock/unlock() as usual, incrementing PTE as necessary. The PTE pointer is kept within bounds since we clamp it with PTRS_PER_PTE. Link: http://lkml.kernel.org/r/20200618220446.20284-1-arjunroy.kdev@gmail.com Fixes: 8cd3984d81d5 ("mm/memory.c: add vm_insert_pages()") Signed-off-by: Arjun Roy Acked-by: David Rientjes Cc: Eric Dumazet Cc: Hugh Dickins Cc: Soheil Hassas Yeganeh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 1c632faa2611..0e5b25c9b151 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1498,7 +1498,7 @@ out: } #ifdef pte_index -static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd, +static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; @@ -1506,8 +1506,9 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd, if (!page_count(page)) return -EINVAL; err = validate_page_before_insert(page); - return err ? err : insert_page_into_pte_locked( - mm, pte_offset_map(pmd, addr), addr, page, prot); + if (err) + return err; + return insert_page_into_pte_locked(mm, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations @@ -1517,7 +1518,8 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) { pmd_t *pmd = NULL; - spinlock_t *pte_lock = NULL; + pte_t *start_pte, *pte; + spinlock_t *pte_lock; struct mm_struct *const mm = vma->vm_mm; unsigned long curr_page_idx = 0; unsigned long remaining_pages_total = *num; @@ -1536,18 +1538,17 @@ more: ret = -ENOMEM; if (pte_alloc(mm, pmd)) goto out; - pte_lock = pte_lockptr(mm, pmd); while (pages_to_write_in_pmd) { int pte_idx = 0; const int batch_size = min_t(int, pages_to_write_in_pmd, 8); - spin_lock(pte_lock); - for (; pte_idx < batch_size; ++pte_idx) { - int err = insert_page_in_batch_locked(mm, pmd, + start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); + for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { + int err = insert_page_in_batch_locked(mm, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { - spin_unlock(pte_lock); + pte_unmap_unlock(start_pte, pte_lock); ret = err; remaining_pages_total -= pte_idx; goto out; @@ -1555,7 +1556,7 @@ more: addr += PAGE_SIZE; ++curr_page_idx; } - spin_unlock(pte_lock); + pte_unmap_unlock(start_pte, pte_lock); pages_to_write_in_pmd -= batch_size; remaining_pages_total -= batch_size; } -- cgit v1.2.3 From 9449c9cb420b249eb6d7dad3953e4686443d7bd9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 25 Jun 2020 20:30:04 -0700 Subject: mm/debug_vm_pgtable: fix build failure with powerpc 8xx Since commit 9e343b467c70 ("READ_ONCE: Enforce atomicity for {READ,WRITE}_ONCE() memory accesses"), READ_ONCE() cannot be used anymore to read complex page table entries. This leads to: CC mm/debug_vm_pgtable.o In file included from ./include/asm-generic/bug.h:5, from ./arch/powerpc/include/asm/bug.h:109, from ./include/linux/bug.h:5, from ./include/linux/mmdebug.h:5, from ./include/linux/gfp.h:5, from mm/debug_vm_pgtable.c:13: In function 'pte_clear_tests', inlined from 'debug_vm_pgtable' at mm/debug_vm_pgtable.c:363:2: ./include/linux/compiler.h:392:38: error: Unsupported access size for {READ,WRITE}_ONCE(). mm/debug_vm_pgtable.c:249:14: note: in expansion of macro 'READ_ONCE' 249 | pte_t pte = READ_ONCE(*ptep); | ^~~~~~~~~ make[2]: *** [mm/debug_vm_pgtable.o] Error 1 Fix it by using the recently added ptep_get() helper. Link: http://lkml.kernel.org/r/6ca8c972e6c920dc4ae0d4affbed9703afa4d010.1592490570.git.christophe.leroy@csgroup.eu Fixes: 9e343b467c70 ("READ_ONCE: Enforce atomicity for {READ,WRITE}_ONCE() memory accesses") Signed-off-by: Christophe Leroy Acked-by: Will Deacon Reviewed-by: Anshuman Khandual Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "Peter Zijlstra (Intel)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index e45623016aea..61ab16fb2e36 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -246,13 +246,13 @@ static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp, static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep, unsigned long vaddr) { - pte_t pte = READ_ONCE(*ptep); + pte_t pte = ptep_get(ptep); pte = __pte(pte_val(pte) | RANDOM_ORVALUE); set_pte_at(mm, vaddr, ptep, pte); barrier(); pte_clear(mm, vaddr, ptep); - pte = READ_ONCE(*ptep); + pte = ptep_get(ptep); WARN_ON(!pte_none(pte)); } -- cgit v1.2.3 From 8eab7035b231aa3ac27b20ec77f85375e4413083 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Thu, 25 Jun 2020 20:30:13 -0700 Subject: mm/vmalloc.c: fix a warning while make xmldocs This patch fixes following warning while "make xmldocs" mm/vmalloc.c:1877: warning: Excess function parameter 'prot' description in 'vm_map_ram' This warning started since commit d4efd79a81ab ("mm: remove the prot argument from vm_map_ram"). Link: http://lkml.kernel.org/r/20200622152850.140871-1-standby24x7@gmail.com Fixes: d4efd79a81ab ("mm: remove the prot argument from vm_map_ram") Signed-off-by: Masanari Iida Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3091c2ca60df..957a0be77270 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1862,7 +1862,6 @@ EXPORT_SYMBOL(vm_unmap_ram); * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node - * @prot: memory protection to use. PAGE_KERNEL for regular RAM * * If you use this function for less than VMAP_MAX_ALLOC pages, it could be * faster than vmap so it's good. But if you mix long-life and short-life -- cgit v1.2.3 From cd324edce598ebddde44162a2aa01321c1261b9e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 25 Jun 2020 20:30:16 -0700 Subject: mm: memcontrol: handle div0 crash race condition in memory.low Tejun reports seeing rare div0 crashes in memory.low stress testing: RIP: 0010:mem_cgroup_calculate_protection+0xed/0x150 Code: 0f 46 d1 4c 39 d8 72 57 f6 05 16 d6 42 01 40 74 1f 4c 39 d8 76 1a 4c 39 d1 76 15 4c 29 d1 4c 29 d8 4d 29 d9 31 d2 48 0f af c1 <49> f7 f1 49 01 c2 4c 89 96 38 01 00 00 5d c3 48 0f af c7 31 d2 49 RSP: 0018:ffffa14e01d6fcd0 EFLAGS: 00010246 RAX: 000000000243e384 RBX: 0000000000000000 RCX: 0000000000008f4b RDX: 0000000000000000 RSI: ffff8b89bee84000 RDI: 0000000000000000 RBP: ffffa14e01d6fcd0 R08: ffff8b89ca7d40f8 R09: 0000000000000000 R10: 0000000000000000 R11: 00000000006422f7 R12: 0000000000000000 R13: ffff8b89d9617000 R14: ffff8b89bee84000 R15: ffffa14e01d6fdb8 FS: 0000000000000000(0000) GS:ffff8b8a1f1c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f93b1fc175b CR3: 000000016100a000 CR4: 0000000000340ea0 Call Trace: shrink_node+0x1e5/0x6c0 balance_pgdat+0x32d/0x5f0 kswapd+0x1d7/0x3d0 kthread+0x11c/0x160 ret_from_fork+0x1f/0x30 This happens when parent_usage == siblings_protected. We check that usage is bigger than protected, which should imply parent_usage being bigger than siblings_protected. However, we don't read (or even update) these values atomically, and they can be out of sync as the memory state changes under us. A bit of fluctuation around the target protection isn't a big deal, but we need to handle the div0 case. Check the parent state explicitly to make sure we have a reasonable positive value for the divisor. Link: http://lkml.kernel.org/r/20200615140658.601684-1-hannes@cmpxchg.org Fixes: 8a931f801340 ("mm: memcontrol: recursive memory.low protection") Signed-off-by: Johannes Weiner Reported-by: Tejun Heo Acked-by: Michal Hocko Acked-by: Chris Down Cc: Roman Gushchin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b38b6ad547d..5de0a9035b5f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6360,11 +6360,16 @@ static unsigned long effective_protection(unsigned long usage, * We're using unprotected memory for the weight so that if * some cgroups DO claim explicit protection, we don't protect * the same bytes twice. + * + * Check both usage and parent_usage against the respective + * protected values. One should imply the other, but they + * aren't read atomically - make sure the division is sane. */ if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)) return ep; - - if (parent_effective > siblings_protected && usage > protected) { + if (parent_effective > siblings_protected && + parent_usage > siblings_protected && + usage > protected) { unsigned long unclaimed; unclaimed = parent_effective - siblings_protected; -- cgit v1.2.3 From 3a98990ae2150277ed34d3b248c60e68bf2244b2 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 25 Jun 2020 20:30:19 -0700 Subject: mm/memcontrol.c: add missed css_put() We should put the css reference when memory allocation failed. Link: http://lkml.kernel.org/r/20200614122653.98829-1-songmuchun@bytedance.com Fixes: f0a3a24b532d ("mm: memcg/slab: rework non-root kmem_cache lifecycle management") Signed-off-by: Muchun Song Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Qian Cai Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5de0a9035b5f..da15b686caa8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2772,8 +2772,10 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, return; cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); - if (!cw) + if (!cw) { + css_put(&memcg->css); return; + } cw->memcg = memcg; cw->cachep = cachep; -- cgit v1.2.3 From 03960e33187ae969187281c3aa6c308d7282c468 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Thu, 25 Jun 2020 20:30:22 -0700 Subject: mm/memcontrol.c: prevent missed memory.low load tears Looks like one of these got missed when massaging in f86b810c2610 ("mm, memcg: prevent memory.low load/store tearing") with other linux-mm changes. Link: http://lkml.kernel.org/r/20200612174437.GA391453@chrisdown.name Signed-off-by: Chris Down Reported-by: Michal Koutny Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da15b686caa8..19622328e4b5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6423,7 +6423,7 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, if (parent == root) { memcg->memory.emin = READ_ONCE(memcg->memory.min); - memcg->memory.elow = memcg->memory.low; + memcg->memory.elow = READ_ONCE(memcg->memory.low); goto out; } @@ -6435,7 +6435,8 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, atomic_long_read(&parent->memory.children_min_usage))); WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage, - memcg->memory.low, READ_ONCE(parent->memory.elow), + READ_ONCE(memcg->memory.low), + READ_ONCE(parent->memory.elow), atomic_long_read(&parent->memory.children_low_usage))); out: -- cgit v1.2.3 From 31d8fcac00fcf4007f3921edc69ab4dcb3abcd4d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 25 Jun 2020 20:30:31 -0700 Subject: mm: workingset: age nonresident information alongside anonymous pages Patch series "fix for "mm: balance LRU lists based on relative thrashing" patchset" This patchset fixes some problems of the patchset, "mm: balance LRU lists based on relative thrashing", which is now merged on the mainline. Patch "mm: workingset: let cache workingset challenge anon fix" is the result of discussion with Johannes. See following link. http://lkml.kernel.org/r/20200520232525.798933-6-hannes@cmpxchg.org And, the other two are minor things which are found when I try to rebase my patchset. This patch (of 3): After ("mm: workingset: let cache workingset challenge anon fix"), we compare refault distances to active_file + anon. But age of the non-resident information is only driven by the file LRU. As a result, we may overestimate the recency of any incoming refaults and activate them too eagerly, causing unnecessary LRU churn in certain situations. Make anon aging drive nonresident age as well to address that. Link: http://lkml.kernel.org/r/1592288204-27734-1-git-send-email-iamjoonsoo.kim@lge.com Link: http://lkml.kernel.org/r/1592288204-27734-2-git-send-email-iamjoonsoo.kim@lge.com Fixes: 34e58cac6d8f2a ("mm: workingset: let cache workingset challenge anon") Reported-by: Joonsoo Kim Signed-off-by: Johannes Weiner Signed-off-by: Joonsoo Kim Cc: Rik van Riel Cc: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- include/linux/swap.h | 1 + mm/vmscan.c | 3 +++ mm/workingset.c | 46 +++++++++++++++++++++++++++------------------- 4 files changed, 33 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c4c37fd12104..f6f884970511 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -257,8 +257,8 @@ struct lruvec { */ unsigned long anon_cost; unsigned long file_cost; - /* Evictions & activations on the inactive file list */ - atomic_long_t inactive_age; + /* Non-resident age, driven by LRU movement */ + atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults; /* Various lruvec state flags (enum lruvec_flags) */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 4c5974bb9ba9..5b3216ba39a9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -313,6 +313,7 @@ struct vma_swap_readahead { }; /* linux/mm/workingset.c */ +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); diff --git a/mm/vmscan.c b/mm/vmscan.c index b6d84326bdf2..749d239c62b2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -904,6 +904,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, __delete_from_swap_cache(page, swap); xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); + workingset_eviction(page, target_memcg); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -1884,6 +1885,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, &pages_to_free); } else { nr_moved += nr_pages; + if (PageActive(page)) + workingset_age_nonresident(lruvec, nr_pages); } } diff --git a/mm/workingset.c b/mm/workingset.c index d481ea452eeb..50b7937bab32 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -156,8 +156,8 @@ * * Implementation * - * For each node's file LRU lists, a counter for inactive evictions - * and activations is maintained (node->inactive_age). + * For each node's LRU lists, a counter for inactive evictions and + * activations is maintained (node->nonresident_age). * * On eviction, a snapshot of this counter (along with some bits to * identify the node) is stored in the now empty page cache @@ -213,7 +213,17 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *workingsetp = workingset; } -static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) +/** + * workingset_age_nonresident - age non-resident entries as LRU ages + * @memcg: the lruvec that was aged + * @nr_pages: the number of pages to count + * + * As in-memory pages are aged, non-resident pages need to be aged as + * well, in order for the refault distances later on to be comparable + * to the in-memory dimensions. This function allows reclaim and LRU + * operations to drive the non-resident aging along in parallel. + */ +void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages) { /* * Reclaiming a cgroup means reclaiming all its children in a @@ -227,11 +237,8 @@ static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) * the root cgroup's, age as well. */ do { - struct lruvec *lruvec; - - lruvec = mem_cgroup_lruvec(memcg, pgdat); - atomic_long_inc(&lruvec->inactive_age); - } while (memcg && (memcg = parent_mem_cgroup(memcg))); + atomic_long_add(nr_pages, &lruvec->nonresident_age); + } while ((lruvec = parent_lruvec(lruvec))); } /** @@ -254,12 +261,11 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - advance_inactive_age(page_memcg(page), pgdat); - lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); - eviction = atomic_long_read(&lruvec->inactive_age); + eviction = atomic_long_read(&lruvec->nonresident_age); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -309,20 +315,20 @@ void workingset_refault(struct page *page, void *shadow) if (!mem_cgroup_disabled() && !eviction_memcg) goto out; eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); - refault = atomic_long_read(&eviction_lruvec->inactive_age); + refault = atomic_long_read(&eviction_lruvec->nonresident_age); /* * Calculate the refault distance * * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. There is a + * across nonresident_age overflows in most cases. There is a * special case: usually, shadow entries have a short lifetime * and are either refaulted or reclaimed along with the inode * before they get too old. But it is not impossible for the - * inactive_age to lap a shadow entry in the field, which can - * then result in a false small refault distance, leading to a - * false activation should this old entry actually refault - * again. However, earlier kernels used to deactivate + * nonresident_age to lap a shadow entry in the field, which + * can then result in a false small refault distance, leading + * to a false activation should this old entry actually + * refault again. However, earlier kernels used to deactivate * unconditionally with *every* reclaim invocation for the * longest time, so the occasional inappropriate activation * leading to pressure on the active list is not a problem. @@ -359,7 +365,7 @@ void workingset_refault(struct page *page, void *shadow) goto out; SetPageActive(page); - advance_inactive_age(memcg, pgdat); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); /* Page was active prior to eviction */ @@ -382,6 +388,7 @@ out: void workingset_activation(struct page *page) { struct mem_cgroup *memcg; + struct lruvec *lruvec; rcu_read_lock(); /* @@ -394,7 +401,8 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - advance_inactive_age(memcg, page_pgdat(page)); + lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); + workingset_age_nonresident(lruvec, hpage_nr_pages(page)); out: rcu_read_unlock(); } -- cgit v1.2.3 From cb6868832ede5cd73b346ec11cf89814d26ff7c7 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 25 Jun 2020 20:30:34 -0700 Subject: mm/swap: fix for "mm: workingset: age nonresident information alongside anonymous pages" Non-file-lru page could also be activated in mark_page_accessed() and we need to count this activation for nonresident_age. Note that it's better for this patch to be squashed into the patch "mm: workingset: age nonresident information alongside anonymous pages". Link: http://lkml.kernel.org/r/1592288204-27734-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Joonsoo Kim Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index dbcab84c6fce..a82efc33411f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -443,8 +443,7 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); - if (page_is_file_lru(page)) - workingset_activation(page); + workingset_activation(page); } if (page_is_idle(page)) clear_page_idle(page); -- cgit v1.2.3 From 0076f029cb2906d32baf3bf4401ef09663071d16 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 25 Jun 2020 20:30:37 -0700 Subject: mm/memory: fix IO cost for anonymous page With synchronous IO swap device, swap-in is directly handled in fault code. Since IO cost notation isn't added there, with synchronous IO swap device, LRU balancing could be wrongly biased. Fix it to count it in fault code. Link: http://lkml.kernel.org/r/1592288204-27734-4-git-send-email-iamjoonsoo.kim@lge.com Fixes: 314b57fb0460001 ("mm: balance LRU lists based on relative thrashing cache sizing") Signed-off-by: Joonsoo Kim Acked-by: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 0e5b25c9b151..87ec87cdc1ff 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3146,6 +3146,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_page; } + /* + * XXX: Move to lru_cache_add() when it + * supports new vs putback + */ + spin_lock_irq(&page_pgdat(page)->lru_lock); + lru_note_cost_page(page); + spin_unlock_irq(&page_pgdat(page)->lru_lock); + lru_cache_add(page); swap_readpage(page, true); } -- cgit v1.2.3 From 7a0e27b2a0ce2735e27e21ebc8b777550fe0ed81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Jun 2020 20:30:47 -0700 Subject: mm: remove vmalloc_exec Merge vmalloc_exec into its only caller. Note that for !CONFIG_MMU __vmalloc_node_range maps to __vmalloc, which directly clears the __GFP_HIGHMEM added by the vmalloc_exec stub anyway. Link: http://lkml.kernel.org/r/20200618064307.32739-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Dexuan Cui Cc: Jessica Yu Cc: Vitaly Kuznetsov Cc: Wei Liu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 - kernel/module.c | 4 +++- mm/nommu.c | 17 ----------------- mm/vmalloc.c | 20 -------------------- 4 files changed, 3 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 48bb681e6c2a..0221f852a7e1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -106,7 +106,6 @@ extern void *vzalloc(unsigned long size); extern void *vmalloc_user(unsigned long size); extern void *vmalloc_node(unsigned long size, int node); extern void *vzalloc_node(unsigned long size, int node); -extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); diff --git a/kernel/module.c b/kernel/module.c index e8a198588f26..0c6573b98c36 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2783,7 +2783,9 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) void * __weak module_alloc(unsigned long size) { - return vmalloc_exec(size); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __func__); } bool __weak module_init_section(const char *name) diff --git a/mm/nommu.c b/mm/nommu.c index cdcad5d61dd1..f32a69095d50 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -290,23 +290,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - */ - -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); -} - /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 957a0be77270..5a2b55c8dd9a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2695,26 +2695,6 @@ void *vzalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vzalloc_node); -/** - * vmalloc_exec - allocate virtually contiguous, executable memory - * @size: allocation size - * - * Kernel-internal function to allocate enough pages to cover @size - * the page level allocator and map them into contiguous and - * executable kernel virtual space. - * - * For tight control over page level allocator and protection flags - * use __vmalloc() instead. - * - * Return: pointer to the allocated memory or %NULL on error - */ -void *vmalloc_exec(unsigned long size) -{ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, - NUMA_NO_NODE, __builtin_return_address(0)); -} - #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) -- cgit v1.2.3 From b7e3debdd0408c0dca5d4750371afa5003f792dc Mon Sep 17 00:00:00 2001 From: Ben Widawsky Date: Thu, 25 Jun 2020 20:30:51 -0700 Subject: mm/memory_hotplug.c: fix false softlockup during pfn range removal When working with very large nodes, poisoning the struct pages (for which there will be very many) can take a very long time. If the system is using voluntary preemptions, the software watchdog will not be able to detect forward progress. This patch addresses this issue by offering to give up time like __remove_pages() does. This behavior was introduced in v5.6 with: commit d33695b16a9f ("mm/memory_hotplug: poison memmap in remove_pfn_range_from_zone()") Alternately, init_page_poison could do this cond_resched(), but it seems to me that the caller of init_page_poison() is what actually knows whether or not it should relax its own priority. Based on Dan's notes, I think this is perfectly safe: commit f931ab479dd2 ("mm: fix devm_memremap_pages crash, use mem_hotplug_{begin, done}") Aside from fixing the lockup, it is also a friendlier thing to do on lower core systems that might wipe out large chunks of hotplug memory (probably not a very common case). Fixes this kind of splat: watchdog: BUG: soft lockup - CPU#46 stuck for 22s! [daxctl:9922] irq event stamp: 138450 hardirqs last enabled at (138449): [] trace_hardirqs_on_thunk+0x1a/0x1c hardirqs last disabled at (138450): [] trace_hardirqs_off_thunk+0x1a/0x1c softirqs last enabled at (138448): [] __do_softirq+0x347/0x456 softirqs last disabled at (138443): [] irq_exit+0x7d/0xb0 CPU: 46 PID: 9922 Comm: daxctl Not tainted 5.7.0-BEN-14238-g373c6049b336 #30 Hardware name: Intel Corporation PURLEY/PURLEY, BIOS PLYXCRB1.86B.0578.D07.1902280810 02/28/2019 RIP: 0010:memset_erms+0x9/0x10 Code: c1 e9 03 40 0f b6 f6 48 b8 01 01 01 01 01 01 01 01 48 0f af c6 f3 48 ab 89 d1 f3 aa 4c 89 c8 c3 90 49 89 f9 40 88 f0 48 89 d1 aa 4c 89 c8 c3 90 49 89 fa 40 0f b6 ce 48 b8 01 01 01 01 01 01 Call Trace: remove_pfn_range_from_zone+0x3a/0x380 memunmap_pages+0x17f/0x280 release_nodes+0x22a/0x260 __device_release_driver+0x172/0x220 device_driver_detach+0x3e/0xa0 unbind_store+0x113/0x130 kernfs_fop_write+0xdc/0x1c0 vfs_write+0xde/0x1d0 ksys_write+0x58/0xd0 do_syscall_64+0x5a/0x120 entry_SYSCALL_64_after_hwframe+0x49/0xb3 Built 2 zonelists, mobility grouping on. Total pages: 49050381 Policy zone: Normal Built 3 zonelists, mobility grouping on. Total pages: 49312525 Policy zone: Normal David said: "It really only is an issue for devmem. Ordinary hotplugged system memory is not affected (onlined/offlined in memory block granularity)." Link: http://lkml.kernel.org/r/20200619231213.1160351-1-ben.widawsky@intel.com Fixes: commit d33695b16a9f ("mm/memory_hotplug: poison memmap in remove_pfn_range_from_zone()") Signed-off-by: Ben Widawsky Reported-by: "Scargall, Steve" Reported-by: Ben Widawsky Acked-by: David Hildenbrand Cc: Dan Williams Cc: Vishal Verma Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9b34e03e730a..da374cd3d45b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -471,11 +471,20 @@ void __ref remove_pfn_range_from_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { + const unsigned long end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long flags; + unsigned long pfn, cur_nr_pages, flags; /* Poison struct pages because they are now uninitialized again. */ - page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); + for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) { + cond_resched(); + + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = + min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn); + page_init_poison(pfn_to_page(pfn), + sizeof(struct page) * cur_nr_pages); + } #ifdef CONFIG_ZONE_DEVICE /* -- cgit v1.2.3 From 1139d336fff425f9a20374945cdd28eb44d09fa8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 3 Jul 2020 15:15:18 -0700 Subject: mm/hugetlb.c: fix pages per hugetlb calculation The routine hpage_nr_pages() was incorrectly used to calculate the number of base pages in a hugetlb page. hpage_nr_pages is designed to be called for THP pages and will return HPAGE_PMD_NR for hugetlb pages of any size. Due to the context in which hpage_nr_pages was called, it is unlikely to produce a user visible error. The routine with the incorrect call is only exercised in the case of hugetlb memory error or migration. In addition, this would need to be on an architecture which supports huge page sizes less than PMD_SIZE. And, the vma containing the huge page would also need to smaller than PMD_SIZE. Fixes: c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") Reported-by: Matthew Wilcox (Oracle) Signed-off-by: Mike Kravetz Signed-off-by: Andrew Morton Reviewed-by: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Kirill A . Shutemov" Cc: Link: http://lkml.kernel.org/r/20200629185003.97202-1-mike.kravetz@oracle.com Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 57ece74e3aae..fab4485b9e52 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1593,7 +1593,7 @@ static struct address_space *_get_hugetlb_page_mapping(struct page *hpage) /* Use first found vma */ pgoff_start = page_to_pgoff(hpage); - pgoff_end = pgoff_start + hpage_nr_pages(hpage) - 1; + pgoff_end = pgoff_start + pages_per_huge_page(page_hstate(hpage)) - 1; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; -- cgit v1.2.3 From 40366bd70bbbbf822ca224dfc227a8c8e868c44f Mon Sep 17 00:00:00 2001 From: Barry Song Date: Fri, 3 Jul 2020 15:15:24 -0700 Subject: mm/cma.c: use exact_nid true to fix possible per-numa cma leak Calling cma_declare_contiguous_nid() with false exact_nid for per-numa reservation can easily cause cma leak and various confusion. For example, mm/hugetlb.c is trying to reserve per-numa cma for gigantic pages. But it can easily leak cma and make users confused when system has memoryless nodes. In case the system has 4 numa nodes, and only numa node0 has memory. if we set hugetlb_cma=4G in bootargs, mm/hugetlb.c will get 4 cma areas for 4 different numa nodes. since exact_nid=false in current code, all 4 numa nodes will get cma successfully from node0, but hugetlb_cma[1 to 3] will never be available to hugepage will only allocate memory from hugetlb_cma[0]. In case the system has 4 numa nodes, both numa node0&2 has memory, other nodes have no memory. if we set hugetlb_cma=4G in bootargs, mm/hugetlb.c will get 4 cma areas for 4 different numa nodes. since exact_nid=false in current code, all 4 numa nodes will get cma successfully from node0 or 2, but hugetlb_cma[1] and [3] will never be available to hugepage as mm/hugetlb.c will only allocate memory from hugetlb_cma[0] and hugetlb_cma[2]. This causes permanent leak of the cma areas which are supposed to be used by memoryless node. Of cource we can workaround the issue by letting mm/hugetlb.c scan all cma areas in alloc_gigantic_page() even node_mask includes node0 only. that means when node_mask includes node0 only, we can get page from hugetlb_cma[1] to hugetlb_cma[3]. But this will cause kernel crash in free_gigantic_page() while it wants to free page by: cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order) On the other hand, exact_nid=false won't consider numa distance, it might be not that useful to leverage cma areas on remote nodes. I feel it is much simpler to make exact_nid true to make everything clear. After that, memoryless nodes won't be able to reserve per-numa CMA from other nodes which have memory. Fixes: cf11e85fc08c ("mm: hugetlb: optionally allocate gigantic hugepages using cma") Signed-off-by: Barry Song Signed-off-by: Andrew Morton Acked-by: Roman Gushchin Cc: Jonathan Cameron Cc: Aslan Bakirov Cc: Michal Hocko Cc: Andreas Schaufler Cc: Mike Kravetz Cc: Rik van Riel Cc: Joonsoo Kim Cc: Robin Murphy Cc: Link: http://lkml.kernel.org/r/20200628074345.27228-1-song.bao.hua@hisilicon.com Signed-off-by: Linus Torvalds --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 0463ad2ce06b..26ecff818881 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -339,13 +339,13 @@ int __init cma_declare_contiguous_nid(phys_addr_t base, */ if (base < highmem_start && limit > highmem_start) { addr = memblock_alloc_range_nid(size, alignment, - highmem_start, limit, nid, false); + highmem_start, limit, nid, true); limit = highmem_start; } if (!addr) { addr = memblock_alloc_range_nid(size, alignment, base, - limit, nid, false); + limit, nid, true); if (!addr) { ret = -ENOMEM; goto err; -- cgit v1.2.3 From 8beeae86b8e19a4714531f8764b68bd828ffba2a Mon Sep 17 00:00:00 2001 From: Joel Savitz Date: Fri, 3 Jul 2020 15:15:30 -0700 Subject: mm/page_alloc: fix documentation error When I increased the upper bound of the min_free_kbytes value in ee8eb9a5fe863 ("mm/page_alloc: increase default min_free_kbytes bound") I forgot to tweak the above comment to reflect the new value. This patch fixes that mistake. Signed-off-by: Joel Savitz Signed-off-by: Andrew Morton Cc: Matthew Wilcox Cc: Rafael Aquini Cc: Fabrizio D'Angelo Link: http://lkml.kernel.org/r/20200624221236.29560-1-jsavitz@redhat.com Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48eb0f1410d4..e028b87ce294 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7832,7 +7832,7 @@ void setup_per_zone_wmarks(void) * Initialise min_free_kbytes. * * For small machines we want it small (128k min). For large machines - * we want it large (64MB max). But it is not linear, because network + * we want it large (256MB max). But it is not linear, because network * bandwidth does not increase linearly with machine size. We use * * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: -- cgit v1.2.3 From 41da51bce36f44eefc1e3d0f47d18841cbd065ba Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 21 Nov 2019 23:25:07 +0000 Subject: fs: Add IOCB_NOIO flag for generic_file_read_iter Add an IOCB_NOIO flag that indicates to generic_file_read_iter that it shouldn't trigger any filesystem I/O for the actual request or for readahead. This allows to do tentative reads out of the page cache as some filesystems allow, and to take the appropriate locks and retry the reads only if the requested pages are not cached. Signed-off-by: Andreas Gruenbacher --- include/linux/fs.h | 1 + mm/filemap.c | 23 +++++++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3f881a892ea7..4b7cb76e5837 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -315,6 +315,7 @@ enum rw_hint { #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) #define IOCB_NOWAIT (1 << 7) +#define IOCB_NOIO (1 << 9) struct kiocb { struct file *ki_filp; diff --git a/mm/filemap.c b/mm/filemap.c index f0ae9a6308cb..385759c4ce4b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2028,7 +2028,7 @@ find_page: page = find_get_page(mapping, index); if (!page) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) goto would_block; page_cache_sync_readahead(mapping, ra, filp, @@ -2038,6 +2038,10 @@ find_page: goto no_cached_page; } if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_NOIO) { + put_page(page); + goto out; + } page_cache_async_readahead(mapping, ra, filp, page, index, last_index - index); @@ -2160,6 +2164,11 @@ page_not_up_to_date_locked: } readpage: + if (iocb->ki_flags & IOCB_NOIO) { + unlock_page(page); + put_page(page); + goto would_block; + } /* * A previous I/O error may have been due to temporary * failures, eg. multipath errors. @@ -2249,9 +2258,19 @@ EXPORT_SYMBOL_GPL(generic_file_buffered_read); * * This is the "read_iter()" routine for all filesystems * that can use the page cache directly. + * + * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall + * be returned when no data can be read without waiting for I/O requests + * to complete; it doesn't prevent readahead. + * + * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O + * requests shall be made for the read or for readahead. When no data + * can be read, -EAGAIN shall be returned. When readahead would be + * triggered, a partial, possibly empty read shall be returned. + * * Return: * * number of bytes copied, even for partial reads - * * negative error code if nothing was read + * * negative error code (or 0 if IOCB_NOIO) if nothing was read */ ssize_t generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) -- cgit v1.2.3 From 6ec4476ac82512f09c94aff5972654b70f3772b2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jul 2020 10:48:35 -0700 Subject: Raise gcc version requirement to 4.9 I realize that we fairly recently raised it to 4.8, but the fact is, 4.9 is a much better minimum version to target. We have a number of workarounds for actual bugs in pre-4.9 gcc versions (including things like internal compiler errors on ARM), but we also have some syntactic workarounds for lacking features. In particular, raising the minimum to 4.9 means that we can now just assume _Generic() exists, which is likely the much better replacement for a lot of very convoluted built-time magic with conditionals on sizeof and/or __builtin_choose_expr() with same_type() etc. Using _Generic also means that you will need to have a very recent version of 'sparse', but thats easy to build yourself, and much less of a hassle than some old gcc version can be. The latest (in a long string) of reasons for minimum compiler version upgrades was commit 5435f73d5c4a ("efi/x86: Fix build with gcc 4"). Ard points out that RHEL 7 uses gcc-4.8, but the people who stay back on old RHEL versions persumably also don't build their own kernels anyway. And maybe they should cross-built or just have a little side affair with a newer compiler? Acked-by: Ard Biesheuvel Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- arch/arm/kernel/asm-offsets.c | 9 --------- arch/mips/include/asm/unroll.h | 7 +++---- include/linux/bits.h | 3 +-- include/linux/compiler-gcc.h | 2 +- include/linux/compiler_types.h | 27 +-------------------------- mm/migrate.c | 13 +------------ tools/include/linux/bits.h | 3 +-- 7 files changed, 8 insertions(+), 56 deletions(-) (limited to 'mm') diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index c036a4a2f8e2..a1570c8bab25 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -31,15 +31,6 @@ #if defined(__APCS_26__) #error Sorry, your compiler targets APCS-26 but this kernel requires APCS-32 #endif -/* - * GCC 4.8.0-4.8.2: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58854 - * miscompiles find_get_entry(), and can result in EXT3 and EXT4 - * filesystem corruption (possibly other FS too). - */ -#if defined(GCC_VERSION) && GCC_VERSION >= 40800 && GCC_VERSION < 40803 -#error Your compiler is too buggy; it is known to miscompile kernels -#error and result in filesystem corruption and oopses. -#endif int main(void) { diff --git a/arch/mips/include/asm/unroll.h b/arch/mips/include/asm/unroll.h index c628747d4ecd..8ed660adc84f 100644 --- a/arch/mips/include/asm/unroll.h +++ b/arch/mips/include/asm/unroll.h @@ -19,14 +19,13 @@ \ /* \ * We can't unroll if the number of iterations isn't \ - * compile-time constant. Unfortunately GCC versions \ - * up until 4.6 tend to miss obvious constants & cause \ + * compile-time constant. Unfortunately clang versions \ + * up until 8.0 tend to miss obvious constants & cause \ * this check to fail, even though they go on to \ * generate reasonable code for the switch statement, \ * so we skip the sanity check for those compilers. \ */ \ - BUILD_BUG_ON((CONFIG_GCC_VERSION >= 40700 || \ - CONFIG_CLANG_VERSION >= 80000) && \ + BUILD_BUG_ON((CONFIG_CLANG_VERSION >= 80000) && \ !__builtin_constant_p(times)); \ \ switch (times) { \ diff --git a/include/linux/bits.h b/include/linux/bits.h index 4671fbf28842..7f475d59a097 100644 --- a/include/linux/bits.h +++ b/include/linux/bits.h @@ -18,8 +18,7 @@ * position @h. For example * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ -#if !defined(__ASSEMBLY__) && \ - (!defined(CONFIG_CC_IS_GCC) || CONFIG_GCC_VERSION >= 49000) +#if !defined(__ASSEMBLY__) #include #define GENMASK_INPUT_CHECK(h, l) \ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 1c74464c80c6..0b1dc61f3955 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -11,7 +11,7 @@ + __GNUC_PATCHLEVEL__) /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */ -#if GCC_VERSION < 40800 +#if GCC_VERSION < 40900 # error Sorry, your compiler is too old - please upgrade it. #endif diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index c3bf7710f69a..01dd58c74d80 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -252,32 +252,8 @@ struct ftrace_likely_data { * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving * non-scalar types unchanged. */ -#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 40900) || defined(__CHECKER__) /* - * We build this out of a couple of helper macros in a vain attempt to - * help you keep your lunch down while reading it. - */ -#define __pick_scalar_type(x, type, otherwise) \ - __builtin_choose_expr(__same_type(x, type), (type)0, otherwise) - -/* - * 'char' is not type-compatible with either 'signed char' or 'unsigned char', - * so we include the naked type here as well as the signed/unsigned variants. - */ -#define __pick_integer_type(x, type, otherwise) \ - __pick_scalar_type(x, type, \ - __pick_scalar_type(x, unsigned type, \ - __pick_scalar_type(x, signed type, otherwise))) - -#define __unqual_scalar_typeof(x) typeof( \ - __pick_integer_type(x, char, \ - __pick_integer_type(x, short, \ - __pick_integer_type(x, int, \ - __pick_integer_type(x, long, \ - __pick_integer_type(x, long long, x)))))) -#else -/* - * If supported, prefer C11 _Generic for better compile-times. As above, 'char' + * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char' * is not type-compatible with 'signed char', and we define a separate case. */ #define __scalar_type_to_expr_cases(type) \ @@ -293,7 +269,6 @@ struct ftrace_likely_data { __scalar_type_to_expr_cases(long), \ __scalar_type_to_expr_cases(long long), \ default: (x))) -#endif /* Is this type a native word size -- useful for atomic operations */ #define __native_word(t) \ diff --git a/mm/migrate.c b/mm/migrate.c index f37729673558..40cd7016ae6f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1160,22 +1160,11 @@ out: return rc; } -/* - * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work - * around it. - */ -#if defined(CONFIG_ARM) && \ - defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700 -#define ICE_noinline noinline -#else -#define ICE_noinline -#endif - /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static ICE_noinline int unmap_and_move(new_page_t get_new_page, +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *page, int force, enum migrate_mode mode, diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 4671fbf28842..7f475d59a097 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -18,8 +18,7 @@ * position @h. For example * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ -#if !defined(__ASSEMBLY__) && \ - (!defined(CONFIG_CC_IS_GCC) || CONFIG_GCC_VERSION >= 49000) +#if !defined(__ASSEMBLY__) #include #define GENMASK_INPUT_CHECK(h, l) \ (BUILD_BUG_ON_ZERO(__builtin_choose_expr( \ -- cgit v1.2.3