From 4533d3aed857c558d6aabd00d0cb04100c5a2258 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 1 Sep 2020 10:33:25 +0200 Subject: memremap: rename MEMORY_DEVICE_DEVDAX to MEMORY_DEVICE_GENERIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is in preparation for the logic behind MEMORY_DEVICE_DEVDAX also being used by non DAX devices. No functional change intended. Signed-off-by: Roger Pau Monné Reviewed-by: Ira Weiny Acked-by: Andrew Morton Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20200901083326.21264-3-roger.pau@citrix.com Signed-off-by: Juergen Gross --- mm/memremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memremap.c b/mm/memremap.c index 03e38b7a38f1..006dace60b1a 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -216,7 +216,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) return ERR_PTR(-EINVAL); } break; - case MEMORY_DEVICE_DEVDAX: + case MEMORY_DEVICE_GENERIC: need_devmap_managed = false; break; case MEMORY_DEVICE_PCI_P2PDMA: -- cgit v1.2.3 From f1796544a0ca0f14386a679d3d05fbc69235015e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 4 Sep 2020 16:35:24 -0700 Subject: memcg: fix use-after-free in uncharge_batch syzbot has reported an use-after-free in the uncharge_batch path BUG: KASAN: use-after-free in instrument_atomic_write include/linux/instrumented.h:71 [inline] BUG: KASAN: use-after-free in atomic64_sub_return include/asm-generic/atomic-instrumented.h:970 [inline] BUG: KASAN: use-after-free in atomic_long_sub_return include/asm-generic/atomic-long.h:113 [inline] BUG: KASAN: use-after-free in page_counter_cancel mm/page_counter.c:54 [inline] BUG: KASAN: use-after-free in page_counter_uncharge+0x3d/0xc0 mm/page_counter.c:155 Write of size 8 at addr ffff8880371c0148 by task syz-executor.0/9304 CPU: 0 PID: 9304 Comm: syz-executor.0 Not tainted 5.8.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1f0/0x31e lib/dump_stack.c:118 print_address_description+0x66/0x620 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report+0x132/0x1d0 mm/kasan/report.c:530 check_memory_region_inline mm/kasan/generic.c:183 [inline] check_memory_region+0x2b5/0x2f0 mm/kasan/generic.c:192 instrument_atomic_write include/linux/instrumented.h:71 [inline] atomic64_sub_return include/asm-generic/atomic-instrumented.h:970 [inline] atomic_long_sub_return include/asm-generic/atomic-long.h:113 [inline] page_counter_cancel mm/page_counter.c:54 [inline] page_counter_uncharge+0x3d/0xc0 mm/page_counter.c:155 uncharge_batch+0x6c/0x350 mm/memcontrol.c:6764 uncharge_page+0x115/0x430 mm/memcontrol.c:6796 uncharge_list mm/memcontrol.c:6835 [inline] mem_cgroup_uncharge_list+0x70/0xe0 mm/memcontrol.c:6877 release_pages+0x13a2/0x1550 mm/swap.c:911 tlb_batch_pages_flush mm/mmu_gather.c:49 [inline] tlb_flush_mmu_free mm/mmu_gather.c:242 [inline] tlb_flush_mmu+0x780/0x910 mm/mmu_gather.c:249 tlb_finish_mmu+0xcb/0x200 mm/mmu_gather.c:328 exit_mmap+0x296/0x550 mm/mmap.c:3185 __mmput+0x113/0x370 kernel/fork.c:1076 exit_mm+0x4cd/0x550 kernel/exit.c:483 do_exit+0x576/0x1f20 kernel/exit.c:793 do_group_exit+0x161/0x2d0 kernel/exit.c:903 get_signal+0x139b/0x1d30 kernel/signal.c:2743 arch_do_signal+0x33/0x610 arch/x86/kernel/signal.c:811 exit_to_user_mode_loop kernel/entry/common.c:135 [inline] exit_to_user_mode_prepare+0x8d/0x1b0 kernel/entry/common.c:166 syscall_exit_to_user_mode+0x5e/0x1a0 kernel/entry/common.c:241 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Commit 1a3e1f40962c ("mm: memcontrol: decouple reference counting from page accounting") reworked the memcg lifetime to be bound the the struct page rather than charges. It also removed the css_put_many from uncharge_batch and that is causing the above splat. uncharge_batch() is supposed to uncharge accumulated charges for all pages freed from the same memcg. The queuing is done by uncharge_page which however drops the memcg reference after it adds charges to the batch. If the current page happens to be the last one holding the reference for its memcg then the memcg is OK to go and the next page to be freed will trigger batched uncharge which needs to access the memcg which is gone already. Fix the issue by taking a reference for the memcg in the current batch. Fixes: 1a3e1f40962c ("mm: memcontrol: decouple reference counting from page accounting") Reported-by: syzbot+b305848212deec86eabe@syzkaller.appspotmail.com Reported-by: syzbot+b5ea6fb6f139c8b9482b@syzkaller.appspotmail.com Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Roman Gushchin Cc: Hugh Dickins Link: https://lkml.kernel.org/r/20200820090341.GC5033@dhcp22.suse.cz Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b807952b4d43..cfa6cbad21d5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6774,6 +6774,9 @@ static void uncharge_batch(const struct uncharge_gather *ug) __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); + + /* drop reference from uncharge_page */ + css_put(&ug->memcg->css); } static void uncharge_page(struct page *page, struct uncharge_gather *ug) @@ -6797,6 +6800,9 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) uncharge_gather_clear(ug); } ug->memcg = page->mem_cgroup; + + /* pairs with css_put in uncharge_batch */ + css_get(&ug->memcg->css); } nr_pages = compound_nr(page); -- cgit v1.2.3 From e3336cab2579012b1e72b5265adf98e2d6e244ad Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Fri, 4 Sep 2020 16:35:27 -0700 Subject: mm: memcg: fix memcg reclaim soft lockup We've met softlockup with "CONFIG_PREEMPT_NONE=y", when the target memcg doesn't have any reclaimable memory. It can be easily reproduced as below: watchdog: BUG: soft lockup - CPU#0 stuck for 111s![memcg_test:2204] CPU: 0 PID: 2204 Comm: memcg_test Not tainted 5.9.0-rc2+ #12 Call Trace: shrink_lruvec+0x49f/0x640 shrink_node+0x2a6/0x6f0 do_try_to_free_pages+0xe9/0x3e0 try_to_free_mem_cgroup_pages+0xef/0x1f0 try_charge+0x2c1/0x750 mem_cgroup_charge+0xd7/0x240 __add_to_page_cache_locked+0x2fd/0x370 add_to_page_cache_lru+0x4a/0xc0 pagecache_get_page+0x10b/0x2f0 filemap_fault+0x661/0xad0 ext4_filemap_fault+0x2c/0x40 __do_fault+0x4d/0xf9 handle_mm_fault+0x1080/0x1790 It only happens on our 1-vcpu instances, because there's no chance for oom reaper to run to reclaim the to-be-killed process. Add a cond_resched() at the upper shrink_node_memcgs() to solve this issue, this will mean that we will get a scheduling point for each memcg in the reclaimed hierarchy without any dependency on the reclaimable memory in that memcg thus making it more predictable. Suggested-by: Michal Hocko Signed-off-by: Xunlei Pang Signed-off-by: Andrew Morton Acked-by: Chris Down Acked-by: Michal Hocko Acked-by: Johannes Weiner Link: http://lkml.kernel.org/r/1598495549-67324-1-git-send-email-xlpang@linux.alibaba.com Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 99e1796eb833..9727dd8e2581 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2615,6 +2615,14 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) unsigned long reclaimed; unsigned long scanned; + /* + * This loop can become CPU-bound when target memcgs + * aren't eligible for reclaim - either because they + * don't have any reclaimable pages, or because their + * memory is explicitly protected. Avoid soft lockups. + */ + cond_resched(); + mem_cgroup_calculate_protection(target_memcg, memcg); if (mem_cgroup_below_min(memcg)) { -- cgit v1.2.3 From dc07a728d49cf025f5da2c31add438d839d076c0 Mon Sep 17 00:00:00 2001 From: Eugeniu Rosca Date: Fri, 4 Sep 2020 16:35:30 -0700 Subject: mm: slub: fix conversion of freelist_corrupted() Commit 52f23478081ae0 ("mm/slub.c: fix corrupted freechain in deactivate_slab()") suffered an update when picked up from LKML [1]. Specifically, relocating 'freelist = NULL' into 'freelist_corrupted()' created a no-op statement. Fix it by sticking to the behavior intended in the original patch [1]. In addition, make freelist_corrupted() immune to passing NULL instead of &freelist. The issue has been spotted via static analysis and code review. [1] https://lore.kernel.org/linux-mm/20200331031450.12182-1-dongli.zhang@oracle.com/ Fixes: 52f23478081ae0 ("mm/slub.c: fix corrupted freechain in deactivate_slab()") Signed-off-by: Eugeniu Rosca Signed-off-by: Andrew Morton Cc: Dongli Zhang Cc: Joe Jin Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Link: https://lkml.kernel.org/r/20200824130643.10291-1-erosca@de.adit-jv.com Signed-off-by: Linus Torvalds --- mm/slub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 68c02b2eecd9..d4177aecedf6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -672,12 +672,12 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) } static bool freelist_corrupted(struct kmem_cache *s, struct page *page, - void *freelist, void *nextfree) + void **freelist, void *nextfree) { if ((s->flags & SLAB_CONSISTENCY_CHECKS) && - !check_valid_pointer(s, page, nextfree)) { - object_err(s, page, freelist, "Freechain corrupt"); - freelist = NULL; + !check_valid_pointer(s, page, nextfree) && freelist) { + object_err(s, page, *freelist, "Freechain corrupt"); + *freelist = NULL; slab_fix(s, "Isolate corrupted freechain"); return true; } @@ -1494,7 +1494,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} static bool freelist_corrupted(struct kmem_cache *s, struct page *page, - void *freelist, void *nextfree) + void **freelist, void *nextfree) { return false; } @@ -2184,7 +2184,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, * 'freelist' is already corrupted. So isolate all objects * starting at 'freelist'. */ - if (freelist_corrupted(s, page, freelist, nextfree)) + if (freelist_corrupted(s, page, &freelist, nextfree)) break; do { -- cgit v1.2.3 From e80d3909be42f7e38cc350c1ba109cf0aa51956a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 4 Sep 2020 16:35:43 -0700 Subject: mm: track page table modifications in __apply_to_page_range() __apply_to_page_range() is also used to change and/or allocate page-table pages in the vmalloc area of the address space. Make sure these changes get synchronized to other page-tables in the system by calling arch_sync_kernel_mappings() when necessary. The impact appears limited to x86-32, where apply_to_page_range may miss updating the PMD. That leads to explosions in drivers like BUG: unable to handle page fault for address: fe036000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page *pde = 00000000 Oops: 0002 [#1] SMP CPU: 3 PID: 1300 Comm: gem_concurrent_ Not tainted 5.9.0-rc1+ #16 Hardware name: /NUC6i3SYB, BIOS SYSKLi35.86A.0024.2015.1027.2142 10/27/2015 EIP: __execlists_context_alloc+0x132/0x2d0 [i915] Code: 31 d2 89 f0 e8 2f 55 02 00 89 45 e8 3d 00 f0 ff ff 0f 87 11 01 00 00 8b 4d e8 03 4b 30 b8 5a 5a 5a 5a ba 01 00 00 00 8d 79 04 01 5a 5a 5a 5a c7 81 fc 0f 00 00 5a 5a 5a 5a 83 e7 fc 29 f9 81 EAX: 5a5a5a5a EBX: f60ca000 ECX: fe036000 EDX: 00000001 ESI: f43b7340 EDI: fe036004 EBP: f6389cb8 ESP: f6389c9c DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010286 CR0: 80050033 CR2: fe036000 CR3: 2d361000 CR4: 001506d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: fffe0ff0 DR7: 00000400 Call Trace: execlists_context_alloc+0x10/0x20 [i915] intel_context_alloc_state+0x3f/0x70 [i915] __intel_context_do_pin+0x117/0x170 [i915] i915_gem_do_execbuffer+0xcc7/0x2500 [i915] i915_gem_execbuffer2_ioctl+0xcd/0x1f0 [i915] drm_ioctl_kernel+0x8f/0xd0 drm_ioctl+0x223/0x3d0 __ia32_sys_ioctl+0x1ab/0x760 __do_fast_syscall_32+0x3f/0x70 do_fast_syscall_32+0x29/0x60 do_SYSENTER_32+0x15/0x20 entry_SYSENTER_32+0x9f/0xf2 EIP: 0xb7f28559 Code: 03 74 c0 01 10 05 03 74 b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d 76 00 58 b8 77 00 00 00 cd 80 90 8d 76 EAX: ffffffda EBX: 00000005 ECX: c0406469 EDX: bf95556c ESI: b7e68000 EDI: c0406469 EBP: 00000005 ESP: bf9554d8 DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 007b EFLAGS: 00000296 Modules linked in: i915 x86_pkg_temp_thermal intel_powerclamp crc32_pclmul crc32c_intel intel_cstate intel_uncore intel_gtt drm_kms_helper intel_pch_thermal video button autofs4 i2c_i801 i2c_smbus fan CR2: 00000000fe036000 It looks like kasan, xen and i915 are vulnerable. Actual impact is "on thinkpad X60 in 5.9-rc1, screen starts blinking after 30-or-so minutes, and machine is unusable" [sfr@canb.auug.org.au: ARCH_PAGE_TABLE_SYNC_MASK needs vmalloc.h] Link: https://lkml.kernel.org/r/20200825172508.16800a4f@canb.auug.org.au [chris@chris-wilson.co.uk: changelog addition] [pavel@ucw.cz: changelog addition] Fixes: 2ba3e6947aed ("mm/vmalloc: track which page-table levels were modified") Fixes: 86cf69f1d893 ("x86/mm/32: implement arch_sync_kernel_mappings()") Signed-off-by: Joerg Roedel Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Tested-by: Chris Wilson [x86-32] Tested-by: Pavel Machek Acked-by: Linus Torvalds Cc: [5.8+] Link: https://lkml.kernel.org/r/20200821123746.16904-1-joro@8bytes.org Signed-off-by: Linus Torvalds --- mm/memory.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 602f4283122f..547b81a14059 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -73,6 +73,7 @@ #include #include #include +#include #include @@ -83,6 +84,7 @@ #include #include +#include "pgalloc-track.h" #include "internal.h" #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) @@ -2206,7 +2208,8 @@ EXPORT_SYMBOL(vm_iomap_memory); static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { pte_t *pte; int err = 0; @@ -2214,7 +2217,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, if (create) { pte = (mm == &init_mm) ? - pte_alloc_kernel(pmd, addr) : + pte_alloc_kernel_track(pmd, addr, mask) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; @@ -2235,6 +2238,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, break; } } while (addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED; arch_leave_lazy_mmu_mode(); @@ -2245,7 +2249,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; @@ -2254,7 +2259,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, BUG_ON(pud_huge(*pud)); if (create) { - pmd = pmd_alloc(mm, pud, addr); + pmd = pmd_alloc_track(mm, pud, addr, mask); if (!pmd) return -ENOMEM; } else { @@ -2264,7 +2269,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, next = pmd_addr_end(addr, end); if (create || !pmd_none_or_clear_bad(pmd)) { err = apply_to_pte_range(mm, pmd, addr, next, fn, data, - create); + create, mask); if (err) break; } @@ -2274,14 +2279,15 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int err = 0; if (create) { - pud = pud_alloc(mm, p4d, addr); + pud = pud_alloc_track(mm, p4d, addr, mask); if (!pud) return -ENOMEM; } else { @@ -2291,7 +2297,7 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, next = pud_addr_end(addr, end); if (create || !pud_none_or_clear_bad(pud)) { err = apply_to_pmd_range(mm, pud, addr, next, fn, data, - create); + create, mask); if (err) break; } @@ -2301,14 +2307,15 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; int err = 0; if (create) { - p4d = p4d_alloc(mm, pgd, addr); + p4d = p4d_alloc_track(mm, pgd, addr, mask); if (!p4d) return -ENOMEM; } else { @@ -2318,7 +2325,7 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, next = p4d_addr_end(addr, end); if (create || !p4d_none_or_clear_bad(p4d)) { err = apply_to_pud_range(mm, p4d, addr, next, fn, data, - create); + create, mask); if (err) break; } @@ -2331,8 +2338,9 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, void *data, bool create) { pgd_t *pgd; - unsigned long next; + unsigned long start = addr, next; unsigned long end = addr + size; + pgtbl_mod_mask mask = 0; int err = 0; if (WARN_ON(addr >= end)) @@ -2343,11 +2351,14 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, next = pgd_addr_end(addr, end); if (!create && pgd_none_or_clear_bad(pgd)) continue; - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create); + err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end); + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, start + size); + return err; } -- cgit v1.2.3 From 7867fd7cc44e63c6673cd0f8fea155456d34d0de Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 4 Sep 2020 16:35:55 -0700 Subject: mm: madvise: fix vma user-after-free The syzbot reported the below use-after-free: BUG: KASAN: use-after-free in madvise_willneed mm/madvise.c:293 [inline] BUG: KASAN: use-after-free in madvise_vma mm/madvise.c:942 [inline] BUG: KASAN: use-after-free in do_madvise.part.0+0x1c8b/0x1cf0 mm/madvise.c:1145 Read of size 8 at addr ffff8880a6163eb0 by task syz-executor.0/9996 CPU: 0 PID: 9996 Comm: syz-executor.0 Not tainted 5.9.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x18f/0x20d lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x497 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530 madvise_willneed mm/madvise.c:293 [inline] madvise_vma mm/madvise.c:942 [inline] do_madvise.part.0+0x1c8b/0x1cf0 mm/madvise.c:1145 do_madvise mm/madvise.c:1169 [inline] __do_sys_madvise mm/madvise.c:1171 [inline] __se_sys_madvise mm/madvise.c:1169 [inline] __x64_sys_madvise+0xd9/0x110 mm/madvise.c:1169 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Allocated by task 9992: kmem_cache_alloc+0x138/0x3a0 mm/slab.c:3482 vm_area_alloc+0x1c/0x110 kernel/fork.c:347 mmap_region+0x8e5/0x1780 mm/mmap.c:1743 do_mmap+0xcf9/0x11d0 mm/mmap.c:1545 vm_mmap_pgoff+0x195/0x200 mm/util.c:506 ksys_mmap_pgoff+0x43a/0x560 mm/mmap.c:1596 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 9992: kmem_cache_free.part.0+0x67/0x1f0 mm/slab.c:3693 remove_vma+0x132/0x170 mm/mmap.c:184 remove_vma_list mm/mmap.c:2613 [inline] __do_munmap+0x743/0x1170 mm/mmap.c:2869 do_munmap mm/mmap.c:2877 [inline] mmap_region+0x257/0x1780 mm/mmap.c:1716 do_mmap+0xcf9/0x11d0 mm/mmap.c:1545 vm_mmap_pgoff+0x195/0x200 mm/util.c:506 ksys_mmap_pgoff+0x43a/0x560 mm/mmap.c:1596 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 It is because vma is accessed after releasing mmap_lock, but someone else acquired the mmap_lock and the vma is gone. Releasing mmap_lock after accessing vma should fix the problem. Fixes: 692fe62433d4c ("mm: Handle MADV_WILLNEED through vfs_fadvise()") Reported-by: syzbot+b90df26038d1d5d85c97@syzkaller.appspotmail.com Signed-off-by: Yang Shi Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Cc: [5.4+] Link: https://lkml.kernel.org/r/20200816141204.162624-1-shy828301@gmail.com Signed-off-by: Linus Torvalds --- mm/madvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index dd1d43cf026d..d4aa5f776543 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -289,9 +289,9 @@ static long madvise_willneed(struct vm_area_struct *vma, */ *prev = NULL; /* tell sys_madvise we drop mmap_lock */ get_file(file); - mmap_read_unlock(current->mm); offset = (loff_t)(start - vma->vm_start) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + mmap_read_unlock(current->mm); vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); fput(file); mmap_read_lock(current->mm); -- cgit v1.2.3 From ebdf8321eeeb623aed60f7ed16f7445363230118 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 4 Sep 2020 16:35:58 -0700 Subject: mm/migrate: fixup setting UFFD_WP flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit f45ec5ff16a75 ("userfaultfd: wp: support swap and page migration") introduced support for tracking the uffd wp bit during page migration. However the non-swap PTE variant was used to set the flag for zone device private pages which are a type of swap page. This leads to corruption of the swap offset if the original PTE has the uffd_wp flag set. Fixes: f45ec5ff16a75 ("userfaultfd: wp: support swap and page migration") Signed-off-by: Alistair Popple Signed-off-by: Andrew Morton Reviewed-by: Peter Xu Cc: Jérôme Glisse Cc: John Hubbard Cc: Ralph Campbell Link: https://lkml.kernel.org/r/20200825064232.10023-1-alistair@popple.id.au Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 34a842a8eb6a..ddb64253fe3e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -251,7 +251,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, entry = make_device_private_entry(new, pte_write(pte)); pte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(*pvmw.pte)) - pte = pte_mkuffd_wp(pte); + pte = pte_swp_mkuffd_wp(pte); } } -- cgit v1.2.3 From ad7df764b7e1c7dc64e016da7ada2e3e1bb90700 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 4 Sep 2020 16:36:01 -0700 Subject: mm/rmap: fixup copying of soft dirty and uffd ptes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During memory migration a pte is temporarily replaced with a migration swap pte. Some pte bits from the existing mapping such as the soft-dirty and uffd write-protect bits are preserved by copying these to the temporary migration swap pte. However these bits are not stored at the same location for swap and non-swap ptes. Therefore testing these bits requires using the appropriate helper function for the given pte type. Unfortunately several code locations were found where the wrong helper function is being used to test soft_dirty and uffd_wp bits which leads to them getting incorrectly set or cleared during page-migration. Fix these by using the correct tests based on pte type. Fixes: a5430dda8a3a ("mm/migrate: support un-addressable ZONE_DEVICE page in migration") Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages") Fixes: f45ec5ff16a7 ("userfaultfd: wp: support swap and page migration") Signed-off-by: Alistair Popple Signed-off-by: Andrew Morton Reviewed-by: Peter Xu Cc: Jérôme Glisse Cc: John Hubbard Cc: Ralph Campbell Cc: Alistair Popple Cc: Link: https://lkml.kernel.org/r/20200825064232.10023-2-alistair@popple.id.au Signed-off-by: Linus Torvalds --- mm/migrate.c | 15 +++++++++++---- mm/rmap.c | 9 +++++++-- 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index ddb64253fe3e..12f63806d0ac 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2427,10 +2427,17 @@ again: entry = make_migration_entry(page, mpfn & MIGRATE_PFN_WRITE); swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pte)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pte)) - swp_pte = pte_swp_mkuffd_wp(swp_pte); + if (pte_present(pte)) { + if (pte_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } else { + if (pte_swp_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_swp_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); + } set_pte_at(mm, addr, ptep, swp_pte); /* diff --git a/mm/rmap.c b/mm/rmap.c index 83cc459edc40..9425260774a1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1511,9 +1511,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ entry = make_migration_entry(page, 0); swp_pte = swp_entry_to_pte(entry); - if (pte_soft_dirty(pteval)) + + /* + * pteval maps a zone device page and is therefore + * a swap pte. + */ + if (pte_swp_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); - if (pte_uffd_wp(pteval)) + if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); /* -- cgit v1.2.3 From 6128763fc3244d1b4868e5f0aa401f7f987b5c4d Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 4 Sep 2020 16:36:04 -0700 Subject: mm/migrate: remove unnecessary is_zone_device_page() check Patch series "mm/migrate: preserve soft dirty in remove_migration_pte()". I happened to notice this from code inspection after seeing Alistair Popple's patch ("mm/rmap: Fixup copying of soft dirty and uffd ptes"). This patch (of 2): The check for is_zone_device_page() and is_device_private_page() is unnecessary since the latter is sufficient to determine if the page is a device private page. Simplify the code for easier reading. Signed-off-by: Ralph Campbell Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Jerome Glisse Cc: Alistair Popple Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Bharata B Rao Link: https://lkml.kernel.org/r/20200831212222.22409-1-rcampbell@nvidia.com Link: https://lkml.kernel.org/r/20200831212222.22409-2-rcampbell@nvidia.com Signed-off-by: Linus Torvalds --- mm/migrate.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 12f63806d0ac..1d791d420725 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -246,13 +246,11 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, else if (pte_swp_uffd_wp(*pvmw.pte)) pte = pte_mkuffd_wp(pte); - if (unlikely(is_zone_device_page(new))) { - if (is_device_private_page(new)) { - entry = make_device_private_entry(new, pte_write(pte)); - pte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(*pvmw.pte)) - pte = pte_swp_mkuffd_wp(pte); - } + if (unlikely(is_device_private_page(new))) { + entry = make_device_private_entry(new, pte_write(pte)); + pte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_swp_mkuffd_wp(pte); } #ifdef CONFIG_HUGETLB_PAGE -- cgit v1.2.3 From 3d321bf82c4be8e33261754a5775bc65fc5d2184 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 4 Sep 2020 16:36:07 -0700 Subject: mm/migrate: preserve soft dirty in remove_migration_pte() The code to remove a migration PTE and replace it with a device private PTE was not copying the soft dirty bit from the migration entry. This could lead to page contents not being marked dirty when faulting the page back from device private memory. Signed-off-by: Ralph Campbell Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Jerome Glisse Cc: Alistair Popple Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Bharata B Rao Link: https://lkml.kernel.org/r/20200831212222.22409-3-rcampbell@nvidia.com Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 1d791d420725..941b89383cf3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -249,6 +249,8 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (unlikely(is_device_private_page(new))) { entry = make_device_private_entry(new, pte_write(pte)); pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*pvmw.pte)) + pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(*pvmw.pte)) pte = pte_swp_mkuffd_wp(pte); } -- cgit v1.2.3 From 953f064aa6b29debcc211869b60bd59f26d19c34 Mon Sep 17 00:00:00 2001 From: Li Xinhai Date: Fri, 4 Sep 2020 16:36:10 -0700 Subject: mm/hugetlb: try preferred node first when alloc gigantic page from cma Since commit cf11e85fc08c ("mm: hugetlb: optionally allocate gigantic hugepages using cma"), the gigantic page would be allocated from node which is not the preferred node, although there are pages available from that node. The reason is that the nid parameter has been ignored in alloc_gigantic_page(). Besides, the __GFP_THISNODE also need be checked if user required to alloc only from the preferred node. After this patch, the preferred node is tried first before other allowed nodes, and don't try to allocate from other nodes if __GFP_THISNODE is specified. If user don't specify the preferred node, the current node will be used as preferred node, which makes sure consistent behavior of allocating gigantic and non-gigantic hugetlb page. Fixes: cf11e85fc08c ("mm: hugetlb: optionally allocate gigantic hugepages using cma") Signed-off-by: Li Xinhai Signed-off-by: Andrew Morton Reviewed-by: Mike Kravetz Acked-by: Michal Hocko Cc: Roman Gushchin Link: https://lkml.kernel.org/r/20200902025016.697260-1-lixinhai.lxh@gmail.com Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a301c2d672bf..5957dc80ebb1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1250,21 +1250,32 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { unsigned long nr_pages = 1UL << huge_page_order(h); + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); #ifdef CONFIG_CMA { struct page *page; int node; - for_each_node_mask(node, *nodemask) { - if (!hugetlb_cma[node]) - continue; - - page = cma_alloc(hugetlb_cma[node], nr_pages, - huge_page_order(h), true); + if (hugetlb_cma[nid]) { + page = cma_alloc(hugetlb_cma[nid], nr_pages, + huge_page_order(h), true); if (page) return page; } + + if (!(gfp_mask & __GFP_THISNODE)) { + for_each_node_mask(node, *nodemask) { + if (node == nid || !hugetlb_cma[node]) + continue; + + page = cma_alloc(hugetlb_cma[node], nr_pages, + huge_page_order(h), true); + if (page) + return page; + } + } } #endif -- cgit v1.2.3 From 17743798d81238ab13050e8e2833699b54e15467 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 4 Sep 2020 16:36:13 -0700 Subject: mm/hugetlb: fix a race between hugetlb sysctl handlers There is a race between the assignment of `table->data` and write value to the pointer of `table->data` in the __do_proc_doulongvec_minmax() on the other thread. CPU0: CPU1: proc_sys_write hugetlb_sysctl_handler proc_sys_call_handler hugetlb_sysctl_handler_common hugetlb_sysctl_handler table->data = &tmp; hugetlb_sysctl_handler_common table->data = &tmp; proc_doulongvec_minmax do_proc_doulongvec_minmax sysctl_head_finish __do_proc_doulongvec_minmax unuse_table i = table->data; *i = val; // corrupt CPU1's stack Fix this by duplicating the `table`, and only update the duplicate of it. And introduce a helper of proc_hugetlb_doulongvec_minmax() to simplify the code. The following oops was seen: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor instruction fetch in kernel mode #PF: error_code(0x0010) - not-present page Code: Bad RIP value. ... Call Trace: ? set_max_huge_pages+0x3da/0x4f0 ? alloc_pool_huge_page+0x150/0x150 ? proc_doulongvec_minmax+0x46/0x60 ? hugetlb_sysctl_handler_common+0x1c7/0x200 ? nr_hugepages_store+0x20/0x20 ? copy_fd_bitmaps+0x170/0x170 ? hugetlb_sysctl_handler+0x1e/0x20 ? proc_sys_call_handler+0x2f1/0x300 ? unregister_sysctl_table+0xb0/0xb0 ? __fd_install+0x78/0x100 ? proc_sys_write+0x14/0x20 ? __vfs_write+0x4d/0x90 ? vfs_write+0xef/0x240 ? ksys_write+0xc0/0x160 ? __ia32_sys_read+0x50/0x50 ? __close_fd+0x129/0x150 ? __x64_sys_write+0x43/0x50 ? do_syscall_64+0x6c/0x200 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: e5ff215941d5 ("hugetlb: multiple hstates for multiple page sizes") Signed-off-by: Muchun Song Signed-off-by: Andrew Morton Reviewed-by: Mike Kravetz Cc: Andi Kleen Link: http://lkml.kernel.org/r/20200828031146.43035-1-songmuchun@bytedance.com Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5957dc80ebb1..67fc6383995b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3465,6 +3465,22 @@ static unsigned int allowed_mems_nr(struct hstate *h) } #ifdef CONFIG_SYSCTL +static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, + void *buffer, size_t *length, + loff_t *ppos, unsigned long *out) +{ + struct ctl_table dup_table; + + /* + * In order to avoid races with __do_proc_doulongvec_minmax(), we + * can duplicate the @table and alter the duplicate of it. + */ + dup_table = *table; + dup_table.data = out; + + return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); +} + static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) @@ -3476,9 +3492,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (!hugepages_supported()) return -EOPNOTSUPP; - table->data = &tmp; - table->maxlen = sizeof(unsigned long); - ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); if (ret) goto out; @@ -3521,9 +3536,8 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, if (write && hstate_is_gigantic(h)) return -EINVAL; - table->data = &tmp; - table->maxlen = sizeof(unsigned long); - ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, + &tmp); if (ret) goto out; -- cgit v1.2.3 From e5a59d308f52bb0052af5790c22173651b187465 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 4 Sep 2020 16:36:16 -0700 Subject: mm/khugepaged.c: fix khugepaged's request size in collapse_file collapse_file() in khugepaged passes PAGE_SIZE as the number of pages to be read to page_cache_sync_readahead(). The intent was probably to read a single page. Fix it to use the number of pages to the end of the window instead. Fixes: 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem) FS") Signed-off-by: David Howells Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Song Liu Acked-by: Yang Shi Acked-by: Pankaj Gupta Cc: Eric Biggers Link: https://lkml.kernel.org/r/20200903140844.14194-2-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e749e568e1ea..cfa0dba5fd3b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1709,7 +1709,7 @@ static void collapse_file(struct mm_struct *mm, xas_unlock_irq(&xas); page_cache_sync_readahead(mapping, &file->f_ra, file, index, - PAGE_SIZE); + end - index); /* drain pagevecs to help isolate_lru_page() */ lru_add_drain(); page = find_lock_page(mapping, index); -- cgit v1.2.3 From 5ef64cc8987a9211d3f3667331ba3411a94ddc79 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 13 Sep 2020 14:05:35 -0700 Subject: mm: allow a controlled amount of unfairness in the page lock Commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") made the page locking entirely fair, in that if a waiter came in while the lock was held, the lock would be transferred to the lockers strictly in order. That was intended to finally get rid of the long-reported watchdog failures that involved the page lock under extreme load, where a process could end up waiting essentially forever, as other page lockers stole the lock from under it. It also improved some benchmarks, but it ended up causing huge performance regressions on others, simply because fair lock behavior doesn't end up giving out the lock as aggressively, causing better worst-case latency, but potentially much worse average latencies and throughput. Instead of reverting that change entirely, this introduces a controlled amount of unfairness, with a sysctl knob to tune it if somebody needs to. But the default value should hopefully be good for any normal load, allowing a few rounds of lock stealing, but enforcing the strict ordering before the lock has been stolen too many times. There is also a hint from Matthieu Baerts that the fair page coloring may end up exposing an ABBA deadlock that is hidden by the usual optimistic lock stealing, and while the unfairness doesn't fix the fundamental issue (and I'm still looking at that), it avoids it in practice. The amount of unfairness can be modified by writing a new value to the 'sysctl_page_lock_unfairness' variable (default value of 5, exposed through /proc/sys/vm/page_lock_unfairness), but that is hopefully something we'd use mainly for debugging rather than being necessary for any deep system tuning. This whole issue has exposed just how critical the page lock can be, and how contended it gets under certain locks. And the main contention doesn't really seem to be anything related to IO (which was the origin of this lock), but for things like just verifying that the page file mapping is stable while faulting in the page into a page table. Link: https://lore.kernel.org/linux-fsdevel/ed8442fd-6f54-dd84-cd4a-941e8b7ee603@MichaelLarabel.com/ Link: https://www.phoronix.com/scan.php?page=article&item=linux-50-59&num=1 Link: https://lore.kernel.org/linux-fsdevel/c560a38d-8313-51fb-b1ec-e904bd8836bc@tessares.net/ Reported-and-tested-by: Michael Larabel Tested-by: Matthieu Baerts Cc: Dave Chinner Cc: Matthew Wilcox Cc: Chris Mason Cc: Jan Kara Cc: Amir Goldstein Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 + include/linux/wait.h | 1 + kernel/sysctl.c | 8 +++ mm/filemap.c | 160 +++++++++++++++++++++++++++++++++++++++++---------- 4 files changed, 140 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index ca6e6a81576b..b2f370f0b420 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -41,6 +41,8 @@ struct writeback_control; struct bdi_writeback; struct pt_regs; +extern int sysctl_page_lock_unfairness; + void init_mm_internals(void); #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ diff --git a/include/linux/wait.h b/include/linux/wait.h index 898c890fc153..27fb99cfeb02 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -21,6 +21,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int #define WQ_FLAG_WOKEN 0x02 #define WQ_FLAG_BOOKMARK 0x04 #define WQ_FLAG_CUSTOM 0x08 +#define WQ_FLAG_DONE 0x10 /* * A single wait-queue entry structure: diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09e70ee2332e..afad085960b8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2912,6 +2912,14 @@ static struct ctl_table vm_table[] = { .proc_handler = percpu_pagelist_fraction_sysctl_handler, .extra1 = SYSCTL_ZERO, }, + { + .procname = "page_lock_unfairness", + .data = &sysctl_page_lock_unfairness, + .maxlen = sizeof(sysctl_page_lock_unfairness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, #ifdef CONFIG_MMU { .procname = "max_map_count", diff --git a/mm/filemap.c b/mm/filemap.c index 1aaea26556cc..6aa08e7714ce 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -988,9 +988,43 @@ void __init pagecache_init(void) page_writeback_init(); } +/* + * The page wait code treats the "wait->flags" somewhat unusually, because + * we have multiple different kinds of waits, not just he usual "exclusive" + * one. + * + * We have: + * + * (a) no special bits set: + * + * We're just waiting for the bit to be released, and when a waker + * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up, + * and remove it from the wait queue. + * + * Simple and straightforward. + * + * (b) WQ_FLAG_EXCLUSIVE: + * + * The waiter is waiting to get the lock, and only one waiter should + * be woken up to avoid any thundering herd behavior. We'll set the + * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue. + * + * This is the traditional exclusive wait. + * + * (b) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: + * + * The waiter is waiting to get the bit, and additionally wants the + * lock to be transferred to it for fair lock behavior. If the lock + * cannot be taken, we stop walking the wait queue without waking + * the waiter. + * + * This is the "fair lock handoff" case, and in addition to setting + * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see + * that it now has the lock. + */ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg) { - int ret; + unsigned int flags; struct wait_page_key *key = arg; struct wait_page_queue *wait_page = container_of(wait, struct wait_page_queue, wait); @@ -999,35 +1033,44 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, return 0; /* - * If it's an exclusive wait, we get the bit for it, and - * stop walking if we can't. - * - * If it's a non-exclusive wait, then the fact that this - * wake function was called means that the bit already - * was cleared, and we don't care if somebody then - * re-took it. + * If it's a lock handoff wait, we get the bit for it, and + * stop walking (and do not wake it up) if we can't. */ - ret = 0; - if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(key->bit_nr, &key->page->flags)) + flags = wait->flags; + if (flags & WQ_FLAG_EXCLUSIVE) { + if (test_bit(key->bit_nr, &key->page->flags)) return -1; - ret = 1; + if (flags & WQ_FLAG_CUSTOM) { + if (test_and_set_bit(key->bit_nr, &key->page->flags)) + return -1; + flags |= WQ_FLAG_DONE; + } } - wait->flags |= WQ_FLAG_WOKEN; + /* + * We are holding the wait-queue lock, but the waiter that + * is waiting for this will be checking the flags without + * any locking. + * + * So update the flags atomically, and wake up the waiter + * afterwards to avoid any races. This store-release pairs + * with the load-acquire in wait_on_page_bit_common(). + */ + smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN); wake_up_state(wait->private, mode); /* * Ok, we have successfully done what we're waiting for, * and we can unconditionally remove the wait entry. * - * Note that this has to be the absolute last thing we do, - * since after list_del_init(&wait->entry) the wait entry + * Note that this pairs with the "finish_wait()" in the + * waiter, and has to be the absolute last thing we do. + * After this list_del_init(&wait->entry) the wait entry * might be de-allocated and the process might even have * exited. */ list_del_init_careful(&wait->entry); - return ret; + return (flags & WQ_FLAG_EXCLUSIVE) != 0; } static void wake_up_page_bit(struct page *page, int bit_nr) @@ -1107,8 +1150,8 @@ enum behavior { }; /* - * Attempt to check (or get) the page bit, and mark the - * waiter woken if successful. + * Attempt to check (or get) the page bit, and mark us done + * if successful. */ static inline bool trylock_page_bit_common(struct page *page, int bit_nr, struct wait_queue_entry *wait) @@ -1119,13 +1162,17 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr, } else if (test_bit(bit_nr, &page->flags)) return false; - wait->flags |= WQ_FLAG_WOKEN; + wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; return true; } +/* How many times do we accept lock stealing from under a waiter? */ +int sysctl_page_lock_unfairness = 5; + static inline int wait_on_page_bit_common(wait_queue_head_t *q, struct page *page, int bit_nr, int state, enum behavior behavior) { + int unfairness = sysctl_page_lock_unfairness; struct wait_page_queue wait_page; wait_queue_entry_t *wait = &wait_page.wait; bool thrashing = false; @@ -1143,11 +1190,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, } init_wait(wait); - wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; wait_page.page = page; wait_page.bit_nr = bit_nr; +repeat: + wait->flags = 0; + if (behavior == EXCLUSIVE) { + wait->flags = WQ_FLAG_EXCLUSIVE; + if (--unfairness < 0) + wait->flags |= WQ_FLAG_CUSTOM; + } + /* * Do one last check whether we can get the * page bit synchronously. @@ -1170,27 +1224,63 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, /* * From now on, all the logic will be based on - * the WQ_FLAG_WOKEN flag, and the and the page - * bit testing (and setting) will be - or has - * already been - done by the wake function. + * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to + * see whether the page bit testing has already + * been done by the wake function. * * We can drop our reference to the page. */ if (behavior == DROP) put_page(page); + /* + * Note that until the "finish_wait()", or until + * we see the WQ_FLAG_WOKEN flag, we need to + * be very careful with the 'wait->flags', because + * we may race with a waker that sets them. + */ for (;;) { + unsigned int flags; + set_current_state(state); - if (signal_pending_state(state, current)) + /* Loop until we've been woken or interrupted */ + flags = smp_load_acquire(&wait->flags); + if (!(flags & WQ_FLAG_WOKEN)) { + if (signal_pending_state(state, current)) + break; + + io_schedule(); + continue; + } + + /* If we were non-exclusive, we're done */ + if (behavior != EXCLUSIVE) break; - if (wait->flags & WQ_FLAG_WOKEN) + /* If the waker got the lock for us, we're done */ + if (flags & WQ_FLAG_DONE) break; - io_schedule(); + /* + * Otherwise, if we're getting the lock, we need to + * try to get it ourselves. + * + * And if that fails, we'll have to retry this all. + */ + if (unlikely(test_and_set_bit(bit_nr, &page->flags))) + goto repeat; + + wait->flags |= WQ_FLAG_DONE; + break; } + /* + * If a signal happened, this 'finish_wait()' may remove the last + * waiter from the wait-queues, but the PageWaiters bit will remain + * set. That's ok. The next wakeup will take care of it, and trying + * to do it here would be difficult and prone to races. + */ finish_wait(q, wait); if (thrashing) { @@ -1200,12 +1290,20 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, } /* - * A signal could leave PageWaiters set. Clearing it here if - * !waitqueue_active would be possible (by open-coding finish_wait), - * but still fail to catch it in the case of wait hash collision. We - * already can fail to clear wait hash collision cases, so don't - * bother with signals either. + * NOTE! The wait->flags weren't stable until we've done the + * 'finish_wait()', and we could have exited the loop above due + * to a signal, and had a wakeup event happen after the signal + * test but before the 'finish_wait()'. + * + * So only after the finish_wait() can we reliably determine + * if we got woken up or not, so we can now figure out the final + * return value based on that state without races. + * + * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive + * waiter, but an exclusive one requires WQ_FLAG_DONE. */ + if (behavior == EXCLUSIVE) + return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR; return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR; } -- cgit v1.2.3 From b3b33d3c43bbe0177d70653f4e889c78cc37f097 Mon Sep 17 00:00:00 2001 From: Sunghyun Jin Date: Thu, 3 Sep 2020 21:41:16 +0900 Subject: percpu: fix first chunk size calculation for populated bitmap Variable populated, which is a member of struct pcpu_chunk, is used as a unit of size of unsigned long. However, size of populated is miscounted. So, I fix this minor part. Fixes: 8ab16c43ea79 ("percpu: change the number of pages marked in the first_chunk pop bitmap") Cc: # 4.14+ Signed-off-by: Sunghyun Jin Signed-off-by: Dennis Zhou --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index f4709629e6de..1ed1a349eab8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1316,7 +1316,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, /* allocate chunk */ alloc_size = sizeof(struct pcpu_chunk) + - BITS_TO_LONGS(region_size >> PAGE_SHIFT); + BITS_TO_LONGS(region_size >> PAGE_SHIFT) * sizeof(unsigned long); chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk) panic("%s: Failed to allocate %zu bytes\n", __func__, -- cgit v1.2.3 From 62fdb1632bcbed30c40f6bd2b58297617e442658 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 18 Sep 2020 21:20:03 -0700 Subject: ksm: reinstate memcg charge on copied pages Patch series "mm: fixes to past from future testing". Here's a set of independent fixes against 5.9-rc2: prompted by testing Alex Shi's "warning on !memcg" and lru_lock series, but I think fit for 5.9 - though maybe only the first for stable. This patch (of 5): In 5.8 some instances of memcg charging in do_swap_page() and unuse_pte() were removed, on the understanding that swap cache is now already charged at those points; but a case was missed, when ksm_might_need_to_copy() has decided it must allocate a substitute page: such pages were never charged. Fix it inside ksm_might_need_to_copy(). This was discovered by Alex Shi's prospective commit "mm/memcg: warning on !memcg after readahead page charged". But there is a another surprise: this also fixes some rarer uncharged PageAnon cases, when KSM is configured in, but has never been activated. ksm_might_need_to_copy()'s anon_vma->root and linear_page_index() check sometimes catches a case which would need to have been copied if KSM were turned on. Or that's my optimistic interpretation (of my own old code), but it leaves some doubt as to whether everything is working as intended there - might it hint at rare anon ptes which rmap cannot find? A question not easily answered: put in the fix for missed memcg charges. Cc; Matthew Wilcox Fixes: 4c6355b25e8b ("mm: memcontrol: charge swapin pages on instantiation") Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Alex Shi Cc: Michal Hocko Cc: Mike Kravetz Cc: Qian Cai Cc: [5.8] Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008301343270.5954@eggly.anvils Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008301358020.5954@eggly.anvils Signed-off-by: Linus Torvalds --- mm/ksm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 235f55d01541..9afccc36dbd2 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2586,6 +2586,10 @@ struct page *ksm_might_need_to_copy(struct page *page, return page; /* let do_swap_page report the error */ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) { + put_page(new_page); + new_page = NULL; + } if (new_page) { copy_user_highpage(new_page, page, address, vma); -- cgit v1.2.3 From a333e3e73b6648d3bd3ef6b971a59a6363bfcfc5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 18 Sep 2020 21:20:06 -0700 Subject: mm: migration of hugetlbfs page skip memcg hugetlbfs pages do not participate in memcg: so although they do find most of migrate_page_states() useful, it would be better if they did not call into mem_cgroup_migrate() - where Qian Cai reported that LTP's move_pages12 triggers the warning in Alex Shi's prospective commit "mm/memcg: warning on !memcg after readahead page charged". Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Alex Shi Cc: Michal Hocko Cc: Mike Kravetz Cc: Qian Cai Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008301359460.5954@eggly.anvils Signed-off-by: Linus Torvalds --- mm/migrate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 941b89383cf3..aecb1433cf3c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -668,7 +668,8 @@ void migrate_page_states(struct page *newpage, struct page *page) copy_page_owner(page, newpage); - mem_cgroup_migrate(page, newpage); + if (!PageHuge(page)) + mem_cgroup_migrate(page, newpage); } EXPORT_SYMBOL(migrate_page_states); -- cgit v1.2.3 From 8d8869ca5d2d9d86db96271ab063fdcfa9baf5b4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 18 Sep 2020 21:20:12 -0700 Subject: mm: fix check_move_unevictable_pages() on THP check_move_unevictable_pages() is used in making unevictable shmem pages evictable: by shmem_unlock_mapping(), drm_gem_check_release_pagevec() and i915/gem check_release_pagevec(). Those may pass down subpages of a huge page, when /sys/kernel/mm/transparent_hugepage/shmem_enabled is "force". That does not crash or warn at present, but the accounting of vmstats unevictable_pgs_scanned and unevictable_pgs_rescued is inconsistent: scanned being incremented on each subpage, rescued only on the head (since tails already appear evictable once the head has been updated). 5.8 commit 5d91f31faf8e ("mm: swap: fix vmstats for huge page") has established that vm_events in general (and unevictable_pgs_rescued in particular) should count every subpage: so follow that precedent here. Do this in such a way that if mem_cgroup_page_lruvec() is made stricter (to check page->mem_cgroup is always set), no problem: skip the tails before calling it, and add thp_nr_pages() to vmstats on the head. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Yang Shi Cc: Johannes Weiner Cc: Michal Hocko Cc: Mike Kravetz Cc: Qian Cai Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008301405000.5954@eggly.anvils Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 9727dd8e2581..466fc3144fff 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4268,8 +4268,14 @@ void check_move_unevictable_pages(struct pagevec *pvec) for (i = 0; i < pvec->nr; i++) { struct page *page = pvec->pages[i]; struct pglist_data *pagepgdat = page_pgdat(page); + int nr_pages; + + if (PageTransTail(page)) + continue; + + nr_pages = thp_nr_pages(page); + pgscanned += nr_pages; - pgscanned++; if (pagepgdat != pgdat) { if (pgdat) spin_unlock_irq(&pgdat->lru_lock); @@ -4288,7 +4294,7 @@ void check_move_unevictable_pages(struct pagevec *pvec) ClearPageUnevictable(page); del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); add_page_to_lru_list(page, lruvec, lru); - pgrescued++; + pgrescued += nr_pages; } } -- cgit v1.2.3 From 0964730bf46b4e271c5ecad5badbbd95737c087b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 18 Sep 2020 21:20:15 -0700 Subject: mlock: fix unevictable_pgs event counts on THP 5.8 commit 5d91f31faf8e ("mm: swap: fix vmstats for huge page") has established that vm_events should count every subpage of a THP, including unevictable_pgs_culled and unevictable_pgs_rescued; but lru_cache_add_inactive_or_unevictable() was not doing so for unevictable_pgs_mlocked, and mm/mlock.c was not doing so for unevictable_pgs mlocked, munlocked, cleared and stranded. Fix them; but THPs don't go the pagevec way in mlock.c, so no fixes needed on that path. Fixes: 5d91f31faf8e ("mm: swap: fix vmstats for huge page") Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Reviewed-by: Shakeel Butt Acked-by: Yang Shi Cc: Alex Shi Cc: Johannes Weiner Cc: Michal Hocko Cc: Mike Kravetz Cc: Qian Cai Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008301408230.5954@eggly.anvils Signed-off-by: Linus Torvalds --- mm/mlock.c | 24 +++++++++++++++--------- mm/swap.c | 6 +++--- 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 93ca2bf30b4f..884b1216da6a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -58,11 +58,14 @@ EXPORT_SYMBOL(can_do_mlock); */ void clear_page_mlock(struct page *page) { + int nr_pages; + if (!TestClearPageMlocked(page)) return; - mod_zone_page_state(page_zone(page), NR_MLOCK, -thp_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGCLEARED); + nr_pages = thp_nr_pages(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); /* * The previous TestClearPageMlocked() corresponds to the smp_mb() * in __pagevec_lru_add_fn(). @@ -76,7 +79,7 @@ void clear_page_mlock(struct page *page) * We lost the race. the page already moved to evictable list. */ if (PageUnevictable(page)) - count_vm_event(UNEVICTABLE_PGSTRANDED); + count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } } @@ -93,9 +96,10 @@ void mlock_vma_page(struct page *page) VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); if (!TestSetPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, - thp_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); + int nr_pages = thp_nr_pages(page); + + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); if (!isolate_lru_page(page)) putback_lru_page(page); } @@ -138,7 +142,7 @@ static void __munlock_isolated_page(struct page *page) /* Did try_to_unlock() succeed or punt? */ if (!PageMlocked(page)) - count_vm_event(UNEVICTABLE_PGMUNLOCKED); + count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page)); putback_lru_page(page); } @@ -154,10 +158,12 @@ static void __munlock_isolated_page(struct page *page) */ static void __munlock_isolation_failed(struct page *page) { + int nr_pages = thp_nr_pages(page); + if (PageUnevictable(page)) - __count_vm_event(UNEVICTABLE_PGSTRANDED); + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); else - __count_vm_event(UNEVICTABLE_PGMUNLOCKED); + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); } /** diff --git a/mm/swap.c b/mm/swap.c index d16d65d9b4e0..e7bdf094f76a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -494,14 +494,14 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED; if (unlikely(unevictable) && !TestSetPageMlocked(page)) { + int nr_pages = thp_nr_pages(page); /* * We use the irq-unsafe __mod_zone_page_stat because this * counter is not modified from interrupt context, and the pte * lock is held(spinlock), which implies preemption disabled. */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, - thp_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); + __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } lru_cache_add(page); } -- cgit v1.2.3 From bb3e96d63eb75a2f4ff790b089f6b93614c729a1 Mon Sep 17 00:00:00 2001 From: Byron Stanoszek Date: Fri, 18 Sep 2020 21:20:18 -0700 Subject: tmpfs: restore functionality of nr_inodes=0 Commit e809d5f0b5c9 ("tmpfs: per-superblock i_ino support") made changes to shmem_reserve_inode() in mm/shmem.c, however the original test for (sbinfo->max_inodes) got dropped. This causes mounting tmpfs with option nr_inodes=0 to fail: # mount -ttmpfs -onr_inodes=0 none /ext0 mount: /ext0: mount(2) system call failed: Cannot allocate memory. This patch restores the nr_inodes=0 functionality. Fixes: e809d5f0b5c9 ("tmpfs: per-superblock i_ino support") Signed-off-by: Byron Stanoszek Signed-off-by: Andrew Morton Acked-by: Hugh Dickins Acked-by: Chris Down Link: https://lkml.kernel.org/r/20200902035715.16414-1-gandalf@winds.org Signed-off-by: Linus Torvalds --- mm/shmem.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 271548ca20f3..8e2b35ba93ad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -279,11 +279,13 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) if (!(sb->s_flags & SB_KERNMOUNT)) { spin_lock(&sbinfo->stat_lock); - if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); - return -ENOSPC; + if (sbinfo->max_inodes) { + if (!sbinfo->free_inodes) { + spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; } - sbinfo->free_inodes--; if (inop) { ino = sbinfo->next_ino++; if (unlikely(is_zero_ino(ino))) -- cgit v1.2.3 From ec0abae6dcdf7ef88607c869bf35a4b63ce1b370 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Fri, 18 Sep 2020 21:20:24 -0700 Subject: mm/thp: fix __split_huge_pmd_locked() for migration PMD A migrating transparent huge page has to already be unmapped. Otherwise, the page could be modified while it is being copied to a new page and data could be lost. The function __split_huge_pmd() checks for a PMD migration entry before calling __split_huge_pmd_locked() leading one to think that __split_huge_pmd_locked() can handle splitting a migrating PMD. However, the code always increments the page->_mapcount and adjusts the memory control group accounting assuming the page is mapped. Also, if the PMD entry is a migration PMD entry, the call to is_huge_zero_pmd(*pmd) is incorrect because it calls pmd_pfn(pmd) instead of migration_entry_to_pfn(pmd_to_swp_entry(pmd)). Fix these problems by checking for a PMD migration entry. Fixes: 84c3fc4e9c56 ("mm: thp: check pmd migration entry in common path") Signed-off-by: Ralph Campbell Signed-off-by: Andrew Morton Reviewed-by: Yang Shi Reviewed-by: Zi Yan Cc: Jerome Glisse Cc: John Hubbard Cc: Alistair Popple Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Bharata B Rao Cc: Ben Skeggs Cc: Shuah Khan Cc: [4.14+] Link: https://lkml.kernel.org/r/20200903183140.19055-1-rcampbell@nvidia.com Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7ff29cc3d55c..faadc449cca5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2022,7 +2022,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, put_page(page); add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); return; - } else if (is_huge_zero_pmd(*pmd)) { + } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling * mmu_notifier_invalidate_range() see comments below inside @@ -2116,30 +2116,34 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); - atomic_inc(&page[i]._mapcount); - pte_unmap(pte); - } - - /* - * Set PG_double_map before dropping compound_mapcount to avoid - * false-negative page_mapped(). - */ - if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { - for (i = 0; i < HPAGE_PMD_NR; i++) + if (!pmd_migration) atomic_inc(&page[i]._mapcount); + pte_unmap(pte); } - lock_page_memcg(page); - if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { - /* Last compound_mapcount is gone. */ - __dec_lruvec_page_state(page, NR_ANON_THPS); - if (TestClearPageDoubleMap(page)) { - /* No need in mapcount reference anymore */ + if (!pmd_migration) { + /* + * Set PG_double_map before dropping compound_mapcount to avoid + * false-negative page_mapped(). + */ + if (compound_mapcount(page) > 1 && + !TestSetPageDoubleMap(page)) { for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_dec(&page[i]._mapcount); + atomic_inc(&page[i]._mapcount); + } + + lock_page_memcg(page); + if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { + /* Last compound_mapcount is gone. */ + __dec_lruvec_page_state(page, NR_ANON_THPS); + if (TestClearPageDoubleMap(page)) { + /* No need in mapcount reference anymore */ + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_dec(&page[i]._mapcount); + } } + unlock_page_memcg(page); } - unlock_page_memcg(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); -- cgit v1.2.3 From 9683182612214aa5f5e709fad49444b847cd866a Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 18 Sep 2020 21:20:31 -0700 Subject: mm/memory_hotplug: drain per-cpu pages again during memory offline There is a race during page offline that can lead to infinite loop: a page never ends up on a buddy list and __offline_pages() keeps retrying infinitely or until a termination signal is received. Thread#1 - a new process: load_elf_binary begin_new_exec exec_mmap mmput exit_mmap tlb_finish_mmu tlb_flush_mmu release_pages free_unref_page_list free_unref_page_prepare set_pcppage_migratetype(page, migratetype); // Set page->index migration type below MIGRATE_PCPTYPES Thread#2 - hot-removes memory __offline_pages start_isolate_page_range set_migratetype_isolate set_pageblock_migratetype(page, MIGRATE_ISOLATE); Set migration type to MIGRATE_ISOLATE-> set drain_all_pages(zone); // drain per-cpu page lists to buddy allocator. Thread#1 - continue free_unref_page_commit migratetype = get_pcppage_migratetype(page); // get old migration type list_add(&page->lru, &pcp->lists[migratetype]); // add new page to already drained pcp list Thread#2 Never drains pcp again, and therefore gets stuck in the loop. The fix is to try to drain per-cpu lists again after check_pages_isolated_cb() fails. Fixes: c52e75935f8d ("mm: remove extra drain pages on pcp list") Signed-off-by: Pavel Tatashin Signed-off-by: Andrew Morton Acked-by: David Rientjes Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Oscar Salvador Cc: Wei Yang Cc: Link: https://lkml.kernel.org/r/20200903140032.380431-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20200904151448.100489-2-pasha.tatashin@soleen.com Link: http://lkml.kernel.org/r/20200904070235.GA15277@dhcp22.suse.cz Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 14 ++++++++++++++ mm/page_isolation.c | 8 ++++++++ 2 files changed, 22 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e9d5ab5d3ca0..b11a269e2356 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1575,6 +1575,20 @@ static int __ref __offline_pages(unsigned long start_pfn, /* check again */ ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, check_pages_isolated_cb); + /* + * per-cpu pages are drained in start_isolate_page_range, but if + * there are still pages that are not free, make sure that we + * drain again, because when we isolated range we might + * have raced with another thread that was adding pages to pcp + * list. + * + * Forward progress should be still guaranteed because + * pages on the pcp list can only belong to MOVABLE_ZONE + * because has_unmovable_pages explicitly checks for + * PageBuddy on freed pages on other zones. + */ + if (ret) + drain_all_pages(zone); } while (ret); /* Ok, all of our target is isolated. diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 242c03121d73..63a3db10a8c0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -170,6 +170,14 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * pageblocks we may have modified and return -EBUSY to caller. This * prevents two threads from simultaneously working on overlapping ranges. * + * Please note that there is no strong synchronization with the page allocator + * either. Pages might be freed while their page blocks are marked ISOLATED. + * In some cases pages might still end up on pcp lists and that would allow + * for their allocation even when they are in fact isolated already. Depending + * on how strong of a guarantee the caller needs drain_all_pages might be needed + * (e.g. __offline_pages will need to call it after check for isolated range for + * a next retry). + * * Return: the number of isolated pageblocks on success and -EBUSY if any part * of range cannot be isolated. */ -- cgit v1.2.3 From 5868ec267de5eade3ef80bd8716d6b7621a0c4c0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Sep 2020 10:38:47 -0700 Subject: mm: fix wake_page_function() comment typos Sedat Dilek pointed out some silly comment typo issues. Reported-by: Sedat Dilek Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 6aa08e7714ce..5202e38ab79e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -990,7 +990,7 @@ void __init pagecache_init(void) /* * The page wait code treats the "wait->flags" somewhat unusually, because - * we have multiple different kinds of waits, not just he usual "exclusive" + * we have multiple different kinds of waits, not just the usual "exclusive" * one. * * We have: @@ -1011,7 +1011,7 @@ void __init pagecache_init(void) * * This is the traditional exclusive wait. * - * (b) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: + * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM: * * The waiter is waiting to get the bit, and additionally wants the * lock to be transferred to it for fair lock behavior. If the lock -- cgit v1.2.3