From 042999388ef3dba43e813fdc6d6133ec9ca405dc Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 2 Jun 2022 14:21:16 +0800 Subject: mm/page_isolation.c: fix one kernel-doc comment Remove one warning found by running scripts/kernel-doc, which is caused by using 'make W=1': mm/page_isolation.c:304: warning: Function parameter or member 'skip_isolation' not described in 'isolate_single_pageblock' Link: https://lkml.kernel.org/r/20220602062116.61199-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Reported-by: Abaci Robot Signed-off-by: Andrew Morton --- mm/page_isolation.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index d200d41ad0d3..9d73dc38e3d7 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -286,6 +286,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * @flags: isolation flags * @gfp_flags: GFP flags used for migrating pages * @isolate_before: isolate the pageblock before the boundary_pfn + * @skip_isolation: the flag to skip the pageblock isolation in second + * isolate_single_pageblock() * * Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one * pageblock. When not all pageblocks within a page are isolated at the same -- cgit v1.2.3 From 31733463372e8d88ea54bfa1e35178aad9b2ffd2 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 30 May 2022 12:51:56 -0300 Subject: mm: lru_cache_disable: use synchronize_rcu_expedited commit ff042f4a9b050 ("mm: lru_cache_disable: replace work queue synchronization with synchronize_rcu") replaced lru_cache_disable's usage of work queues with synchronize_rcu. Some users reported large performance regressions due to this commit, for example: https://lore.kernel.org/all/20220521234616.GO1790663@paulmck-ThinkPad-P17-Gen-1/T/ Switching to synchronize_rcu_expedited fixes the problem. Link: https://lkml.kernel.org/r/YpToHCmnx/HEcVyR@fuller.cnet Fixes: ff042f4a9b050 ("mm: lru_cache_disable: replace work queue synchronization with synchronize_rcu") Signed-off-by: Marcelo Tosatti Tested-by: Stefan Wahren Tested-by: Michael Larabel Cc: Sebastian Andrzej Siewior Cc: Nicolas Saenz Julienne Cc: Borislav Petkov Cc: Minchan Kim Cc: Matthew Wilcox Cc: Mel Gorman Cc: Juri Lelli Cc: Thomas Gleixner Cc: Paul E. McKenney Cc: Phil Elwell Cc: Signed-off-by: Andrew Morton --- mm/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index f3922a96b2e9..034bb24879a3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -881,7 +881,7 @@ void lru_cache_disable(void) * lru_disable_count = 0 will have exited the critical * section when synchronize_rcu() returns. */ - synchronize_rcu(); + synchronize_rcu_expedited(); #ifdef CONFIG_SMP __lru_add_drain_all(true); #else -- cgit v1.2.3 From 2949282938135ab734c3829495ae393523ceb702 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 4 Jun 2022 19:50:51 +0000 Subject: mm/damon/reclaim: schedule 'damon_reclaim_timer' only after 'system_wq' is initialized Commit 059342d1dd4e ("mm/damon/reclaim: fix the timer always stays active") made DAMON_RECLAIM's 'enabled' parameter store callback, 'enabled_store()', to schedule 'damon_reclaim_timer'. The scheduling uses 'system_wq', which is initialized in 'workqueue_init_early()'. As kernel parameters parsing function ('parse_args()') is called before 'workqueue_init_early()', 'enabled_store()' can be executed before 'workqueue_init_early()' and end up accessing the uninitialized 'system_wq'. As a result, the booting hang[1]. This commit fixes the issue by checking if the initialization is done before scheduling the timer. [1] https://lkml.kernel.org/20220604192222.1488-1-sj@kernel.org/ Link: https://lkml.kernel.org/r/20220604195051.1589-1-sj@kernel.org Fixes: 059342d1dd4e ("mm/damon/reclaim: fix the timer always stays active") Signed-off-by: SeongJae Park Reported-by: Greg White Cc: Hailong Tu Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 8efbfb24f3a1..4b07c29effe9 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -374,6 +374,8 @@ static void damon_reclaim_timer_fn(struct work_struct *work) } static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); +static bool damon_reclaim_initialized; + static int enabled_store(const char *val, const struct kernel_param *kp) { @@ -382,6 +384,10 @@ static int enabled_store(const char *val, if (rc < 0) return rc; + /* system_wq might not initialized yet */ + if (!damon_reclaim_initialized) + return rc; + if (enabled) schedule_delayed_work(&damon_reclaim_timer, 0); @@ -449,6 +455,8 @@ static int __init damon_reclaim_init(void) damon_add_target(ctx, target); schedule_delayed_work(&damon_reclaim_timer, 0); + + damon_reclaim_initialized = true; return 0; } -- cgit v1.2.3 From 327b18b7aaed5de3b548212e3ab75133bf323759 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 9 Jun 2022 14:33:19 +0200 Subject: mm/kfence: select random number before taking raw lock The RNG uses vanilla spinlocks, not raw spinlocks, so kfence should pick its random numbers before taking its raw spinlocks. This also has the nice effect of doing less work inside the lock. It should fix a splat that Geert saw with CONFIG_PROVE_RAW_LOCK_NESTING: dump_backtrace.part.0+0x98/0xc0 show_stack+0x14/0x28 dump_stack_lvl+0xac/0xec dump_stack+0x14/0x2c __lock_acquire+0x388/0x10a0 lock_acquire+0x190/0x2c0 _raw_spin_lock_irqsave+0x6c/0x94 crng_make_state+0x148/0x1e4 _get_random_bytes.part.0+0x4c/0xe8 get_random_u32+0x4c/0x140 __kfence_alloc+0x460/0x5c4 kmem_cache_alloc_trace+0x194/0x1dc __kthread_create_on_node+0x5c/0x1a8 kthread_create_on_node+0x58/0x7c printk_start_kthread.part.0+0x34/0xa8 printk_activate_kthreads+0x4c/0x54 do_one_initcall+0xec/0x278 kernel_init_freeable+0x11c/0x214 kernel_init+0x24/0x124 ret_from_fork+0x10/0x20 Link: https://lkml.kernel.org/r/20220609123319.17576-1-Jason@zx2c4.com Fixes: d4150779e60f ("random32: use real rng for non-deterministic randomness") Signed-off-by: Jason A. Donenfeld Reported-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Reviewed-by: Marco Elver Reviewed-by: Petr Mladek Cc: John Ogness Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kfence/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 4e7cd4c8e687..4b5e5a3d3a63 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -360,6 +360,9 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g unsigned long flags; struct slab *slab; void *addr; + const bool random_right_allocate = prandom_u32_max(2); + const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS && + !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS); /* Try to obtain a free object. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); @@ -404,7 +407,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g * is that the out-of-bounds accesses detected are deterministic for * such allocations. */ - if (prandom_u32_max(2)) { + if (random_right_allocate) { /* Allocate on the "right" side, re-calculate address. */ meta->addr += PAGE_SIZE - size; meta->addr = ALIGN_DOWN(meta->addr, cache->align); @@ -444,7 +447,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g if (cache->ctor) cache->ctor(addr); - if (CONFIG_KFENCE_STRESS_TEST_FAULTS && !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS)) + if (random_fault) kfence_protect(meta->addr); /* Random "faults" by protecting the object. */ atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]); -- cgit v1.2.3 From df4ae285a3d5ce99d69efe81b21c4fed9bbc51b9 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 10 Jun 2022 02:44:52 +0000 Subject: mm: memcontrol: reference to tools/cgroup/memcg_slabinfo.py There is no slabinfo.py in tools/cgroup, but has memcg_slabinfo.py instead. Link: https://lkml.kernel.org/r/20220610024451.744135-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reviewed-by: Muchun Song Acked-by: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index abec50f31fe6..618c366a2f07 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4859,7 +4859,7 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p) { /* * Deprecated. - * Please, take a look at tools/cgroup/slabinfo.py . + * Please, take a look at tools/cgroup/memcg_slabinfo.py . */ return 0; } -- cgit v1.2.3 From 67f22ba7750f940bcd7e1b12720896c505c2d63f Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Wed, 15 Jun 2022 17:32:09 +0800 Subject: mm/memory-failure: disable unpoison once hw error happens Currently unpoison_memory(unsigned long pfn) is designed for soft poison(hwpoison-inject) only. Since 17fae1294ad9d, the KPTE gets cleared on a x86 platform once hardware memory corrupts. Unpoisoning a hardware corrupted page puts page back buddy only, the kernel has a chance to access the page with *NOT PRESENT* KPTE. This leads BUG during accessing on the corrupted KPTE. Suggested by David&Naoya, disable unpoison mechanism when a real HW error happens to avoid BUG like this: Unpoison: Software-unpoisoned page 0x61234 BUG: unable to handle page fault for address: ffff888061234000 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 2c01067 P4D 2c01067 PUD 107267063 PMD 10382b063 PTE 800fffff9edcb062 Oops: 0002 [#1] PREEMPT SMP NOPTI CPU: 4 PID: 26551 Comm: stress Kdump: loaded Tainted: G M OE 5.18.0.bm.1-amd64 #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ... RIP: 0010:clear_page_erms+0x7/0x10 Code: ... RSP: 0000:ffffc90001107bc8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000901 RCX: 0000000000001000 RDX: ffffea0001848d00 RSI: ffffea0001848d40 RDI: ffff888061234000 RBP: ffffea0001848d00 R08: 0000000000000901 R09: 0000000000001276 R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000001 R13: 0000000000000000 R14: 0000000000140dca R15: 0000000000000001 FS: 00007fd8b2333740(0000) GS:ffff88813fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff888061234000 CR3: 00000001023d2005 CR4: 0000000000770ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: prep_new_page+0x151/0x170 get_page_from_freelist+0xca0/0xe20 ? sysvec_apic_timer_interrupt+0xab/0xc0 ? asm_sysvec_apic_timer_interrupt+0x1b/0x20 __alloc_pages+0x17e/0x340 __folio_alloc+0x17/0x40 vma_alloc_folio+0x84/0x280 __handle_mm_fault+0x8d4/0xeb0 handle_mm_fault+0xd5/0x2a0 do_user_addr_fault+0x1d0/0x680 ? kvm_read_and_reset_apf_flags+0x3b/0x50 exc_page_fault+0x78/0x170 asm_exc_page_fault+0x27/0x30 Link: https://lkml.kernel.org/r/20220615093209.259374-2-pizhenwei@bytedance.com Fixes: 847ce401df392 ("HWPOISON: Add unpoisoning support") Fixes: 17fae1294ad9d ("x86/{mce,mm}: Unmap the entire page if the whole page is affected and poisoned") Signed-off-by: zhenwei pi Acked-by: David Hildenbrand Acked-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Reviewed-by: Oscar Salvador Cc: Greg Kroah-Hartman Cc: [5.8+] Signed-off-by: Andrew Morton --- Documentation/vm/hwpoison.rst | 3 ++- drivers/base/memory.c | 2 +- include/linux/mm.h | 1 + mm/hwpoison-inject.c | 2 +- mm/madvise.c | 2 +- mm/memory-failure.c | 12 ++++++++++++ 6 files changed, 18 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/Documentation/vm/hwpoison.rst b/Documentation/vm/hwpoison.rst index c742de1769d1..b9d5253c1305 100644 --- a/Documentation/vm/hwpoison.rst +++ b/Documentation/vm/hwpoison.rst @@ -120,7 +120,8 @@ Testing unpoison-pfn Software-unpoison page at PFN echoed into this file. This way a page can be reused again. This only works for Linux - injected failures, not for real memory failures. + injected failures, not for real memory failures. Once any hardware + memory failure happens, this feature is disabled. Note these injection interfaces are not stable and might change between kernel versions diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 084d67fd55cc..bc60c9cd3230 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -558,7 +558,7 @@ static ssize_t hard_offline_page_store(struct device *dev, if (kstrtoull(buf, 0, &pfn) < 0) return -EINVAL; pfn >>= PAGE_SHIFT; - ret = memory_failure(pfn, 0); + ret = memory_failure(pfn, MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; return ret ? ret : count; diff --git a/include/linux/mm.h b/include/linux/mm.h index 781fae17177d..cf3d0d673f6b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3232,6 +3232,7 @@ enum mf_flags { MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, MF_UNPOISON = 1 << 4, + MF_SW_SIMULATED = 1 << 5, }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 5c0cddd81505..65e242b5a432 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -48,7 +48,7 @@ static int hwpoison_inject(void *data, u64 val) inject: pr_info("Injecting memory failure at pfn %#lx\n", pfn); - err = memory_failure(pfn, 0); + err = memory_failure(pfn, MF_SW_SIMULATED); return (err == -EOPNOTSUPP) ? 0 : err; } diff --git a/mm/madvise.c b/mm/madvise.c index d7b4f2602949..0316bbc6441b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1112,7 +1112,7 @@ static int madvise_inject_error(int behavior, } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); - ret = memory_failure(pfn, MF_COUNT_INCREASED); + ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b85661cbdc4a..da39ec8afca8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -69,6 +69,8 @@ int sysctl_memory_failure_recovery __read_mostly = 1; atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); +static bool hw_memory_failure __read_mostly = false; + static bool __page_handle_poison(struct page *page) { int ret; @@ -1768,6 +1770,9 @@ int memory_failure(unsigned long pfn, int flags) mutex_lock(&mf_mutex); + if (!(flags & MF_SW_SIMULATED)) + hw_memory_failure = true; + p = pfn_to_online_page(pfn); if (!p) { res = arch_memory_failure(pfn, flags); @@ -2103,6 +2108,13 @@ int unpoison_memory(unsigned long pfn) mutex_lock(&mf_mutex); + if (hw_memory_failure) { + unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n", + pfn, &unpoison_rs); + ret = -EOPNOTSUPP; + goto unlock_mutex; + } + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); -- cgit v1.2.3