From 5033091de814ab4b5623faed2755f3064e19e2d2 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Mon, 24 Oct 2022 15:20:12 +0900 Subject: mm/hwpoison: introduce per-memory_block hwpoison counter Currently PageHWPoison flag does not behave well when experiencing memory hotremove/hotplug. Any data field in struct page is unreliable when the associated memory is offlined, and the current mechanism can't tell whether a memory block is onlined because a new memory devices is installed or because previous failed offline operations are undone. Especially if there's a hwpoisoned memory, it's unclear what the best option is. So introduce a new mechanism to make struct memory_block remember that a memory block has hwpoisoned memory inside it. And make any online event fail if the onlining memory block contains hwpoison. struct memory_block is freed and reallocated over ACPI-based hotremove/hotplug, but not over sysfs-based hotremove/hotplug. So the new counter can distinguish these cases. Link: https://lkml.kernel.org/r/20221024062012.1520887-5-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: kernel test robot Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Jane Chu Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- drivers/base/memory.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'drivers/base') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 9aa0da991cfb..fe98fb8d94e5 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -175,6 +175,15 @@ int memory_notify(unsigned long val, void *v) return blocking_notifier_call_chain(&memory_chain, val, v); } +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +static unsigned long memblk_nr_poison(struct memory_block *mem); +#else +static inline unsigned long memblk_nr_poison(struct memory_block *mem) +{ + return 0; +} +#endif + static int memory_block_online(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); @@ -183,6 +192,9 @@ static int memory_block_online(struct memory_block *mem) struct zone *zone; int ret; + if (memblk_nr_poison(mem)) + return -EHWPOISON; + zone = zone_for_pfn_range(mem->online_type, mem->nid, mem->group, start_pfn, nr_pages); @@ -864,6 +876,7 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) continue; + num_poisoned_pages_sub(-1UL, memblk_nr_poison(mem)); unregister_memory_block_under_nodes(mem); remove_memory_block(mem); } @@ -1164,3 +1177,28 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, } return ret; } + +#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) +void memblk_nr_poison_inc(unsigned long pfn) +{ + const unsigned long block_id = pfn_to_block_id(pfn); + struct memory_block *mem = find_memory_block_by_id(block_id); + + if (mem) + atomic_long_inc(&mem->nr_hwpoison); +} + +void memblk_nr_poison_sub(unsigned long pfn, long i) +{ + const unsigned long block_id = pfn_to_block_id(pfn); + struct memory_block *mem = find_memory_block_by_id(block_id); + + if (mem) + atomic_long_sub(i, &mem->nr_hwpoison); +} + +static unsigned long memblk_nr_poison(struct memory_block *mem) +{ + return atomic_long_read(&mem->nr_hwpoison); +} +#endif -- cgit v1.2.3