summaryrefslogtreecommitdiffstats
path: root/mm/memory-failure.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory-failure.c')
-rw-r--r--mm/memory-failure.c351
1 files changed, 183 insertions, 168 deletions
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d487f8dc6d39..ea5a93659488 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -20,6 +20,14 @@
* this code has to be extremely careful. Generally it tries to use
* normal locking rules, as in get the standard locks, even if that means
* the error handling takes potentially a long time.
+ *
+ * It can be very tempting to add handling for obscure cases here.
+ * In general any code for handling new cases should only be added iff:
+ * - You know how to test it.
+ * - You have a test that can be added to mce-test
+ * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
+ * - The case actually shows up as a frequent (top 10) page state in
+ * tools/vm/page-types when running a real workload.
*
* There are several operations here with exponential complexity because
* of unsuitable VM data structures. For example the operation to map back
@@ -28,13 +36,6 @@
* are rare we hope to get away with this. This avoids impacting the core
* VM.
*/
-
-/*
- * Notebook:
- * - hugetlb needs more code
- * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
- * - pass bad pages to kdump next kernel
- */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -56,6 +57,7 @@
#include <linux/mm_inline.h>
#include <linux/kfifo.h>
#include "internal.h"
+#include "ras/ras_event.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -503,22 +505,34 @@ static void collect_procs(struct page *page, struct list_head *tokill,
kfree(tk);
}
-/*
- * Error handlers for various types of pages.
- */
-
-enum outcome {
- IGNORED, /* Error: cannot be handled */
- FAILED, /* Error: handling failed */
- DELAYED, /* Will be handled later */
- RECOVERED, /* Successfully recovered */
+static const char *action_name[] = {
+ [MF_IGNORED] = "Ignored",
+ [MF_FAILED] = "Failed",
+ [MF_DELAYED] = "Delayed",
+ [MF_RECOVERED] = "Recovered",
};
-static const char *action_name[] = {
- [IGNORED] = "Ignored",
- [FAILED] = "Failed",
- [DELAYED] = "Delayed",
- [RECOVERED] = "Recovered",
+static const char * const action_page_types[] = {
+ [MF_MSG_KERNEL] = "reserved kernel page",
+ [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
+ [MF_MSG_SLAB] = "kernel slab page",
+ [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
+ [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
+ [MF_MSG_HUGE] = "huge page",
+ [MF_MSG_FREE_HUGE] = "free huge page",
+ [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
+ [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
+ [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
+ [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
+ [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
+ [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
+ [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
+ [MF_MSG_DIRTY_LRU] = "dirty LRU page",
+ [MF_MSG_CLEAN_LRU] = "clean LRU page",
+ [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
+ [MF_MSG_BUDDY] = "free buddy page",
+ [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
+ [MF_MSG_UNKNOWN] = "unknown page",
};
/*
@@ -552,7 +566,7 @@ static int delete_from_lru_cache(struct page *p)
*/
static int me_kernel(struct page *p, unsigned long pfn)
{
- return IGNORED;
+ return MF_IGNORED;
}
/*
@@ -561,7 +575,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
static int me_unknown(struct page *p, unsigned long pfn)
{
printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
- return FAILED;
+ return MF_FAILED;
}
/*
@@ -570,7 +584,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
int err;
- int ret = FAILED;
+ int ret = MF_FAILED;
struct address_space *mapping;
delete_from_lru_cache(p);
@@ -580,7 +594,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* should be the one m_f() holds.
*/
if (PageAnon(p))
- return RECOVERED;
+ return MF_RECOVERED;
/*
* Now truncate the page in the page cache. This is really
@@ -594,7 +608,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
- return FAILED;
+ return MF_FAILED;
}
/*
@@ -611,7 +625,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
!try_to_release_page(p, GFP_NOIO)) {
pr_info("MCE %#lx: failed to release buffers\n", pfn);
} else {
- ret = RECOVERED;
+ ret = MF_RECOVERED;
}
} else {
/*
@@ -619,7 +633,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* This fails on dirty or anything with private pages
*/
if (invalidate_inode_page(p))
- ret = RECOVERED;
+ ret = MF_RECOVERED;
else
printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
pfn);
@@ -705,9 +719,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
ClearPageUptodate(p);
if (!delete_from_lru_cache(p))
- return DELAYED;
+ return MF_DELAYED;
else
- return FAILED;
+ return MF_FAILED;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
@@ -715,9 +729,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
delete_from_swap_cache(p);
if (!delete_from_lru_cache(p))
- return RECOVERED;
+ return MF_RECOVERED;
else
- return FAILED;
+ return MF_FAILED;
}
/*
@@ -730,6 +744,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
{
int res = 0;
struct page *hpage = compound_head(p);
+
+ if (!PageHuge(hpage))
+ return MF_DELAYED;
+
/*
* We can safely recover from error on free or reserved (i.e.
* not in-use) hugepage by dequeuing it from freelist.
@@ -743,9 +761,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
if (!(page_mapping(hpage) || PageAnon(hpage))) {
res = dequeue_hwpoisoned_huge_page(hpage);
if (!res)
- return RECOVERED;
+ return MF_RECOVERED;
}
- return DELAYED;
+ return MF_DELAYED;
}
/*
@@ -777,10 +795,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
static struct page_state {
unsigned long mask;
unsigned long res;
- char *msg;
+ enum mf_action_page_type type;
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
- { reserved, reserved, "reserved kernel", me_kernel },
+ { reserved, reserved, MF_MSG_KERNEL, me_kernel },
/*
* free pages are specially detected outside this table:
* PG_buddy pages only make a small fraction of all free pages.
@@ -791,31 +809,31 @@ static struct page_state {
* currently unused objects without touching them. But just
* treat it as standard kernel for now.
*/
- { slab, slab, "kernel slab", me_kernel },
+ { slab, slab, MF_MSG_SLAB, me_kernel },
#ifdef CONFIG_PAGEFLAGS_EXTENDED
- { head, head, "huge", me_huge_page },
- { tail, tail, "huge", me_huge_page },
+ { head, head, MF_MSG_HUGE, me_huge_page },
+ { tail, tail, MF_MSG_HUGE, me_huge_page },
#else
- { compound, compound, "huge", me_huge_page },
+ { compound, compound, MF_MSG_HUGE, me_huge_page },
#endif
- { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
- { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
+ { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
+ { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
- { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
- { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
+ { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
+ { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
- { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
- { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
+ { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
+ { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
- { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
- { lru|dirty, lru, "clean LRU", me_pagecache_clean },
+ { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
+ { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
/*
* Catchall entry: must be at end.
*/
- { 0, 0, "unknown page state", me_unknown },
+ { 0, 0, MF_MSG_UNKNOWN, me_unknown },
};
#undef dirty
@@ -835,10 +853,13 @@ static struct page_state {
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
*/
-static void action_result(unsigned long pfn, char *msg, int result)
+static void action_result(unsigned long pfn, enum mf_action_page_type type,
+ enum mf_result result)
{
- pr_err("MCE %#lx: %s page recovery: %s\n",
- pfn, msg, action_name[result]);
+ trace_memory_failure_event(pfn, type, result);
+
+ pr_err("MCE %#lx: recovery action for %s: %s\n",
+ pfn, action_page_types[type], action_name[result]);
}
static int page_action(struct page_state *ps, struct page *p,
@@ -850,23 +871,68 @@ static int page_action(struct page_state *ps, struct page *p,
result = ps->action(p, pfn);
count = page_count(p) - 1;
- if (ps->action == me_swapcache_dirty && result == DELAYED)
+ if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
if (count != 0) {
printk(KERN_ERR
- "MCE %#lx: %s page still referenced by %d users\n",
- pfn, ps->msg, count);
- result = FAILED;
+ "MCE %#lx: %s still referenced by %d users\n",
+ pfn, action_page_types[ps->type], count);
+ result = MF_FAILED;
}
- action_result(pfn, ps->msg, result);
+ action_result(pfn, ps->type, result);
/* Could do more checks here if page looks ok */
/*
* Could adjust zone counters here to correct for the missing page.
*/
- return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
+ return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
+}
+
+/**
+ * get_hwpoison_page() - Get refcount for memory error handling:
+ * @page: raw error page (hit by memory error)
+ *
+ * Return: return 0 if failed to grab the refcount, otherwise true (some
+ * non-zero value.)
+ */
+int get_hwpoison_page(struct page *page)
+{
+ struct page *head = compound_head(page);
+
+ if (PageHuge(head))
+ return get_page_unless_zero(head);
+
+ /*
+ * Thp tail page has special refcounting rule (refcount of tail pages
+ * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
+ * directly for tail pages.
+ */
+ if (PageTransHuge(head)) {
+ /*
+ * Non anonymous thp exists only in allocation/free time. We
+ * can't handle such a case correctly, so let's give it up.
+ * This should be better than triggering BUG_ON when kernel
+ * tries to touch the "partially handled" page.
+ */
+ if (!PageAnon(head)) {
+ pr_err("MCE: %#lx: non anonymous thp\n",
+ page_to_pfn(page));
+ return 0;
+ }
+
+ if (get_page_unless_zero(head)) {
+ if (PageTail(page))
+ get_page(page);
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ return get_page_unless_zero(page);
}
+EXPORT_SYMBOL_GPL(get_hwpoison_page);
/*
* Do all that is necessary to remove user space mappings. Unmap
@@ -881,7 +947,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
int ret;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
- struct page *ppage;
/*
* Here we are interested only in user-mapped pages, so skip any
@@ -931,59 +996,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
}
/*
- * ppage: poisoned page
- * if p is regular page(4k page)
- * ppage == real poisoned page;
- * else p is hugetlb or THP, ppage == head page.
- */
- ppage = hpage;
-
- if (PageTransHuge(hpage)) {
- /*
- * Verify that this isn't a hugetlbfs head page, the check for
- * PageAnon is just for avoid tripping a split_huge_page
- * internal debug check, as split_huge_page refuses to deal with
- * anything that isn't an anon page. PageAnon can't go away fro
- * under us because we hold a refcount on the hpage, without a
- * refcount on the hpage. split_huge_page can't be safely called
- * in the first place, having a refcount on the tail isn't
- * enough * to be safe.
- */
- if (!PageHuge(hpage) && PageAnon(hpage)) {
- if (unlikely(split_huge_page(hpage))) {
- /*
- * FIXME: if splitting THP is failed, it is
- * better to stop the following operation rather
- * than causing panic by unmapping. System might
- * survive if the page is freed later.
- */
- printk(KERN_INFO
- "MCE %#lx: failed to split THP\n", pfn);
-
- BUG_ON(!PageHWPoison(p));
- return SWAP_FAIL;
- }
- /*
- * We pinned the head page for hwpoison handling,
- * now we split the thp and we are interested in
- * the hwpoisoned raw page, so move the refcount
- * to it. Similarly, page lock is shifted.
- */
- if (hpage != p) {
- if (!(flags & MF_COUNT_INCREASED)) {
- put_page(hpage);
- get_page(p);
- }
- lock_page(p);
- unlock_page(hpage);
- *hpagep = p;
- }
- /* THP is split, so ppage should be the real poisoned page. */
- ppage = p;
- }
- }
-
- /*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
* because ttu takes the rmap data structures down.
@@ -992,12 +1004,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- ret = try_to_unmap(ppage, ttu);
+ ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(ppage));
+ pfn, page_mapcount(hpage));
/*
* Now that the dirty bit has been propagated to the
@@ -1009,7 +1021,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
kill_procs(&tokill, forcekill, trapno,
ret != SWAP_SUCCESS, p, pfn, flags);
@@ -1055,6 +1067,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
struct page_state *ps;
struct page *p;
struct page *hpage;
+ struct page *orig_head;
int res;
unsigned int nr_pages;
unsigned long page_flags;
@@ -1070,7 +1083,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
p = pfn_to_page(pfn);
- hpage = compound_head(p);
+ orig_head = hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
return 0;
@@ -1103,10 +1116,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* In fact it's dangerous to directly bump up page count from 0,
* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
*/
- if (!(flags & MF_COUNT_INCREASED) &&
- !get_page_unless_zero(hpage)) {
+ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
if (is_free_buddy_page(p)) {
- action_result(pfn, "free buddy", DELAYED);
+ action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
return 0;
} else if (PageHuge(hpage)) {
/*
@@ -1123,16 +1135,30 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
- action_result(pfn, "free huge",
- res ? IGNORED : DELAYED);
+ action_result(pfn, MF_MSG_FREE_HUGE,
+ res ? MF_IGNORED : MF_DELAYED);
unlock_page(hpage);
return res;
} else {
- action_result(pfn, "high order kernel", IGNORED);
+ action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
return -EBUSY;
}
}
+ if (!PageHuge(p) && PageTransHuge(hpage)) {
+ if (unlikely(split_huge_page(hpage))) {
+ pr_err("MCE: %#lx: thp split failed\n", pfn);
+ if (TestClearPageHWPoison(p))
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ put_page(p);
+ if (p != hpage)
+ put_page(hpage);
+ return -EBUSY;
+ }
+ VM_BUG_ON_PAGE(!page_count(p), p);
+ hpage = compound_head(p);
+ }
+
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
@@ -1141,7 +1167,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageHuge(p) && !PageTransTail(p)) {
+ if (!PageHuge(p)) {
if (!PageLRU(p))
shake_page(p, 0);
if (!PageLRU(p)) {
@@ -1150,9 +1176,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (is_free_buddy_page(p)) {
if (flags & MF_COUNT_INCREASED)
- action_result(pfn, "free buddy", DELAYED);
+ action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
else
- action_result(pfn, "free buddy, 2nd try", DELAYED);
+ action_result(pfn, MF_MSG_BUDDY_2ND,
+ MF_DELAYED);
return 0;
}
}
@@ -1164,8 +1191,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* The page could have changed compound pages during the locking.
* If this happens just bail out.
*/
- if (compound_head(p) != hpage) {
- action_result(pfn, "different compound page after locking", IGNORED);
+ if (PageCompound(p) && compound_head(p) != orig_head) {
+ action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
res = -EBUSY;
goto out;
}
@@ -1185,9 +1212,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
put_page(hpage);
- res = 0;
- goto out;
+ return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
@@ -1205,8 +1232,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* on the head page to show that the hugepage is hwpoisoned
*/
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
- action_result(pfn, "hugepage already hardware poisoned",
- IGNORED);
+ action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
unlock_page(hpage);
put_page(hpage);
return 0;
@@ -1235,7 +1261,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
!= SWAP_SUCCESS) {
- action_result(pfn, "unmapping failed", IGNORED);
+ action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
}
@@ -1244,7 +1270,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* Torn down by someone else?
*/
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
- action_result(pfn, "already truncated LRU", IGNORED);
+ action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
res = -EBUSY;
goto out;
}
@@ -1404,12 +1430,12 @@ int unpoison_memory(unsigned long pfn)
*/
if (!PageHuge(page) && PageTransHuge(page)) {
pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
- return 0;
+ return 0;
}
nr_pages = 1 << compound_order(page);
- if (!get_page_unless_zero(page)) {
+ if (!get_hwpoison_page(p)) {
/*
* Since HWPoisoned hugepage should have non-zero refcount,
* race between memory failure and unpoison seems to happen.
@@ -1477,7 +1503,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
* When the target page is a free hugepage, just remove it
* from free hugepage list.
*/
- if (!get_page_unless_zero(compound_head(p))) {
+ if (!get_hwpoison_page(p)) {
if (PageHuge(p)) {
pr_info("%s: %#lx free huge page\n", __func__, pfn);
ret = 0;
@@ -1540,8 +1566,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
}
unlock_page(hpage);
- /* Keep page count to indicate a given hugepage is isolated. */
- list_move(&hpage->lru, &pagelist);
+ ret = isolate_huge_page(hpage, &pagelist);
+ if (ret) {
+ /*
+ * get_any_page() and isolate_huge_page() takes a refcount each,
+ * so need to drop one here.
+ */
+ put_page(hpage);
+ } else {
+ pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
+ return -EBUSY;
+ }
+
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
@@ -1623,6 +1659,8 @@ static int __soft_offline_page(struct page *page, int flags)
inc_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
+ if (!TestSetPageHWPoison(page))
+ atomic_long_inc(&num_poisoned_pages);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
@@ -1637,22 +1675,8 @@ static int __soft_offline_page(struct page *page, int flags)
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
- } else {
- /*
- * After page migration succeeds, the source page can
- * be trapped in pagevec and actual freeing is delayed.
- * Freeing code works differently based on PG_hwpoison,
- * so there's a race. We need to make sure that the
- * source page should be freed back to buddy before
- * setting PG_hwpoison.
- */
- if (!is_free_buddy_page(page))
- drain_all_pages(page_zone(page));
- SetPageHWPoison(page);
- if (!is_free_buddy_page(page))
- pr_info("soft offline: %#lx: page leaked\n",
- pfn);
- atomic_long_inc(&num_poisoned_pages);
+ if (TestClearPageHWPoison(page))
+ atomic_long_dec(&num_poisoned_pages);
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
@@ -1703,14 +1727,6 @@ int soft_offline_page(struct page *page, int flags)
get_online_mems();
- /*
- * Isolate the page, so that it doesn't get reallocated if it
- * was free. This flag should be kept set until the source page
- * is freed and PG_hwpoison on it is set.
- */
- if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
- set_migratetype_isolate(page, true);
-
ret = get_any_page(page, pfn, flags);
put_online_mems();
if (ret > 0) { /* for in-use pages */
@@ -1721,14 +1737,13 @@ int soft_offline_page(struct page *page, int flags)
} else if (ret == 0) { /* for free pages */
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
- dequeue_hwpoisoned_huge_page(hpage);
- atomic_long_add(1 << compound_order(hpage),
+ if (!dequeue_hwpoisoned_huge_page(hpage))
+ atomic_long_add(1 << compound_order(hpage),
&num_poisoned_pages);
} else {
- SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
+ if (!TestSetPageHWPoison(page))
+ atomic_long_inc(&num_poisoned_pages);
}
}
- unset_migratetype_isolate(page, MIGRATE_MOVABLE);
return ret;
}