summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-12-28 16:55:46 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2018-12-28 16:55:46 -0800
commitf346b0becb1bc62e45495f9cdbae3eef35d0b635 (patch)
treeae79f3dfb8e031da51d38f0f095f89d7d23f3643 /mm
parent00d59fde8532b2d42e80909d2e58678755e04da9 (diff)
parent0f4991e8fd48987ae476a92cdee6bfec4aff31b8 (diff)
downloadlinux-f346b0becb1bc62e45495f9cdbae3eef35d0b635.tar.bz2
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton: - large KASAN update to use arm's "software tag-based mode" - a few misc things - sh updates - ocfs2 updates - just about all of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (167 commits) kernel/fork.c: mark 'stack_vm_area' with __maybe_unused memcg, oom: notify on oom killer invocation from the charge path mm, swap: fix swapoff with KSM pages include/linux/gfp.h: fix typo mm/hmm: fix memremap.h, move dev_page_fault_t callback to hmm hugetlbfs: Use i_mmap_rwsem to fix page fault/truncate race hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization memory_hotplug: add missing newlines to debugging output mm: remove __hugepage_set_anon_rmap() include/linux/vmstat.h: remove unused page state adjustment macro mm/page_alloc.c: allow error injection mm: migrate: drop unused argument of migrate_page_move_mapping() blkdev: avoid migration stalls for blkdev pages mm: migrate: provide buffer_migrate_page_norefs() mm: migrate: move migrate_page_lock_buffers() mm: migrate: lock buffers before migrate_page_move_mapping() mm: migration: factor out code to compute expected number of page references mm, page_alloc: enable pcpu_drain with zone capability kmemleak: add config to select auto scan mm/page_alloc.c: don't call kasan_free_pages() at deferred mem init ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/cma.c11
-rw-r--r--mm/compaction.c2
-rw-r--r--mm/debug.c27
-rw-r--r--mm/filemap.c96
-rw-r--r--mm/highmem.c5
-rw-r--r--mm/hmm.c331
-rw-r--r--mm/huge_memory.c74
-rw-r--r--mm/hugetlb.c133
-rw-r--r--mm/internal.h24
-rw-r--r--mm/kasan/Makefile15
-rw-r--r--mm/kasan/common.c (renamed from mm/kasan/kasan.c)656
-rw-r--r--mm/kasan/generic.c344
-rw-r--r--mm/kasan/generic_report.c153
-rw-r--r--mm/kasan/init.c (renamed from mm/kasan/kasan_init.c)71
-rw-r--r--mm/kasan/kasan.h59
-rw-r--r--mm/kasan/quarantine.c3
-rw-r--r--mm/kasan/report.c272
-rw-r--r--mm/kasan/tags.c161
-rw-r--r--mm/kasan/tags_report.c58
-rw-r--r--mm/khugepaged.c10
-rw-r--r--mm/kmemleak.c19
-rw-r--r--mm/ksm.c35
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memblock.c52
-rw-r--r--mm/memcontrol.c53
-rw-r--r--mm/memory-failure.c16
-rw-r--r--mm/memory.c103
-rw-r--r--mm/memory_hotplug.c172
-rw-r--r--mm/migrate.c264
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mmu_notifier.c31
-rw-r--r--mm/mprotect.c15
-rw-r--r--mm/mremap.c10
-rw-r--r--mm/oom_kill.c51
-rw-r--r--mm/page-writeback.c35
-rw-r--r--mm/page_alloc.c404
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/page_owner.c1
-rw-r--r--mm/readahead.c12
-rw-r--r--mm/rmap.c59
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c31
-rw-r--r--mm/slab.h2
-rw-r--r--mm/slab_common.c10
-rw-r--r--mm/slub.c82
-rw-r--r--mm/sparse.c26
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/userfaultfd.c11
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c143
-rw-r--r--mm/vmstat.c4
-rw-r--r--mm/workingset.c2
-rw-r--r--mm/zswap.c4
57 files changed, 2454 insertions, 1770 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d85e39da47ae..25c71eb8a7db 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -291,6 +291,7 @@ config MMU_NOTIFIER
config KSM
bool "Enable KSM for page merging"
depends on MMU
+ select XXHASH
help
Enable Kernel Samepage Merging: KSM periodically scans those areas
of an application's address space that an app has advised may be
diff --git a/mm/cma.c b/mm/cma.c
index 4cb76121a3ab..c7b39dd3b4f6 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -407,6 +407,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
unsigned long pfn = -1;
unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
+ size_t i;
struct page *page = NULL;
int ret = -ENOMEM;
@@ -466,6 +467,16 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
trace_cma_alloc(pfn, page, count, align);
+ /*
+ * CMA can allocate multiple page blocks, which results in different
+ * blocks being marked with different tags. Reset the tags to ignore
+ * those page blocks.
+ */
+ if (page) {
+ for (i = 0; i < count; i++)
+ page_kasan_tag_reset(page + i);
+ }
+
if (ret && !no_warn) {
pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
__func__, count, ret);
diff --git a/mm/compaction.c b/mm/compaction.c
index 7c607479de4a..ef29490b0f46 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1431,7 +1431,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
if (is_via_compact_memory(order))
return COMPACT_CONTINUE;
- watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
/*
* If watermarks for high-order allocation are already met, there
* should be no need for compaction at all.
diff --git a/mm/debug.c b/mm/debug.c
index cdacba12e09a..0abb987dad9b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -17,7 +17,7 @@
#include "internal.h"
-char *migrate_reason_names[MR_TYPES] = {
+const char *migrate_reason_names[MR_TYPES] = {
"compaction",
"memory_failure",
"memory_hotplug",
@@ -44,6 +44,7 @@ const struct trace_print_flags vmaflag_names[] = {
void __dump_page(struct page *page, const char *reason)
{
+ struct address_space *mapping = page_mapping(page);
bool page_poisoned = PagePoisoned(page);
int mapcount;
@@ -53,7 +54,7 @@ void __dump_page(struct page *page, const char *reason)
* dump_page() when detected.
*/
if (page_poisoned) {
- pr_emerg("page:%px is uninitialized and poisoned", page);
+ pr_warn("page:%px is uninitialized and poisoned", page);
goto hex_only;
}
@@ -64,27 +65,39 @@ void __dump_page(struct page *page, const char *reason)
*/
mapcount = PageSlab(page) ? 0 : page_mapcount(page);
- pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
+ pr_warn("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
page, page_ref_count(page), mapcount,
page->mapping, page_to_pgoff(page));
if (PageCompound(page))
pr_cont(" compound_mapcount: %d", compound_mapcount(page));
pr_cont("\n");
+ if (PageAnon(page))
+ pr_warn("anon ");
+ else if (PageKsm(page))
+ pr_warn("ksm ");
+ else if (mapping) {
+ pr_warn("%ps ", mapping->a_ops);
+ if (mapping->host->i_dentry.first) {
+ struct dentry *dentry;
+ dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias);
+ pr_warn("name:\"%pd\" ", dentry);
+ }
+ }
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
- pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
hex_only:
- print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32,
+ print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
sizeof(struct page), false);
if (reason)
- pr_alert("page dumped because: %s\n", reason);
+ pr_warn("page dumped because: %s\n", reason);
#ifdef CONFIG_MEMCG
if (!page_poisoned && page->mem_cgroup)
- pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup);
+ pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup);
#endif
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 81adec8ee02c..29655fb47a2c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -981,7 +981,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
if (wait_page->bit_nr != key->bit_nr)
return 0;
- /* Stop walking if it's locked */
+ /*
+ * Stop walking if it's locked.
+ * Is this safe if put_and_wait_on_page_locked() is in use?
+ * Yes: the waker must hold a reference to this page, and if PG_locked
+ * has now already been set by another task, that task must also hold
+ * a reference to the *same usage* of this page; so there is no need
+ * to walk on to wake even the put_and_wait_on_page_locked() callers.
+ */
if (test_bit(key->bit_nr, &key->page->flags))
return -1;
@@ -1049,25 +1056,44 @@ static void wake_up_page(struct page *page, int bit)
wake_up_page_bit(page, bit);
}
+/*
+ * A choice of three behaviors for wait_on_page_bit_common():
+ */
+enum behavior {
+ EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
+ * __lock_page() waiting on then setting PG_locked.
+ */
+ SHARED, /* Hold ref to page and check the bit when woken, like
+ * wait_on_page_writeback() waiting on PG_writeback.
+ */
+ DROP, /* Drop ref to page before wait, no check when woken,
+ * like put_and_wait_on_page_locked() on PG_locked.
+ */
+};
+
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
- struct page *page, int bit_nr, int state, bool lock)
+ struct page *page, int bit_nr, int state, enum behavior behavior)
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
+ bool bit_is_set;
bool thrashing = false;
+ bool delayacct = false;
unsigned long pflags;
int ret = 0;
if (bit_nr == PG_locked &&
!PageUptodate(page) && PageWorkingset(page)) {
- if (!PageSwapBacked(page))
+ if (!PageSwapBacked(page)) {
delayacct_thrashing_start();
+ delayacct = true;
+ }
psi_memstall_enter(&pflags);
thrashing = true;
}
init_wait(wait);
- wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
+ wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
@@ -1084,14 +1110,17 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
spin_unlock_irq(&q->lock);
- if (likely(test_bit(bit_nr, &page->flags))) {
+ bit_is_set = test_bit(bit_nr, &page->flags);
+ if (behavior == DROP)
+ put_page(page);
+
+ if (likely(bit_is_set))
io_schedule();
- }
- if (lock) {
+ if (behavior == EXCLUSIVE) {
if (!test_and_set_bit_lock(bit_nr, &page->flags))
break;
- } else {
+ } else if (behavior == SHARED) {
if (!test_bit(bit_nr, &page->flags))
break;
}
@@ -1100,12 +1129,23 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
ret = -EINTR;
break;
}
+
+ if (behavior == DROP) {
+ /*
+ * We can no longer safely access page->flags:
+ * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
+ * there is a risk of waiting forever on a page reused
+ * for something that keeps it locked indefinitely.
+ * But best check for -EINTR above before breaking.
+ */
+ break;
+ }
}
finish_wait(q, wait);
if (thrashing) {
- if (!PageSwapBacked(page))
+ if (delayacct)
delayacct_thrashing_end();
psi_memstall_leave(&pflags);
}
@@ -1124,18 +1164,37 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
void wait_on_page_bit(struct page *page, int bit_nr)
{
wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+ wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
EXPORT_SYMBOL(wait_on_page_bit);
int wait_on_page_bit_killable(struct page *page, int bit_nr)
{
wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
+ return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
}
EXPORT_SYMBOL(wait_on_page_bit_killable);
/**
+ * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
+ * @page: The page to wait for.
+ *
+ * The caller should hold a reference on @page. They expect the page to
+ * become unlocked relatively soon, but do not wish to hold up migration
+ * (for example) by holding the reference while waiting for the page to
+ * come unlocked. After this function returns, the caller should not
+ * dereference @page.
+ */
+void put_and_wait_on_page_locked(struct page *page)
+{
+ wait_queue_head_t *q;
+
+ page = compound_head(page);
+ q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+}
+
+/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
* @page: Page defining the wait queue of interest
* @waiter: Waiter to add to the queue
@@ -1264,7 +1323,8 @@ void __lock_page(struct page *__page)
{
struct page *page = compound_head(__page);
wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+ EXCLUSIVE);
}
EXPORT_SYMBOL(__lock_page);
@@ -1272,7 +1332,8 @@ int __lock_page_killable(struct page *__page)
{
struct page *page = compound_head(__page);
wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
+ return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+ EXCLUSIVE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
@@ -1540,7 +1601,7 @@ repeat:
VM_BUG_ON_PAGE(page->index != offset, page);
}
- if (page && (fgp_flags & FGP_ACCESSED))
+ if (fgp_flags & FGP_ACCESSED)
mark_page_accessed(page);
no_page:
@@ -2553,6 +2614,13 @@ void filemap_map_pages(struct vm_fault *vmf,
goto next;
head = compound_head(page);
+
+ /*
+ * Check for a locked page first, as a speculative
+ * reference may adversely influence page migration.
+ */
+ if (PageLocked(head))
+ goto next;
if (!page_cache_get_speculative(head))
goto next;
diff --git a/mm/highmem.c b/mm/highmem.c
index 59db3223a5d6..107b10f9878e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -105,9 +105,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
}
#endif
-unsigned long totalhigh_pages __read_mostly;
-EXPORT_SYMBOL(totalhigh_pages);
-
+atomic_long_t _totalhigh_pages __read_mostly;
+EXPORT_SYMBOL(_totalhigh_pages);
EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
diff --git a/mm/hmm.c b/mm/hmm.c
index 90c34f3d1243..a04e4b810610 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -189,35 +189,30 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
}
static int hmm_invalidate_range_start(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- bool blockable)
+ const struct mmu_notifier_range *range)
{
struct hmm_update update;
- struct hmm *hmm = mm->hmm;
+ struct hmm *hmm = range->mm->hmm;
VM_BUG_ON(!hmm);
- update.start = start;
- update.end = end;
+ update.start = range->start;
+ update.end = range->end;
update.event = HMM_UPDATE_INVALIDATE;
- update.blockable = blockable;
+ update.blockable = range->blockable;
return hmm_invalidate_range(hmm, true, &update);
}
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
+ const struct mmu_notifier_range *range)
{
struct hmm_update update;
- struct hmm *hmm = mm->hmm;
+ struct hmm *hmm = range->mm->hmm;
VM_BUG_ON(!hmm);
- update.start = start;
- update.end = end;
+ update.start = range->start;
+ update.end = range->end;
update.event = HMM_UPDATE_INVALIDATE;
update.blockable = true;
hmm_invalidate_range(hmm, false, &update);
@@ -986,19 +981,13 @@ static void hmm_devmem_ref_exit(void *data)
struct hmm_devmem *devmem;
devmem = container_of(ref, struct hmm_devmem, ref);
+ wait_for_completion(&devmem->completion);
percpu_ref_exit(ref);
- devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
}
-static void hmm_devmem_ref_kill(void *data)
+static void hmm_devmem_ref_kill(struct percpu_ref *ref)
{
- struct percpu_ref *ref = data;
- struct hmm_devmem *devmem;
-
- devmem = container_of(ref, struct hmm_devmem, ref);
percpu_ref_kill(ref);
- wait_for_completion(&devmem->completion);
- devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
}
static int hmm_devmem_fault(struct vm_area_struct *vma,
@@ -1021,172 +1010,6 @@ static void hmm_devmem_free(struct page *page, void *data)
devmem->ops->free(devmem, page);
}
-static DEFINE_MUTEX(hmm_devmem_lock);
-static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
-
-static void hmm_devmem_radix_release(struct resource *resource)
-{
- resource_size_t key;
-
- mutex_lock(&hmm_devmem_lock);
- for (key = resource->start;
- key <= resource->end;
- key += PA_SECTION_SIZE)
- radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
- mutex_unlock(&hmm_devmem_lock);
-}
-
-static void hmm_devmem_release(struct device *dev, void *data)
-{
- struct hmm_devmem *devmem = data;
- struct resource *resource = devmem->resource;
- unsigned long start_pfn, npages;
- struct zone *zone;
- struct page *page;
-
- if (percpu_ref_tryget_live(&devmem->ref)) {
- dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
- percpu_ref_put(&devmem->ref);
- }
-
- /* pages are dead and unused, undo the arch mapping */
- start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
- npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
-
- page = pfn_to_page(start_pfn);
- zone = page_zone(page);
-
- mem_hotplug_begin();
- if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
- __remove_pages(zone, start_pfn, npages, NULL);
- else
- arch_remove_memory(start_pfn << PAGE_SHIFT,
- npages << PAGE_SHIFT, NULL);
- mem_hotplug_done();
-
- hmm_devmem_radix_release(resource);
-}
-
-static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
-{
- resource_size_t key, align_start, align_size, align_end;
- struct device *device = devmem->device;
- int ret, nid, is_ram;
-
- align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
- align_size = ALIGN(devmem->resource->start +
- resource_size(devmem->resource),
- PA_SECTION_SIZE) - align_start;
-
- is_ram = region_intersects(align_start, align_size,
- IORESOURCE_SYSTEM_RAM,
- IORES_DESC_NONE);
- if (is_ram == REGION_MIXED) {
- WARN_ONCE(1, "%s attempted on mixed region %pr\n",
- __func__, devmem->resource);
- return -ENXIO;
- }
- if (is_ram == REGION_INTERSECTS)
- return -ENXIO;
-
- if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
- devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
- else
- devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-
- devmem->pagemap.res = *devmem->resource;
- devmem->pagemap.page_fault = hmm_devmem_fault;
- devmem->pagemap.page_free = hmm_devmem_free;
- devmem->pagemap.dev = devmem->device;
- devmem->pagemap.ref = &devmem->ref;
- devmem->pagemap.data = devmem;
-
- mutex_lock(&hmm_devmem_lock);
- align_end = align_start + align_size - 1;
- for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
- struct hmm_devmem *dup;
-
- dup = radix_tree_lookup(&hmm_devmem_radix,
- key >> PA_SECTION_SHIFT);
- if (dup) {
- dev_err(device, "%s: collides with mapping for %s\n",
- __func__, dev_name(dup->device));
- mutex_unlock(&hmm_devmem_lock);
- ret = -EBUSY;
- goto error;
- }
- ret = radix_tree_insert(&hmm_devmem_radix,
- key >> PA_SECTION_SHIFT,
- devmem);
- if (ret) {
- dev_err(device, "%s: failed: %d\n", __func__, ret);
- mutex_unlock(&hmm_devmem_lock);
- goto error_radix;
- }
- }
- mutex_unlock(&hmm_devmem_lock);
-
- nid = dev_to_node(device);
- if (nid < 0)
- nid = numa_mem_id();
-
- mem_hotplug_begin();
- /*
- * For device private memory we call add_pages() as we only need to
- * allocate and initialize struct page for the device memory. More-
- * over the device memory is un-accessible thus we do not want to
- * create a linear mapping for the memory like arch_add_memory()
- * would do.
- *
- * For device public memory, which is accesible by the CPU, we do
- * want the linear mapping and thus use arch_add_memory().
- */
- if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
- ret = arch_add_memory(nid, align_start, align_size, NULL,
- false);
- else
- ret = add_pages(nid, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, NULL, false);
- if (ret) {
- mem_hotplug_done();
- goto error_add_memory;
- }
- move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
- align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, NULL);
- mem_hotplug_done();
-
- /*
- * Initialization of the pages has been deferred until now in order
- * to allow us to do the work while not holding the hotplug lock.
- */
- memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
- align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, &devmem->pagemap);
-
- return 0;
-
-error_add_memory:
- untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
-error_radix:
- hmm_devmem_radix_release(devmem->resource);
-error:
- return ret;
-}
-
-static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
-{
- struct hmm_devmem *devmem = data;
-
- return devmem->resource == match_data;
-}
-
-static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
-{
- devres_release(devmem->device, &hmm_devmem_release,
- &hmm_devmem_match, devmem->resource);
-}
-
/*
* hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
*
@@ -1210,12 +1033,12 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
{
struct hmm_devmem *devmem;
resource_size_t addr;
+ void *result;
int ret;
dev_pagemap_get_ops();
- devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
- GFP_KERNEL, dev_to_node(device));
+ devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
if (!devmem)
return ERR_PTR(-ENOMEM);
@@ -1229,11 +1052,11 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
0, GFP_KERNEL);
if (ret)
- goto error_percpu_ref;
+ return ERR_PTR(ret);
- ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+ ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref);
if (ret)
- goto error_devm_add_action;
+ return ERR_PTR(ret);
size = ALIGN(size, PA_SECTION_SIZE);
addr = min((unsigned long)iomem_resource.end,
@@ -1253,54 +1076,40 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
devmem->resource = devm_request_mem_region(device, addr, size,
dev_name(device));
- if (!devmem->resource) {
- ret = -ENOMEM;
- goto error_no_resource;
- }
+ if (!devmem->resource)
+ return ERR_PTR(-ENOMEM);
break;
}
- if (!devmem->resource) {
- ret = -ERANGE;
- goto error_no_resource;
- }
+ if (!devmem->resource)
+ return ERR_PTR(-ERANGE);
devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
devmem->pfn_last = devmem->pfn_first +
(resource_size(devmem->resource) >> PAGE_SHIFT);
+ devmem->page_fault = hmm_devmem_fault;
- ret = hmm_devmem_pages_create(devmem);
- if (ret)
- goto error_pages;
-
- devres_add(device, devmem);
-
- ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
- if (ret) {
- hmm_devmem_remove(devmem);
- return ERR_PTR(ret);
- }
+ devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+ devmem->pagemap.res = *devmem->resource;
+ devmem->pagemap.page_free = hmm_devmem_free;
+ devmem->pagemap.altmap_valid = false;
+ devmem->pagemap.ref = &devmem->ref;
+ devmem->pagemap.data = devmem;
+ devmem->pagemap.kill = hmm_devmem_ref_kill;
+ result = devm_memremap_pages(devmem->device, &devmem->pagemap);
+ if (IS_ERR(result))
+ return result;
return devmem;
-
-error_pages:
- devm_release_mem_region(device, devmem->resource->start,
- resource_size(devmem->resource));
-error_no_resource:
-error_devm_add_action:
- hmm_devmem_ref_kill(&devmem->ref);
- hmm_devmem_ref_exit(&devmem->ref);
-error_percpu_ref:
- devres_free(devmem);
- return ERR_PTR(ret);
}
-EXPORT_SYMBOL(hmm_devmem_add);
+EXPORT_SYMBOL_GPL(hmm_devmem_add);
struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
struct device *device,
struct resource *res)
{
struct hmm_devmem *devmem;
+ void *result;
int ret;
if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
@@ -1308,8 +1117,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
dev_pagemap_get_ops();
- devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
- GFP_KERNEL, dev_to_node(device));
+ devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
if (!devmem)
return ERR_PTR(-ENOMEM);
@@ -1323,71 +1131,32 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
0, GFP_KERNEL);
if (ret)
- goto error_percpu_ref;
+ return ERR_PTR(ret);
- ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+ ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit,
+ &devmem->ref);
if (ret)
- goto error_devm_add_action;
-
+ return ERR_PTR(ret);
devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
devmem->pfn_last = devmem->pfn_first +
(resource_size(devmem->resource) >> PAGE_SHIFT);
+ devmem->page_fault = hmm_devmem_fault;
- ret = hmm_devmem_pages_create(devmem);
- if (ret)
- goto error_devm_add_action;
-
- devres_add(device, devmem);
-
- ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
- if (ret) {
- hmm_devmem_remove(devmem);
- return ERR_PTR(ret);
- }
+ devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
+ devmem->pagemap.res = *devmem->resource;
+ devmem->pagemap.page_free = hmm_devmem_free;
+ devmem->pagemap.altmap_valid = false;
+ devmem->pagemap.ref = &devmem->ref;
+ devmem->pagemap.data = devmem;
+ devmem->pagemap.kill = hmm_devmem_ref_kill;
+ result = devm_memremap_pages(devmem->device, &devmem->pagemap);
+ if (IS_ERR(result))
+ return result;
return devmem;
-
-error_devm_add_action:
- hmm_devmem_ref_kill(&devmem->ref);
- hmm_devmem_ref_exit(&devmem->ref);
-error_percpu_ref:
- devres_free(devmem);
- return ERR_PTR(ret);
-}
-EXPORT_SYMBOL(hmm_devmem_add_resource);
-
-/*
- * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
- *
- * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
- *
- * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
- * of the device driver. It will free struct page and remove the resource that
- * reserved the physical address range for this device memory.
- */
-void hmm_devmem_remove(struct hmm_devmem *devmem)
-{
- resource_size_t start, size;
- struct device *device;
- bool cdm = false;
-
- if (!devmem)
- return;
-
- device = devmem->device;
- start = devmem->resource->start;
- size = resource_size(devmem->resource);
-
- cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
- hmm_devmem_ref_kill(&devmem->ref);
- hmm_devmem_ref_exit(&devmem->ref);
- hmm_devmem_pages_remove(devmem);
-
- if (!cdm)
- devm_release_mem_region(device, start, size);
}
-EXPORT_SYMBOL(hmm_devmem_remove);
+EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
/*
* A device driver that wants to handle multiple devices memory through a
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e84a10b0d310..cbd977b1d60d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -62,6 +62,16 @@ static struct shrinker deferred_split_shrinker;
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
+bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return __transparent_hugepage_enabled(vma);
+ if (vma_is_shmem(vma) && shmem_huge_enabled(vma))
+ return __transparent_hugepage_enabled(vma);
+
+ return false;
+}
+
static struct page *get_huge_zero_page(void)
{
struct page *zero_page;
@@ -420,7 +430,7 @@ static int __init hugepage_init(void)
* where the extra memory used could hurt more than TLB overhead
* is likely to save. The admin can still enable it through /sys.
*/
- if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
+ if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
transparent_hugepage_flags = 0;
return 0;
}
@@ -1134,8 +1144,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
int i;
vm_fault_t ret = 0;
struct page **pages;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
GFP_KERNEL);
@@ -1173,9 +1182,9 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
cond_resched();
}
- mmun_start = haddr;
- mmun_end = haddr + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
@@ -1220,8 +1229,7 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
* No need to double call mmu_notifier->invalidate_range() callback as
* the above pmdp_huge_clear_flush_notify() did already call it.
*/
- mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
- mmun_end);
+ mmu_notifier_invalidate_range_only_end(&range);
ret |= VM_FAULT_WRITE;
put_page(page);
@@ -1231,7 +1239,7 @@ out:
out_free_pages:
spin_unlock(vmf->ptl);
- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
@@ -1248,8 +1256,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
struct page *page = NULL, *new_page;
struct mem_cgroup *memcg;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
gfp_t huge_gfp; /* for allocation and charge */
vm_fault_t ret = 0;
@@ -1293,7 +1300,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
get_page(page);
spin_unlock(vmf->ptl);
alloc:
- if (transparent_hugepage_enabled(vma) &&
+ if (__transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
huge_gfp = alloc_hugepage_direct_gfpmask(vma);
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -1338,9 +1345,9 @@ alloc:
vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
- mmun_start = haddr;
- mmun_end = haddr + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
spin_lock(vmf->ptl);
if (page)
@@ -1375,8 +1382,7 @@ out_mn:
* No need to double call mmu_notifier->invalidate_range() callback as
* the above pmdp_huge_clear_flush_notify() did already call it.
*/
- mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
- mmun_end);
+ mmu_notifier_invalidate_range_only_end(&range);
out:
return ret;
out_unlock:
@@ -1490,8 +1496,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
- wait_on_page_locked(page);
- put_page(page);
+ put_and_wait_on_page_locked(page);
goto out;
}
@@ -1527,8 +1532,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
- wait_on_page_locked(page);
- put_page(page);
+ put_and_wait_on_page_locked(page);
goto out;
}
@@ -2017,14 +2021,15 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
unsigned long address)
{
spinlock_t *ptl;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & HPAGE_PUD_MASK;
+ struct mmu_notifier_range range;
- mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
- ptl = pud_lock(mm, pud);
+ mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK,
+ (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ ptl = pud_lock(vma->vm_mm, pud);
if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
goto out;
- __split_huge_pud_locked(vma, pud, haddr);
+ __split_huge_pud_locked(vma, pud, range.start);
out:
spin_unlock(ptl);
@@ -2032,8 +2037,7 @@ out:
* No need to double call mmu_notifier->invalidate_range() callback as
* the above pudp_huge_clear_flush_notify() did already call it.
*/
- mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
- HPAGE_PUD_SIZE);
+ mmu_notifier_invalidate_range_only_end(&range);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -2235,11 +2239,12 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct page *page)
{
spinlock_t *ptl;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & HPAGE_PMD_MASK;
+ struct mmu_notifier_range range;
- mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
- ptl = pmd_lock(mm, pmd);
+ mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK,
+ (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ ptl = pmd_lock(vma->vm_mm, pmd);
/*
* If caller asks to setup a migration entries, we need a page to check
@@ -2255,7 +2260,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
clear_page_mlock(page);
} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
goto out;
- __split_huge_pmd_locked(vma, pmd, haddr, freeze);
+ __split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
/*
@@ -2271,8 +2276,7 @@ out:
* any further changes to individual pte will notify. So no need
* to call mmu_notifier->invalidate_range()
*/
- mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
- HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_only_end(&range);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a80832487981..e37efd5d8318 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3238,24 +3238,35 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct page *ptepage;
unsigned long addr;
int cow;
+ struct address_space *mapping = vma->vm_file->f_mapping;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
- mmun_start = vma->vm_start;
- mmun_end = vma->vm_end;
- if (cow)
- mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+ if (cow) {
+ mmu_notifier_range_init(&range, src, vma->vm_start,
+ vma->vm_end);
+ mmu_notifier_invalidate_range_start(&range);
+ } else {
+ /*
+ * For shared mappings i_mmap_rwsem must be held to call
+ * huge_pte_alloc, otherwise the returned ptep could go
+ * away if part of a shared pmd and another thread calls
+ * huge_pmd_unshare.
+ */
+ i_mmap_lock_read(mapping);
+ }
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
+
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
+
dst_pte = huge_pte_alloc(dst, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
@@ -3325,7 +3336,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
if (cow)
- mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
+ else
+ i_mmap_unlock_read(mapping);
return ret;
}
@@ -3342,8 +3355,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- unsigned long mmun_start = start; /* For mmu_notifiers */
- unsigned long mmun_end = end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -3359,8 +3371,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
/*
* If sharing possible, alert mmu notifiers of worst case.
*/
- adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, mm, start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
address = start;
for (; address < end; address += sz) {
ptep = huge_pte_offset(mm, address, sz);
@@ -3428,7 +3441,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (ref_page)
break;
}
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
}
@@ -3546,9 +3559,8 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *old_page, *new_page;
int outside_reserve = 0;
vm_fault_t ret = 0;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
unsigned long haddr = address & huge_page_mask(h);
+ struct mmu_notifier_range range;
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
@@ -3627,9 +3639,8 @@ retry_avoidcopy:
__SetPageUptodate(new_page);
set_page_huge_active(new_page);
- mmun_start = haddr;
- mmun_end = mmun_start + huge_page_size(h);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h));
+ mmu_notifier_invalidate_range_start(&range);
/*
* Retake the page table lock to check for racing updates
@@ -3642,7 +3653,7 @@ retry_avoidcopy:
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
- mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range(mm, range.start, range.end);
set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
@@ -3651,7 +3662,7 @@ retry_avoidcopy:
new_page = old_page;
}
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
out_release_all:
restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
@@ -3744,16 +3755,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
}
/*
- * Use page lock to guard against racing truncation
- * before we get page_table_lock.
+ * We can not race with truncation due to holding i_mmap_rwsem.
+ * Check once here for faults beyond end of file.
*/
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+
retry:
page = find_lock_page(mapping, idx);
if (!page) {
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
/*
* Check for page in userfault range
*/
@@ -3773,14 +3784,18 @@ retry:
};
/*
- * hugetlb_fault_mutex must be dropped before
- * handling userfault. Reacquire after handling
- * fault to make calling code simpler.
+ * hugetlb_fault_mutex and i_mmap_rwsem must be
+ * dropped before handling userfault. Reacquire
+ * after handling fault to make calling code simpler.
*/
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
+
+ i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
goto out;
}
@@ -3839,9 +3854,6 @@ retry:
}
ptl = huge_pte_lock(h, mm, ptep);
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto backout;
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3928,6 +3940,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
+ /*
+ * Since we hold no locks, ptep could be stale. That is
+ * OK as we are only making decisions based on content and
+ * not actually modifying content here.
+ */
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, mm, ptep);
@@ -3935,20 +3952,33 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
- } else {
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
- if (!ptep)
- return VM_FAULT_OOM;
}
+ /*
+ * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
+ * until finished with ptep. This serves two purposes:
+ * 1) It prevents huge_pmd_unshare from being called elsewhere
+ * and making the ptep no longer valid.
+ * 2) It synchronizes us with file truncation.
+ *
+ * ptep could have already be assigned via huge_pte_offset. That
+ * is OK, as huge_pte_alloc will return the same value unless
+ * something changed.
+ */
mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, haddr);
+ i_mmap_lock_read(mapping);
+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+ if (!ptep) {
+ i_mmap_unlock_read(mapping);
+ return VM_FAULT_OOM;
+ }
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
+ idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -4036,6 +4066,7 @@ out_ptl:
}
out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -4340,21 +4371,21 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t pte;
struct hstate *h = hstate_vma(vma);
unsigned long pages = 0;
- unsigned long f_start = start;
- unsigned long f_end = end;
bool shared_pmd = false;
+ struct mmu_notifier_range range;
/*
* In the case of shared PMDs, the area to flush could be beyond
- * start/end. Set f_start/f_end to cover the maximum possible
+ * start/end. Set range.start/range.end to cover the maximum possible
* range if PMD sharing is possible.
*/
- adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
+ mmu_notifier_range_init(&range, mm, start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
- flush_cache_range(vma, f_start, f_end);
+ flush_cache_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range_start(mm, f_start, f_end);
+ mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
@@ -4405,7 +4436,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* did unshare a page of pmds, flush the range corresponding to the pud.
*/
if (shared_pmd)
- flush_hugetlb_tlb_range(vma, f_start, f_end);
+ flush_hugetlb_tlb_range(vma, range.start, range.end);
else
flush_hugetlb_tlb_range(vma, start, end);
/*
@@ -4415,7 +4446,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* See Documentation/vm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
- mmu_notifier_invalidate_range_end(mm, f_start, f_end);
+ mmu_notifier_invalidate_range_end(&range);
return pages << h->order;
}
@@ -4640,10 +4671,12 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
- * bad pmd for sharing.
+ * code much cleaner.
+ *
+ * This routine must be called with i_mmap_rwsem held in at least read mode.
+ * For hugetlbfs, this prevents removal of any page table entries associated
+ * with the address space. This is important as we are setting up sharing
+ * based on existing page table entries (mappings).
*/
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
@@ -4660,7 +4693,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_lock_write(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -4690,7 +4722,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_unlock_write(mapping);
return pte;
}
@@ -4701,7 +4732,7 @@ out:
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * called with page table lock held.
+ * Called with page table lock held and i_mmap_rwsem held in write mode.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/internal.h b/mm/internal.h
index 291eb2b6d1d8..f4a7bb02decf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -444,6 +444,16 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
#define NODE_RECLAIM_SOME 0
#define NODE_RECLAIM_SUCCESS 1
+#ifdef CONFIG_NUMA
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+#else
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+ unsigned int order)
+{
+ return NODE_RECLAIM_NOSCAN;
+}
+#endif
+
extern int hwpoison_filter(struct page *p);
extern u32 hwpoison_filter_dev_major;
@@ -480,10 +490,16 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_OOM ALLOC_NO_WATERMARKS
#endif
-#define ALLOC_HARDER 0x10 /* try to alloc harder */
-#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
-#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#define ALLOC_HARDER 0x10 /* try to alloc harder */
+#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#ifdef CONFIG_ZONE_DMA32
+#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */
+#else
+#define ALLOC_NOFRAGMENT 0x0
+#endif
+#define ALLOC_KSWAPD 0x200 /* allow waking of kswapd */
enum ttu_flags;
struct tlbflush_unmap_batch;
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 3289db38bc87..0a14fcff70ed 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,11 +1,18 @@
# SPDX-License-Identifier: GPL-2.0
KASAN_SANITIZE := n
-UBSAN_SANITIZE_kasan.o := n
+UBSAN_SANITIZE_common.o := n
+UBSAN_SANITIZE_generic.o := n
+UBSAN_SANITIZE_tags.o := n
KCOV_INSTRUMENT := n
-CFLAGS_REMOVE_kasan.o = -pg
+CFLAGS_REMOVE_generic.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
-CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-obj-y := kasan.o report.o kasan_init.o quarantine.o
+CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+
+obj-$(CONFIG_KASAN) := common.o init.o report.o
+obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
+obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/common.c
index c3bd5209da38..03d5d1374ca7 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/common.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains shadow memory manipulation code.
+ * This file contains common generic and tag-based KASAN code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
@@ -13,9 +14,6 @@
*
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define DISABLE_BRANCH_PROFILING
-
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/init.h>
@@ -40,6 +38,53 @@
#include "kasan.h"
#include "../slab.h"
+static inline int in_irqentry_text(unsigned long ptr)
+{
+ return (ptr >= (unsigned long)&__irqentry_text_start &&
+ ptr < (unsigned long)&__irqentry_text_end) ||
+ (ptr >= (unsigned long)&__softirqentry_text_start &&
+ ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+static inline void filter_irq_stacks(struct stack_trace *trace)
+{
+ int i;
+
+ if (!trace->nr_entries)
+ return;
+ for (i = 0; i < trace->nr_entries; i++)
+ if (in_irqentry_text(trace->entries[i])) {
+ /* Include the irqentry function into the stack. */
+ trace->nr_entries = i + 1;
+ break;
+ }
+}
+
+static inline depot_stack_handle_t save_stack(gfp_t flags)
+{
+ unsigned long entries[KASAN_STACK_DEPTH];
+ struct stack_trace trace = {
+ .nr_entries = 0,
+ .entries = entries,
+ .max_entries = KASAN_STACK_DEPTH,
+ .skip = 0
+ };
+
+ save_stack_trace(&trace);
+ filter_irq_stacks(&trace);
+ if (trace.nr_entries != 0 &&
+ trace.entries[trace.nr_entries-1] == ULONG_MAX)
+ trace.nr_entries--;
+
+ return depot_save_stack(&trace, flags);
+}
+
+static inline void set_track(struct kasan_track *track, gfp_t flags)
+{
+ track->pid = current->pid;
+ track->stack = save_stack(flags);
+}
+
void kasan_enable_current(void)
{
current->kasan_depth++;
@@ -50,27 +95,85 @@ void kasan_disable_current(void)
current->kasan_depth--;
}
+void kasan_check_read(const volatile void *p, unsigned int size)
+{
+ check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_read);
+
+void kasan_check_write(const volatile void *p, unsigned int size)
+{
+ check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
+
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+ check_memory_region((unsigned long)addr, len, true, _RET_IP_);
+
+ return __memset(addr, c, len);
+}
+
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+ check_memory_region((unsigned long)src, len, false, _RET_IP_);
+ check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+
+ return __memmove(dest, src, len);
+}
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+ check_memory_region((unsigned long)src, len, false, _RET_IP_);
+ check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+
+ return __memcpy(dest, src, len);
+}
+
/*
* Poisons the shadow memory for 'size' bytes starting from 'addr'.
* Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
*/
-static void kasan_poison_shadow(const void *address, size_t size, u8 value)
+void kasan_poison_shadow(const void *address, size_t size, u8 value)
{
void *shadow_start, *shadow_end;
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_poison_object_data) pass tagged
+ * addresses to this function.
+ */
+ address = reset_tag(address);
+
shadow_start = kasan_mem_to_shadow(address);
shadow_end = kasan_mem_to_shadow(address + size);
- memset(shadow_start, value, shadow_end - shadow_start);
+ __memset(shadow_start, value, shadow_end - shadow_start);
}
void kasan_unpoison_shadow(const void *address, size_t size)
{
- kasan_poison_shadow(address, size, 0);
+ u8 tag = get_tag(address);
+
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
+ * addresses to this function.
+ */
+ address = reset_tag(address);
+
+ kasan_poison_shadow(address, size, tag);
if (size & KASAN_SHADOW_MASK) {
u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
- *shadow = size & KASAN_SHADOW_MASK;
+
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ *shadow = tag;
+ else
+ *shadow = size & KASAN_SHADOW_MASK;
}
}
@@ -116,199 +219,18 @@ void kasan_unpoison_stack_above_sp_to(const void *watermark)
kasan_unpoison_shadow(sp, size);
}
-/*
- * All functions below always inlined so compiler could
- * perform better optimizations in each of __asan_loadX/__assn_storeX
- * depending on memory access size X.
- */
-
-static __always_inline bool memory_is_poisoned_1(unsigned long addr)
-{
- s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
-
- if (unlikely(shadow_value)) {
- s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
- return unlikely(last_accessible_byte >= shadow_value);
- }
-
- return false;
-}
-
-static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
- unsigned long size)
-{
- u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
-
- /*
- * Access crosses 8(shadow size)-byte boundary. Such access maps
- * into 2 shadow bytes, so we need to check them both.
- */
- if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
- return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
-
- return memory_is_poisoned_1(addr + size - 1);
-}
-
-static __always_inline bool memory_is_poisoned_16(unsigned long addr)
-{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
-
- /* Unaligned 16-bytes access maps into 3 shadow bytes. */
- if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
- return *shadow_addr || memory_is_poisoned_1(addr + 15);
-
- return *shadow_addr;
-}
-
-static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
- size_t size)
-{
- while (size) {
- if (unlikely(*start))
- return (unsigned long)start;
- start++;
- size--;
- }
-
- return 0;
-}
-
-static __always_inline unsigned long memory_is_nonzero(const void *start,
- const void *end)
-{
- unsigned int words;
- unsigned long ret;
- unsigned int prefix = (unsigned long)start % 8;
-
- if (end - start <= 16)
- return bytes_is_nonzero(start, end - start);
-
- if (prefix) {
- prefix = 8 - prefix;
- ret = bytes_is_nonzero(start, prefix);
- if (unlikely(ret))
- return ret;
- start += prefix;
- }
-
- words = (end - start) / 8;
- while (words) {
- if (unlikely(*(u64 *)start))
- return bytes_is_nonzero(start, 8);
- start += 8;
- words--;
- }
-
- return bytes_is_nonzero(start, (end - start) % 8);
-}
-
-static __always_inline bool memory_is_poisoned_n(unsigned long addr,
- size_t size)
-{
- unsigned long ret;
-
- ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
- kasan_mem_to_shadow((void *)addr + size - 1) + 1);
-
- if (unlikely(ret)) {
- unsigned long last_byte = addr + size - 1;
- s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
-
- if (unlikely(ret != (unsigned long)last_shadow ||
- ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
- return true;
- }
- return false;
-}
-
-static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
-{
- if (__builtin_constant_p(size)) {
- switch (size) {
- case 1:
- return memory_is_poisoned_1(addr);
- case 2:
- case 4:
- case 8:
- return memory_is_poisoned_2_4_8(addr, size);
- case 16:
- return memory_is_poisoned_16(addr);
- default:
- BUILD_BUG();
- }
- }
-
- return memory_is_poisoned_n(addr, size);
-}
-
-static __always_inline void check_memory_region_inline(unsigned long addr,
- size_t size, bool write,
- unsigned long ret_ip)
+void kasan_alloc_pages(struct page *page, unsigned int order)
{
- if (unlikely(size == 0))
- return;
+ u8 tag;
+ unsigned long i;
- if (unlikely((void *)addr <
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
- kasan_report(addr, size, write, ret_ip);
+ if (unlikely(PageHighMem(page)))
return;
- }
- if (likely(!memory_is_poisoned(addr, size)))
- return;
-
- kasan_report(addr, size, write, ret_ip);
-}
-
-static void check_memory_region(unsigned long addr,
- size_t size, bool write,
- unsigned long ret_ip)
-{
- check_memory_region_inline(addr, size, write, ret_ip);
-}
-
-void kasan_check_read(const volatile void *p, unsigned int size)
-{
- check_memory_region((unsigned long)p, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(kasan_check_read);
-
-void kasan_check_write(const volatile void *p, unsigned int size)
-{
- check_memory_region((unsigned long)p, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(kasan_check_write);
-
-#undef memset
-void *memset(void *addr, int c, size_t len)
-{
- check_memory_region((unsigned long)addr, len, true, _RET_IP_);
-
- return __memset(addr, c, len);
-}
-
-#undef memmove
-void *memmove(void *dest, const void *src, size_t len)
-{
- check_memory_region((unsigned long)src, len, false, _RET_IP_);
- check_memory_region((unsigned long)dest, len, true, _RET_IP_);
-
- return __memmove(dest, src, len);
-}
-
-#undef memcpy
-void *memcpy(void *dest, const void *src, size_t len)
-{
- check_memory_region((unsigned long)src, len, false, _RET_IP_);
- check_memory_region((unsigned long)dest, len, true, _RET_IP_);
-
- return __memcpy(dest, src, len);
-}
-
-void kasan_alloc_pages(struct page *page, unsigned int order)
-{
- if (likely(!PageHighMem(page)))
- kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+ tag = random_tag();
+ for (i = 0; i < (1 << order); i++)
+ page_kasan_tag_set(page + i, tag);
+ kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
}
void kasan_free_pages(struct page *page, unsigned int order)
@@ -323,8 +245,11 @@ void kasan_free_pages(struct page *page, unsigned int order)
* Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
* For larger allocations larger redzones are used.
*/
-static unsigned int optimal_redzone(unsigned int object_size)
+static inline unsigned int optimal_redzone(unsigned int object_size)
{
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ return 0;
+
return
object_size <= 64 - 16 ? 16 :
object_size <= 128 - 32 ? 32 :
@@ -339,6 +264,7 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
slab_flags_t *flags)
{
unsigned int orig_size = *size;
+ unsigned int redzone_size;
int redzone_adjust;
/* Add alloc meta. */
@@ -346,20 +272,20 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
*size += sizeof(struct kasan_alloc_meta);
/* Add free meta. */
- if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
- cache->object_size < sizeof(struct kasan_free_meta)) {
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
+ cache->object_size < sizeof(struct kasan_free_meta))) {
cache->kasan_info.free_meta_offset = *size;
*size += sizeof(struct kasan_free_meta);
}
- redzone_adjust = optimal_redzone(cache->object_size) -
- (*size - cache->object_size);
+ redzone_size = optimal_redzone(cache->object_size);
+ redzone_adjust = redzone_size - (*size - cache->object_size);
if (redzone_adjust > 0)
*size += redzone_adjust;
*size = min_t(unsigned int, KMALLOC_MAX_SIZE,
- max(*size, cache->object_size +
- optimal_redzone(cache->object_size)));
+ max(*size, cache->object_size + redzone_size));
/*
* If the metadata doesn't fit, don't enable KASAN at all.
@@ -372,30 +298,39 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
return;
}
+ cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
+
*flags |= SLAB_KASAN;
}
-void kasan_cache_shrink(struct kmem_cache *cache)
+size_t kasan_metadata_size(struct kmem_cache *cache)
{
- quarantine_remove_cache(cache);
+ return (cache->kasan_info.alloc_meta_offset ?
+ sizeof(struct kasan_alloc_meta) : 0) +
+ (cache->kasan_info.free_meta_offset ?
+ sizeof(struct kasan_free_meta) : 0);
}
-void kasan_cache_shutdown(struct kmem_cache *cache)
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+ const void *object)
{
- if (!__kmem_cache_empty(cache))
- quarantine_remove_cache(cache);
+ BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
+ return (void *)object + cache->kasan_info.alloc_meta_offset;
}
-size_t kasan_metadata_size(struct kmem_cache *cache)
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+ const void *object)
{
- return (cache->kasan_info.alloc_meta_offset ?
- sizeof(struct kasan_alloc_meta) : 0) +
- (cache->kasan_info.free_meta_offset ?
- sizeof(struct kasan_free_meta) : 0);
+ BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
+ return (void *)object + cache->kasan_info.free_meta_offset;
}
void kasan_poison_slab(struct page *page)
{
+ unsigned long i;
+
+ for (i = 0; i < (1 << compound_order(page)); i++)
+ page_kasan_tag_reset(page + i);
kasan_poison_shadow(page_address(page),
PAGE_SIZE << compound_order(page),
KASAN_KMALLOC_REDZONE);
@@ -413,92 +348,79 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
KASAN_KMALLOC_REDZONE);
}
-static inline int in_irqentry_text(unsigned long ptr)
-{
- return (ptr >= (unsigned long)&__irqentry_text_start &&
- ptr < (unsigned long)&__irqentry_text_end) ||
- (ptr >= (unsigned long)&__softirqentry_text_start &&
- ptr < (unsigned long)&__softirqentry_text_end);
-}
-
-static inline void filter_irq_stacks(struct stack_trace *trace)
+/*
+ * Since it's desirable to only call object contructors once during slab
+ * allocation, we preassign tags to all such objects. Also preassign tags for
+ * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports.
+ * For SLAB allocator we can't preassign tags randomly since the freelist is
+ * stored as an array of indexes instead of a linked list. Assign tags based
+ * on objects indexes, so that objects that are next to each other get
+ * different tags.
+ * After a tag is assigned, the object always gets allocated with the same tag.
+ * The reason is that we can't change tags for objects with constructors on
+ * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor
+ * code can save the pointer to the object somewhere (e.g. in the object
+ * itself). Then if we retag it, the old saved pointer will become invalid.
+ */
+static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new)
{
- int i;
+ if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
+ return new ? KASAN_TAG_KERNEL : random_tag();
- if (!trace->nr_entries)
- return;
- for (i = 0; i < trace->nr_entries; i++)
- if (in_irqentry_text(trace->entries[i])) {
- /* Include the irqentry function into the stack. */
- trace->nr_entries = i + 1;
- break;
- }
+#ifdef CONFIG_SLAB
+ return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
+#else
+ return new ? random_tag() : get_tag(object);
+#endif
}
-static inline depot_stack_handle_t save_stack(gfp_t flags)
+void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
+ const void *object)
{
- unsigned long entries[KASAN_STACK_DEPTH];
- struct stack_trace trace = {
- .nr_entries = 0,
- .entries = entries,
- .max_entries = KASAN_STACK_DEPTH,
- .skip = 0
- };
-
- save_stack_trace(&trace);
- filter_irq_stacks(&trace);
- if (trace.nr_entries != 0 &&
- trace.entries[trace.nr_entries-1] == ULONG_MAX)
- trace.nr_entries--;
+ struct kasan_alloc_meta *alloc_info;
- return depot_save_stack(&trace, flags);
-}
+ if (!(cache->flags & SLAB_KASAN))
+ return (void *)object;
-static inline void set_track(struct kasan_track *track, gfp_t flags)
-{
- track->pid = current->pid;
- track->stack = save_stack(flags);
-}
+ alloc_info = get_alloc_info(cache, object);
+ __memset(alloc_info, 0, sizeof(*alloc_info));
-struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
- const void *object)
-{
- BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
- return (void *)object + cache->kasan_info.alloc_meta_offset;
-}
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ object = set_tag(object, assign_tag(cache, object, true));
-struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
- const void *object)
-{
- BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
- return (void *)object + cache->kasan_info.free_meta_offset;
+ return (void *)object;
}
-void kasan_init_slab_obj(struct kmem_cache *cache, const void *object)
+void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
+ gfp_t flags)
{
- struct kasan_alloc_meta *alloc_info;
-
- if (!(cache->flags & SLAB_KASAN))
- return;
-
- alloc_info = get_alloc_info(cache, object);
- __memset(alloc_info, 0, sizeof(*alloc_info));
+ return kasan_kmalloc(cache, object, cache->object_size, flags);
}
-void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
+static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
{
- kasan_kmalloc(cache, object, cache->object_size, flags);
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ return shadow_byte < 0 ||
+ shadow_byte >= KASAN_SHADOW_SCALE_SIZE;
+ else
+ return tag != (u8)shadow_byte;
}
static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
unsigned long ip, bool quarantine)
{
s8 shadow_byte;
+ u8 tag;
+ void *tagged_object;
unsigned long rounded_up_size;
+ tag = get_tag(object);
+ tagged_object = object;
+ object = reset_tag(object);
+
if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
object)) {
- kasan_report_invalid_free(object, ip);
+ kasan_report_invalid_free(tagged_object, ip);
return true;
}
@@ -507,20 +429,22 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
return false;
shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
- if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
- kasan_report_invalid_free(object, ip);
+ if (shadow_invalid(tag, shadow_byte)) {
+ kasan_report_invalid_free(tagged_object, ip);
return true;
}
rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE);
kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
- if (!quarantine || unlikely(!(cache->flags & SLAB_KASAN)))
+ if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) ||
+ unlikely(!(cache->flags & SLAB_KASAN)))
return false;
set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT);
quarantine_put(get_free_info(cache, object), cache);
- return true;
+
+ return IS_ENABLED(CONFIG_KASAN_GENERIC);
}
bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
@@ -528,33 +452,41 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
return __kasan_slab_free(cache, object, ip, true);
}
-void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
- gfp_t flags)
+void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags)
{
unsigned long redzone_start;
unsigned long redzone_end;
+ u8 tag;
if (gfpflags_allow_blocking(flags))
quarantine_reduce();
if (unlikely(object == NULL))
- return;
+ return NULL;
redzone_start = round_up((unsigned long)(object + size),
KASAN_SHADOW_SCALE_SIZE);
redzone_end = round_up((unsigned long)object + cache->object_size,
KASAN_SHADOW_SCALE_SIZE);
- kasan_unpoison_shadow(object, size);
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ tag = assign_tag(cache, object, false);
+
+ /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
+ kasan_unpoison_shadow(set_tag(object, tag), size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
KASAN_KMALLOC_REDZONE);
if (cache->flags & SLAB_KASAN)
set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+
+ return set_tag(object, tag);
}
EXPORT_SYMBOL(kasan_kmalloc);
-void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
+void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
+ gfp_t flags)
{
struct page *page;
unsigned long redzone_start;
@@ -564,7 +496,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
quarantine_reduce();
if (unlikely(ptr == NULL))
- return;
+ return NULL;
page = virt_to_page(ptr);
redzone_start = round_up((unsigned long)(ptr + size),
@@ -574,21 +506,23 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
kasan_unpoison_shadow(ptr, size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
KASAN_PAGE_REDZONE);
+
+ return (void *)ptr;
}
-void kasan_krealloc(const void *object, size_t size, gfp_t flags)
+void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
{
struct page *page;
if (unlikely(object == ZERO_SIZE_PTR))
- return;
+ return (void *)object;
page = virt_to_head_page(object);
if (unlikely(!PageSlab(page)))
- kasan_kmalloc_large(object, size, flags);
+ return kasan_kmalloc_large(object, size, flags);
else
- kasan_kmalloc(page->slab_cache, object, size, flags);
+ return kasan_kmalloc(page->slab_cache, object, size, flags);
}
void kasan_poison_kfree(void *ptr, unsigned long ip)
@@ -632,11 +566,12 @@ int kasan_module_alloc(void *addr, size_t size)
ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
shadow_start + shadow_size,
- GFP_KERNEL | __GFP_ZERO,
+ GFP_KERNEL,
PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
__builtin_return_address(0));
if (ret) {
+ __memset(ret, KASAN_SHADOW_INIT, shadow_size);
find_vm_area(addr)->flags |= VM_KASAN;
kmemleak_ignore(ret);
return 0;
@@ -651,147 +586,6 @@ void kasan_free_shadow(const struct vm_struct *vm)
vfree(kasan_mem_to_shadow(vm->addr));
}
-static void register_global(struct kasan_global *global)
-{
- size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
-
- kasan_unpoison_shadow(global->beg, global->size);
-
- kasan_poison_shadow(global->beg + aligned_size,
- global->size_with_redzone - aligned_size,
- KASAN_GLOBAL_REDZONE);
-}
-
-void __asan_register_globals(struct kasan_global *globals, size_t size)
-{
- int i;
-
- for (i = 0; i < size; i++)
- register_global(&globals[i]);
-}
-EXPORT_SYMBOL(__asan_register_globals);
-
-void __asan_unregister_globals(struct kasan_global *globals, size_t size)
-{
-}
-EXPORT_SYMBOL(__asan_unregister_globals);
-
-#define DEFINE_ASAN_LOAD_STORE(size) \
- void __asan_load##size(unsigned long addr) \
- { \
- check_memory_region_inline(addr, size, false, _RET_IP_);\
- } \
- EXPORT_SYMBOL(__asan_load##size); \
- __alias(__asan_load##size) \
- void __asan_load##size##_noabort(unsigned long); \
- EXPORT_SYMBOL(__asan_load##size##_noabort); \
- void __asan_store##size(unsigned long addr) \
- { \
- check_memory_region_inline(addr, size, true, _RET_IP_); \
- } \
- EXPORT_SYMBOL(__asan_store##size); \
- __alias(__asan_store##size) \
- void __asan_store##size##_noabort(unsigned long); \
- EXPORT_SYMBOL(__asan_store##size##_noabort)
-
-DEFINE_ASAN_LOAD_STORE(1);
-DEFINE_ASAN_LOAD_STORE(2);
-DEFINE_ASAN_LOAD_STORE(4);
-DEFINE_ASAN_LOAD_STORE(8);
-DEFINE_ASAN_LOAD_STORE(16);
-
-void __asan_loadN(unsigned long addr, size_t size)
-{
- check_memory_region(addr, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_loadN);
-
-__alias(__asan_loadN)
-void __asan_loadN_noabort(unsigned long, size_t);
-EXPORT_SYMBOL(__asan_loadN_noabort);
-
-void __asan_storeN(unsigned long addr, size_t size)
-{
- check_memory_region(addr, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_storeN);
-
-__alias(__asan_storeN)
-void __asan_storeN_noabort(unsigned long, size_t);
-EXPORT_SYMBOL(__asan_storeN_noabort);
-
-/* to shut up compiler complaints */
-void __asan_handle_no_return(void) {}
-EXPORT_SYMBOL(__asan_handle_no_return);
-
-/* Emitted by compiler to poison large objects when they go out of scope. */
-void __asan_poison_stack_memory(const void *addr, size_t size)
-{
- /*
- * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded
- * by redzones, so we simply round up size to simplify logic.
- */
- kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE),
- KASAN_USE_AFTER_SCOPE);
-}
-EXPORT_SYMBOL(__asan_poison_stack_memory);
-
-/* Emitted by compiler to unpoison large objects when they go into scope. */
-void __asan_unpoison_stack_memory(const void *addr, size_t size)
-{
- kasan_unpoison_shadow(addr, size);
-}
-EXPORT_SYMBOL(__asan_unpoison_stack_memory);
-
-/* Emitted by compiler to poison alloca()ed objects. */
-void __asan_alloca_poison(unsigned long addr, size_t size)
-{
- size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
- size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
- rounded_up_size;
- size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
-
- const void *left_redzone = (const void *)(addr -
- KASAN_ALLOCA_REDZONE_SIZE);
- const void *right_redzone = (const void *)(addr + rounded_up_size);
-
- WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
-
- kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
- size - rounded_down_size);
- kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_LEFT);
- kasan_poison_shadow(right_redzone,
- padding_size + KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_RIGHT);
-}
-EXPORT_SYMBOL(__asan_alloca_poison);
-
-/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
-void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
-{
- if (unlikely(!stack_top || stack_top > stack_bottom))
- return;
-
- kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
-}
-EXPORT_SYMBOL(__asan_allocas_unpoison);
-
-/* Emitted by the compiler to [un]poison local variables. */
-#define DEFINE_ASAN_SET_SHADOW(byte) \
- void __asan_set_shadow_##byte(const void *addr, size_t size) \
- { \
- __memset((void *)addr, 0x##byte, size); \
- } \
- EXPORT_SYMBOL(__asan_set_shadow_##byte)
-
-DEFINE_ASAN_SET_SHADOW(00);
-DEFINE_ASAN_SET_SHADOW(f1);
-DEFINE_ASAN_SET_SHADOW(f2);
-DEFINE_ASAN_SET_SHADOW(f3);
-DEFINE_ASAN_SET_SHADOW(f5);
-DEFINE_ASAN_SET_SHADOW(f8);
-
#ifdef CONFIG_MEMORY_HOTPLUG
static bool shadow_mapped(unsigned long addr)
{
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
new file mode 100644
index 000000000000..ccb6207276e3
--- /dev/null
+++ b/mm/kasan/generic.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core generic KASAN code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/*
+ * All functions below always inlined so compiler could
+ * perform better optimizations in each of __asan_loadX/__assn_storeX
+ * depending on memory access size X.
+ */
+
+static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+{
+ s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(shadow_value)) {
+ s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+ return unlikely(last_accessible_byte >= shadow_value);
+ }
+
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
+ unsigned long size)
+{
+ u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
+
+ /*
+ * Access crosses 8(shadow size)-byte boundary. Such access maps
+ * into 2 shadow bytes, so we need to check them both.
+ */
+ if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
+ return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
+
+ return memory_is_poisoned_1(addr + size - 1);
+}
+
+static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+{
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+ /* Unaligned 16-bytes access maps into 3 shadow bytes. */
+ if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
+ return *shadow_addr || memory_is_poisoned_1(addr + 15);
+
+ return *shadow_addr;
+}
+
+static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
+ size_t size)
+{
+ while (size) {
+ if (unlikely(*start))
+ return (unsigned long)start;
+ start++;
+ size--;
+ }
+
+ return 0;
+}
+
+static __always_inline unsigned long memory_is_nonzero(const void *start,
+ const void *end)
+{
+ unsigned int words;
+ unsigned long ret;
+ unsigned int prefix = (unsigned long)start % 8;
+
+ if (end - start <= 16)
+ return bytes_is_nonzero(start, end - start);
+
+ if (prefix) {
+ prefix = 8 - prefix;
+ ret = bytes_is_nonzero(start, prefix);
+ if (unlikely(ret))
+ return ret;
+ start += prefix;
+ }
+
+ words = (end - start) / 8;
+ while (words) {
+ if (unlikely(*(u64 *)start))
+ return bytes_is_nonzero(start, 8);
+ start += 8;
+ words--;
+ }
+
+ return bytes_is_nonzero(start, (end - start) % 8);
+}
+
+static __always_inline bool memory_is_poisoned_n(unsigned long addr,
+ size_t size)
+{
+ unsigned long ret;
+
+ ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
+ kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+
+ if (unlikely(ret)) {
+ unsigned long last_byte = addr + size - 1;
+ s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+
+ if (unlikely(ret != (unsigned long)last_shadow ||
+ ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+{
+ if (__builtin_constant_p(size)) {
+ switch (size) {
+ case 1:
+ return memory_is_poisoned_1(addr);
+ case 2:
+ case 4:
+ case 8:
+ return memory_is_poisoned_2_4_8(addr, size);
+ case 16:
+ return memory_is_poisoned_16(addr);
+ default:
+ BUILD_BUG();
+ }
+ }
+
+ return memory_is_poisoned_n(addr, size);
+}
+
+static __always_inline void check_memory_region_inline(unsigned long addr,
+ size_t size, bool write,
+ unsigned long ret_ip)
+{
+ if (unlikely(size == 0))
+ return;
+
+ if (unlikely((void *)addr <
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+ kasan_report(addr, size, write, ret_ip);
+ return;
+ }
+
+ if (likely(!memory_is_poisoned(addr, size)))
+ return;
+
+ kasan_report(addr, size, write, ret_ip);
+}
+
+void check_memory_region(unsigned long addr, size_t size, bool write,
+ unsigned long ret_ip)
+{
+ check_memory_region_inline(addr, size, write, ret_ip);
+}
+
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+ quarantine_remove_cache(cache);
+}
+
+void kasan_cache_shutdown(struct kmem_cache *cache)
+{
+ if (!__kmem_cache_empty(cache))
+ quarantine_remove_cache(cache);
+}
+
+static void register_global(struct kasan_global *global)
+{
+ size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+
+ kasan_unpoison_shadow(global->beg, global->size);
+
+ kasan_poison_shadow(global->beg + aligned_size,
+ global->size_with_redzone - aligned_size,
+ KASAN_GLOBAL_REDZONE);
+}
+
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+
+#define DEFINE_ASAN_LOAD_STORE(size) \
+ void __asan_load##size(unsigned long addr) \
+ { \
+ check_memory_region_inline(addr, size, false, _RET_IP_);\
+ } \
+ EXPORT_SYMBOL(__asan_load##size); \
+ __alias(__asan_load##size) \
+ void __asan_load##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_load##size##_noabort); \
+ void __asan_store##size(unsigned long addr) \
+ { \
+ check_memory_region_inline(addr, size, true, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__asan_store##size); \
+ __alias(__asan_store##size) \
+ void __asan_store##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_store##size##_noabort)
+
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+DEFINE_ASAN_LOAD_STORE(16);
+
+void __asan_loadN(unsigned long addr, size_t size)
+{
+ check_memory_region(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_loadN);
+
+__alias(__asan_loadN)
+void __asan_loadN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_loadN_noabort);
+
+void __asan_storeN(unsigned long addr, size_t size)
+{
+ check_memory_region(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_storeN);
+
+__alias(__asan_storeN)
+void __asan_storeN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_storeN_noabort);
+
+/* to shut up compiler complaints */
+void __asan_handle_no_return(void) {}
+EXPORT_SYMBOL(__asan_handle_no_return);
+
+/* Emitted by compiler to poison large objects when they go out of scope. */
+void __asan_poison_stack_memory(const void *addr, size_t size)
+{
+ /*
+ * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded
+ * by redzones, so we simply round up size to simplify logic.
+ */
+ kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE),
+ KASAN_USE_AFTER_SCOPE);
+}
+EXPORT_SYMBOL(__asan_poison_stack_memory);
+
+/* Emitted by compiler to unpoison large objects when they go into scope. */
+void __asan_unpoison_stack_memory(const void *addr, size_t size)
+{
+ kasan_unpoison_shadow(addr, size);
+}
+EXPORT_SYMBOL(__asan_unpoison_stack_memory);
+
+/* Emitted by compiler to poison alloca()ed objects. */
+void __asan_alloca_poison(unsigned long addr, size_t size)
+{
+ size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+ size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
+ rounded_up_size;
+ size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
+
+ const void *left_redzone = (const void *)(addr -
+ KASAN_ALLOCA_REDZONE_SIZE);
+ const void *right_redzone = (const void *)(addr + rounded_up_size);
+
+ WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
+
+ kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
+ size - rounded_down_size);
+ kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_LEFT);
+ kasan_poison_shadow(right_redzone,
+ padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_RIGHT);
+}
+EXPORT_SYMBOL(__asan_alloca_poison);
+
+/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
+void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
+{
+ if (unlikely(!stack_top || stack_top > stack_bottom))
+ return;
+
+ kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
+}
+EXPORT_SYMBOL(__asan_allocas_unpoison);
+
+/* Emitted by the compiler to [un]poison local variables. */
+#define DEFINE_ASAN_SET_SHADOW(byte) \
+ void __asan_set_shadow_##byte(const void *addr, size_t size) \
+ { \
+ __memset((void *)addr, 0x##byte, size); \
+ } \
+ EXPORT_SYMBOL(__asan_set_shadow_##byte)
+
+DEFINE_ASAN_SET_SHADOW(00);
+DEFINE_ASAN_SET_SHADOW(f1);
+DEFINE_ASAN_SET_SHADOW(f2);
+DEFINE_ASAN_SET_SHADOW(f3);
+DEFINE_ASAN_SET_SHADOW(f5);
+DEFINE_ASAN_SET_SHADOW(f8);
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
new file mode 100644
index 000000000000..5e12035888f2
--- /dev/null
+++ b/mm/kasan/generic_report.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains generic KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+ void *p = addr;
+
+ while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
+ p += KASAN_SHADOW_SCALE_SIZE;
+ return p;
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+ u8 *shadow_addr;
+
+ shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+ /*
+ * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
+ * at the next shadow byte to determine the type of the bad access.
+ */
+ if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
+ shadow_addr++;
+
+ switch (*shadow_addr) {
+ case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+ /*
+ * In theory it's still possible to see these shadow values
+ * due to a data race in the kernel code.
+ */
+ bug_type = "out-of-bounds";
+ break;
+ case KASAN_PAGE_REDZONE:
+ case KASAN_KMALLOC_REDZONE:
+ bug_type = "slab-out-of-bounds";
+ break;
+ case KASAN_GLOBAL_REDZONE:
+ bug_type = "global-out-of-bounds";
+ break;
+ case KASAN_STACK_LEFT:
+ case KASAN_STACK_MID:
+ case KASAN_STACK_RIGHT:
+ case KASAN_STACK_PARTIAL:
+ bug_type = "stack-out-of-bounds";
+ break;
+ case KASAN_FREE_PAGE:
+ case KASAN_KMALLOC_FREE:
+ bug_type = "use-after-free";
+ break;
+ case KASAN_USE_AFTER_SCOPE:
+ bug_type = "use-after-scope";
+ break;
+ case KASAN_ALLOCA_LEFT:
+ case KASAN_ALLOCA_RIGHT:
+ bug_type = "alloca-out-of-bounds";
+ break;
+ }
+
+ return bug_type;
+}
+
+static const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+
+ return bug_type;
+}
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+ if (addr_has_shadow(info->access_addr))
+ return get_shadow_bug_type(info);
+ return get_wild_bug_type(info);
+}
+
+#define DEFINE_ASAN_REPORT_LOAD(size) \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, false, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size) \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, true, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/init.c
index c7550eb65922..34afad56497b 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/init.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file contains some kasan initialization code.
*
@@ -30,13 +31,13 @@
* - Latter it reused it as zero shadow to cover large ranges of memory
* that allowed to access, but not handled by kasan (vmalloc/vmemmap ...).
*/
-unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
+unsigned char kasan_early_shadow_page[PAGE_SIZE] __page_aligned_bss;
#if CONFIG_PGTABLE_LEVELS > 4
-p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
+p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
static inline bool kasan_p4d_table(pgd_t pgd)
{
- return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d));
+ return pgd_page(pgd) == virt_to_page(lm_alias(kasan_early_shadow_p4d));
}
#else
static inline bool kasan_p4d_table(pgd_t pgd)
@@ -45,10 +46,10 @@ static inline bool kasan_p4d_table(pgd_t pgd)
}
#endif
#if CONFIG_PGTABLE_LEVELS > 3
-pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
static inline bool kasan_pud_table(p4d_t p4d)
{
- return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud));
+ return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
}
#else
static inline bool kasan_pud_table(p4d_t p4d)
@@ -57,10 +58,10 @@ static inline bool kasan_pud_table(p4d_t p4d)
}
#endif
#if CONFIG_PGTABLE_LEVELS > 2
-pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
static inline bool kasan_pmd_table(pud_t pud)
{
- return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd));
+ return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
}
#else
static inline bool kasan_pmd_table(pud_t pud)
@@ -68,16 +69,16 @@ static inline bool kasan_pmd_table(pud_t pud)
return 0;
}
#endif
-pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss;
static inline bool kasan_pte_table(pmd_t pmd)
{
- return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte));
+ return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte));
}
-static inline bool kasan_zero_page_entry(pte_t pte)
+static inline bool kasan_early_shadow_page_entry(pte_t pte)
{
- return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page));
+ return pte_page(pte) == virt_to_page(lm_alias(kasan_early_shadow_page));
}
static __init void *early_alloc(size_t size, int node)
@@ -92,7 +93,8 @@ static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
pte_t *pte = pte_offset_kernel(pmd, addr);
pte_t zero_pte;
- zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_zero_page)), PAGE_KERNEL);
+ zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_early_shadow_page)),
+ PAGE_KERNEL);
zero_pte = pte_wrprotect(zero_pte);
while (addr + PAGE_SIZE <= end) {
@@ -112,7 +114,8 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
next = pmd_addr_end(addr, end);
if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) {
- pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_early_shadow_pte));
continue;
}
@@ -145,9 +148,11 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr,
if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) {
pmd_t *pmd;
- pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
+ pud_populate(&init_mm, pud,
+ lm_alias(kasan_early_shadow_pmd));
pmd = pmd_offset(pud, addr);
- pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_early_shadow_pte));
continue;
}
@@ -181,12 +186,14 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
pud_t *pud;
pmd_t *pmd;
- p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud));
+ p4d_populate(&init_mm, p4d,
+ lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
- pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
+ pud_populate(&init_mm, pud,
+ lm_alias(kasan_early_shadow_pmd));
pmd = pmd_offset(pud, addr);
pmd_populate_kernel(&init_mm, pmd,
- lm_alias(kasan_zero_pte));
+ lm_alias(kasan_early_shadow_pte));
continue;
}
@@ -209,13 +216,13 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
}
/**
- * kasan_populate_zero_shadow - populate shadow memory region with
- * kasan_zero_page
+ * kasan_populate_early_shadow - populate shadow memory region with
+ * kasan_early_shadow_page
* @shadow_start - start of the memory range to populate
* @shadow_end - end of the memory range to populate
*/
-int __ref kasan_populate_zero_shadow(const void *shadow_start,
- const void *shadow_end)
+int __ref kasan_populate_early_shadow(const void *shadow_start,
+ const void *shadow_end)
{
unsigned long addr = (unsigned long)shadow_start;
unsigned long end = (unsigned long)shadow_end;
@@ -231,7 +238,7 @@ int __ref kasan_populate_zero_shadow(const void *shadow_start,
pmd_t *pmd;
/*
- * kasan_zero_pud should be populated with pmds
+ * kasan_early_shadow_pud should be populated with pmds
* at this moment.
* [pud,pmd]_populate*() below needed only for
* 3,2 - level page tables where we don't have
@@ -241,21 +248,25 @@ int __ref kasan_populate_zero_shadow(const void *shadow_start,
* The ifndef is required to avoid build breakage.
*
* With 5level-fixup.h, pgd_populate() is not nop and
- * we reference kasan_zero_p4d. It's not defined
+ * we reference kasan_early_shadow_p4d. It's not defined
* unless 5-level paging enabled.
*
* The ifndef can be dropped once all KASAN-enabled
* architectures will switch to pgtable-nop4d.h.
*/
#ifndef __ARCH_HAS_5LEVEL_HACK
- pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_p4d));
+ pgd_populate(&init_mm, pgd,
+ lm_alias(kasan_early_shadow_p4d));
#endif
p4d = p4d_offset(pgd, addr);
- p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud));
+ p4d_populate(&init_mm, p4d,
+ lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
- pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
+ pud_populate(&init_mm, pud,
+ lm_alias(kasan_early_shadow_pmd));
pmd = pmd_offset(pud, addr);
- pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_early_shadow_pte));
continue;
}
@@ -350,7 +361,7 @@ static void kasan_remove_pte_table(pte_t *pte, unsigned long addr,
if (!pte_present(*pte))
continue;
- if (WARN_ON(!kasan_zero_page_entry(*pte)))
+ if (WARN_ON(!kasan_early_shadow_page_entry(*pte)))
continue;
pte_clear(&init_mm, addr, pte);
}
@@ -480,7 +491,7 @@ int kasan_add_zero_shadow(void *start, unsigned long size)
WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
return -EINVAL;
- ret = kasan_populate_zero_shadow(shadow_start, shadow_end);
+ ret = kasan_populate_early_shadow(shadow_start, shadow_end);
if (ret)
kasan_remove_zero_shadow(shadow_start,
size >> KASAN_SHADOW_SCALE_SHIFT);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index c12dcfde2ebd..ea51b2d898ec 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -8,10 +8,22 @@
#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
+#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */
+#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */
+#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */
+
+#ifdef CONFIG_KASAN_GENERIC
#define KASAN_FREE_PAGE 0xFF /* page was freed */
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
+#else
+#define KASAN_FREE_PAGE KASAN_TAG_INVALID
+#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
+#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
+#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
+#endif
+
#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
/*
@@ -105,11 +117,25 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
<< KASAN_SHADOW_SCALE_SHIFT);
}
+static inline bool addr_has_shadow(const void *addr)
+{
+ return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+}
+
+void kasan_poison_shadow(const void *address, size_t size, u8 value);
+
+void check_memory_region(unsigned long addr, size_t size, bool write,
+ unsigned long ret_ip);
+
+void *find_first_bad_addr(void *addr, size_t size);
+const char *get_bug_type(struct kasan_access_info *info);
+
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_invalid_free(void *object, unsigned long ip);
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
+#if defined(CONFIG_KASAN_GENERIC) && \
+ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
void quarantine_reduce(void);
void quarantine_remove_cache(struct kmem_cache *cache);
@@ -120,6 +146,37 @@ static inline void quarantine_reduce(void) { }
static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
#endif
+#ifdef CONFIG_KASAN_SW_TAGS
+
+void print_tags(u8 addr_tag, const void *addr);
+
+u8 random_tag(void);
+
+#else
+
+static inline void print_tags(u8 addr_tag, const void *addr) { }
+
+static inline u8 random_tag(void)
+{
+ return 0;
+}
+
+#endif
+
+#ifndef arch_kasan_set_tag
+#define arch_kasan_set_tag(addr, tag) ((void *)(addr))
+#endif
+#ifndef arch_kasan_reset_tag
+#define arch_kasan_reset_tag(addr) ((void *)(addr))
+#endif
+#ifndef arch_kasan_get_tag
+#define arch_kasan_get_tag(addr) 0
+#endif
+
+#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag)))
+#define reset_tag(addr) ((void *)arch_kasan_reset_tag(addr))
+#define get_tag(addr) arch_kasan_get_tag(addr)
+
/*
* Exported functions for interfaces called from assembly or from generated
* code. Declarations here to avoid warning about missing declarations.
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index b209dbaefde8..978bc4a3eb51 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* KASAN quarantine.
*
@@ -236,7 +237,7 @@ void quarantine_reduce(void)
* Update quarantine size in case of hotplug. Allocate a fraction of
* the installed memory to quarantine minus per-cpu queue limits.
*/
- total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+ total_size = (totalram_pages() << PAGE_SHIFT) /
QUARANTINE_FRACTION;
percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
new_quarantine_size = (total_size < percpu_quarantines) ?
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 5c169aa688fd..ca9418fe9232 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains error reporting code.
+ * This file contains common generic and tag-based KASAN error reporting code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
@@ -39,129 +40,43 @@
#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
#define SHADOW_ROWS_AROUND_ADDR 2
-static const void *find_first_bad_addr(const void *addr, size_t size)
-{
- u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
- const void *first_bad_addr = addr;
-
- while (!shadow_val && first_bad_addr < addr + size) {
- first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
- shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
- }
- return first_bad_addr;
-}
+static unsigned long kasan_flags;
-static bool addr_has_shadow(struct kasan_access_info *info)
-{
- return (info->access_addr >=
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
-}
+#define KASAN_BIT_REPORTED 0
+#define KASAN_BIT_MULTI_SHOT 1
-static const char *get_shadow_bug_type(struct kasan_access_info *info)
+bool kasan_save_enable_multi_shot(void)
{
- const char *bug_type = "unknown-crash";
- u8 *shadow_addr;
-
- info->first_bad_addr = find_first_bad_addr(info->access_addr,
- info->access_size);
-
- shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
-
- /*
- * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
- * at the next shadow byte to determine the type of the bad access.
- */
- if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
- shadow_addr++;
-
- switch (*shadow_addr) {
- case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
- /*
- * In theory it's still possible to see these shadow values
- * due to a data race in the kernel code.
- */
- bug_type = "out-of-bounds";
- break;
- case KASAN_PAGE_REDZONE:
- case KASAN_KMALLOC_REDZONE:
- bug_type = "slab-out-of-bounds";
- break;
- case KASAN_GLOBAL_REDZONE:
- bug_type = "global-out-of-bounds";
- break;
- case KASAN_STACK_LEFT:
- case KASAN_STACK_MID:
- case KASAN_STACK_RIGHT:
- case KASAN_STACK_PARTIAL:
- bug_type = "stack-out-of-bounds";
- break;
- case KASAN_FREE_PAGE:
- case KASAN_KMALLOC_FREE:
- bug_type = "use-after-free";
- break;
- case KASAN_USE_AFTER_SCOPE:
- bug_type = "use-after-scope";
- break;
- case KASAN_ALLOCA_LEFT:
- case KASAN_ALLOCA_RIGHT:
- bug_type = "alloca-out-of-bounds";
- break;
- }
-
- return bug_type;
+ return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
}
+EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
-static const char *get_wild_bug_type(struct kasan_access_info *info)
+void kasan_restore_multi_shot(bool enabled)
{
- const char *bug_type = "unknown-crash";
-
- if ((unsigned long)info->access_addr < PAGE_SIZE)
- bug_type = "null-ptr-deref";
- else if ((unsigned long)info->access_addr < TASK_SIZE)
- bug_type = "user-memory-access";
- else
- bug_type = "wild-memory-access";
-
- return bug_type;
+ if (!enabled)
+ clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
}
+EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
-static const char *get_bug_type(struct kasan_access_info *info)
+static int __init kasan_set_multi_shot(char *str)
{
- if (addr_has_shadow(info))
- return get_shadow_bug_type(info);
- return get_wild_bug_type(info);
+ set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+ return 1;
}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
static void print_error_description(struct kasan_access_info *info)
{
- const char *bug_type = get_bug_type(info);
-
pr_err("BUG: KASAN: %s in %pS\n",
- bug_type, (void *)info->ip);
+ get_bug_type(info), (void *)info->ip);
pr_err("%s of size %zu at addr %px by task %s/%d\n",
info->is_write ? "Write" : "Read", info->access_size,
info->access_addr, current->comm, task_pid_nr(current));
}
-static inline bool kernel_or_module_addr(const void *addr)
-{
- if (addr >= (void *)_stext && addr < (void *)_end)
- return true;
- if (is_module_address((unsigned long)addr))
- return true;
- return false;
-}
-
-static inline bool init_task_stack_addr(const void *addr)
-{
- return addr >= (void *)&init_thread_union.stack &&
- (addr <= (void *)&init_thread_union.stack +
- sizeof(init_thread_union.stack));
-}
-
static DEFINE_SPINLOCK(report_lock);
-static void kasan_start_report(unsigned long *flags)
+static void start_report(unsigned long *flags)
{
/*
* Make sure we don't end up in loop.
@@ -171,7 +86,7 @@ static void kasan_start_report(unsigned long *flags)
pr_err("==================================================================\n");
}
-static void kasan_end_report(unsigned long *flags)
+static void end_report(unsigned long *flags)
{
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -249,6 +164,22 @@ static void describe_object(struct kmem_cache *cache, void *object,
describe_object_addr(cache, object, addr);
}
+static inline bool kernel_or_module_addr(const void *addr)
+{
+ if (addr >= (void *)_stext && addr < (void *)_end)
+ return true;
+ if (is_module_address((unsigned long)addr))
+ return true;
+ return false;
+}
+
+static inline bool init_task_stack_addr(const void *addr)
+{
+ return addr >= (void *)&init_thread_union.stack &&
+ (addr <= (void *)&init_thread_union.stack +
+ sizeof(init_thread_union.stack));
+}
+
static void print_address_description(void *addr)
{
struct page *page = addr_to_page(addr);
@@ -326,126 +257,69 @@ static void print_shadow_for_address(const void *addr)
}
}
+static bool report_enabled(void)
+{
+ if (current->kasan_depth)
+ return false;
+ if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
+ return true;
+ return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+}
+
void kasan_report_invalid_free(void *object, unsigned long ip)
{
unsigned long flags;
- kasan_start_report(&flags);
+ start_report(&flags);
pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
+ print_tags(get_tag(object), reset_tag(object));
+ object = reset_tag(object);
pr_err("\n");
print_address_description(object);
pr_err("\n");
print_shadow_for_address(object);
- kasan_end_report(&flags);
-}
-
-static void kasan_report_error(struct kasan_access_info *info)
-{
- unsigned long flags;
-
- kasan_start_report(&flags);
-
- print_error_description(info);
- pr_err("\n");
-
- if (!addr_has_shadow(info)) {
- dump_stack();
- } else {
- print_address_description((void *)info->access_addr);
- pr_err("\n");
- print_shadow_for_address(info->first_bad_addr);
- }
-
- kasan_end_report(&flags);
-}
-
-static unsigned long kasan_flags;
-
-#define KASAN_BIT_REPORTED 0
-#define KASAN_BIT_MULTI_SHOT 1
-
-bool kasan_save_enable_multi_shot(void)
-{
- return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
-}
-EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
-
-void kasan_restore_multi_shot(bool enabled)
-{
- if (!enabled)
- clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
-}
-EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
-
-static int __init kasan_set_multi_shot(char *str)
-{
- set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
- return 1;
-}
-__setup("kasan_multi_shot", kasan_set_multi_shot);
-
-static inline bool kasan_report_enabled(void)
-{
- if (current->kasan_depth)
- return false;
- if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
- return true;
- return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
+ end_report(&flags);
}
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip)
{
struct kasan_access_info info;
+ void *tagged_addr;
+ void *untagged_addr;
+ unsigned long flags;
- if (likely(!kasan_report_enabled()))
+ if (likely(!report_enabled()))
return;
disable_trace_on_warning();
- info.access_addr = (void *)addr;
- info.first_bad_addr = (void *)addr;
+ tagged_addr = (void *)addr;
+ untagged_addr = reset_tag(tagged_addr);
+
+ info.access_addr = tagged_addr;
+ if (addr_has_shadow(untagged_addr))
+ info.first_bad_addr = find_first_bad_addr(tagged_addr, size);
+ else
+ info.first_bad_addr = untagged_addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
- kasan_report_error(&info);
-}
+ start_report(&flags);
+ print_error_description(&info);
+ if (addr_has_shadow(untagged_addr))
+ print_tags(get_tag(tagged_addr), info.first_bad_addr);
+ pr_err("\n");
-#define DEFINE_ASAN_REPORT_LOAD(size) \
-void __asan_report_load##size##_noabort(unsigned long addr) \
-{ \
- kasan_report(addr, size, false, _RET_IP_); \
-} \
-EXPORT_SYMBOL(__asan_report_load##size##_noabort)
-
-#define DEFINE_ASAN_REPORT_STORE(size) \
-void __asan_report_store##size##_noabort(unsigned long addr) \
-{ \
- kasan_report(addr, size, true, _RET_IP_); \
-} \
-EXPORT_SYMBOL(__asan_report_store##size##_noabort)
-
-DEFINE_ASAN_REPORT_LOAD(1);
-DEFINE_ASAN_REPORT_LOAD(2);
-DEFINE_ASAN_REPORT_LOAD(4);
-DEFINE_ASAN_REPORT_LOAD(8);
-DEFINE_ASAN_REPORT_LOAD(16);
-DEFINE_ASAN_REPORT_STORE(1);
-DEFINE_ASAN_REPORT_STORE(2);
-DEFINE_ASAN_REPORT_STORE(4);
-DEFINE_ASAN_REPORT_STORE(8);
-DEFINE_ASAN_REPORT_STORE(16);
-
-void __asan_report_load_n_noabort(unsigned long addr, size_t size)
-{
- kasan_report(addr, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_report_load_n_noabort);
+ if (addr_has_shadow(untagged_addr)) {
+ print_address_description(untagged_addr);
+ pr_err("\n");
+ print_shadow_for_address(info.first_bad_addr);
+ } else {
+ dump_stack();
+ }
-void __asan_report_store_n_noabort(unsigned long addr, size_t size)
-{
- kasan_report(addr, size, true, _RET_IP_);
+ end_report(&flags);
}
-EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
new file mode 100644
index 000000000000..0777649e07c4
--- /dev/null
+++ b/mm/kasan/tags.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core tag-based KASAN code.
+ *
+ * Copyright (c) 2018 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+static DEFINE_PER_CPU(u32, prng_state);
+
+void kasan_init_tags(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(prng_state, cpu) = get_random_u32();
+}
+
+/*
+ * If a preemption happens between this_cpu_read and this_cpu_write, the only
+ * side effect is that we'll give a few allocated in different contexts objects
+ * the same tag. Since tag-based KASAN is meant to be used a probabilistic
+ * bug-detection debug feature, this doesn't have significant negative impact.
+ *
+ * Ideally the tags use strong randomness to prevent any attempts to predict
+ * them during explicit exploit attempts. But strong randomness is expensive,
+ * and we did an intentional trade-off to use a PRNG. This non-atomic RMW
+ * sequence has in fact positive effect, since interrupts that randomly skew
+ * PRNG at unpredictable points do only good.
+ */
+u8 random_tag(void)
+{
+ u32 state = this_cpu_read(prng_state);
+
+ state = 1664525 * state + 1013904223;
+ this_cpu_write(prng_state, state);
+
+ return (u8)(state % (KASAN_TAG_MAX + 1));
+}
+
+void *kasan_reset_tag(const void *addr)
+{
+ return reset_tag(addr);
+}
+
+void check_memory_region(unsigned long addr, size_t size, bool write,
+ unsigned long ret_ip)
+{
+ u8 tag;
+ u8 *shadow_first, *shadow_last, *shadow;
+ void *untagged_addr;
+
+ if (unlikely(size == 0))
+ return;
+
+ tag = get_tag((const void *)addr);
+
+ /*
+ * Ignore accesses for pointers tagged with 0xff (native kernel
+ * pointer tag) to suppress false positives caused by kmap.
+ *
+ * Some kernel code was written to account for archs that don't keep
+ * high memory mapped all the time, but rather map and unmap particular
+ * pages when needed. Instead of storing a pointer to the kernel memory,
+ * this code saves the address of the page structure and offset within
+ * that page for later use. Those pages are then mapped and unmapped
+ * with kmap/kunmap when necessary and virt_to_page is used to get the
+ * virtual address of the page. For arm64 (that keeps the high memory
+ * mapped all the time), kmap is turned into a page_address call.
+
+ * The issue is that with use of the page_address + virt_to_page
+ * sequence the top byte value of the original pointer gets lost (gets
+ * set to KASAN_TAG_KERNEL (0xFF)).
+ */
+ if (tag == KASAN_TAG_KERNEL)
+ return;
+
+ untagged_addr = reset_tag((const void *)addr);
+ if (unlikely(untagged_addr <
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+ kasan_report(addr, size, write, ret_ip);
+ return;
+ }
+ shadow_first = kasan_mem_to_shadow(untagged_addr);
+ shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
+ for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
+ if (*shadow != tag) {
+ kasan_report(addr, size, write, ret_ip);
+ return;
+ }
+ }
+}
+
+#define DEFINE_HWASAN_LOAD_STORE(size) \
+ void __hwasan_load##size##_noabort(unsigned long addr) \
+ { \
+ check_memory_region(addr, size, false, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__hwasan_load##size##_noabort); \
+ void __hwasan_store##size##_noabort(unsigned long addr) \
+ { \
+ check_memory_region(addr, size, true, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__hwasan_store##size##_noabort)
+
+DEFINE_HWASAN_LOAD_STORE(1);
+DEFINE_HWASAN_LOAD_STORE(2);
+DEFINE_HWASAN_LOAD_STORE(4);
+DEFINE_HWASAN_LOAD_STORE(8);
+DEFINE_HWASAN_LOAD_STORE(16);
+
+void __hwasan_loadN_noabort(unsigned long addr, unsigned long size)
+{
+ check_memory_region(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__hwasan_loadN_noabort);
+
+void __hwasan_storeN_noabort(unsigned long addr, unsigned long size)
+{
+ check_memory_region(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__hwasan_storeN_noabort);
+
+void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
+{
+ kasan_poison_shadow((void *)addr, size, tag);
+}
+EXPORT_SYMBOL(__hwasan_tag_memory);
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c
new file mode 100644
index 000000000000..8eaf5f722271
--- /dev/null
+++ b/mm/kasan/tags_report.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains tag-based KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+ return "invalid-access";
+}
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+ u8 tag = get_tag(addr);
+ void *p = reset_tag(addr);
+ void *end = p + size;
+
+ while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
+ p += KASAN_SHADOW_SCALE_SIZE;
+ return p;
+}
+
+void print_tags(u8 addr_tag, const void *addr)
+{
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(addr);
+
+ pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow);
+}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 43ce2f4d2551..4f017339ddb2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -944,8 +944,7 @@ static void collapse_huge_page(struct mm_struct *mm,
int isolated = 0, result = 0;
struct mem_cgroup *memcg;
struct vm_area_struct *vma;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
gfp_t gfp;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -1017,9 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm,
pte = pte_offset_map(pmd, address);
pte_ptl = pte_lockptr(mm, pmd);
- mmun_start = address;
- mmun_end = address + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
@@ -1029,7 +1027,7 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
spin_lock(pte_ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 877de4fa0720..f9d9dc250428 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1547,11 +1547,14 @@ static void kmemleak_scan(void)
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- struct page *page;
+ struct page *page = pfn_to_online_page(pfn);
- if (!pfn_valid(pfn))
+ if (!page)
+ continue;
+
+ /* only scan pages belonging to this node */
+ if (page_to_nid(page) != i)
continue;
- page = pfn_to_page(pfn);
/* only scan if page is in use */
if (page_count(page) == 0)
continue;
@@ -1647,7 +1650,7 @@ static void kmemleak_scan(void)
*/
static int kmemleak_scan_thread(void *arg)
{
- static int first_run = 1;
+ static int first_run = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN);
pr_info("Automatic memory scanning thread started\n");
set_user_nice(current, 10);
@@ -2141,9 +2144,11 @@ static int __init kmemleak_late_init(void)
return -ENOMEM;
}
- mutex_lock(&scan_mutex);
- start_scan_thread();
- mutex_unlock(&scan_mutex);
+ if (IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN)) {
+ mutex_lock(&scan_mutex);
+ start_scan_thread();
+ mutex_unlock(&scan_mutex);
+ }
pr_info("Kernel memory leak detector initialized\n");
diff --git a/mm/ksm.c b/mm/ksm.c
index 5b0894b45ee5..6c48ad13b4c9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -25,7 +25,7 @@
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/spinlock.h>
-#include <linux/jhash.h>
+#include <linux/xxhash.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/wait.h>
@@ -296,6 +296,7 @@ static unsigned long ksm_run = KSM_RUN_STOP;
static void wait_while_offlining(void);
static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
+static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);
@@ -1009,7 +1010,7 @@ static u32 calc_checksum(struct page *page)
{
u32 checksum;
void *addr = kmap_atomic(page);
- checksum = jhash2(addr, PAGE_SIZE / 4, 17);
+ checksum = xxhash(addr, PAGE_SIZE, 0);
kunmap_atomic(addr);
return checksum;
}
@@ -1042,8 +1043,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
};
int swapped;
int err = -EFAULT;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
pvmw.address = page_address_in_vma(page, vma);
if (pvmw.address == -EFAULT)
@@ -1051,9 +1051,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
BUG_ON(PageTransCompound(page));
- mmun_start = pvmw.address;
- mmun_end = pvmw.address + PAGE_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, mm, pvmw.address,
+ pvmw.address + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
if (!page_vma_mapped_walk(&pvmw))
goto out_mn;
@@ -1105,7 +1105,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
out_unlock:
page_vma_mapped_walk_done(&pvmw);
out_mn:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
out:
return err;
}
@@ -1129,8 +1129,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
spinlock_t *ptl;
unsigned long addr;
int err = -EFAULT;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
@@ -1140,9 +1139,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
if (!pmd)
goto out;
- mmun_start = addr;
- mmun_end = addr + PAGE_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte_same(*ptep, orig_pte)) {
@@ -1188,7 +1186,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
pte_unmap_unlock(ptep, ptl);
err = 0;
out_mn:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
out:
return err;
}
@@ -2391,6 +2389,8 @@ static int ksmd_should_run(void)
static int ksm_scan_thread(void *nothing)
{
+ unsigned int sleep_ms;
+
set_freezable();
set_user_nice(current, 5);
@@ -2404,8 +2404,10 @@ static int ksm_scan_thread(void *nothing)
try_to_freeze();
if (ksmd_should_run()) {
- schedule_timeout_interruptible(
- msecs_to_jiffies(ksm_thread_sleep_millisecs));
+ sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
+ wait_event_interruptible_timeout(ksm_iter_wait,
+ sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
+ msecs_to_jiffies(sleep_ms));
} else {
wait_event_freezable(ksm_thread_wait,
ksmd_should_run() || kthread_should_stop());
@@ -2824,6 +2826,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj,
return -EINVAL;
ksm_thread_sleep_millisecs = msecs;
+ wake_up_interruptible(&ksm_iter_wait);
return count;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 6cb1ca93e290..21a7881a2db4 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -458,29 +458,30 @@ static void madvise_free_page_range(struct mmu_gather *tlb,
static int madvise_free_single_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr)
{
- unsigned long start, end;
struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
return -EINVAL;
- start = max(vma->vm_start, start_addr);
- if (start >= vma->vm_end)
+ range.start = max(vma->vm_start, start_addr);
+ if (range.start >= vma->vm_end)
return -EINVAL;
- end = min(vma->vm_end, end_addr);
- if (end <= vma->vm_start)
+ range.end = min(vma->vm_end, end_addr);
+ if (range.end <= vma->vm_start)
return -EINVAL;
+ mmu_notifier_range_init(&range, mm, range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start, end);
+ tlb_gather_mmu(&tlb, mm, range.start, range.end);
update_hiwater_rss(mm);
- mmu_notifier_invalidate_range_start(mm, start, end);
- madvise_free_page_range(&tlb, vma, start, end);
- mmu_notifier_invalidate_range_end(mm, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ mmu_notifier_invalidate_range_start(&range);
+ madvise_free_page_range(&tlb, vma, range.start, range.end);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, range.start, range.end);
return 0;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 81ae63ca78d0..022d4cbb3618 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -262,7 +262,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t kernel_end, ret;
/* pump up @end */
- if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+ if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
+ end == MEMBLOCK_ALLOC_KASAN)
end = memblock.current_limit;
/* avoid allocating the first page */
@@ -800,7 +801,14 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
return memblock_remove_range(&memblock.memory, base, size);
}
-
+/**
+ * memblock_free - free boot memory block
+ * @base: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
@@ -1412,13 +1420,15 @@ again:
done:
ptr = phys_to_virt(alloc);
- /*
- * The min_count is set to 0 so that bootmem allocated blocks
- * are never reported as leaks. This is because many of these blocks
- * are only referred via the physical address which is not
- * looked up by kmemleak.
- */
- kmemleak_alloc(ptr, size, 0, 0);
+ /* Skip kmemleak for kasan_init() due to high volume. */
+ if (max_addr != MEMBLOCK_ALLOC_KASAN)
+ /*
+ * The min_count is set to 0 so that bootmem allocated
+ * blocks are never reported as leaks. This is because many
+ * of these blocks are only referred via the physical
+ * address which is not looked up by kmemleak.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
return ptr;
}
@@ -1537,24 +1547,6 @@ void * __init memblock_alloc_try_nid(
}
/**
- * __memblock_free_early - free boot memory block
- * @base: phys starting address of the boot memory block
- * @size: size of the boot memory block in bytes
- *
- * Free boot memory block previously allocated by memblock_alloc_xx() API.
- * The freeing memory will not be released to the buddy allocator.
- */
-void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
-{
- phys_addr_t end = base + size - 1;
-
- memblock_dbg("%s: [%pa-%pa] %pF\n",
- __func__, &base, &end, (void *)_RET_IP_);
- kmemleak_free_part_phys(base, size);
- memblock_remove_range(&memblock.reserved, base, size);
-}
-
-/**
* __memblock_free_late - free bootmem block pages directly to buddy allocator
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
@@ -1576,7 +1568,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
for (; cursor < end; cursor++) {
memblock_free_pages(pfn_to_page(cursor), cursor, 0);
- totalram_pages++;
+ totalram_pages_inc();
}
}
@@ -1950,7 +1942,7 @@ void reset_node_managed_pages(pg_data_t *pgdat)
struct zone *z;
for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
- z->managed_pages = 0;
+ atomic_long_set(&z->managed_pages, 0);
}
void __init reset_all_zones_managed_pages(void)
@@ -1978,7 +1970,7 @@ unsigned long __init memblock_free_all(void)
reset_all_zones_managed_pages();
pages = free_low_memory_core_early();
- totalram_pages += pages;
+ totalram_pages_add(pages);
return pages;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6e1469b80cb7..af7f18b32389 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1293,32 +1293,39 @@ static const char *const memcg1_stat_names[] = {
#define K(x) ((x) << (PAGE_SHIFT-10))
/**
- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
+ * mem_cgroup_print_oom_context: Print OOM information relevant to
+ * memory controller.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
-void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
+void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
- struct mem_cgroup *iter;
- unsigned int i;
-
rcu_read_lock();
+ if (memcg) {
+ pr_cont(",oom_memcg=");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ } else
+ pr_cont(",global_oom");
if (p) {
- pr_info("Task in ");
+ pr_cont(",task_memcg=");
pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_cont(" killed as a result of limit of ");
- } else {
- pr_info("Memory limit reached of cgroup ");
}
-
- pr_cont_cgroup_path(memcg->css.cgroup);
- pr_cont("\n");
-
rcu_read_unlock();
+}
+
+/**
+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
+ * memory controller.
+ * @memcg: The memory cgroup that went over limit
+ */
+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
+{
+ struct mem_cgroup *iter;
+ unsigned int i;
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
@@ -1666,6 +1673,9 @@ enum oom_status {
static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
+ enum oom_status ret;
+ bool locked;
+
if (order > PAGE_ALLOC_COSTLY_ORDER)
return OOM_SKIPPED;
@@ -1700,10 +1710,23 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
return OOM_ASYNC;
}
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ mem_cgroup_unmark_under_oom(memcg);
if (mem_cgroup_out_of_memory(memcg, mask, order))
- return OOM_SUCCESS;
+ ret = OOM_SUCCESS;
+ else
+ ret = OOM_FAILED;
- return OOM_FAILED;
+ if (locked)
+ mem_cgroup_oom_unlock(memcg);
+
+ return ret;
}
/**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7c72f2a95785..6379fff1a5ff 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
- bool unmap_success;
+ bool unmap_success = true;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
bool mlocked = PageMlocked(hpage);
@@ -1028,7 +1028,19 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_success = try_to_unmap(hpage, ttu);
+ if (!PageHuge(hpage)) {
+ unmap_success = try_to_unmap(hpage, ttu);
+ } else if (mapping) {
+ /*
+ * For hugetlb pages, try_to_unmap could potentially call
+ * huge_pmd_unshare. Because of this, take semaphore in
+ * write mode here and set TTU_RMAP_LOCKED to indicate we
+ * have taken the lock at this higer level.
+ */
+ i_mmap_lock_write(mapping);
+ unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
+ i_mmap_unlock_write(mapping);
+ }
if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
diff --git a/mm/memory.c b/mm/memory.c
index 4ad2d293ddc2..2dd2f9ab57f4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -973,8 +973,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
unsigned long next;
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
bool is_cow;
int ret;
@@ -1008,11 +1007,11 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* is_cow_mapping() returns true.
*/
is_cow = is_cow_mapping(vma->vm_flags);
- mmun_start = addr;
- mmun_end = end;
- if (is_cow)
- mmu_notifier_invalidate_range_start(src_mm, mmun_start,
- mmun_end);
+
+ if (is_cow) {
+ mmu_notifier_range_init(&range, src_mm, addr, end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
ret = 0;
dst_pgd = pgd_offset(dst_mm, addr);
@@ -1029,7 +1028,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
if (is_cow)
- mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
return ret;
}
@@ -1332,12 +1331,13 @@ void unmap_vmas(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
- mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+ mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr);
+ mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
- mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
+ mmu_notifier_invalidate_range_end(&range);
}
/**
@@ -1351,18 +1351,18 @@ void unmap_vmas(struct mmu_gather *tlb,
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long size)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
- unsigned long end = start + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start, end);
- update_hiwater_rss(mm);
- mmu_notifier_invalidate_range_start(mm, start, end);
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
- unmap_single_vma(&tlb, vma, start, end, NULL);
- mmu_notifier_invalidate_range_end(mm, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ mmu_notifier_range_init(&range, vma->vm_mm, start, start + size);
+ tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
+ update_hiwater_rss(vma->vm_mm);
+ mmu_notifier_invalidate_range_start(&range);
+ for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
+ unmap_single_vma(&tlb, vma, start, range.end, NULL);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, start, range.end);
}
/**
@@ -1377,17 +1377,17 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
- unsigned long end = address + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, address, end);
- update_hiwater_rss(mm);
- mmu_notifier_invalidate_range_start(mm, address, end);
- unmap_single_vma(&tlb, vma, address, end, details);
- mmu_notifier_invalidate_range_end(mm, address, end);
- tlb_finish_mmu(&tlb, address, end);
+ mmu_notifier_range_init(&range, vma->vm_mm, address, address + size);
+ tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
+ update_hiwater_rss(vma->vm_mm);
+ mmu_notifier_invalidate_range_start(&range);
+ unmap_single_vma(&tlb, vma, address, range.end, details);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, address, range.end);
}
/**
@@ -2247,9 +2247,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- const unsigned long mmun_start = vmf->address & PAGE_MASK;
- const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
+ struct mmu_notifier_range range;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2272,7 +2271,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
__SetPageUptodate(new_page);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK,
+ (vmf->address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
/*
* Re-check the pte - we dropped the lock
@@ -2349,7 +2350,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
* No need to double call mmu_notifier->invalidate_range() callback as
* the above ptep_clear_flush_notify() did already call it.
*/
- mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_only_end(&range);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
@@ -3830,7 +3831,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
vmf.pud = pud_alloc(mm, p4d, address);
if (!vmf.pud)
return VM_FAULT_OOM;
- if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
+ if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -3856,7 +3857,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
if (!vmf.pmd)
return VM_FAULT_OOM;
- if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+ if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4030,7 +4031,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
#endif /* __PAGETABLE_PMD_FOLDED */
static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- unsigned long *start, unsigned long *end,
+ struct mmu_notifier_range *range,
pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
@@ -4058,10 +4059,10 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
if (!pmdpp)
goto out;
- if (start && end) {
- *start = address & PMD_MASK;
- *end = *start + PMD_SIZE;
- mmu_notifier_invalidate_range_start(mm, *start, *end);
+ if (range) {
+ mmu_notifier_range_init(range, mm, address & PMD_MASK,
+ (address & PMD_MASK) + PMD_SIZE);
+ mmu_notifier_invalidate_range_start(range);
}
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
@@ -4069,17 +4070,17 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
return 0;
}
spin_unlock(*ptlp);
- if (start && end)
- mmu_notifier_invalidate_range_end(mm, *start, *end);
+ if (range)
+ mmu_notifier_invalidate_range_end(range);
}
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
- if (start && end) {
- *start = address & PAGE_MASK;
- *end = *start + PAGE_SIZE;
- mmu_notifier_invalidate_range_start(mm, *start, *end);
+ if (range) {
+ range->start = address & PAGE_MASK;
+ range->end = range->start + PAGE_SIZE;
+ mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
@@ -4088,8 +4089,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
- if (start && end)
- mmu_notifier_invalidate_range_end(mm, *start, *end);
+ if (range)
+ mmu_notifier_invalidate_range_end(range);
out:
return -EINVAL;
}
@@ -4101,20 +4102,20 @@ static inline int follow_pte(struct mm_struct *mm, unsigned long address,
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+ !(res = __follow_pte_pmd(mm, address, NULL,
ptepp, NULL, ptlp)));
return res;
}
int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- unsigned long *start, unsigned long *end,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+ struct mmu_notifier_range *range,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
int res;
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, start, end,
+ !(res = __follow_pte_pmd(mm, address, range,
ptepp, pmdpp, ptlp)));
return res;
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2b2b3ccbbfb5..b9a667d36c55 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,7 @@
#include <linux/hugetlb.h>
#include <linux/memblock.h>
#include <linux/compaction.h>
+#include <linux/rmap.h>
#include <asm/tlbflush.h>
@@ -253,7 +254,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
if (pfn_valid(phys_start_pfn))
return -EEXIST;
- ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap);
+ ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
if (ret < 0)
return ret;
@@ -743,14 +744,13 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
int nid = pgdat->node_id;
unsigned long flags;
- if (zone_is_empty(zone))
- init_currently_empty_zone(zone, start_pfn, nr_pages);
-
clear_zone_contiguous(zone);
/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
pgdat_resize_lock(pgdat, &flags);
zone_span_writelock(zone);
+ if (zone_is_empty(zone))
+ init_currently_empty_zone(zone, start_pfn, nr_pages);
resize_zone_range(zone, start_pfn, nr_pages);
zone_span_writeunlock(zone);
resize_pgdat_range(pgdat, start_pfn, nr_pages);
@@ -1078,7 +1078,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*
* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
*/
-int __ref add_memory_resource(int nid, struct resource *res, bool online)
+int __ref add_memory_resource(int nid, struct resource *res)
{
u64 start, size;
bool new_node = false;
@@ -1133,7 +1133,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
mem_hotplug_done();
/* online pages if requested */
- if (online)
+ if (memhp_auto_online)
walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
NULL, online_memory_block);
@@ -1157,7 +1157,7 @@ int __ref __add_memory(int nid, u64 start, u64 size)
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res, memhp_auto_online);
+ ret = add_memory_resource(nid, res);
if (ret < 0)
release_memory_resource(res);
return ret;
@@ -1226,7 +1226,7 @@ static bool is_pageblock_removable_nolock(struct page *page)
if (!zone_spans_pfn(zone, pfn))
return false;
- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
+ return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
}
/* Checks if this range of memory is likely to be hot-removable. */
@@ -1339,18 +1339,16 @@ static struct page *new_node_page(struct page *page, unsigned long private)
return new_page_nodemask(page, nid, &nmask);
}
-#define NR_OFFLINE_AT_ONCE_PAGES (256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page;
- int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
int not_managed = 0;
int ret = 0;
LIST_HEAD(source);
- for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
@@ -1362,13 +1360,27 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
ret = -EBUSY;
break;
}
- if (isolate_huge_page(page, &source))
- move_pages -= 1 << compound_order(head);
+ isolate_huge_page(page, &source);
continue;
} else if (PageTransHuge(page))
pfn = page_to_pfn(compound_head(page))
+ hpage_nr_pages(page) - 1;
+ /*
+ * HWPoison pages have elevated reference counts so the migration would
+ * fail on them. It also doesn't make any sense to migrate them in the
+ * first place. Still try to unmap such a page in case it is still mapped
+ * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
+ * the unmap as the catch all safety net).
+ */
+ if (PageHWPoison(page)) {
+ if (WARN_ON(PageLRU(page)))
+ isolate_lru_page(page);
+ if (page_mapped(page))
+ try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+ continue;
+ }
+
if (!get_page_unless_zero(page))
continue;
/*
@@ -1382,16 +1394,13 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (!ret) { /* Success */
put_page(page);
list_add_tail(&page->lru, &source);
- move_pages--;
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
} else {
-#ifdef CONFIG_DEBUG_VM
- pr_alert("failed to isolate pfn %lx\n", pfn);
+ pr_warn("failed to isolate pfn %lx\n", pfn);
dump_page(page, "isolation failed");
-#endif
put_page(page);
/* Because we don't have big zone->lock. we should
check this again here. */
@@ -1411,8 +1420,14 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
/* Allocate a new page from the nearest neighbor node */
ret = migrate_pages(&source, new_node_page, NULL, 0,
MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
- if (ret)
+ if (ret) {
+ list_for_each_entry(page, &source, lru) {
+ pr_warn("migrating pfn %lx failed ret:%d ",
+ page_to_pfn(page), ret);
+ dump_page(page, "migration failure");
+ }
putback_movable_pages(&source);
+ }
}
out:
return ret;
@@ -1553,12 +1568,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
unsigned long valid_start, valid_end;
struct zone *zone;
struct memory_notify arg;
-
- /* at least, alignment against pageblock is necessary */
- if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
- return -EINVAL;
- if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
- return -EINVAL;
+ char *reason;
mem_hotplug_begin();
@@ -1567,7 +1577,9 @@ static int __ref __offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
&valid_end)) {
mem_hotplug_done();
- return -EINVAL;
+ ret = -EINVAL;
+ reason = "multizone range";
+ goto failed_removal;
}
zone = page_zone(pfn_to_page(valid_start));
@@ -1576,10 +1588,12 @@ static int __ref __offline_pages(unsigned long start_pfn,
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
- MIGRATE_MOVABLE, true);
+ MIGRATE_MOVABLE,
+ SKIP_HWPOISON | REPORT_FAILURE);
if (ret) {
mem_hotplug_done();
- return ret;
+ reason = "failure to isolate range";
+ goto failed_removal;
}
arg.start_pfn = start_pfn;
@@ -1588,37 +1602,47 @@ static int __ref __offline_pages(unsigned long start_pfn,
ret = memory_notify(MEM_GOING_OFFLINE, &arg);
ret = notifier_to_errno(ret);
- if (ret)
- goto failed_removal;
+ if (ret) {
+ reason = "notifier failure";
+ goto failed_removal_isolated;
+ }
- pfn = start_pfn;
-repeat:
- /* start memory hot removal */
- ret = -EINTR;
- if (signal_pending(current))
- goto failed_removal;
+ do {
+ for (pfn = start_pfn; pfn;) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ reason = "signal backoff";
+ goto failed_removal_isolated;
+ }
- cond_resched();
- lru_add_drain_all();
- drain_all_pages(zone);
+ cond_resched();
+ lru_add_drain_all();
+ drain_all_pages(zone);
+
+ pfn = scan_movable_pages(pfn, end_pfn);
+ if (pfn) {
+ /*
+ * TODO: fatal migration failures should bail
+ * out
+ */
+ do_migrate_range(pfn, end_pfn);
+ }
+ }
- pfn = scan_movable_pages(start_pfn, end_pfn);
- if (pfn) { /* We have movable pages */
- ret = do_migrate_range(pfn, end_pfn);
- goto repeat;
- }
+ /*
+ * Dissolve free hugepages in the memory block before doing
+ * offlining actually in order to make hugetlbfs's object
+ * counting consistent.
+ */
+ ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+ if (ret) {
+ reason = "failure to dissolve huge pages";
+ goto failed_removal_isolated;
+ }
+ /* check again */
+ offlined_pages = check_pages_isolated(start_pfn, end_pfn);
+ } while (offlined_pages < 0);
- /*
- * dissolve free hugepages in the memory block before doing offlining
- * actually in order to make hugetlbfs's object counting consistent.
- */
- ret = dissolve_free_huge_pages(start_pfn, end_pfn);
- if (ret)
- goto failed_removal;
- /* check again */
- offlined_pages = check_pages_isolated(start_pfn, end_pfn);
- if (offlined_pages < 0)
- goto repeat;
pr_info("Offlined Pages %ld\n", offlined_pages);
/* Ok, all of our target is isolated.
We cannot do rollback at this point. */
@@ -1654,13 +1678,15 @@ repeat:
mem_hotplug_done();
return 0;
+failed_removal_isolated:
+ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
failed_removal:
- pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
+ pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
(unsigned long long) start_pfn << PAGE_SHIFT,
- ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
+ reason);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
/* pushback to free area */
- undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
mem_hotplug_done();
return ret;
}
@@ -1753,34 +1779,6 @@ static int check_cpu_on_node(pg_data_t *pgdat)
return 0;
}
-static void unmap_cpu_on_node(pg_data_t *pgdat)
-{
-#ifdef CONFIG_ACPI_NUMA
- int cpu;
-
- for_each_possible_cpu(cpu)
- if (cpu_to_node(cpu) == pgdat->node_id)
- numa_clear_node(cpu);
-#endif
-}
-
-static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
-{
- int ret;
-
- ret = check_cpu_on_node(pgdat);
- if (ret)
- return ret;
-
- /*
- * the node will be offlined when we come here, so we can clear
- * the cpu_to_node() now.
- */
-
- unmap_cpu_on_node(pgdat);
- return 0;
-}
-
/**
* try_offline_node
* @nid: the node ID
@@ -1813,7 +1811,7 @@ void try_offline_node(int nid)
return;
}
- if (check_and_unmap_cpu_on_node(pgdat))
+ if (check_cpu_on_node(pgdat))
return;
/*
@@ -1858,7 +1856,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size)
memblock_free(start, size);
memblock_remove(start, size);
- arch_remove_memory(start, size, NULL);
+ arch_remove_memory(nid, start, size, NULL);
try_offline_node(nid);
diff --git a/mm/migrate.c b/mm/migrate.c
index f7e4bfdc13b7..5d1839a9148d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -327,16 +327,13 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
/*
* Once page cache replacement of page migration started, page_count
- * *must* be zero. And, we don't want to call wait_on_page_locked()
- * against a page without get_page().
- * So, we use get_page_unless_zero(), here. Even failed, page fault
- * will occur again.
+ * is zero; but we must not call put_and_wait_on_page_locked() without
+ * a ref. Use get_page_unless_zero(), and just fault again if it fails.
*/
if (!get_page_unless_zero(page))
goto out;
pte_unmap_unlock(ptep, ptl);
- wait_on_page_locked(page);
- put_page(page);
+ put_and_wait_on_page_locked(page);
return;
out:
pte_unmap_unlock(ptep, ptl);
@@ -370,63 +367,28 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
if (!get_page_unless_zero(page))
goto unlock;
spin_unlock(ptl);
- wait_on_page_locked(page);
- put_page(page);
+ put_and_wait_on_page_locked(page);
return;
unlock:
spin_unlock(ptl);
}
#endif
-#ifdef CONFIG_BLOCK
-/* Returns true if all buffers are successfully locked */
-static bool buffer_migrate_lock_buffers(struct buffer_head *head,
- enum migrate_mode mode)
+static int expected_page_refs(struct page *page)
{
- struct buffer_head *bh = head;
-
- /* Simple case, sync compaction */
- if (mode != MIGRATE_ASYNC) {
- do {
- get_bh(bh);
- lock_buffer(bh);
- bh = bh->b_this_page;
-
- } while (bh != head);
+ int expected_count = 1;
- return true;
- }
-
- /* async case, we cannot block on lock_buffer so use trylock_buffer */
- do {
- get_bh(bh);
- if (!trylock_buffer(bh)) {
- /*
- * We failed to lock the buffer and cannot stall in
- * async migration. Release the taken locks
- */
- struct buffer_head *failed_bh = bh;
- put_bh(failed_bh);
- bh = head;
- while (bh != failed_bh) {
- unlock_buffer(bh);
- put_bh(bh);
- bh = bh->b_this_page;
- }
- return false;
- }
+ /*
+ * Device public or private pages have an extra refcount as they are
+ * ZONE_DEVICE pages.
+ */
+ expected_count += is_device_private_page(page);
+ expected_count += is_device_public_page(page);
+ if (page_mapping(page))
+ expected_count += hpage_nr_pages(page) + page_has_private(page);
- bh = bh->b_this_page;
- } while (bh != head);
- return true;
-}
-#else
-static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
- enum migrate_mode mode)
-{
- return true;
+ return expected_count;
}
-#endif /* CONFIG_BLOCK */
/*
* Replace the page in the mapping.
@@ -437,21 +399,13 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page,
- struct buffer_head *head, enum migrate_mode mode,
+ struct page *newpage, struct page *page, enum migrate_mode mode,
int extra_count)
{
XA_STATE(xas, &mapping->i_pages, page_index(page));
struct zone *oldzone, *newzone;
int dirty;
- int expected_count = 1 + extra_count;
-
- /*
- * Device public or private pages have an extra refcount as they are
- * ZONE_DEVICE pages.
- */
- expected_count += is_device_private_page(page);
- expected_count += is_device_public_page(page);
+ int expected_count = expected_page_refs(page) + extra_count;
if (!mapping) {
/* Anonymous page without mapping */
@@ -471,8 +425,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
newzone = page_zone(newpage);
xas_lock_irq(&xas);
-
- expected_count += hpage_nr_pages(page) + page_has_private(page);
if (page_count(page) != expected_count || xas_load(&xas) != page) {
xas_unlock_irq(&xas);
return -EAGAIN;
@@ -484,20 +436,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
}
/*
- * In the async migration case of moving a page with buffers, lock the
- * buffers using trylock before the mapping is moved. If the mapping
- * was moved, we later failed to lock the buffers and could not move
- * the mapping back due to an elevated page count, we would have to
- * block waiting on other references to be dropped.
- */
- if (mode == MIGRATE_ASYNC && head &&
- !buffer_migrate_lock_buffers(head, mode)) {
- page_ref_unfreeze(page, expected_count);
- xas_unlock_irq(&xas);
- return -EAGAIN;
- }
-
- /*
* Now we know that no one else is looking at the page:
* no turning back from here.
*/
@@ -748,7 +686,7 @@ int migrate_page(struct address_space *mapping,
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -762,34 +700,98 @@ int migrate_page(struct address_space *mapping,
EXPORT_SYMBOL(migrate_page);
#ifdef CONFIG_BLOCK
-/*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist.
- */
-int buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ struct buffer_head *bh = head;
+
+ /* Simple case, sync compaction */
+ if (mode != MIGRATE_ASYNC) {
+ do {
+ get_bh(bh);
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return true;
+ }
+
+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
+ do {
+ get_bh(bh);
+ if (!trylock_buffer(bh)) {
+ /*
+ * We failed to lock the buffer and cannot stall in
+ * async migration. Release the taken locks
+ */
+ struct buffer_head *failed_bh = bh;
+ put_bh(failed_bh);
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ put_bh(bh);
+ bh = bh->b_this_page;
+ }
+ return false;
+ }
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ return true;
+}
+
+static int __buffer_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode,
+ bool check_refs)
{
struct buffer_head *bh, *head;
int rc;
+ int expected_count;
if (!page_has_buffers(page))
return migrate_page(mapping, newpage, page, mode);
+ /* Check whether page does not have extra refs before we do more work */
+ expected_count = expected_page_refs(page);
+ if (page_count(page) != expected_count)
+ return -EAGAIN;
+
head = page_buffers(page);
+ if (!buffer_migrate_lock_buffers(head, mode))
+ return -EAGAIN;
- rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
+ if (check_refs) {
+ bool busy;
+ bool invalidated = false;
- if (rc != MIGRATEPAGE_SUCCESS)
- return rc;
+recheck_buffers:
+ busy = false;
+ spin_lock(&mapping->private_lock);
+ bh = head;
+ do {
+ if (atomic_read(&bh->b_count)) {
+ busy = true;
+ break;
+ }
+ bh = bh->b_this_page;
+ } while (bh != head);
+ spin_unlock(&mapping->private_lock);
+ if (busy) {
+ if (invalidated) {
+ rc = -EAGAIN;
+ goto unlock_buffers;
+ }
+ invalidate_bh_lrus();
+ invalidated = true;
+ goto recheck_buffers;
+ }
+ }
- /*
- * In the async case, migrate_page_move_mapping locked the buffers
- * with an IRQ-safe spinlock held. In the sync case, the buffers
- * need to be locked now
- */
- if (mode != MIGRATE_ASYNC)
- BUG_ON(!buffer_migrate_lock_buffers(head, mode));
+ rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0);
+ if (rc != MIGRATEPAGE_SUCCESS)
+ goto unlock_buffers;
ClearPagePrivate(page);
set_page_private(newpage, page_private(page));
@@ -811,6 +813,8 @@ int buffer_migrate_page(struct address_space *mapping,
else
migrate_page_states(newpage, page);
+ rc = MIGRATEPAGE_SUCCESS;
+unlock_buffers:
bh = head;
do {
unlock_buffer(bh);
@@ -819,9 +823,32 @@ int buffer_migrate_page(struct address_space *mapping,
} while (bh != head);
- return MIGRATEPAGE_SUCCESS;
+ return rc;
+}
+
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist. For example attached buffer heads are accessed only under page lock.
+ */
+int buffer_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return __buffer_migrate_page(mapping, newpage, page, mode, false);
}
EXPORT_SYMBOL(buffer_migrate_page);
+
+/*
+ * Same as above except that this variant is more careful and checks that there
+ * are also no buffer head references. This function is the right one for
+ * mappings where buffer heads are directly looked up and referenced (such as
+ * block device mappings).
+ */
+int buffer_migrate_page_norefs(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return __buffer_migrate_page(mapping, newpage, page, mode, true);
+}
#endif
/*
@@ -1297,8 +1324,19 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
goto put_anon;
if (page_mapped(hpage)) {
+ struct address_space *mapping = page_mapping(hpage);
+
+ /*
+ * try_to_unmap could potentially call huge_pmd_unshare.
+ * Because of this, take semaphore in write mode here and
+ * set TTU_RMAP_LOCKED to let lower levels know we have
+ * taken the lock.
+ */
+ i_mmap_lock_write(mapping);
try_to_unmap(hpage,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
+ TTU_RMAP_LOCKED);
+ i_mmap_unlock_write(mapping);
page_was_mapped = 1;
}
@@ -2303,6 +2341,7 @@ next:
*/
static void migrate_vma_collect(struct migrate_vma *migrate)
{
+ struct mmu_notifier_range range;
struct mm_walk mm_walk;
mm_walk.pmd_entry = migrate_vma_collect_pmd;
@@ -2314,13 +2353,11 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
mm_walk.mm = migrate->vma->vm_mm;
mm_walk.private = migrate;
- mmu_notifier_invalidate_range_start(mm_walk.mm,
- migrate->start,
- migrate->end);
+ mmu_notifier_range_init(&range, mm_walk.mm, migrate->start,
+ migrate->end);
+ mmu_notifier_invalidate_range_start(&range);
walk_page_range(migrate->start, migrate->end, &mm_walk);
- mmu_notifier_invalidate_range_end(mm_walk.mm,
- migrate->start,
- migrate->end);
+ mmu_notifier_invalidate_range_end(&range);
migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
}
@@ -2701,9 +2738,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
{
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
- struct vm_area_struct *vma = migrate->vma;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long addr, i, mmu_start;
+ struct mmu_notifier_range range;
+ unsigned long addr, i;
bool notified = false;
for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
@@ -2722,11 +2758,12 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
continue;
}
if (!notified) {
- mmu_start = addr;
notified = true;
- mmu_notifier_invalidate_range_start(mm,
- mmu_start,
- migrate->end);
+
+ mmu_notifier_range_init(&range,
+ migrate->vma->vm_mm,
+ addr, migrate->end);
+ mmu_notifier_invalidate_range_start(&range);
}
migrate_vma_insert_page(migrate, addr, newpage,
&migrate->src[i],
@@ -2767,8 +2804,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
* did already call it.
*/
if (notified)
- mmu_notifier_invalidate_range_only_end(mm, mmu_start,
- migrate->end);
+ mmu_notifier_invalidate_range_only_end(&range);
}
/*
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 6838a530789b..33917105a3a2 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -146,7 +146,7 @@ static void __meminit mm_compute_batch(void)
s32 batch = max_t(s32, nr*2, 32);
/* batch size set to 0.4% of (total memory/#cpus), or max int32 */
- memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
+ memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff);
vm_committed_as_batch = max_t(s32, memsized_batch, batch);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 7bb64381e77c..f901065c4c64 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2973,16 +2973,6 @@ out:
return ret;
}
-static inline void verify_mm_writelocked(struct mm_struct *mm)
-{
-#ifdef CONFIG_DEBUG_VM
- if (unlikely(down_read_trylock(&mm->mmap_sem))) {
- WARN_ON(1);
- up_read(&mm->mmap_sem);
- }
-#endif
-}
-
/*
* this is really a simplified "do_mmap". it only handles
* anonymous maps. eventually we may be able to do some
@@ -3010,12 +3000,6 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
return error;
/*
- * mm->mmap_sem is required to protect against another thread
- * changing the mappings in case we sleep.
- */
- verify_mm_writelocked(mm);
-
- /*
* Clear old maps. this also does some error checking for us
*/
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5119ff846769..9c884abc7850 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -35,13 +35,6 @@ void mmu_notifier_call_srcu(struct rcu_head *rcu,
}
EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu);
-void mmu_notifier_synchronize(void)
-{
- /* Wait for any running method to finish. */
- srcu_barrier(&srcu);
-}
-EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
-
/*
* This function can't run concurrently against mmu_notifier_register
* because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -174,22 +167,20 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
srcu_read_unlock(&srcu, id);
}
-int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
- unsigned long start, unsigned long end,
- bool blockable)
+int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
struct mmu_notifier *mn;
int ret = 0;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_start) {
- int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable);
+ int _ret = mn->ops->invalidate_range_start(mn, range);
if (_ret) {
pr_info("%pS callback failed with %d in %sblockable context.\n",
- mn->ops->invalidate_range_start, _ret,
- !blockable ? "non-" : "");
+ mn->ops->invalidate_range_start, _ret,
+ !range->blockable ? "non-" : "");
ret = _ret;
}
}
@@ -200,16 +191,14 @@ int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
-void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
+void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
bool only_end)
{
struct mmu_notifier *mn;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
/*
* Call invalidate_range here too to avoid the need for the
* subsystem of having to register an invalidate_range_end
@@ -224,9 +213,11 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
* already happen under page table lock.
*/
if (!only_end && mn->ops->invalidate_range)
- mn->ops->invalidate_range(mn, mm, start, end);
+ mn->ops->invalidate_range(mn, range->mm,
+ range->start,
+ range->end);
if (mn->ops->invalidate_range_end)
- mn->ops->invalidate_range_end(mn, mm, start, end);
+ mn->ops->invalidate_range_end(mn, range);
}
srcu_read_unlock(&srcu, id);
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6d331620b9e5..36cb358db170 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -167,11 +167,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
- struct mm_struct *mm = vma->vm_mm;
unsigned long next;
unsigned long pages = 0;
unsigned long nr_huge_updates = 0;
- unsigned long mni_start = 0;
+ struct mmu_notifier_range range;
+
+ range.start = 0;
pmd = pmd_offset(pud, addr);
do {
@@ -183,9 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
goto next;
/* invoke the mmu notifier if the pmd is populated */
- if (!mni_start) {
- mni_start = addr;
- mmu_notifier_invalidate_range_start(mm, mni_start, end);
+ if (!range.start) {
+ mmu_notifier_range_init(&range, vma->vm_mm, addr, end);
+ mmu_notifier_invalidate_range_start(&range);
}
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
@@ -214,8 +215,8 @@ next:
cond_resched();
} while (pmd++, addr = next, addr != end);
- if (mni_start)
- mmu_notifier_invalidate_range_end(mm, mni_start, end);
+ if (range.start)
+ mmu_notifier_invalidate_range_end(&range);
if (nr_huge_updates)
count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
diff --git a/mm/mremap.c b/mm/mremap.c
index 7f9f9180e401..def01d86e36f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -197,16 +197,14 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
bool need_rmap_locks)
{
unsigned long extent, next, old_end;
+ struct mmu_notifier_range range;
pmd_t *old_pmd, *new_pmd;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
- mmun_start = old_addr;
- mmun_end = old_end;
- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end);
+ mmu_notifier_invalidate_range_start(&range);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
@@ -247,7 +245,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
new_pmd, new_addr, need_rmap_locks);
}
- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
return len + old_addr - old_end; /* how much done */
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 6589f60d5018..f0e8cd9edb1a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -245,11 +245,11 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
return points > 0 ? points : 1;
}
-enum oom_constraint {
- CONSTRAINT_NONE,
- CONSTRAINT_CPUSET,
- CONSTRAINT_MEMORY_POLICY,
- CONSTRAINT_MEMCG,
+static const char * const oom_constraint_text[] = {
+ [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
+ [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
+ [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
+ [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
};
/*
@@ -269,7 +269,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
}
/* Default to all available memory */
- oc->totalpages = totalram_pages + total_swap_pages;
+ oc->totalpages = totalram_pages() + total_swap_pages;
if (!IS_ENABLED(CONFIG_NUMA))
return CONSTRAINT_NONE;
@@ -428,19 +428,29 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
rcu_read_unlock();
}
+static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
+{
+ /* one line summary of the oom killer context. */
+ pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
+ oom_constraint_text[oc->constraint],
+ nodemask_pr_args(oc->nodemask));
+ cpuset_print_current_mems_allowed();
+ mem_cgroup_print_oom_context(oc->memcg, victim);
+ pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
+ from_kuid(&init_user_ns, task_uid(victim)));
+}
+
static void dump_header(struct oom_control *oc, struct task_struct *p)
{
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
- current->comm, oc->gfp_mask, &oc->gfp_mask,
- nodemask_pr_args(oc->nodemask), oc->order,
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
+ current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
pr_warn("COMPACTION is disabled!!!\n");
- cpuset_print_current_mems_allowed();
dump_stack();
if (is_memcg_oom(oc))
- mem_cgroup_print_oom_info(oc->memcg, p);
+ mem_cgroup_print_oom_meminfo(oc->memcg);
else {
show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
if (is_dump_unreclaim_slabs())
@@ -448,6 +458,8 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
}
if (sysctl_oom_dump_tasks)
dump_tasks(oc->memcg, oc->nodemask);
+ if (p)
+ dump_oom_summary(oc, p);
}
/*
@@ -516,19 +528,20 @@ bool __oom_reap_task_mm(struct mm_struct *mm)
* count elevated without a good reason.
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- const unsigned long start = vma->vm_start;
- const unsigned long end = vma->vm_end;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
- tlb_gather_mmu(&tlb, mm, start, end);
- if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
- tlb_finish_mmu(&tlb, start, end);
+ mmu_notifier_range_init(&range, mm, vma->vm_start,
+ vma->vm_end);
+ tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
+ tlb_finish_mmu(&tlb, range.start, range.end);
ret = false;
continue;
}
- unmap_page_range(&tlb, vma, start, end, NULL);
- mmu_notifier_invalidate_range_end(mm, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ unmap_page_range(&tlb, vma, range.start, range.end, NULL);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, range.start, range.end);
}
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f690bae6b78..7d1010453fb9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2154,6 +2154,7 @@ int write_cache_pages(struct address_space *mapping,
{
int ret = 0;
int done = 0;
+ int error;
struct pagevec pvec;
int nr_pages;
pgoff_t uninitialized_var(writeback_index);
@@ -2227,25 +2228,31 @@ continue_unlock:
goto continue_unlock;
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
- ret = (*writepage)(page, wbc, data);
- if (unlikely(ret)) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ error = (*writepage)(page, wbc, data);
+ if (unlikely(error)) {
+ /*
+ * Handle errors according to the type of
+ * writeback. There's no need to continue for
+ * background writeback. Just push done_index
+ * past this page so media errors won't choke
+ * writeout for the entire file. For integrity
+ * writeback, we must process the entire dirty
+ * set regardless of errors because the fs may
+ * still have state to clear for each page. In
+ * that case we continue processing and return
+ * the first error.
+ */
+ if (error == AOP_WRITEPAGE_ACTIVATE) {
unlock_page(page);
- ret = 0;
- } else {
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
+ error = 0;
+ } else if (wbc->sync_mode != WB_SYNC_ALL) {
+ ret = error;
done_index = page->index + 1;
done = 1;
break;
}
+ if (!ret)
+ ret = error;
}
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e95b5b7c9c3d..cde5dac6229a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -16,6 +16,7 @@
#include <linux/stddef.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
@@ -96,8 +97,12 @@ int _node_numa_mem_[MAX_NUMNODES];
#endif
/* work_structs for global per-cpu drains */
+struct pcpu_drain {
+ struct zone *zone;
+ struct work_struct work;
+};
DEFINE_MUTEX(pcpu_drain_mutex);
-DEFINE_PER_CPU(struct work_struct, pcpu_drain);
+DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
@@ -121,10 +126,8 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
};
EXPORT_SYMBOL(node_states);
-/* Protect totalram_pages and zone->managed_pages */
-static DEFINE_SPINLOCK(managed_page_count_lock);
-
-unsigned long totalram_pages __read_mostly;
+atomic_long_t _totalram_pages __read_mostly;
+EXPORT_SYMBOL(_totalram_pages);
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
@@ -237,7 +240,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
-char * const migratetype_names[MIGRATE_TYPES] = {
+const char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
"Reclaimable",
@@ -263,20 +266,21 @@ compound_page_dtor * const compound_page_dtors[] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+int watermark_boost_factor __read_mostly = 15000;
int watermark_scale_factor = 10;
-static unsigned long nr_kernel_pages __meminitdata;
-static unsigned long nr_all_pages __meminitdata;
-static unsigned long dma_reserve __meminitdata;
+static unsigned long nr_kernel_pages __initdata;
+static unsigned long nr_all_pages __initdata;
+static unsigned long dma_reserve __initdata;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long required_kernelcore __initdata;
static unsigned long required_kernelcore_percent __initdata;
static unsigned long required_movablecore __initdata;
static unsigned long required_movablecore_percent __initdata;
-static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
static bool mirrored_kernelcore __meminitdata;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -294,6 +298,32 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/*
+ * During boot we initialize deferred pages on-demand, as needed, but once
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
+ * and we can permanently disable that path.
+ */
+static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+
+/*
+ * Calling kasan_free_pages() only after deferred memory initialization
+ * has completed. Poisoning pages during deferred memory init will greatly
+ * lengthen the process and cause problem in large memory systems as the
+ * deferred pages initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+{
+ if (!static_branch_unlikely(&deferred_pages))
+ kasan_free_pages(page, order);
+}
+
/* Returns true if the struct page for the pfn is uninitialised */
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
{
@@ -326,8 +356,13 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
/* Always populate low zones for address-constrained allocations */
if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
return false;
+
+ /*
+ * We start only with one section of pages, more pages are added as
+ * needed until the rest of deferred pages are initialized.
+ */
nr_initialised++;
- if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) &&
+ if ((nr_initialised > PAGES_PER_SECTION) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
NODE_DATA(nid)->first_deferred_pfn = pfn;
return true;
@@ -335,6 +370,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
#else
+#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
+
static inline bool early_page_uninitialised(unsigned long pfn)
{
return false;
@@ -426,6 +463,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
unsigned long old_word, word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+ BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
bitmap = get_pageblock_bitmap(page, pfn);
bitidx = pfn_to_bitidx(page, pfn);
@@ -1037,7 +1075,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
arch_free_page(page, order);
kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
- kasan_free_pages(page, order);
+ kasan_free_nondeferred_pages(page, order);
return true;
}
@@ -1183,6 +1221,7 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
init_page_count(page);
page_mapcount_reset(page);
page_cpupid_reset_last(page);
+ page_kasan_tag_reset(page);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
@@ -1279,7 +1318,7 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order)
__ClearPageReserved(p);
set_page_count(p, 0);
- page_zone(page)->managed_pages += nr_pages;
+ atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
set_page_refcounted(page);
__free_pages(page, order);
}
@@ -1606,13 +1645,6 @@ static int __init deferred_init_memmap(void *data)
}
/*
- * During boot we initialize deferred pages on-demand, as needed, but once
- * page_alloc_init_late() has finished, the deferred pages are all initialized,
- * and we can permanently disable that path.
- */
-static DEFINE_STATIC_KEY_TRUE(deferred_pages);
-
-/*
* If this zone has deferred pages, try to grow it by initializing enough
* deferred pages to satisfy the allocation specified by order, rounded up to
* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
@@ -1981,8 +2013,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
*/
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
#endif
@@ -2129,6 +2161,21 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
return false;
}
+static inline void boost_watermark(struct zone *zone)
+{
+ unsigned long max_boost;
+
+ if (!watermark_boost_factor)
+ return;
+
+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+ watermark_boost_factor, 10000);
+ max_boost = max(pageblock_nr_pages, max_boost);
+
+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
+ max_boost);
+}
+
/*
* This function implements actual steal behaviour. If order is large enough,
* we can steal whole pageblock. If not, we first move freepages in this
@@ -2138,7 +2185,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
* itself, so pages freed in the future will be put on the correct free list.
*/
static void steal_suitable_fallback(struct zone *zone, struct page *page,
- int start_type, bool whole_block)
+ unsigned int alloc_flags, int start_type, bool whole_block)
{
unsigned int current_order = page_order(page);
struct free_area *area;
@@ -2160,6 +2207,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
goto single_page;
}
+ /*
+ * Boost watermarks to increase reclaim pressure to reduce the
+ * likelihood of future fallbacks. Wake kswapd now as the node
+ * may be balanced overall and kswapd will not wake naturally.
+ */
+ boost_watermark(zone);
+ if (alloc_flags & ALLOC_KSWAPD)
+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+
/* We are not allowed to try stealing from the whole block */
if (!whole_block)
goto single_page;
@@ -2258,7 +2314,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
* Limit the number reserved to 1 pageblock or roughly 1% of a zone.
* Check is race-prone but harmless.
*/
- max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+ max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
if (zone->nr_reserved_highatomic >= max_managed)
return;
@@ -2375,20 +2431,30 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* condition simpler.
*/
static __always_inline bool
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+ unsigned int alloc_flags)
{
struct free_area *area;
int current_order;
+ int min_order = order;
struct page *page;
int fallback_mt;
bool can_steal;
/*
+ * Do not steal pages from freelists belonging to other pageblocks
+ * i.e. orders < pageblock_order. If there are no local zones free,
+ * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
+ */
+ if (alloc_flags & ALLOC_NOFRAGMENT)
+ min_order = pageblock_order;
+
+ /*
* Find the largest available free page in the other list. This roughly
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
- for (current_order = MAX_ORDER - 1; current_order >= order;
+ for (current_order = MAX_ORDER - 1; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2433,7 +2499,8 @@ do_steal:
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
- steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+ steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
+ can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
@@ -2447,7 +2514,8 @@ do_steal:
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
+ unsigned int alloc_flags)
{
struct page *page;
@@ -2457,7 +2525,8 @@ retry:
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
- if (!page && __rmqueue_fallback(zone, order, migratetype))
+ if (!page && __rmqueue_fallback(zone, order, migratetype,
+ alloc_flags))
goto retry;
}
@@ -2472,13 +2541,14 @@ retry:
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype)
+ int migratetype, unsigned int alloc_flags)
{
int i, alloced = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- struct page *page = __rmqueue(zone, order, migratetype);
+ struct page *page = __rmqueue(zone, order, migratetype,
+ alloc_flags);
if (unlikely(page == NULL))
break;
@@ -2592,6 +2662,10 @@ void drain_local_pages(struct zone *zone)
static void drain_local_pages_wq(struct work_struct *work)
{
+ struct pcpu_drain *drain;
+
+ drain = container_of(work, struct pcpu_drain, work);
+
/*
* drain_all_pages doesn't use proper cpu hotplug protection so
* we can race with cpu offline when the WQ can move this from
@@ -2600,7 +2674,7 @@ static void drain_local_pages_wq(struct work_struct *work)
* a different one.
*/
preempt_disable();
- drain_local_pages(NULL);
+ drain_local_pages(drain->zone);
preempt_enable();
}
@@ -2671,12 +2745,14 @@ void drain_all_pages(struct zone *zone)
}
for_each_cpu(cpu, &cpus_with_pcps) {
- struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
- INIT_WORK(work, drain_local_pages_wq);
- queue_work_on(cpu, mm_percpu_wq, work);
+ struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
+
+ drain->zone = zone;
+ INIT_WORK(&drain->work, drain_local_pages_wq);
+ queue_work_on(cpu, mm_percpu_wq, &drain->work);
}
for_each_cpu(cpu, &cpus_with_pcps)
- flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+ flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
mutex_unlock(&pcpu_drain_mutex);
}
@@ -2934,6 +3010,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+ unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
{
@@ -2943,7 +3020,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
- migratetype);
+ migratetype, alloc_flags);
if (unlikely(list_empty(list)))
return NULL;
}
@@ -2959,7 +3036,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype)
+ gfp_t gfp_flags, int migratetype,
+ unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -2969,7 +3047,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
- page = __rmqueue_pcplist(zone, migratetype, pcp, list);
+ page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
@@ -2992,7 +3070,7 @@ struct page *rmqueue(struct zone *preferred_zone,
if (likely(order == 0)) {
page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype);
+ gfp_flags, migratetype, alloc_flags);
goto out;
}
@@ -3011,7 +3089,7 @@ struct page *rmqueue(struct zone *preferred_zone,
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
if (!page)
- page = __rmqueue(zone, order, migratetype);
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
spin_unlock(&zone->lock);
if (!page)
@@ -3053,7 +3131,7 @@ static int __init setup_fail_page_alloc(char *str)
}
__setup("fail_page_alloc=", setup_fail_page_alloc);
-static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
if (order < fail_page_alloc.min_order)
return false;
@@ -3103,13 +3181,19 @@ late_initcall(fail_page_alloc_debugfs);
#else /* CONFIG_FAIL_PAGE_ALLOC */
-static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return false;
}
#endif /* CONFIG_FAIL_PAGE_ALLOC */
+static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ return __should_fail_alloc_page(gfp_mask, order);
+}
+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
+
/*
* Return true if free base pages are above 'mark'. For high-order checks it
* will return true of the order-0 watermark is reached and there is at least
@@ -3254,6 +3338,40 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
#endif /* CONFIG_NUMA */
/*
+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
+ * premature use of a lower zone may cause lowmem pressure problems that
+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
+ * probably too small. It only makes sense to spread allocations to avoid
+ * fragmentation between the Normal and DMA32 zones.
+ */
+static inline unsigned int
+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
+{
+ unsigned int alloc_flags = 0;
+
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ alloc_flags |= ALLOC_KSWAPD;
+
+#ifdef CONFIG_ZONE_DMA32
+ if (zone_idx(zone) != ZONE_NORMAL)
+ goto out;
+
+ /*
+ * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
+ * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
+ * on UMA that if Normal is populated then so is DMA32.
+ */
+ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
+ if (nr_online_nodes > 1 && !populated_zone(--zone))
+ goto out;
+
+out:
+#endif /* CONFIG_ZONE_DMA32 */
+ return alloc_flags;
+}
+
+/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
@@ -3261,14 +3379,18 @@ static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
- struct zoneref *z = ac->preferred_zoneref;
+ struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
+ bool no_fallback;
+retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
+ no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
+ z = ac->preferred_zoneref;
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
struct page *page;
@@ -3307,7 +3429,23 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
}
}
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ if (no_fallback && nr_online_nodes > 1 &&
+ zone != ac->preferred_zoneref->zone) {
+ int local_nid;
+
+ /*
+ * If moving to a remote node, retry but allow
+ * fragmenting fallbacks. Locality is more important
+ * than fragmentation avoidance.
+ */
+ local_nid = zone_to_nid(ac->preferred_zoneref->zone);
+ if (zone_to_nid(zone) != local_nid) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
+ }
+
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac_classzone_idx(ac), alloc_flags)) {
int ret;
@@ -3374,6 +3512,15 @@ try_this_zone:
}
}
+ /*
+ * It's possible on a UMA machine to get through all zones that are
+ * fragmented. If avoiding fragmentation, reset and try again.
+ */
+ if (no_fallback) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
+
return NULL;
}
@@ -3413,13 +3560,13 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
current->comm, &vaf, gfp_mask, &gfp_mask,
nodemask_pr_args(nodemask));
va_end(args);
cpuset_print_current_mems_allowed();
-
+ pr_cont("\n");
dump_stack();
warn_alloc_show_mem(gfp_mask, nodemask);
}
@@ -3861,6 +4008,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ alloc_flags |= ALLOC_KSWAPD;
+
#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
@@ -4092,7 +4242,7 @@ retry_cpuset:
if (!ac->preferred_zoneref->zone)
goto nopage;
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
/*
@@ -4150,7 +4300,7 @@ retry_cpuset:
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
@@ -4369,6 +4519,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
finalise_ac(gfp_mask, &ac);
+ /*
+ * Forbid the first pass from falling back to types that fragment
+ * memory until all local zones are considered.
+ */
+ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
+
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
@@ -4427,16 +4583,19 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
}
EXPORT_SYMBOL(get_zeroed_page);
-void __free_pages(struct page *page, unsigned int order)
+static inline void free_the_page(struct page *page, unsigned int order)
{
- if (put_page_testzero(page)) {
- if (order == 0)
- free_unref_page(page);
- else
- __free_pages_ok(page, order);
- }
+ if (order == 0) /* Via pcp? */
+ free_unref_page(page);
+ else
+ __free_pages_ok(page, order);
}
+void __free_pages(struct page *page, unsigned int order)
+{
+ if (put_page_testzero(page))
+ free_the_page(page, order);
+}
EXPORT_SYMBOL(__free_pages);
void free_pages(unsigned long addr, unsigned int order)
@@ -4485,14 +4644,8 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
- if (page_ref_sub_and_test(page, count)) {
- unsigned int order = compound_order(page);
-
- if (order == 0)
- free_unref_page(page);
- else
- __free_pages_ok(page, order);
- }
+ if (page_ref_sub_and_test(page, count))
+ free_the_page(page, compound_order(page));
}
EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -4558,7 +4711,7 @@ void page_frag_free(void *addr)
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
- __free_pages_ok(page, compound_order(page));
+ free_the_page(page, compound_order(page));
}
EXPORT_SYMBOL(page_frag_free);
@@ -4660,7 +4813,7 @@ static unsigned long nr_free_zone_pages(int offset)
struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
for_each_zone_zonelist(zone, z, zonelist, offset) {
- unsigned long size = zone->managed_pages;
+ unsigned long size = zone_managed_pages(zone);
unsigned long high = high_wmark_pages(zone);
if (size > high)
sum += size - high;
@@ -4712,7 +4865,7 @@ long si_mem_available(void)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
for_each_zone(zone)
- wmark_low += zone->watermark[WMARK_LOW];
+ wmark_low += low_wmark_pages(zone);
/*
* Estimate the amount of memory available for userspace allocations,
@@ -4746,11 +4899,11 @@ EXPORT_SYMBOL_GPL(si_mem_available);
void si_meminfo(struct sysinfo *val)
{
- val->totalram = totalram_pages;
+ val->totalram = totalram_pages();
val->sharedram = global_node_page_state(NR_SHMEM);
val->freeram = global_zone_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
- val->totalhigh = totalhigh_pages;
+ val->totalhigh = totalhigh_pages();
val->freehigh = nr_free_highpages();
val->mem_unit = PAGE_SIZE;
}
@@ -4767,7 +4920,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
pg_data_t *pgdat = NODE_DATA(nid);
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += pgdat->node_zones[zone_type].managed_pages;
+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
val->totalram = managed_pages;
val->sharedram = node_page_state(pgdat, NR_SHMEM);
val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
@@ -4776,7 +4929,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
struct zone *zone = &pgdat->node_zones[zone_type];
if (is_highmem(zone)) {
- managed_highpages += zone->managed_pages;
+ managed_highpages += zone_managed_pages(zone);
free_highpages += zone_page_state(zone, NR_FREE_PAGES);
}
}
@@ -4983,7 +5136,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
K(zone->present_pages),
- K(zone->managed_pages),
+ K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
@@ -5655,7 +5808,7 @@ static int zone_batchsize(struct zone *zone)
* The per-cpu-pages pools are set to around 1000th of the
* size of the zone.
*/
- batch = zone->managed_pages / 1024;
+ batch = zone_managed_pages(zone) / 1024;
/* But no more than a meg. */
if (batch * PAGE_SIZE > 1024 * 1024)
batch = (1024 * 1024) / PAGE_SIZE;
@@ -5736,7 +5889,6 @@ static void pageset_init(struct per_cpu_pageset *p)
memset(p, 0, sizeof(*p));
pcp = &p->pcp;
- pcp->count = 0;
for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
INIT_LIST_HEAD(&pcp->lists[migratetype]);
}
@@ -5766,7 +5918,7 @@ static void pageset_set_high_and_batch(struct zone *zone,
{
if (percpu_pagelist_fraction)
pageset_set_high(pcp,
- (zone->managed_pages /
+ (zone_managed_pages(zone) /
percpu_pagelist_fraction));
else
pageset_set_batch(pcp, zone_batchsize(zone));
@@ -5920,7 +6072,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
* with no available memory, a warning is printed and the start and end
* PFNs will be 0.
*/
-void __meminit get_pfn_range_for_nid(unsigned int nid,
+void __init get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
{
unsigned long this_start_pfn, this_end_pfn;
@@ -5969,7 +6121,7 @@ static void __init find_usable_zone_for_movable(void)
* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
* zones within a node are in order of monotonic increases memory addresses
*/
-static void __meminit adjust_zone_range_for_zone_movable(int nid,
+static void __init adjust_zone_range_for_zone_movable(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -6000,7 +6152,7 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
* Return the number of pages a zone spans in a node, including holes
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
*/
-static unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -6035,7 +6187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
* then all holes in the requested range will be accounted for.
*/
-unsigned long __meminit __absent_pages_in_range(int nid,
+unsigned long __init __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
@@ -6065,7 +6217,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
}
/* Return the number of page frames in holes in a zone on a node */
-static unsigned long __meminit zone_absent_pages_in_node(int nid,
+static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -6117,7 +6269,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
}
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static inline unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -6136,7 +6288,7 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
return zones_size[zone_type];
}
-static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
+static inline unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -6150,7 +6302,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zones_size,
@@ -6323,7 +6475,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
unsigned long remaining_pages)
{
- zone->managed_pages = remaining_pages;
+ atomic_long_set(&zone->managed_pages, remaining_pages);
zone_set_nid(zone, nid);
zone->name = zone_names[idx];
zone->zone_pgdat = NODE_DATA(nid);
@@ -6476,12 +6628,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
{
- /*
- * We start only with one section of pages, more pages are added as
- * needed until the rest of deferred pages are initialized.
- */
- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
- pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
}
#else
@@ -7075,18 +7221,16 @@ early_param("movablecore", cmdline_parse_movablecore);
void adjust_managed_page_count(struct page *page, long count)
{
- spin_lock(&managed_page_count_lock);
- page_zone(page)->managed_pages += count;
- totalram_pages += count;
+ atomic_long_add(count, &page_zone(page)->managed_pages);
+ totalram_pages_add(count);
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
- totalhigh_pages += count;
+ totalhigh_pages_add(count);
#endif
- spin_unlock(&managed_page_count_lock);
}
EXPORT_SYMBOL(adjust_managed_page_count);
-unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
{
void *pos;
unsigned long pages = 0;
@@ -7123,9 +7267,9 @@ EXPORT_SYMBOL(free_reserved_area);
void free_highmem_page(struct page *page)
{
__free_reserved_page(page);
- totalram_pages++;
- page_zone(page)->managed_pages++;
- totalhigh_pages++;
+ totalram_pages_inc();
+ atomic_long_inc(&page_zone(page)->managed_pages);
+ totalhigh_pages_inc();
}
#endif
@@ -7174,10 +7318,10 @@ void __init mem_init_print_info(const char *str)
physpages << (PAGE_SHIFT - 10),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+ (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
totalcma_pages << (PAGE_SHIFT - 10),
#ifdef CONFIG_HIGHMEM
- totalhigh_pages << (PAGE_SHIFT - 10),
+ totalhigh_pages() << (PAGE_SHIFT - 10),
#endif
str ? ", " : "", str ? str : "");
}
@@ -7257,6 +7401,7 @@ static void calculate_totalreserve_pages(void)
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
long max = 0;
+ unsigned long managed_pages = zone_managed_pages(zone);
/* Find valid and maximum lowmem_reserve in the zone */
for (j = i; j < MAX_NR_ZONES; j++) {
@@ -7267,8 +7412,8 @@ static void calculate_totalreserve_pages(void)
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
- if (max > zone->managed_pages)
- max = zone->managed_pages;
+ if (max > managed_pages)
+ max = managed_pages;
pgdat->totalreserve_pages += max;
@@ -7292,7 +7437,7 @@ static void setup_per_zone_lowmem_reserve(void)
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long managed_pages = zone->managed_pages;
+ unsigned long managed_pages = zone_managed_pages(zone);
zone->lowmem_reserve[j] = 0;
@@ -7310,7 +7455,7 @@ static void setup_per_zone_lowmem_reserve(void)
lower_zone->lowmem_reserve[j] =
managed_pages / sysctl_lowmem_reserve_ratio[idx];
}
- managed_pages += lower_zone->managed_pages;
+ managed_pages += zone_managed_pages(lower_zone);
}
}
}
@@ -7329,14 +7474,14 @@ static void __setup_per_zone_wmarks(void)
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) {
if (!is_highmem(zone))
- lowmem_pages += zone->managed_pages;
+ lowmem_pages += zone_managed_pages(zone);
}
for_each_zone(zone) {
u64 tmp;
spin_lock_irqsave(&zone->lock, flags);
- tmp = (u64)pages_min * zone->managed_pages;
+ tmp = (u64)pages_min * zone_managed_pages(zone);
do_div(tmp, lowmem_pages);
if (is_highmem(zone)) {
/*
@@ -7350,15 +7495,15 @@ static void __setup_per_zone_wmarks(void)
*/
unsigned long min_pages;
- min_pages = zone->managed_pages / 1024;
+ min_pages = zone_managed_pages(zone) / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
- zone->watermark[WMARK_MIN] = min_pages;
+ zone->_watermark[WMARK_MIN] = min_pages;
} else {
/*
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->watermark[WMARK_MIN] = tmp;
+ zone->_watermark[WMARK_MIN] = tmp;
}
/*
@@ -7367,11 +7512,12 @@ static void __setup_per_zone_wmarks(void)
* ensure a minimum size on small systems.
*/
tmp = max_t(u64, tmp >> 2,
- mult_frac(zone->managed_pages,
+ mult_frac(zone_managed_pages(zone),
watermark_scale_factor, 10000));
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
+ zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ zone->watermark_boost = 0;
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -7472,6 +7618,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
@@ -7497,8 +7655,8 @@ static void setup_min_unmapped_ratio(void)
pgdat->min_unmapped_pages = 0;
for_each_zone(zone)
- zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
- sysctl_min_unmapped_ratio) / 100;
+ zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
+ sysctl_min_unmapped_ratio) / 100;
}
@@ -7525,8 +7683,8 @@ static void setup_min_slab_ratio(void)
pgdat->min_slab_pages = 0;
for_each_zone(zone)
- zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
- sysctl_min_slab_ratio) / 100;
+ zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
+ sysctl_min_slab_ratio) / 100;
}
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
@@ -7766,8 +7924,7 @@ void *__init alloc_large_system_hash(const char *tablename,
* race condition. So you can't expect this function should be exact.
*/
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
- int migratetype,
- bool skip_hwpoisoned_pages)
+ int migratetype, int flags)
{
unsigned long pfn, iter, found;
@@ -7841,7 +7998,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
- if (skip_hwpoisoned_pages && PageHWPoison(page))
+ if ((flags & SKIP_HWPOISON) && PageHWPoison(page))
continue;
if (__PageMovable(page))
@@ -7868,6 +8025,8 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
return false;
unmovable:
WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
+ if (flags & REPORT_FAILURE)
+ dump_page(pfn_to_page(pfn+iter), "unmovable page");
return true;
}
@@ -7994,8 +8153,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
*/
ret = start_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype,
- false);
+ pfn_max_align_up(end), migratetype, 0);
if (ret)
return ret;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 43e085608846..ce323e56b34d 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,8 +15,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/page_isolation.h>
-static int set_migratetype_isolate(struct page *page, int migratetype,
- bool skip_hwpoisoned_pages)
+static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
{
struct zone *zone;
unsigned long flags, pfn;
@@ -60,8 +59,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype,
* FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
* We just check MOVABLE pages.
*/
- if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
- skip_hwpoisoned_pages))
+ if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, flags))
ret = 0;
/*
@@ -185,7 +183,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* prevents two threads from simultaneously working on overlapping ranges.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- unsigned migratetype, bool skip_hwpoisoned_pages)
+ unsigned migratetype, int flags)
{
unsigned long pfn;
unsigned long undo_pfn;
@@ -199,7 +197,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
if (page &&
- set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) {
+ set_migratetype_isolate(page, migratetype, flags)) {
undo_pfn = pfn;
goto undo;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 87bc0dfdb52b..28b06524939f 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -351,6 +351,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
.skip = 0
};
+ count = min_t(size_t, count, PAGE_SIZE);
kbuf = kmalloc(count, GFP_KERNEL);
if (!kbuf)
return -ENOMEM;
diff --git a/mm/readahead.c b/mm/readahead.c
index f3d6f9656a3c..1ae16522412a 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -270,17 +270,15 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
* return it as the new window size.
*/
static unsigned long get_next_ra_size(struct file_ra_state *ra,
- unsigned long max)
+ unsigned long max)
{
unsigned long cur = ra->size;
- unsigned long newsize;
if (cur < max / 16)
- newsize = 4 * cur;
- else
- newsize = 2 * cur;
-
- return min(newsize, max);
+ return 4 * cur;
+ if (cur <= max / 2)
+ return 2 * cur;
+ return max;
}
/*
diff --git a/mm/rmap.c b/mm/rmap.c
index 85b7f9423352..21a26cf51114 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,6 +25,7 @@
* page->flags PG_locked (lock_page)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
* mapping->i_mmap_rwsem
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone_lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -889,15 +890,17 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
.address = address,
.flags = PVMW_SYNC,
};
- unsigned long start = address, end;
+ struct mmu_notifier_range range;
int *cleaned = arg;
/*
* We have to assume the worse case ie pmd for invalidation. Note that
* the page can not be free from this function.
*/
- end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
- mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+ mmu_notifier_range_init(&range, vma->vm_mm, address,
+ min(vma->vm_end, address +
+ (PAGE_SIZE << compound_order(page))));
+ mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
unsigned long cstart;
@@ -949,7 +952,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
(*cleaned)++;
}
- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+ mmu_notifier_invalidate_range_end(&range);
return true;
}
@@ -1017,7 +1020,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
/**
* __page_set_anon_rmap - set up new anonymous rmap
- * @page: Page to add to rmap
+ * @page: Page or Hugepage to add to rmap
* @vma: VM area to add page to.
* @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process
@@ -1345,7 +1348,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pte_t pteval;
struct page *subpage;
bool ret = true;
- unsigned long start = address, end;
+ struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)arg;
/* munlock has nothing to gain from examining un-locked vmas */
@@ -1369,15 +1372,21 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
- end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start,
+ min(vma->vm_end, vma->vm_start +
+ (PAGE_SIZE << compound_order(page))));
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
+ *
+ * If called for a huge page, caller must hold i_mmap_rwsem
+ * in write mode as it is possible to call huge_pmd_unshare.
*/
- adjust_range_if_pmd_sharing_possible(vma, &start, &end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
}
- mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+ mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -1428,9 +1437,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* we must flush them all. start/end were
* already adjusted above to cover this range.
*/
- flush_cache_range(vma, start, end);
- flush_tlb_range(vma, start, end);
- mmu_notifier_invalidate_range(mm, start, end);
+ flush_cache_range(vma, range.start, range.end);
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
/*
* The ref count of the PMD page was dropped
@@ -1650,7 +1660,7 @@ discard:
put_page(page);
}
- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+ mmu_notifier_invalidate_range_end(&range);
return ret;
}
@@ -1910,27 +1920,10 @@ void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
#ifdef CONFIG_HUGETLB_PAGE
/*
- * The following three functions are for anonymous (private mapped) hugepages.
+ * The following two functions are for anonymous (private mapped) hugepages.
* Unlike common anonymous pages, anonymous hugepages have no accounting code
* and no lru code, because we handle hugepages differently from common pages.
*/
-static void __hugepage_set_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, int exclusive)
-{
- struct anon_vma *anon_vma = vma->anon_vma;
-
- BUG_ON(!anon_vma);
-
- if (PageAnon(page))
- return;
- if (!exclusive)
- anon_vma = anon_vma->root;
-
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- page->mapping = (struct address_space *) anon_vma;
- page->index = linear_page_index(vma, address);
-}
-
void hugepage_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
@@ -1942,7 +1935,7 @@ void hugepage_add_anon_rmap(struct page *page,
/* address might be in next vma when migration races vma_adjust */
first = atomic_inc_and_test(compound_mapcount_ptr(page));
if (first)
- __hugepage_set_anon_rmap(page, vma, address, 0);
+ __page_set_anon_rmap(page, vma, address, 0);
}
void hugepage_add_new_anon_rmap(struct page *page,
@@ -1950,6 +1943,6 @@ void hugepage_add_new_anon_rmap(struct page *page,
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
atomic_set(compound_mapcount_ptr(page), 0);
- __hugepage_set_anon_rmap(page, vma, address, 1);
+ __page_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index 375f3ac19bb8..6ece1e2fe76e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -109,12 +109,14 @@ struct shmem_falloc {
#ifdef CONFIG_TMPFS
static unsigned long shmem_default_max_blocks(void)
{
- return totalram_pages / 2;
+ return totalram_pages() / 2;
}
static unsigned long shmem_default_max_inodes(void)
{
- return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+ unsigned long nr_pages = totalram_pages();
+
+ return min(nr_pages - totalhigh_pages(), nr_pages / 2);
}
#endif
@@ -3301,7 +3303,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
size = memparse(value,&rest);
if (*rest == '%') {
size <<= PAGE_SHIFT;
- size *= totalram_pages;
+ size *= totalram_pages();
do_div(size, 100);
rest++;
}
diff --git a/mm/slab.c b/mm/slab.c
index 3abb9feb3818..73fe23e649c9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,19 +406,6 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
return page->s_mem + cache->size * idx;
}
-/*
- * We want to avoid an expensive divide : (offset / cache->size)
- * Using the fact that size is a constant for a particular cache,
- * we can replace (offset / cache->size) by
- * reciprocal_divide(offset, cache->reciprocal_buffer_size)
- */
-static inline unsigned int obj_to_index(const struct kmem_cache *cache,
- const struct page *page, void *obj)
-{
- u32 offset = (obj - page->s_mem);
- return reciprocal_divide(offset, cache->reciprocal_buffer_size);
-}
-
#define BOOT_CPUCACHE_ENTRIES 1
/* internal cache of cache description objs */
static struct kmem_cache kmem_cache_boot = {
@@ -1248,7 +1235,7 @@ void __init kmem_cache_init(void)
* page orders on machines with more than 32MB of memory if
* not overridden on the command line.
*/
- if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
+ if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT)
slab_max_order = SLAB_MAX_ORDER_HI;
/* Bootstrap is tricky, because several objects are allocated
@@ -2370,7 +2357,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
void *freelist;
void *addr = page_address(page);
- page->s_mem = addr + colour_off;
+ page->s_mem = kasan_reset_tag(addr) + colour_off;
page->active = 0;
if (OBJFREELIST_SLAB(cachep))
@@ -2574,7 +2561,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
for (i = 0; i < cachep->num; i++) {
objp = index_to_obj(cachep, page, i);
- kasan_init_slab_obj(cachep, objp);
+ objp = kasan_init_slab_obj(cachep, objp);
/* constructor could break poison info */
if (DEBUG == 0 && cachep->ctor) {
@@ -3551,7 +3538,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *ret = slab_alloc(cachep, flags, _RET_IP_);
- kasan_slab_alloc(cachep, ret, flags);
+ ret = kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
@@ -3617,7 +3604,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
ret = slab_alloc(cachep, flags, _RET_IP_);
- kasan_kmalloc(cachep, ret, size, flags);
+ ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(_RET_IP_, ret,
size, cachep->size, flags);
return ret;
@@ -3641,7 +3628,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
- kasan_slab_alloc(cachep, ret, flags);
+ ret = kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
cachep->object_size, cachep->size,
flags, nodeid);
@@ -3660,7 +3647,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
- kasan_kmalloc(cachep, ret, size, flags);
+ ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc_node(_RET_IP_, ret,
size, cachep->size,
flags, nodeid);
@@ -3681,7 +3668,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
- kasan_kmalloc(cachep, ret, size, flags);
+ ret = kasan_kmalloc(cachep, ret, size, flags);
return ret;
}
@@ -3719,7 +3706,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
return cachep;
ret = slab_alloc(cachep, flags, caller);
- kasan_kmalloc(cachep, ret, size, flags);
+ ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(caller, ret,
size, cachep->size, flags);
diff --git a/mm/slab.h b/mm/slab.h
index 58c6c1c2a78e..4190c24ef0e9 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -441,7 +441,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
kmemleak_alloc_recursive(object, s->object_size, 1,
s->flags, flags);
- kasan_slab_alloc(s, object, flags);
+ p[i] = kasan_slab_alloc(s, object, flags);
}
if (memcg_kmem_enabled())
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 9c11e8a937d2..70b0cc85db67 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1029,10 +1029,8 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
index = size_index[size_index_elem(size)];
} else {
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- WARN_ON(1);
+ if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE))
return NULL;
- }
index = fls(size - 1);
}
@@ -1204,7 +1202,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
page = alloc_pages(flags, order);
ret = page ? page_address(page) : NULL;
kmemleak_alloc(ret, size, 1, flags);
- kasan_kmalloc_large(ret, size, flags);
+ ret = kasan_kmalloc_large(ret, size, flags);
return ret;
}
EXPORT_SYMBOL(kmalloc_order);
@@ -1482,7 +1480,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
ks = ksize(p);
if (ks >= new_size) {
- kasan_krealloc((void *)p, new_size, flags);
+ p = kasan_krealloc((void *)p, new_size, flags);
return (void *)p;
}
@@ -1534,7 +1532,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
}
ret = __do_krealloc(p, new_size, flags);
- if (ret && p != ret)
+ if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
kfree(p);
return ret;
diff --git a/mm/slub.c b/mm/slub.c
index e3629cd7aff1..36c0befeebd8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1372,10 +1372,10 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*/
-static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
+static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
{
kmemleak_alloc(ptr, size, 1, flags);
- kasan_kmalloc_large(ptr, size, flags);
+ return kasan_kmalloc_large(ptr, size, flags);
}
static __always_inline void kfree_hook(void *x)
@@ -1451,16 +1451,17 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
#endif
}
-static void setup_object(struct kmem_cache *s, struct page *page,
+static void *setup_object(struct kmem_cache *s, struct page *page,
void *object)
{
setup_object_debug(s, page, object);
- kasan_init_slab_obj(s, object);
+ object = kasan_init_slab_obj(s, object);
if (unlikely(s->ctor)) {
kasan_unpoison_object_data(s, object);
s->ctor(object);
kasan_poison_object_data(s, object);
}
+ return object;
}
/*
@@ -1568,16 +1569,16 @@ static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
/* First entry is used as the base of the freelist */
cur = next_freelist_entry(s, page, &pos, start, page_limit,
freelist_count);
+ cur = setup_object(s, page, cur);
page->freelist = cur;
for (idx = 1; idx < page->objects; idx++) {
- setup_object(s, page, cur);
next = next_freelist_entry(s, page, &pos, start, page_limit,
freelist_count);
+ next = setup_object(s, page, next);
set_freepointer(s, cur, next);
cur = next;
}
- setup_object(s, page, cur);
set_freepointer(s, cur, NULL);
return true;
@@ -1599,7 +1600,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
struct page *page;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
- void *start, *p;
+ void *start, *p, *next;
int idx, order;
bool shuffle;
@@ -1651,13 +1652,16 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
if (!shuffle) {
for_each_object_idx(p, idx, s, start, page->objects) {
- setup_object(s, page, p);
- if (likely(idx < page->objects))
- set_freepointer(s, p, p + s->size);
- else
+ if (likely(idx < page->objects)) {
+ next = p + s->size;
+ next = setup_object(s, page, next);
+ set_freepointer(s, p, next);
+ } else
set_freepointer(s, p, NULL);
}
- page->freelist = fixup_red_left(s, start);
+ start = fixup_red_left(s, start);
+ start = setup_object(s, page, start);
+ page->freelist = start;
}
page->inuse = page->objects;
@@ -2127,26 +2131,15 @@ redo:
}
if (l != m) {
-
if (l == M_PARTIAL)
-
remove_partial(n, page);
-
else if (l == M_FULL)
-
remove_full(s, n, page);
- if (m == M_PARTIAL) {
-
+ if (m == M_PARTIAL)
add_partial(n, page, tail);
- stat(s, tail);
-
- } else if (m == M_FULL) {
-
- stat(s, DEACTIVATE_FULL);
+ else if (m == M_FULL)
add_full(s, n, page);
-
- }
}
l = m;
@@ -2159,7 +2152,11 @@ redo:
if (lock)
spin_unlock(&n->list_lock);
- if (m == M_FREE) {
+ if (m == M_PARTIAL)
+ stat(s, tail);
+ else if (m == M_FULL)
+ stat(s, DEACTIVATE_FULL);
+ else if (m == M_FREE) {
stat(s, DEACTIVATE_EMPTY);
discard_slab(s, page);
stat(s, FREE_SLAB);
@@ -2313,12 +2310,10 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
{
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- if (likely(c)) {
- if (c->page)
- flush_slab(s, c);
+ if (c->page)
+ flush_slab(s, c);
- unfreeze_partials(s, c);
- }
+ unfreeze_partials(s, c);
}
static void flush_cpu_slab(void *d)
@@ -2367,7 +2362,7 @@ static int slub_cpu_dead(unsigned int cpu)
static inline int node_match(struct page *page, int node)
{
#ifdef CONFIG_NUMA
- if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
+ if (node != NUMA_NO_NODE && page_to_nid(page) != node)
return 0;
#endif
return 1;
@@ -2768,7 +2763,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
void *ret = slab_alloc(s, gfpflags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
- kasan_kmalloc(s, ret, size, gfpflags);
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2796,7 +2791,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
- kasan_kmalloc(s, ret, size, gfpflags);
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2992,7 +2987,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
do_slab_free(s, page, head, tail, cnt, addr);
}
-#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_GENERIC
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
{
do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
@@ -3364,16 +3359,16 @@ static void early_kmem_cache_node_alloc(int node)
n = page->freelist;
BUG_ON(!n);
- page->freelist = get_freepointer(kmem_cache_node, n);
- page->inuse = 1;
- page->frozen = 0;
- kmem_cache_node->node[node] = n;
#ifdef CONFIG_SLUB_DEBUG
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
+ n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
GFP_KERNEL);
+ page->freelist = get_freepointer(kmem_cache_node, n);
+ page->inuse = 1;
+ page->frozen = 0;
+ kmem_cache_node->node[node] = n;
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, page->objects);
@@ -3784,7 +3779,7 @@ void *__kmalloc(size_t size, gfp_t flags)
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
- kasan_kmalloc(s, ret, size, flags);
+ ret = kasan_kmalloc(s, ret, size, flags);
return ret;
}
@@ -3801,8 +3796,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
if (page)
ptr = page_address(page);
- kmalloc_large_node_hook(ptr, size, flags);
- return ptr;
+ return kmalloc_large_node_hook(ptr, size, flags);
}
void *__kmalloc_node(size_t size, gfp_t flags, int node)
@@ -3829,7 +3823,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
- kasan_kmalloc(s, ret, size, flags);
+ ret = kasan_kmalloc(s, ret, size, flags);
return ret;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 3abc8cc50201..7ea5dc6c6b19 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -678,25 +678,24 @@ static void free_map_bootmem(struct page *memmap)
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
-int __meminit sparse_add_one_section(struct pglist_data *pgdat,
- unsigned long start_pfn, struct vmem_altmap *altmap)
+int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
+ struct vmem_altmap *altmap)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct mem_section *ms;
struct page *memmap;
unsigned long *usemap;
- unsigned long flags;
int ret;
/*
* no locking for this, because it does its own
* plus, it does a kmalloc
*/
- ret = sparse_index_init(section_nr, pgdat->node_id);
+ ret = sparse_index_init(section_nr, nid);
if (ret < 0 && ret != -EEXIST)
return ret;
ret = 0;
- memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap);
+ memmap = kmalloc_section_memmap(section_nr, nid, altmap);
if (!memmap)
return -ENOMEM;
usemap = __kmalloc_section_usemap();
@@ -705,8 +704,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
return -ENOMEM;
}
- pgdat_resize_lock(pgdat, &flags);
-
ms = __pfn_to_section(start_pfn);
if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
ret = -EEXIST;
@@ -723,7 +720,6 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
sparse_init_one_section(ms, section_nr, memmap, usemap);
out:
- pgdat_resize_unlock(pgdat, &flags);
if (ret < 0) {
kfree(usemap);
__kfree_section_memmap(memmap, altmap);
@@ -740,6 +736,15 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
if (!memmap)
return;
+ /*
+ * A further optimization is to have per section refcounted
+ * num_poisoned_pages. But that would need more space per memmap, so
+ * for now just do a quick global check to speed up this routine in the
+ * absence of bad pages.
+ */
+ if (atomic_long_read(&num_poisoned_pages) == 0)
+ return;
+
for (i = 0; i < nr_pages; i++) {
if (PageHWPoison(&memmap[i])) {
atomic_long_sub(1, &num_poisoned_pages);
@@ -785,10 +790,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
unsigned long map_offset, struct vmem_altmap *altmap)
{
struct page *memmap = NULL;
- unsigned long *usemap = NULL, flags;
- struct pglist_data *pgdat = zone->zone_pgdat;
+ unsigned long *usemap = NULL;
- pgdat_resize_lock(pgdat, &flags);
if (ms->section_mem_map) {
usemap = ms->pageblock_flags;
memmap = sparse_decode_mem_map(ms->section_mem_map,
@@ -796,7 +799,6 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
ms->section_mem_map = 0;
ms->pageblock_flags = NULL;
}
- pgdat_resize_unlock(pgdat, &flags);
clear_hwpoisoned_pages(memmap + map_offset,
PAGES_PER_SECTION - map_offset);
diff --git a/mm/swap.c b/mm/swap.c
index 5d786019eab9..4d8a1f1afaab 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1022,7 +1022,7 @@ EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
*/
void __init swap_setup(void)
{
- unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
+ unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8688ae65ef58..dbac1d49469d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2197,7 +2197,8 @@ int try_to_unuse(unsigned int type, bool frontswap,
*/
if (PageSwapCache(page) &&
likely(page_private(page) == entry.val) &&
- !page_swapped(page))
+ (!PageTransCompound(page) ||
+ !swap_page_trans_huge_swapped(si, entry)))
delete_from_swap_cache(compound_head(page));
/*
@@ -2812,8 +2813,9 @@ static struct swap_info_struct *alloc_swap_info(void)
struct swap_info_struct *p;
unsigned int type;
int i;
+ int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
- p = kvzalloc(sizeof(*p), GFP_KERNEL);
+ p = kvzalloc(size, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 458acda96f20..48368589f519 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -267,10 +267,14 @@ retry:
VM_BUG_ON(dst_addr & ~huge_page_mask(h));
/*
- * Serialize via hugetlb_fault_mutex
+ * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
+ * i_mmap_rwsem ensures the dst_pte remains valid even
+ * in the case of shared pmds. fault mutex prevents
+ * races with other faulting threads.
*/
- idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
+ i_mmap_lock_read(mapping);
+ idx = linear_page_index(dst_vma, dst_addr);
hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -279,6 +283,7 @@ retry:
dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
if (!dst_pte) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -286,6 +291,7 @@ retry:
dst_pteval = huge_ptep_get(dst_pte);
if (!huge_pte_none(dst_pteval)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -293,6 +299,7 @@ retry:
dst_addr, src_addr, &page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
vm_alloc_shared = vm_shared;
cond_resched();
diff --git a/mm/util.c b/mm/util.c
index 8bf08b5b5760..4df23d64aac7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -593,7 +593,7 @@ unsigned long vm_commit_limit(void)
if (sysctl_overcommit_kbytes)
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
else
- allowed = ((totalram_pages - hugetlb_total_pages())
+ allowed = ((totalram_pages() - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100);
allowed += total_swap_pages;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 97d4b25d0373..871e41c55e23 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1634,7 +1634,7 @@ void *vmap(struct page **pages, unsigned int count,
might_sleep();
- if (count > totalram_pages)
+ if (count > totalram_pages())
return NULL;
size = (unsigned long)count << PAGE_SHIFT;
@@ -1739,7 +1739,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long real_size = size;
size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > totalram_pages)
+ if (!size || (size >> PAGE_SHIFT) > totalram_pages())
goto fail;
area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 24ab1f7394ab..a714c4f800e9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -88,6 +88,9 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
+ /* e.g. boosted watermark reclaim leaves slabs alone */
+ unsigned int may_shrinkslab:1;
+
/*
* Cgroups are not reclaimed below their configured memory.low,
* unless we threaten to OOM. If any cgroups are skipped due to
@@ -1457,14 +1460,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
count_memcg_page_event(page, PGLAZYFREED);
} else if (!mapping || !__remove_mapping(mapping, page, true))
goto keep_locked;
- /*
- * At this point, we have no other references and there is
- * no way to pick any more up (removed from LRU, removed
- * from pagecache). Can use non-atomic bitops now (and
- * we obviously don't have to worry about waking up a process
- * waiting on the page lock, because there are no references.
- */
- __ClearPageLocked(page);
+
+ unlock_page(page);
free_it:
nr_reclaimed++;
@@ -2756,8 +2753,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
- shrink_slab(sc->gfp_mask, pgdat->node_id,
+ if (sc->may_shrinkslab) {
+ shrink_slab(sc->gfp_mask, pgdat->node_id,
memcg, sc->priority);
+ }
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
@@ -3239,6 +3238,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = 1,
+ .may_shrinkslab = 1,
};
/*
@@ -3283,6 +3283,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
.may_unmap = 1,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
+ .may_shrinkslab = 1,
};
unsigned long lru_pages;
@@ -3329,6 +3330,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = may_swap,
+ .may_shrinkslab = 1,
};
/*
@@ -3379,6 +3381,30 @@ static void age_active_anon(struct pglist_data *pgdat,
} while (memcg);
}
+static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
+{
+ int i;
+ struct zone *zone;
+
+ /*
+ * Check for watermark boosts top-down as the higher zones
+ * are more likely to be boosted. Both watermarks and boosts
+ * should not be checked at the time time as reclaim would
+ * start prematurely when there is no boosting and a lower
+ * zone is balanced.
+ */
+ for (i = classzone_idx; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (!managed_zone(zone))
+ continue;
+
+ if (zone->watermark_boost)
+ return true;
+ }
+
+ return false;
+}
+
/*
* Returns true if there is an eligible zone balanced for the request order
* and classzone_idx
@@ -3389,6 +3415,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
unsigned long mark = -1;
struct zone *zone;
+ /*
+ * Check watermarks bottom-up as lower zones are more likely to
+ * meet watermarks.
+ */
for (i = 0; i <= classzone_idx; i++) {
zone = pgdat->node_zones + i;
@@ -3517,14 +3547,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
unsigned long pflags;
+ unsigned long nr_boost_reclaim;
+ unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
+ bool boosted;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
- .priority = DEF_PRIORITY,
- .may_writepage = !laptop_mode,
.may_unmap = 1,
- .may_swap = 1,
};
psi_memstall_enter(&pflags);
@@ -3532,9 +3562,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
count_vm_event(PAGEOUTRUN);
+ /*
+ * Account for the reclaim boost. Note that the zone boost is left in
+ * place so that parallel allocations that are near the watermark will
+ * stall or direct reclaim until kswapd is finished.
+ */
+ nr_boost_reclaim = 0;
+ for (i = 0; i <= classzone_idx; i++) {
+ zone = pgdat->node_zones + i;
+ if (!managed_zone(zone))
+ continue;
+
+ nr_boost_reclaim += zone->watermark_boost;
+ zone_boosts[i] = zone->watermark_boost;
+ }
+ boosted = nr_boost_reclaim;
+
+restart:
+ sc.priority = DEF_PRIORITY;
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
+ bool balanced;
bool ret;
sc.reclaim_idx = classzone_idx;
@@ -3561,13 +3610,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
}
/*
- * Only reclaim if there are no eligible zones. Note that
- * sc.reclaim_idx is not used as buffer_heads_over_limit may
- * have adjusted it.
+ * If the pgdat is imbalanced then ignore boosting and preserve
+ * the watermarks for a later time and restart. Note that the
+ * zone watermarks will be still reset at the end of balancing
+ * on the grounds that the normal reclaim should be enough to
+ * re-evaluate if boosting is required when kswapd next wakes.
*/
- if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+ balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
+ if (!balanced && nr_boost_reclaim) {
+ nr_boost_reclaim = 0;
+ goto restart;
+ }
+
+ /*
+ * If boosting is not active then only reclaim if there are no
+ * eligible zones. Note that sc.reclaim_idx is not used as
+ * buffer_heads_over_limit may have adjusted it.
+ */
+ if (!nr_boost_reclaim && balanced)
goto out;
+ /* Limit the priority of boosting to avoid reclaim writeback */
+ if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
+ raise_priority = false;
+
+ /*
+ * Do not writeback or swap pages for boosted reclaim. The
+ * intent is to relieve pressure not issue sub-optimal IO
+ * from reclaim context. If no pages are reclaimed, the
+ * reclaim will be aborted.
+ */
+ sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
+ sc.may_swap = !nr_boost_reclaim;
+ sc.may_shrinkslab = !nr_boost_reclaim;
+
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming. All
@@ -3619,6 +3695,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* progress in reclaiming pages
*/
nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+ nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
+
+ /*
+ * If reclaim made no progress for a boost, stop reclaim as
+ * IO cannot be queued and it could be an infinite loop in
+ * extreme circumstances.
+ */
+ if (nr_boost_reclaim && !nr_reclaimed)
+ break;
+
if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
@@ -3627,6 +3713,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
pgdat->kswapd_failures++;
out:
+ /* If reclaim was boosted, account for the reclaim done in this pass */
+ if (boosted) {
+ unsigned long flags;
+
+ for (i = 0; i <= classzone_idx; i++) {
+ if (!zone_boosts[i])
+ continue;
+
+ /* Increments are under the zone lock */
+ zone = pgdat->node_zones + i;
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ /*
+ * As there is now likely space, wakeup kcompact to defragment
+ * pageblocks.
+ */
+ wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
+ }
+
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
psi_memstall_leave(&pflags);
@@ -3855,7 +3963,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
/* Hopeless node, leave it to direct reclaim if possible */
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
- pgdat_balanced(pgdat, order, classzone_idx)) {
+ (pgdat_balanced(pgdat, order, classzone_idx) &&
+ !pgdat_watermark_boosted(pgdat, classzone_idx))) {
/*
* There may be plenty of free memory available, but it's too
* fragmented for high-order allocations. Wake up kcompactd
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9c624595e904..83b30edc2f7f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -227,7 +227,7 @@ int calculate_normal_threshold(struct zone *zone)
* 125 1024 10 16-32 GB 9
*/
- mem = zone->managed_pages >> (27 - PAGE_SHIFT);
+ mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
@@ -1569,7 +1569,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
high_wmark_pages(zone),
zone->spanned_pages,
zone->present_pages,
- zone->managed_pages);
+ zone_managed_pages(zone));
seq_printf(m,
"\n protection: (%ld",
diff --git a/mm/workingset.c b/mm/workingset.c
index d46f8c92aa2f..dcb994f2acc2 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -549,7 +549,7 @@ static int __init workingset_init(void)
* double the initial memory by using totalram_pages as-is.
*/
timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
- max_order = fls_long(totalram_pages - 1);
+ max_order = fls_long(totalram_pages() - 1);
if (max_order > timestamp_bits)
bucket_order = max_order - timestamp_bits;
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
diff --git a/mm/zswap.c b/mm/zswap.c
index cd91fd9d96b8..a4e4d36ec085 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -219,8 +219,8 @@ static const struct zpool_ops zswap_zpool_ops = {
static bool zswap_is_full(void)
{
- return totalram_pages * zswap_max_pool_percent / 100 <
- DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+ return totalram_pages() * zswap_max_pool_percent / 100 <
+ DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
}
static void zswap_update_total_size(void)