summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/cma.h2
-rw-r--r--mm/cma_debug.c11
-rw-r--r--mm/huge_memory.c7
-rw-r--r--mm/kasan/Makefile2
-rw-r--r--mm/kasan/kasan.c2
-rw-r--r--mm/kasan/kasan_init.c152
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/memory-failure.c54
-rw-r--r--mm/memory.c20
-rw-r--r--mm/memory_hotplug.c13
-rw-r--r--mm/migrate.c8
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/nommu.c12
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c76
-rw-r--r--mm/page_owner.c7
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/slub.c2
-rw-r--r--mm/vmscan.c16
21 files changed, 306 insertions, 99 deletions
diff --git a/mm/cma.h b/mm/cma.h
index 1132d733556d..17c75a4246c8 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -16,7 +16,7 @@ struct cma {
extern struct cma cma_areas[MAX_CMA_AREAS];
extern unsigned cma_area_count;
-static unsigned long cma_bitmap_maxno(struct cma *cma)
+static inline unsigned long cma_bitmap_maxno(struct cma *cma)
{
return cma->count >> cma->order_per_bit;
}
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 7621ee34daa0..f8e4b60db167 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -39,7 +39,7 @@ static int cma_used_get(void *data, u64 *val)
mutex_lock(&cma->lock);
/* pages counter is smaller than sizeof(int) */
- used = bitmap_weight(cma->bitmap, (int)cma->count);
+ used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
mutex_unlock(&cma->lock);
*val = (u64)used << cma->order_per_bit;
@@ -52,13 +52,14 @@ static int cma_maxchunk_get(void *data, u64 *val)
struct cma *cma = data;
unsigned long maxchunk = 0;
unsigned long start, end = 0;
+ unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
mutex_lock(&cma->lock);
for (;;) {
- start = find_next_zero_bit(cma->bitmap, cma->count, end);
+ start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
if (start >= cma->count)
break;
- end = find_next_bit(cma->bitmap, cma->count, start);
+ end = find_next_bit(cma->bitmap, bitmap_maxno, start);
maxchunk = max(end - start, maxchunk);
}
mutex_unlock(&cma->lock);
@@ -170,10 +171,10 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
tmp = debugfs_create_dir(name, cma_debugfs_root);
- debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma,
+ debugfs_create_file("alloc", S_IWUSR, tmp, cma,
&cma_alloc_fops);
- debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma,
+ debugfs_create_file("free", S_IWUSR, tmp, cma,
&cma_free_fops);
debugfs_create_file("base_pfn", S_IRUGO, tmp,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c107094f79ba..097c7a4bfbd9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1676,12 +1676,7 @@ static void __split_huge_page_refcount(struct page *page,
/* after clearing PageTail the gup refcount can be released */
smp_mb__after_atomic();
- /*
- * retain hwpoison flag of the poisoned tail page:
- * fix for the unsuitable process killed on Guest Machine(KVM)
- * by the memory-failure.
- */
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (page->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index bd837b8c2f41..64710148941e 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -5,4 +5,4 @@ CFLAGS_REMOVE_kasan.o = -pg
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-obj-y := kasan.o report.o
+obj-y := kasan.o report.o kasan_init.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 6c513a63ea84..7b28e9cdf1c7 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -2,7 +2,7 @@
* This file contains shadow memory manipulation code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
- * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some of code borrowed from https://github.com/xairy/linux by
* Andrey Konovalov <adech.fo@gmail.com>
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c
new file mode 100644
index 000000000000..3f9a41cf0ac6
--- /dev/null
+++ b/mm/kasan/kasan_init.c
@@ -0,0 +1,152 @@
+/*
+ * This file contains some kasan initialization code.
+ *
+ * Copyright (c) 2015 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memblock.h>
+#include <linux/pfn.h>
+
+#include <asm/page.h>
+#include <asm/pgalloc.h>
+
+/*
+ * This page serves two purposes:
+ * - It used as early shadow memory. The entire shadow region populated
+ * with this page, before we will be able to setup normal shadow memory.
+ * - Latter it reused it as zero shadow to cover large ranges of memory
+ * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...).
+ */
+unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
+
+#if CONFIG_PGTABLE_LEVELS > 3
+pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+#endif
+#if CONFIG_PGTABLE_LEVELS > 2
+pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+#endif
+pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+
+static __init void *early_alloc(size_t size, int node)
+{
+ return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS),
+ BOOTMEM_ALLOC_ACCESSIBLE, node);
+}
+
+static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+ unsigned long end)
+{
+ pte_t *pte = pte_offset_kernel(pmd, addr);
+ pte_t zero_pte;
+
+ zero_pte = pfn_pte(PFN_DOWN(__pa(kasan_zero_page)), PAGE_KERNEL);
+ zero_pte = pte_wrprotect(zero_pte);
+
+ while (addr + PAGE_SIZE <= end) {
+ set_pte_at(&init_mm, addr, pte, zero_pte);
+ addr += PAGE_SIZE;
+ pte = pte_offset_kernel(pmd, addr);
+ }
+}
+
+static void __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+ unsigned long end)
+{
+ pmd_t *pmd = pmd_offset(pud, addr);
+ unsigned long next;
+
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) {
+ pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+ continue;
+ }
+
+ if (pmd_none(*pmd)) {
+ pmd_populate_kernel(&init_mm, pmd,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
+ zero_pte_populate(pmd, addr, next);
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
+ unsigned long end)
+{
+ pud_t *pud = pud_offset(pgd, addr);
+ unsigned long next;
+
+ do {
+ next = pud_addr_end(addr, end);
+ if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) {
+ pmd_t *pmd;
+
+ pud_populate(&init_mm, pud, kasan_zero_pmd);
+ pmd = pmd_offset(pud, addr);
+ pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+ continue;
+ }
+
+ if (pud_none(*pud)) {
+ pud_populate(&init_mm, pud,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
+ zero_pmd_populate(pud, addr, next);
+ } while (pud++, addr = next, addr != end);
+}
+
+/**
+ * kasan_populate_zero_shadow - populate shadow memory region with
+ * kasan_zero_page
+ * @shadow_start - start of the memory range to populate
+ * @shadow_end - end of the memory range to populate
+ */
+void __init kasan_populate_zero_shadow(const void *shadow_start,
+ const void *shadow_end)
+{
+ unsigned long addr = (unsigned long)shadow_start;
+ unsigned long end = (unsigned long)shadow_end;
+ pgd_t *pgd = pgd_offset_k(addr);
+ unsigned long next;
+
+ do {
+ next = pgd_addr_end(addr, end);
+
+ if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) {
+ pud_t *pud;
+ pmd_t *pmd;
+
+ /*
+ * kasan_zero_pud should be populated with pmds
+ * at this moment.
+ * [pud,pmd]_populate*() below needed only for
+ * 3,2 - level page tables where we don't have
+ * puds,pmds, so pgd_populate(), pud_populate()
+ * is noops.
+ */
+ pgd_populate(&init_mm, pgd, kasan_zero_pud);
+ pud = pud_offset(pgd, addr);
+ pud_populate(&init_mm, pud, kasan_zero_pmd);
+ pmd = pmd_offset(pud, addr);
+ pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte);
+ continue;
+ }
+
+ if (pgd_none(*pgd)) {
+ pgd_populate(&init_mm, pgd,
+ early_alloc(PAGE_SIZE, NUMA_NO_NODE));
+ }
+ zero_pud_populate(pgd, addr, next);
+ } while (pgd++, addr = next, addr != end);
+}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 680ceedf810a..e07c94fbd0ac 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -2,7 +2,7 @@
* This file contains error reporting code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
- * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some of code borrowed from https://github.com/xairy/linux by
* Andrey Konovalov <adech.fo@gmail.com>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c53543d89282..1f4446a90cef 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -909,6 +909,18 @@ int get_hwpoison_page(struct page *page)
* directly for tail pages.
*/
if (PageTransHuge(head)) {
+ /*
+ * Non anonymous thp exists only in allocation/free time. We
+ * can't handle such a case correctly, so let's give it up.
+ * This should be better than triggering BUG_ON when kernel
+ * tries to touch the "partially handled" page.
+ */
+ if (!PageAnon(head)) {
+ pr_err("MCE: %#lx: non anonymous thp\n",
+ page_to_pfn(page));
+ return 0;
+ }
+
if (get_page_unless_zero(head)) {
if (PageTail(page))
get_page(page);
@@ -1134,17 +1146,11 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
if (!PageHuge(p) && PageTransHuge(hpage)) {
- if (!PageAnon(hpage)) {
- pr_err("MCE: %#lx: non anonymous thp\n", pfn);
- if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- put_page(p);
- if (p != hpage)
- put_page(hpage);
- return -EBUSY;
- }
- if (unlikely(split_huge_page(hpage))) {
- pr_err("MCE: %#lx: thp split failed\n", pfn);
+ if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+ if (!PageAnon(hpage))
+ pr_err("MCE: %#lx: non anonymous thp\n", pfn);
+ else
+ pr_err("MCE: %#lx: thp split failed\n", pfn);
if (TestClearPageHWPoison(p))
atomic_long_sub(nr_pages, &num_poisoned_pages);
put_page(p);
@@ -1209,9 +1215,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
put_page(hpage);
- res = 0;
- goto out;
+ return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
@@ -1535,6 +1541,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
*/
ret = __get_any_page(page, pfn, 0);
if (!PageLRU(page)) {
+ /* Drop page reference which is from __get_any_page() */
+ put_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
return -EIO;
@@ -1564,13 +1572,12 @@ static int soft_offline_huge_page(struct page *page, int flags)
unlock_page(hpage);
ret = isolate_huge_page(hpage, &pagelist);
- if (ret) {
- /*
- * get_any_page() and isolate_huge_page() takes a refcount each,
- * so need to drop one here.
- */
- put_page(hpage);
- } else {
+ /*
+ * get_any_page() and isolate_huge_page() takes a refcount each,
+ * so need to drop one here.
+ */
+ put_page(hpage);
+ if (!ret) {
pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
return -EBUSY;
}
@@ -1656,6 +1663,8 @@ static int __soft_offline_page(struct page *page, int flags)
inc_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
+ if (!TestSetPageHWPoison(page))
+ atomic_long_inc(&num_poisoned_pages);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
@@ -1670,9 +1679,8 @@ static int __soft_offline_page(struct page *page, int flags)
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
- } else {
- SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
+ if (TestClearPageHWPoison(page))
+ atomic_long_dec(&num_poisoned_pages);
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index a84fbb772034..388dcf9aa283 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2670,6 +2670,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap(page_table);
+ /* File mapping without ->vm_ops ? */
+ if (vma->vm_flags & VM_SHARED)
+ return VM_FAULT_SIGBUS;
+
/* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
return VM_FAULT_SIGSEGV;
@@ -3099,6 +3103,9 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
+ /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
+ if (!vma->vm_ops->fault)
+ return VM_FAULT_SIGBUS;
if (!(flags & FAULT_FLAG_WRITE))
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
@@ -3244,13 +3251,12 @@ static int handle_pte_fault(struct mm_struct *mm,
barrier();
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (vma->vm_ops) {
- if (likely(vma->vm_ops->fault))
- return do_fault(mm, vma, address, pte,
- pmd, flags, entry);
- }
- return do_anonymous_page(mm, vma, address,
- pte, pmd, flags);
+ if (vma->vm_ops)
+ return do_fault(mm, vma, address, pte, pmd,
+ flags, entry);
+
+ return do_anonymous_page(mm, vma, address, pte, pmd,
+ flags);
}
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 26fbba7d888f..6da82bcb0a8b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -446,7 +446,7 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
int nr_pages = PAGES_PER_SECTION;
int nid = pgdat->node_id;
int zone_type;
- unsigned long flags;
+ unsigned long flags, pfn;
int ret;
zone_type = zone - pgdat->node_zones;
@@ -461,6 +461,14 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
pgdat_resize_unlock(zone->zone_pgdat, &flags);
memmap_init_zone(nr_pages, nid, zone_type,
phys_start_pfn, MEMMAP_HOTPLUG);
+
+ /* online_page_range is called later and expects pages reserved */
+ for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+
+ SetPageReserved(pfn_to_page(pfn));
+ }
return 0;
}
@@ -1269,6 +1277,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
/* create new memmap entry */
firmware_map_add_hotplug(start, start + size, "System RAM");
+ memblock_add_node(start, size, nid);
goto out;
@@ -2005,6 +2014,8 @@ void __ref remove_memory(int nid, u64 start, u64 size)
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
+ memblock_free(start, size);
+ memblock_remove(start, size);
arch_remove_memory(start, size);
diff --git a/mm/migrate.c b/mm/migrate.c
index ee401e4e5ef1..eb4267107d1f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -880,7 +880,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
/* Establish migration ptes or remove ptes */
if (page_mapped(page)) {
try_to_unmap(page,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
+ TTU_IGNORE_HWPOISON);
page_was_mapped = 1;
}
@@ -950,7 +951,10 @@ out:
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- if (reason != MR_MEMORY_FAILURE)
+ /* Soft-offlined page shouldn't go through lru cache list */
+ if (reason == MR_MEMORY_FAILURE)
+ put_page(page);
+ else
putback_lru_page(page);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index aa632ade2be7..f126923ce683 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1268,7 +1268,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
* mounted, in which case we dont add PROT_EXEC.)
*/
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
+ if (!(file && path_noexec(&file->f_path)))
prot |= PROT_EXEC;
if (!(flags & MAP_FIXED))
@@ -1337,7 +1337,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
case MAP_PRIVATE:
if (!(file->f_mode & FMODE_READ))
return -EACCES;
- if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+ if (path_noexec(&file->f_path)) {
if (vm_flags & VM_EXEC)
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
diff --git a/mm/nommu.c b/mm/nommu.c
index 58ea3643b9e9..1cc0709fcaa5 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -324,12 +324,12 @@ long vwrite(char *buf, char *addr, unsigned long count)
}
/*
- * vmalloc - allocate virtually continguos memory
+ * vmalloc - allocate virtually contiguous memory
*
* @size: allocation size
*
* Allocate enough pages to cover @size from the page level
- * allocator and map them into continguos kernel virtual space.
+ * allocator and map them into contiguous kernel virtual space.
*
* For tight control over page level allocator and protection flags
* use __vmalloc() instead.
@@ -341,12 +341,12 @@ void *vmalloc(unsigned long size)
EXPORT_SYMBOL(vmalloc);
/*
- * vzalloc - allocate virtually continguos memory with zero fill
+ * vzalloc - allocate virtually contiguous memory with zero fill
*
* @size: allocation size
*
* Allocate enough pages to cover @size from the page level
- * allocator and map them into continguos kernel virtual space.
+ * allocator and map them into contiguous kernel virtual space.
* The memory allocated is set to zero.
*
* For tight control over page level allocator and protection flags
@@ -420,7 +420,7 @@ void *vmalloc_exec(unsigned long size)
* @size: allocation size
*
* Allocate enough 32bit PA addressable pages to cover @size from the
- * page level allocator and map them into continguos kernel virtual space.
+ * page level allocator and map them into contiguous kernel virtual space.
*/
void *vmalloc_32(unsigned long size)
{
@@ -1035,7 +1035,7 @@ static int validate_mmap_request(struct file *file,
/* handle executable mappings and implied executable
* mappings */
- if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
+ if (path_noexec(&file->f_path)) {
if (prot & PROT_EXEC)
return -EPERM;
} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 22cddd3e5de8..5cccc127ef81 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2063,10 +2063,10 @@ static struct notifier_block ratelimit_nb = {
*/
void __init page_writeback_init(void)
{
+ BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
+
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
-
- BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
}
/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 506eac8b38af..5b5240b7f642 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,7 +18,6 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
-#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/bootmem.h>
@@ -246,9 +245,7 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat)
/* Returns true if the struct page for the pfn is uninitialised */
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
{
- int nid = early_pfn_to_nid(pfn);
-
- if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
+ if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn)
return true;
return false;
@@ -983,21 +980,21 @@ static void __init __free_pages_boot_core(struct page *page,
#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-/* Only safe to use early in boot when initialisation is single-threaded */
+
static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
int __meminit early_pfn_to_nid(unsigned long pfn)
{
+ static DEFINE_SPINLOCK(early_pfn_lock);
int nid;
- /* The system will behave unpredictably otherwise */
- BUG_ON(system_state != SYSTEM_BOOTING);
-
+ spin_lock(&early_pfn_lock);
nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid >= 0)
- return nid;
- /* just returns 0 */
- return 0;
+ if (nid < 0)
+ nid = 0;
+ spin_unlock(&early_pfn_lock);
+
+ return nid;
}
#endif
@@ -1062,7 +1059,15 @@ static void __init deferred_free_range(struct page *page,
__free_pages_boot_core(page, pfn, 0);
}
-static __initdata DECLARE_RWSEM(pgdat_init_rwsem);
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+ if (atomic_dec_and_test(&pgdat_init_n_undone))
+ complete(&pgdat_init_all_done_comp);
+}
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
@@ -1079,7 +1084,7 @@ static int __init deferred_init_memmap(void *data)
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (first_init_pfn == ULONG_MAX) {
- up_read(&pgdat_init_rwsem);
+ pgdat_init_report_one_done();
return 0;
}
@@ -1179,7 +1184,8 @@ free_range:
pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
jiffies_to_msecs(jiffies - start));
- up_read(&pgdat_init_rwsem);
+
+ pgdat_init_report_one_done();
return 0;
}
@@ -1187,14 +1193,17 @@ void __init page_alloc_init_late(void)
{
int nid;
+ /* There will be num_node_state(N_MEMORY) threads */
+ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
for_each_node_state(nid, N_MEMORY) {
- down_read(&pgdat_init_rwsem);
kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
}
/* Block until all are initialised */
- down_write(&pgdat_init_rwsem);
- up_write(&pgdat_init_rwsem);
+ wait_for_completion(&pgdat_init_all_done_comp);
+
+ /* Reinit limits that are based on free pages after the kernel is up */
+ files_maxfiles_init();
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
@@ -1287,6 +1296,10 @@ static inline int check_new_page(struct page *page)
bad_reason = "non-NULL mapping";
if (unlikely(atomic_read(&page->_count) != 0))
bad_reason = "nonzero _count";
+ if (unlikely(page->flags & __PG_HWPOISON)) {
+ bad_reason = "HWPoisoned (hardware-corrupted)";
+ bad_flags = __PG_HWPOISON;
+ }
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
@@ -1330,12 +1343,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
set_page_owner(page, order, gfp_flags);
/*
- * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
+ * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
* allocate the page. The expectation is that the caller is taking
* steps that will free more memory. The caller should avoid the page
* being used for !PFMEMALLOC purposes.
*/
- page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+ if (alloc_flags & ALLOC_NO_WATERMARKS)
+ set_page_pfmemalloc(page);
+ else
+ clear_page_pfmemalloc(page);
return 0;
}
@@ -1950,6 +1966,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
void split_page(struct page *page, unsigned int order)
{
int i;
+ gfp_t gfp_mask;
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
@@ -1963,10 +1980,11 @@ void split_page(struct page *page, unsigned int order)
split_page(virt_to_page(page[0].shadow), order);
#endif
- set_page_owner(page, 0, 0);
+ gfp_mask = get_page_owner_gfp(page);
+ set_page_owner(page, 0, gfp_mask);
for (i = 1; i < (1 << order); i++) {
set_page_refcounted(page + i);
- set_page_owner(page + i, 0, 0);
+ set_page_owner(page + i, 0, gfp_mask);
}
}
EXPORT_SYMBOL_GPL(split_page);
@@ -1996,6 +2014,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
zone->free_area[order].nr_free--;
rmv_page_order(page);
+ set_page_owner(page, order, __GFP_MOVABLE);
+
/* Set the pageblock if the isolated page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
@@ -2007,7 +2027,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
}
- set_page_owner(page, order, 0);
+
return 1UL << order;
}
@@ -3328,7 +3348,7 @@ refill:
atomic_add(size - 1, &page->_count);
/* reset page count bias and offset to start of new frag */
- nc->pfmemalloc = page->pfmemalloc;
+ nc->pfmemalloc = page_is_pfmemalloc(page);
nc->pagecnt_bias = size;
nc->offset = size;
}
@@ -5043,6 +5063,10 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
{
unsigned long zone_start_pfn, zone_end_pfn;
+ /* When hotadd a new node, the node should be empty */
+ if (!node_start_pfn && !node_end_pfn)
+ return 0;
+
/* Get the start and end of the zone */
zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
@@ -5106,6 +5130,10 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
+ /* When hotadd a new node, the node should be empty */
+ if (!node_start_pfn && !node_end_pfn)
+ return 0;
+
zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index bd5f842b56d2..983c3a10fa07 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -76,6 +76,13 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
+gfp_t __get_page_owner_gfp(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ return page_ext->gfp_mask;
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_ext *page_ext)
diff --git a/mm/shmem.c b/mm/shmem.c
index 4caf8ed24d65..dbe0c1e8349c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3363,8 +3363,8 @@ put_path:
* shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
* kernel internal. There will be NO LSM permission checks against the
* underlying inode. So users of this interface must do LSM checks at a
- * higher layer. The one user is the big_key implementation. LSM checks
- * are provided at the key level rather than the inode level.
+ * higher layer. The users are the big_key and shm implementations. LSM
+ * checks are provided at the key or shm level rather than the inode.
* @name: name for dentry (to be seen in /proc/<pid>/maps
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
diff --git a/mm/slab.c b/mm/slab.c
index 200e22412a16..bbd0b47dc6a9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1603,7 +1603,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
}
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
- if (unlikely(page->pfmemalloc))
+ if (page_is_pfmemalloc(page))
pfmemalloc_active = true;
nr_pages = (1 << cachep->gfporder);
@@ -1614,7 +1614,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
add_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_pages);
__SetPageSlab(page);
- if (page->pfmemalloc)
+ if (page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3e5f8f29c286..86831105a09f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -37,8 +37,7 @@ struct kmem_cache *kmem_cache;
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
SLAB_FAILSLAB)
-#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_CACHE_DMA | SLAB_NOTRACK)
+#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK)
/*
* Merge control. If this is set then no merging of slab caches will occur.
diff --git a/mm/slub.c b/mm/slub.c
index 816df0016555..f68c0e50f3c0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1427,7 +1427,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab_cache = s;
__SetPageSlab(page);
- if (page->pfmemalloc)
+ if (page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
start = page_address(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e61445dce04e..8286938c70de 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -973,22 +973,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* caller can stall after page list has been processed.
*
* 2) Global or new memcg reclaim encounters a page that is
- * not marked for immediate reclaim or the caller does not
- * have __GFP_IO. In this case mark the page for immediate
+ * not marked for immediate reclaim, or the caller does not
+ * have __GFP_FS (or __GFP_IO if it's simply going to swap,
+ * not to fs). In this case mark the page for immediate
* reclaim and continue scanning.
*
- * __GFP_IO is checked because a loop driver thread might
+ * Require may_enter_fs because we would wait on fs, which
+ * may not have submitted IO yet. And the loop driver might
* enter reclaim, and deadlock if it waits on a page for
* which it is needed to do the write (loop masks off
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * Don't require __GFP_FS, since we're not going into the
- * FS, just waiting on its writeback completion. Worryingly,
- * ext4 gfs2 and xfs allocate pages with
- * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
- * may_enter_fs here is liable to OOM on them.
- *
* 3) Legacy memcg encounters a page that is not already marked
* PageReclaim. memcg does not have any dirty pages
* throttling so we could easily OOM just because too many
@@ -1005,7 +1001,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/* Case 2 above */
} else if (sane_reclaim(sc) ||
- !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+ !PageReclaim(page) || !may_enter_fs) {
/*
* This is slightly racy - end_page_writeback()
* might have just cleared PageReclaim, then