summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/kasan/Makefile9
-rw-r--r--mm/kasan/common.c43
-rw-r--r--mm/kasan/report.c10
-rw-r--r--mm/kmemleak.c24
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c12
-rw-r--r--mm/memory.c8
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mmu_gather.c129
-rw-r--r--mm/page_alloc.c34
-rw-r--r--mm/page_owner.c82
-rw-r--r--mm/percpu-internal.h15
-rw-r--r--mm/percpu-km.c2
-rw-r--r--mm/percpu-stats.c5
-rw-r--r--mm/percpu.c549
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/slab.c48
-rw-r--r--mm/slub.c21
-rw-r--r--mm/vmalloc.c113
-rw-r--r--mm/vmscan.c2
22 files changed, 681 insertions, 439 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 165ea46bf149..b6a34b32d8ac 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1677,7 +1677,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
struct mm_struct *mm = tlb->mm;
bool ret = false;
- tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl)
@@ -1753,7 +1753,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
spinlock_t *ptl;
- tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6cdc7b2d9100..641cedfc8c0f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3353,7 +3353,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* This is a hugetlb vma, all the pte entries should point
* to huge page.
*/
- tlb_remove_check_page_size_change(tlb, sz);
+ tlb_change_page_size(tlb, sz);
tlb_start_vma(tlb, vma);
/*
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 5d1065efbd47..08b43de2383b 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -2,18 +2,21 @@
KASAN_SANITIZE := n
UBSAN_SANITIZE_common.o := n
UBSAN_SANITIZE_generic.o := n
+UBSAN_SANITIZE_generic_report.o := n
UBSAN_SANITIZE_tags.o := n
KCOV_INSTRUMENT := n
-CFLAGS_REMOVE_common.o = -pg
-CFLAGS_REMOVE_generic.o = -pg
-CFLAGS_REMOVE_tags.o = -pg
+CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE)
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
obj-$(CONFIG_KASAN) := common.o init.o report.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 80bbe62b16cd..36afcf64e016 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -36,6 +36,7 @@
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <linux/bug.h>
+#include <linux/uaccess.h>
#include "kasan.h"
#include "../slab.h"
@@ -48,37 +49,28 @@ static inline int in_irqentry_text(unsigned long ptr)
ptr < (unsigned long)&__softirqentry_text_end);
}
-static inline void filter_irq_stacks(struct stack_trace *trace)
+static inline unsigned int filter_irq_stacks(unsigned long *entries,
+ unsigned int nr_entries)
{
- int i;
+ unsigned int i;
- if (!trace->nr_entries)
- return;
- for (i = 0; i < trace->nr_entries; i++)
- if (in_irqentry_text(trace->entries[i])) {
+ for (i = 0; i < nr_entries; i++) {
+ if (in_irqentry_text(entries[i])) {
/* Include the irqentry function into the stack. */
- trace->nr_entries = i + 1;
- break;
+ return i + 1;
}
+ }
+ return nr_entries;
}
static inline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[KASAN_STACK_DEPTH];
- struct stack_trace trace = {
- .nr_entries = 0,
- .entries = entries,
- .max_entries = KASAN_STACK_DEPTH,
- .skip = 0
- };
+ unsigned int nr_entries;
- save_stack_trace(&trace);
- filter_irq_stacks(&trace);
- if (trace.nr_entries != 0 &&
- trace.entries[trace.nr_entries-1] == ULONG_MAX)
- trace.nr_entries--;
-
- return depot_save_stack(&trace, flags);
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+ nr_entries = filter_irq_stacks(entries, nr_entries);
+ return stack_depot_save(entries, nr_entries, flags);
}
static inline void set_track(struct kasan_track *track, gfp_t flags)
@@ -614,6 +606,15 @@ void kasan_free_shadow(const struct vm_struct *vm)
vfree(kasan_mem_to_shadow(vm->addr));
}
+extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip);
+
+void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
+{
+ unsigned long flags = user_access_save();
+ __kasan_report(addr, size, is_write, ip);
+ user_access_restore(flags);
+}
+
#ifdef CONFIG_MEMORY_HOTPLUG
static bool shadow_mapped(unsigned long addr)
{
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index ca9418fe9232..03a443579386 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -100,10 +100,11 @@ static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
- struct stack_trace trace;
+ unsigned long *entries;
+ unsigned int nr_entries;
- depot_fetch_stack(track->stack, &trace);
- print_stack_trace(&trace, 0);
+ nr_entries = stack_depot_fetch(track->stack, &entries);
+ stack_trace_print(entries, nr_entries, 0);
} else {
pr_err("(stack is not available)\n");
}
@@ -281,8 +282,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
end_report(&flags);
}
-void kasan_report(unsigned long addr, size_t size,
- bool is_write, unsigned long ip)
+void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
{
struct kasan_access_info info;
void *tagged_addr;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2e435b8142e5..e57bf810f798 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -410,11 +410,6 @@ static void print_unreferenced(struct seq_file *seq,
*/
static void dump_object_info(struct kmemleak_object *object)
{
- struct stack_trace trace;
-
- trace.nr_entries = object->trace_len;
- trace.entries = object->trace;
-
pr_notice("Object 0x%08lx (size %zu):\n",
object->pointer, object->size);
pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
@@ -424,7 +419,7 @@ static void dump_object_info(struct kmemleak_object *object)
pr_notice(" flags = 0x%x\n", object->flags);
pr_notice(" checksum = %u\n", object->checksum);
pr_notice(" backtrace:\n");
- print_stack_trace(&trace, 4);
+ stack_trace_print(object->trace, object->trace_len, 4);
}
/*
@@ -553,15 +548,7 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali
*/
static int __save_stack_trace(unsigned long *trace)
{
- struct stack_trace stack_trace;
-
- stack_trace.max_entries = MAX_TRACE;
- stack_trace.nr_entries = 0;
- stack_trace.entries = trace;
- stack_trace.skip = 2;
- save_stack_trace(&stack_trace);
-
- return stack_trace.nr_entries;
+ return stack_trace_save(trace, MAX_TRACE, 2);
}
/*
@@ -2021,13 +2008,8 @@ early_param("kmemleak", kmemleak_boot_config);
static void __init print_log_trace(struct early_log *log)
{
- struct stack_trace trace;
-
- trace.nr_entries = log->trace_len;
- trace.entries = log->trace;
-
pr_notice("Early log backtrace:\n");
- print_stack_trace(&trace, 2);
+ stack_trace_print(log->trace, log->trace_len, 2);
}
/*
diff --git a/mm/madvise.c b/mm/madvise.c
index 21a7881a2db4..bb3a4554d5d5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -328,7 +328,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
if (pmd_trans_unstable(pmd))
return 0;
- tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+ tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
diff --git a/mm/memblock.c b/mm/memblock.c
index e7665cf914b1..a48f520c2d01 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -702,7 +702,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_add: [%pa-%pa] %pF\n",
+ memblock_dbg("memblock_add: [%pa-%pa] %pS\n",
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
@@ -821,7 +821,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg(" memblock_free: [%pa-%pa] %pF\n",
+ memblock_dbg(" memblock_free: [%pa-%pa] %pS\n",
&base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
@@ -832,7 +832,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n",
+ memblock_dbg("memblock_reserve: [%pa-%pa] %pS\n",
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
@@ -1447,7 +1447,7 @@ void * __init memblock_alloc_try_nid_raw(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
@@ -1483,7 +1483,7 @@ void * __init memblock_alloc_try_nid(
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
ptr = memblock_alloc_internal(size, align,
@@ -1508,7 +1508,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
phys_addr_t cursor, end;
end = base + size - 1;
- memblock_dbg("%s: [%pa-%pa] %pF\n",
+ memblock_dbg("%s: [%pa-%pa] %pS\n",
__func__, &base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
cursor = PFN_UP(base);
diff --git a/mm/memory.c b/mm/memory.c
index ab650c21bccd..f7d962d7de19 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -356,7 +356,7 @@ void free_pgd_range(struct mmu_gather *tlb,
* We add page table cache pages with PAGE_SIZE,
* (see pte_free_tlb()), flush the tlb if we need
*/
- tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+ tlb_change_page_size(tlb, PAGE_SIZE);
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -519,7 +519,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_page(page, "bad pte");
pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
@@ -1046,7 +1046,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t *pte;
swp_entry_t entry;
- tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+ tlb_change_page_size(tlb, PAGE_SIZE);
again:
init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1155,7 +1155,7 @@ again:
*/
if (force_flush) {
force_flush = 0;
- tlb_flush_mmu_free(tlb);
+ tlb_flush_mmu(tlb);
if (addr != end)
goto again;
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0082d699be94..b236069ff0d8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -874,6 +874,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
*/
mem = find_memory_block(__pfn_to_section(pfn));
nid = mem->nid;
+ put_device(&mem->dev);
/* associate pfn range with the zone */
zone = move_pfn_range(online_type, nid, pfn, nr_pages);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index f2f03c655807..99740e1dd273 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -11,7 +11,7 @@
#include <asm/pgalloc.h>
#include <asm/tlb.h>
-#ifdef HAVE_GENERIC_MMU_GATHER
+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
static bool tlb_next_batch(struct mmu_gather *tlb)
{
@@ -41,35 +41,10 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
return true;
}
-void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- tlb->mm = mm;
-
- /* Is it from 0 to ~0? */
- tlb->fullmm = !(start | (end+1));
- tlb->need_flush_all = 0;
- tlb->local.next = NULL;
- tlb->local.nr = 0;
- tlb->local.max = ARRAY_SIZE(tlb->__pages);
- tlb->active = &tlb->local;
- tlb->batch_count = 0;
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
- tlb->batch = NULL;
-#endif
- tlb->page_size = 0;
-
- __tlb_reset_range(tlb);
-}
-
-void tlb_flush_mmu_free(struct mmu_gather *tlb)
+static void tlb_batch_pages_flush(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
- tlb_table_flush(tlb);
-#endif
for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
free_pages_and_swap_cache(batch->pages, batch->nr);
batch->nr = 0;
@@ -77,31 +52,10 @@ void tlb_flush_mmu_free(struct mmu_gather *tlb)
tlb->active = &tlb->local;
}
-void tlb_flush_mmu(struct mmu_gather *tlb)
-{
- tlb_flush_mmu_tlbonly(tlb);
- tlb_flush_mmu_free(tlb);
-}
-
-/* tlb_finish_mmu
- * Called at the end of the shootdown operation to free up any resources
- * that were required.
- */
-void arch_tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end, bool force)
+static void tlb_batch_list_free(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch, *next;
- if (force) {
- __tlb_reset_range(tlb);
- __tlb_adjust_range(tlb, start, end - start);
- }
-
- tlb_flush_mmu(tlb);
-
- /* keep the page table cache within bounds */
- check_pgt_cache();
-
for (batch = tlb->local.next; batch; batch = next) {
next = batch->next;
free_pages((unsigned long)batch, 0);
@@ -109,19 +63,15 @@ void arch_tlb_finish_mmu(struct mmu_gather *tlb,
tlb->local.next = NULL;
}
-/* __tlb_remove_page
- * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
- * handling the additional races in SMP caused by other CPUs caching valid
- * mappings in their TLBs. Returns the number of free page slots left.
- * When out of page slots we must call tlb_flush_mmu().
- *returns true if the caller should flush.
- */
bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
{
struct mmu_gather_batch *batch;
VM_BUG_ON(!tlb->end);
+
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
VM_WARN_ON(tlb->page_size != page_size);
+#endif
batch = tlb->active;
/*
@@ -139,7 +89,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
return false;
}
-#endif /* HAVE_GENERIC_MMU_GATHER */
+#endif /* HAVE_MMU_GATHER_NO_GATHER */
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
@@ -152,7 +102,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
*/
static inline void tlb_table_invalidate(struct mmu_gather *tlb)
{
-#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
+#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE
/*
* Invalidate page-table caches used by hardware walkers. Then we still
* need to RCU-sched wait while freeing the pages because software
@@ -193,7 +143,7 @@ static void tlb_remove_table_rcu(struct rcu_head *head)
free_page((unsigned long)batch);
}
-void tlb_table_flush(struct mmu_gather *tlb)
+static void tlb_table_flush(struct mmu_gather *tlb)
{
struct mmu_table_batch **batch = &tlb->batch;
@@ -225,6 +175,22 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+static void tlb_flush_mmu_free(struct mmu_gather *tlb)
+{
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb_table_flush(tlb);
+#endif
+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+ tlb_batch_pages_flush(tlb);
+#endif
+}
+
+void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+ tlb_flush_mmu_tlbonly(tlb);
+ tlb_flush_mmu_free(tlb);
+}
+
/**
* tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
* @tlb: the mmu_gather structure to initialize
@@ -240,10 +206,40 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
unsigned long start, unsigned long end)
{
- arch_tlb_gather_mmu(tlb, mm, start, end);
+ tlb->mm = mm;
+
+ /* Is it from 0 to ~0? */
+ tlb->fullmm = !(start | (end+1));
+
+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+ tlb->need_flush_all = 0;
+ tlb->local.next = NULL;
+ tlb->local.nr = 0;
+ tlb->local.max = ARRAY_SIZE(tlb->__pages);
+ tlb->active = &tlb->local;
+ tlb->batch_count = 0;
+#endif
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb->batch = NULL;
+#endif
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+ tlb->page_size = 0;
+#endif
+
+ __tlb_reset_range(tlb);
inc_tlb_flush_pending(tlb->mm);
}
+/**
+ * tlb_finish_mmu - finish an mmu_gather structure
+ * @tlb: the mmu_gather structure to finish
+ * @start: start of the region that will be removed from the page-table
+ * @end: end of the region that will be removed from the page-table
+ *
+ * Called at the end of the shootdown operation to free up any resources that
+ * were required.
+ */
void tlb_finish_mmu(struct mmu_gather *tlb,
unsigned long start, unsigned long end)
{
@@ -254,8 +250,17 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
* the TLB by observing pte_none|!pte_dirty, for example so flush TLB
* forcefully if we detect parallel PTE batching threads.
*/
- bool force = mm_tlb_flush_nested(tlb->mm);
+ if (mm_tlb_flush_nested(tlb->mm)) {
+ __tlb_reset_range(tlb);
+ __tlb_adjust_range(tlb, start, end - start);
+ }
- arch_tlb_finish_mmu(tlb, start, end, force);
+ tlb_flush_mmu(tlb);
+
+ /* keep the page table cache within bounds */
+ check_pgt_cache();
+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+ tlb_batch_list_free(tlb);
+#endif
dec_tlb_flush_pending(tlb->mm);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c6ce20aaf80b..59661106da16 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -266,7 +266,20 @@ compound_page_dtor * const compound_page_dtors[] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
+ * are not on separate NUMA nodes. Functionally this works but with
+ * watermark_boost_factor, it can reclaim prematurely as the ranges can be
+ * quite small. By default, do not boost watermarks on discontigmem as in
+ * many cases very high-order allocations like THP are likely to be
+ * unsupported and the premature reclaim offsets the advantage of long-term
+ * fragmentation avoidance.
+ */
+int watermark_boost_factor __read_mostly;
+#else
int watermark_boost_factor __read_mostly = 15000;
+#endif
int watermark_scale_factor = 10;
static unsigned long nr_kernel_pages __initdata;
@@ -1131,7 +1144,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
}
arch_free_page(page, order);
kernel_poison_pages(page, 1 << order, 0);
- kernel_map_pages(page, 1 << order, 0);
+ if (debug_pagealloc_enabled())
+ kernel_map_pages(page, 1 << order, 0);
+
kasan_free_nondeferred_pages(page, order);
return true;
@@ -2001,7 +2016,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
set_page_refcounted(page);
arch_alloc_page(page, order);
- kernel_map_pages(page, 1 << order, 1);
+ if (debug_pagealloc_enabled())
+ kernel_map_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
@@ -3419,8 +3435,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
alloc_flags |= ALLOC_KSWAPD;
#ifdef CONFIG_ZONE_DMA32
+ if (!zone)
+ return alloc_flags;
+
if (zone_idx(zone) != ZONE_NORMAL)
- goto out;
+ return alloc_flags;
/*
* If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
@@ -3429,9 +3448,9 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
*/
BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
if (nr_online_nodes > 1 && !populated_zone(--zone))
- goto out;
+ return alloc_flags;
-out:
+ alloc_flags |= ALLOC_NOFRAGMENT;
#endif /* CONFIG_ZONE_DMA32 */
return alloc_flags;
}
@@ -3773,11 +3792,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
- if (*compact_result <= COMPACT_INACTIVE) {
- WARN_ON_ONCE(page);
- return NULL;
- }
-
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
* count a compaction stall
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 925b6f44a444..addcbb2ae4e4 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -58,15 +58,10 @@ static bool need_page_owner(void)
static __always_inline depot_stack_handle_t create_dummy_stack(void)
{
unsigned long entries[4];
- struct stack_trace dummy;
+ unsigned int nr_entries;
- dummy.nr_entries = 0;
- dummy.max_entries = ARRAY_SIZE(entries);
- dummy.entries = &entries[0];
- dummy.skip = 0;
-
- save_stack_trace(&dummy);
- return depot_save_stack(&dummy, GFP_KERNEL);
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+ return stack_depot_save(entries, nr_entries, GFP_KERNEL);
}
static noinline void register_dummy_stack(void)
@@ -120,49 +115,39 @@ void __reset_page_owner(struct page *page, unsigned int order)
}
}
-static inline bool check_recursive_alloc(struct stack_trace *trace,
- unsigned long ip)
+static inline bool check_recursive_alloc(unsigned long *entries,
+ unsigned int nr_entries,
+ unsigned long ip)
{
- int i;
-
- if (!trace->nr_entries)
- return false;
+ unsigned int i;
- for (i = 0; i < trace->nr_entries; i++) {
- if (trace->entries[i] == ip)
+ for (i = 0; i < nr_entries; i++) {
+ if (entries[i] == ip)
return true;
}
-
return false;
}
static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[PAGE_OWNER_STACK_DEPTH];
- struct stack_trace trace = {
- .nr_entries = 0,
- .entries = entries,
- .max_entries = PAGE_OWNER_STACK_DEPTH,
- .skip = 2
- };
depot_stack_handle_t handle;
+ unsigned int nr_entries;
- save_stack_trace(&trace);
- if (trace.nr_entries != 0 &&
- trace.entries[trace.nr_entries-1] == ULONG_MAX)
- trace.nr_entries--;
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
/*
- * We need to check recursion here because our request to stackdepot
- * could trigger memory allocation to save new entry. New memory
- * allocation would reach here and call depot_save_stack() again
- * if we don't catch it. There is still not enough memory in stackdepot
- * so it would try to allocate memory again and loop forever.
+ * We need to check recursion here because our request to
+ * stackdepot could trigger memory allocation to save new
+ * entry. New memory allocation would reach here and call
+ * stack_depot_save_entries() again if we don't catch it. There is
+ * still not enough memory in stackdepot so it would try to
+ * allocate memory again and loop forever.
*/
- if (check_recursive_alloc(&trace, _RET_IP_))
+ if (check_recursive_alloc(entries, nr_entries, _RET_IP_))
return dummy_handle;
- handle = depot_save_stack(&trace, flags);
+ handle = stack_depot_save(entries, nr_entries, flags);
if (!handle)
handle = failure_handle;
@@ -340,16 +325,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_owner *page_owner,
depot_stack_handle_t handle)
{
- int ret;
- int pageblock_mt, page_mt;
+ int ret, pageblock_mt, page_mt;
+ unsigned long *entries;
+ unsigned int nr_entries;
char *kbuf;
- unsigned long entries[PAGE_OWNER_STACK_DEPTH];
- struct stack_trace trace = {
- .nr_entries = 0,
- .entries = entries,
- .max_entries = PAGE_OWNER_STACK_DEPTH,
- .skip = 0
- };
count = min_t(size_t, count, PAGE_SIZE);
kbuf = kmalloc(count, GFP_KERNEL);
@@ -378,8 +357,8 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (ret >= count)
goto err;
- depot_fetch_stack(handle, &trace);
- ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
+ nr_entries = stack_depot_fetch(handle, &entries);
+ ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0);
if (ret >= count)
goto err;
@@ -410,14 +389,9 @@ void __dump_page_owner(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
struct page_owner *page_owner;
- unsigned long entries[PAGE_OWNER_STACK_DEPTH];
- struct stack_trace trace = {
- .nr_entries = 0,
- .entries = entries,
- .max_entries = PAGE_OWNER_STACK_DEPTH,
- .skip = 0
- };
depot_stack_handle_t handle;
+ unsigned long *entries;
+ unsigned int nr_entries;
gfp_t gfp_mask;
int mt;
@@ -441,10 +415,10 @@ void __dump_page_owner(struct page *page)
return;
}
- depot_fetch_stack(handle, &trace);
+ nr_entries = stack_depot_fetch(handle, &entries);
pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
- print_stack_trace(&trace, 0);
+ stack_trace_print(entries, nr_entries, 0);
if (page_owner->last_migrate_reason != -1)
pr_alert("page has been migrated, last migrate reason: %s\n",
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index b1739dc06b73..0468ba500bd4 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -9,8 +9,17 @@
* pcpu_block_md is the metadata block struct.
* Each chunk's bitmap is split into a number of full blocks.
* All units are in terms of bits.
+ *
+ * The scan hint is the largest known contiguous area before the contig hint.
+ * It is not necessarily the actual largest contig hint though. There is an
+ * invariant that the scan_hint_start > contig_hint_start iff
+ * scan_hint == contig_hint. This is necessary because when scanning forward,
+ * we don't know if a new contig hint would be better than the current one.
*/
struct pcpu_block_md {
+ int scan_hint; /* scan hint for block */
+ int scan_hint_start; /* block relative starting
+ position of the scan hint */
int contig_hint; /* contig hint for block */
int contig_hint_start; /* block relative starting
position of the contig hint */
@@ -19,6 +28,7 @@ struct pcpu_block_md {
int right_free; /* size of free space along
the right side of the block */
int first_free; /* block position of first free */
+ int nr_bits; /* total bits responsible for */
};
struct pcpu_chunk {
@@ -29,9 +39,7 @@ struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
int free_bytes; /* free bytes in the chunk */
- int contig_bits; /* max contiguous size hint */
- int contig_bits_start; /* contig_bits starting
- offset */
+ struct pcpu_block_md chunk_md;
void *base_addr; /* base address of this chunk */
unsigned long *alloc_map; /* allocation map */
@@ -39,7 +47,6 @@ struct pcpu_chunk {
struct pcpu_block_md *md_blocks; /* metadata blocks */
void *data; /* chunk data */
- int first_bit; /* no free below this */
bool immutable; /* no [de]population allowed */
int start_offset; /* the overlap with the previous
region to have a page aligned
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index b68d5df14731..3a2ff5c9192c 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -70,7 +70,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
chunk->base_addr = page_address(pages);
spin_lock_irqsave(&pcpu_lock, flags);
- pcpu_chunk_populated(chunk, 0, nr_pages, false);
+ pcpu_chunk_populated(chunk, 0, nr_pages);
spin_unlock_irqrestore(&pcpu_lock, flags);
pcpu_stats_chunk_alloc();
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index b5fdd43b60c9..ef5034a0464e 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -53,6 +53,7 @@ static int find_max_nr_alloc(void)
static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
int *buffer)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int i, last_alloc, as_len, start, end;
int *alloc_sizes, *p;
/* statistics */
@@ -121,9 +122,9 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
P("nr_alloc", chunk->nr_alloc);
P("max_alloc_size", chunk->max_alloc_size);
P("empty_pop_pages", chunk->nr_empty_pop_pages);
- P("first_bit", chunk->first_bit);
+ P("first_bit", chunk_md->first_free);
P("free_bytes", chunk->free_bytes);
- P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE);
+ P("contig_bytes", chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
P("sum_frag", sum_frag);
P("max_frag", max_frag);
P("cur_min_alloc", cur_min_alloc);
diff --git a/mm/percpu.c b/mm/percpu.c
index 68dd2e7e73b5..2df0ee680ea6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -94,6 +94,8 @@
/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
#define PCPU_SLOT_BASE_SHIFT 5
+/* chunks in slots below this are subject to being sidelined on failed alloc */
+#define PCPU_SLOT_FAIL_THRESHOLD 3
#define PCPU_EMPTY_POP_PAGES_LOW 2
#define PCPU_EMPTY_POP_PAGES_HIGH 4
@@ -231,10 +233,13 @@ static int pcpu_size_to_slot(int size)
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
- if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
+ const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+
+ if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
+ chunk_md->contig_hint == 0)
return 0;
- return pcpu_size_to_slot(chunk->free_bytes);
+ return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}
/* set the pointer to a chunk in a page struct */
@@ -318,6 +323,34 @@ static unsigned long pcpu_block_off_to_off(int index, int off)
return index * PCPU_BITMAP_BLOCK_BITS + off;
}
+/*
+ * pcpu_next_hint - determine which hint to use
+ * @block: block of interest
+ * @alloc_bits: size of allocation
+ *
+ * This determines if we should scan based on the scan_hint or first_free.
+ * In general, we want to scan from first_free to fulfill allocations by
+ * first fit. However, if we know a scan_hint at position scan_hint_start
+ * cannot fulfill an allocation, we can begin scanning from there knowing
+ * the contig_hint will be our fallback.
+ */
+static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
+{
+ /*
+ * The three conditions below determine if we can skip past the
+ * scan_hint. First, does the scan hint exist. Second, is the
+ * contig_hint after the scan_hint (possibly not true iff
+ * contig_hint == scan_hint). Third, is the allocation request
+ * larger than the scan_hint.
+ */
+ if (block->scan_hint &&
+ block->contig_hint_start > block->scan_hint_start &&
+ alloc_bits > block->scan_hint)
+ return block->scan_hint_start + block->scan_hint;
+
+ return block->first_free;
+}
+
/**
* pcpu_next_md_free_region - finds the next hint free area
* @chunk: chunk of interest
@@ -413,9 +446,11 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
if (block->contig_hint &&
block->contig_hint_start >= block_off &&
block->contig_hint >= *bits + alloc_bits) {
+ int start = pcpu_next_hint(block, alloc_bits);
+
*bits += alloc_bits + block->contig_hint_start -
- block->first_free;
- *bit_off = pcpu_block_off_to_off(i, block->first_free);
+ start;
+ *bit_off = pcpu_block_off_to_off(i, start);
return;
}
/* reset to satisfy the second predicate above */
@@ -488,6 +523,22 @@ static void pcpu_mem_free(void *ptr)
kvfree(ptr);
}
+static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
+ bool move_front)
+{
+ if (chunk != pcpu_reserved_chunk) {
+ if (move_front)
+ list_move(&chunk->list, &pcpu_slot[slot]);
+ else
+ list_move_tail(&chunk->list, &pcpu_slot[slot]);
+ }
+}
+
+static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
+{
+ __pcpu_chunk_move(chunk, slot, true);
+}
+
/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
@@ -505,110 +556,39 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);
- if (chunk != pcpu_reserved_chunk && oslot != nslot) {
- if (oslot < nslot)
- list_move(&chunk->list, &pcpu_slot[nslot]);
- else
- list_move_tail(&chunk->list, &pcpu_slot[nslot]);
- }
+ if (oslot != nslot)
+ __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}
-/**
- * pcpu_cnt_pop_pages- counts populated backing pages in range
+/*
+ * pcpu_update_empty_pages - update empty page counters
* @chunk: chunk of interest
- * @bit_off: start offset
- * @bits: size of area to check
- *
- * Calculates the number of populated pages in the region
- * [page_start, page_end). This keeps track of how many empty populated
- * pages are available and decide if async work should be scheduled.
+ * @nr: nr of empty pages
*
- * RETURNS:
- * The nr of populated pages.
+ * This is used to keep track of the empty pages now based on the premise
+ * a md_block covers a page. The hint update functions recognize if a block
+ * is made full or broken to calculate deltas for keeping track of free pages.
*/
-static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
- int bits)
+static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
- int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
- int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
-
- if (page_start >= page_end)
- return 0;
-
- /*
- * bitmap_weight counts the number of bits set in a bitmap up to
- * the specified number of bits. This is counting the populated
- * pages up to page_end and then subtracting the populated pages
- * up to page_start to count the populated pages in
- * [page_start, page_end).
- */
- return bitmap_weight(chunk->populated, page_end) -
- bitmap_weight(chunk->populated, page_start);
-}
-
-/**
- * pcpu_chunk_update - updates the chunk metadata given a free area
- * @chunk: chunk of interest
- * @bit_off: chunk offset
- * @bits: size of free area
- *
- * This updates the chunk's contig hint and starting offset given a free area.
- * Choose the best starting offset if the contig hint is equal.
- */
-static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
-{
- if (bits > chunk->contig_bits) {
- chunk->contig_bits_start = bit_off;
- chunk->contig_bits = bits;
- } else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
- (!bit_off ||
- __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
- /* use the start with the best alignment */
- chunk->contig_bits_start = bit_off;
- }
+ chunk->nr_empty_pop_pages += nr;
+ if (chunk != pcpu_reserved_chunk)
+ pcpu_nr_empty_pop_pages += nr;
}
-/**
- * pcpu_chunk_refresh_hint - updates metadata about a chunk
- * @chunk: chunk of interest
- *
- * Iterates over the metadata blocks to find the largest contig area.
- * It also counts the populated pages and uses the delta to update the
- * global count.
- *
- * Updates:
- * chunk->contig_bits
- * chunk->contig_bits_start
- * nr_empty_pop_pages (chunk and global)
+/*
+ * pcpu_region_overlap - determines if two regions overlap
+ * @a: start of first region, inclusive
+ * @b: end of first region, exclusive
+ * @x: start of second region, inclusive
+ * @y: end of second region, exclusive
+ *
+ * This is used to determine if the hint region [a, b) overlaps with the
+ * allocated region [x, y).
*/
-static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
+static inline bool pcpu_region_overlap(int a, int b, int x, int y)
{
- int bit_off, bits, nr_empty_pop_pages;
-
- /* clear metadata */
- chunk->contig_bits = 0;
-
- bit_off = chunk->first_bit;
- bits = nr_empty_pop_pages = 0;
- pcpu_for_each_md_free_region(chunk, bit_off, bits) {
- pcpu_chunk_update(chunk, bit_off, bits);
-
- nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
- }
-
- /*
- * Keep track of nr_empty_pop_pages.
- *
- * The chunk maintains the previous number of free pages it held,
- * so the delta is used to update the global counter. The reserved
- * chunk is not part of the free page count as they are populated
- * at init and are special to serving reserved allocations.
- */
- if (chunk != pcpu_reserved_chunk)
- pcpu_nr_empty_pop_pages +=
- (nr_empty_pop_pages - chunk->nr_empty_pop_pages);
-
- chunk->nr_empty_pop_pages = nr_empty_pop_pages;
+ return (a < y) && (x < b);
}
/**
@@ -629,16 +609,132 @@ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
if (start == 0)
block->left_free = contig;
- if (end == PCPU_BITMAP_BLOCK_BITS)
+ if (end == block->nr_bits)
block->right_free = contig;
if (contig > block->contig_hint) {
+ /* promote the old contig_hint to be the new scan_hint */
+ if (start > block->contig_hint_start) {
+ if (block->contig_hint > block->scan_hint) {
+ block->scan_hint_start =
+ block->contig_hint_start;
+ block->scan_hint = block->contig_hint;
+ } else if (start < block->scan_hint_start) {
+ /*
+ * The old contig_hint == scan_hint. But, the
+ * new contig is larger so hold the invariant
+ * scan_hint_start < contig_hint_start.
+ */
+ block->scan_hint = 0;
+ }
+ } else {
+ block->scan_hint = 0;
+ }
block->contig_hint_start = start;
block->contig_hint = contig;
- } else if (block->contig_hint_start && contig == block->contig_hint &&
- (!start || __ffs(start) > __ffs(block->contig_hint_start))) {
- /* use the start with the best alignment */
- block->contig_hint_start = start;
+ } else if (contig == block->contig_hint) {
+ if (block->contig_hint_start &&
+ (!start ||
+ __ffs(start) > __ffs(block->contig_hint_start))) {
+ /* start has a better alignment so use it */
+ block->contig_hint_start = start;
+ if (start < block->scan_hint_start &&
+ block->contig_hint > block->scan_hint)
+ block->scan_hint = 0;
+ } else if (start > block->scan_hint_start ||
+ block->contig_hint > block->scan_hint) {
+ /*
+ * Knowing contig == contig_hint, update the scan_hint
+ * if it is farther than or larger than the current
+ * scan_hint.
+ */
+ block->scan_hint_start = start;
+ block->scan_hint = contig;
+ }
+ } else {
+ /*
+ * The region is smaller than the contig_hint. So only update
+ * the scan_hint if it is larger than or equal and farther than
+ * the current scan_hint.
+ */
+ if ((start < block->contig_hint_start &&
+ (contig > block->scan_hint ||
+ (contig == block->scan_hint &&
+ start > block->scan_hint_start)))) {
+ block->scan_hint_start = start;
+ block->scan_hint = contig;
+ }
+ }
+}
+
+/*
+ * pcpu_block_update_scan - update a block given a free area from a scan
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Finding the final allocation spot first goes through pcpu_find_block_fit()
+ * to find a block that can hold the allocation and then pcpu_alloc_area()
+ * where a scan is used. When allocations require specific alignments,
+ * we can inadvertently create holes which will not be seen in the alloc
+ * or free paths.
+ *
+ * This takes a given free area hole and updates a block as it may change the
+ * scan_hint. We need to scan backwards to ensure we don't miss free bits
+ * from alignment.
+ */
+static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
+{
+ int s_off = pcpu_off_to_block_off(bit_off);
+ int e_off = s_off + bits;
+ int s_index, l_bit;
+ struct pcpu_block_md *block;
+
+ if (e_off > PCPU_BITMAP_BLOCK_BITS)
+ return;
+
+ s_index = pcpu_off_to_block_index(bit_off);
+ block = chunk->md_blocks + s_index;
+
+ /* scan backwards in case of alignment skipping free bits */
+ l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
+ s_off = (s_off == l_bit) ? 0 : l_bit + 1;
+
+ pcpu_block_update(block, s_off, e_off);
+}
+
+/**
+ * pcpu_chunk_refresh_hint - updates metadata about a chunk
+ * @chunk: chunk of interest
+ * @full_scan: if we should scan from the beginning
+ *
+ * Iterates over the metadata blocks to find the largest contig area.
+ * A full scan can be avoided on the allocation path as this is triggered
+ * if we broke the contig_hint. In doing so, the scan_hint will be before
+ * the contig_hint or after if the scan_hint == contig_hint. This cannot
+ * be prevented on freeing as we want to find the largest area possibly
+ * spanning blocks.
+ */
+static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int bit_off, bits;
+
+ /* promote scan_hint to contig_hint */
+ if (!full_scan && chunk_md->scan_hint) {
+ bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
+ chunk_md->contig_hint_start = chunk_md->scan_hint_start;
+ chunk_md->contig_hint = chunk_md->scan_hint;
+ chunk_md->scan_hint = 0;
+ } else {
+ bit_off = chunk_md->first_free;
+ chunk_md->contig_hint = 0;
+ }
+
+ bits = 0;
+ pcpu_for_each_md_free_region(chunk, bit_off, bits) {
+ pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}
}
@@ -654,14 +750,23 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
struct pcpu_block_md *block = chunk->md_blocks + index;
unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
- int rs, re; /* region start, region end */
+ int rs, re, start; /* region start, region end */
+
+ /* promote scan_hint to contig_hint */
+ if (block->scan_hint) {
+ start = block->scan_hint_start + block->scan_hint;
+ block->contig_hint_start = block->scan_hint_start;
+ block->contig_hint = block->scan_hint;
+ block->scan_hint = 0;
+ } else {
+ start = block->first_free;
+ block->contig_hint = 0;
+ }
- /* clear hints */
- block->contig_hint = 0;
- block->left_free = block->right_free = 0;
+ block->right_free = 0;
/* iterate over free areas and update the contig hints */
- pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
+ pcpu_for_each_unpop_region(alloc_map, rs, re, start,
PCPU_BITMAP_BLOCK_BITS) {
pcpu_block_update(block, rs, re);
}
@@ -680,6 +785,8 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
int bits)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int nr_empty_pages = 0;
struct pcpu_block_md *s_block, *e_block, *block;
int s_index, e_index; /* block indexes of the freed allocation */
int s_off, e_off; /* block offsets of the freed allocation */
@@ -704,15 +811,29 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
* If the allocation breaks the contig_hint, a scan is required to
* restore this hint.
*/
+ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+
if (s_off == s_block->first_free)
s_block->first_free = find_next_zero_bit(
pcpu_index_alloc_map(chunk, s_index),
PCPU_BITMAP_BLOCK_BITS,
s_off + bits);
- if (s_off >= s_block->contig_hint_start &&
- s_off < s_block->contig_hint_start + s_block->contig_hint) {
+ if (pcpu_region_overlap(s_block->scan_hint_start,
+ s_block->scan_hint_start + s_block->scan_hint,
+ s_off,
+ s_off + bits))
+ s_block->scan_hint = 0;
+
+ if (pcpu_region_overlap(s_block->contig_hint_start,
+ s_block->contig_hint_start +
+ s_block->contig_hint,
+ s_off,
+ s_off + bits)) {
/* block contig hint is broken - scan to fix it */
+ if (!s_off)
+ s_block->left_free = 0;
pcpu_block_refresh_hint(chunk, s_index);
} else {
/* update left and right contig manually */
@@ -728,6 +849,9 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
* Update e_block.
*/
if (s_index != e_index) {
+ if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+
/*
* When the allocation is across blocks, the end is along
* the left part of the e_block.
@@ -740,11 +864,14 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
/* reset the block */
e_block++;
} else {
+ if (e_off > e_block->scan_hint_start)
+ e_block->scan_hint = 0;
+
+ e_block->left_free = 0;
if (e_off > e_block->contig_hint_start) {
/* contig hint is broken - scan to fix it */
pcpu_block_refresh_hint(chunk, e_index);
} else {
- e_block->left_free = 0;
e_block->right_free =
min_t(int, e_block->right_free,
PCPU_BITMAP_BLOCK_BITS - e_off);
@@ -752,21 +879,36 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
}
/* update in-between md_blocks */
+ nr_empty_pages += (e_index - s_index - 1);
for (block = s_block + 1; block < e_block; block++) {
+ block->scan_hint = 0;
block->contig_hint = 0;
block->left_free = 0;
block->right_free = 0;
}
}
+ if (nr_empty_pages)
+ pcpu_update_empty_pages(chunk, -nr_empty_pages);
+
+ if (pcpu_region_overlap(chunk_md->scan_hint_start,
+ chunk_md->scan_hint_start +
+ chunk_md->scan_hint,
+ bit_off,
+ bit_off + bits))
+ chunk_md->scan_hint = 0;
+
/*
* The only time a full chunk scan is required is if the chunk
* contig hint is broken. Otherwise, it means a smaller space
* was used and therefore the chunk contig hint is still correct.
*/
- if (bit_off >= chunk->contig_bits_start &&
- bit_off < chunk->contig_bits_start + chunk->contig_bits)
- pcpu_chunk_refresh_hint(chunk);
+ if (pcpu_region_overlap(chunk_md->contig_hint_start,
+ chunk_md->contig_hint_start +
+ chunk_md->contig_hint,
+ bit_off,
+ bit_off + bits))
+ pcpu_chunk_refresh_hint(chunk, false);
}
/**
@@ -782,13 +924,15 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
*
* A chunk update is triggered if a page becomes free, a block becomes free,
* or the free spans across blocks. This tradeoff is to minimize iterating
- * over the block metadata to update chunk->contig_bits. chunk->contig_bits
- * may be off by up to a page, but it will never be more than the available
- * space. If the contig hint is contained in one block, it will be accurate.
+ * over the block metadata to update chunk_md->contig_hint.
+ * chunk_md->contig_hint may be off by up to a page, but it will never be more
+ * than the available space. If the contig hint is contained in one block, it
+ * will be accurate.
*/
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
int bits)
{
+ int nr_empty_pages = 0;
struct pcpu_block_md *s_block, *e_block, *block;
int s_index, e_index; /* block indexes of the freed allocation */
int s_off, e_off; /* block offsets of the freed allocation */
@@ -842,16 +986,22 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
/* update s_block */
e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
+ if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
pcpu_block_update(s_block, start, e_off);
/* freeing in the same block */
if (s_index != e_index) {
/* update e_block */
+ if (end == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
pcpu_block_update(e_block, 0, end);
/* reset md_blocks in the middle */
+ nr_empty_pages += (e_index - s_index - 1);
for (block = s_block + 1; block < e_block; block++) {
block->first_free = 0;
+ block->scan_hint = 0;
block->contig_hint_start = 0;
block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
block->left_free = PCPU_BITMAP_BLOCK_BITS;
@@ -859,19 +1009,21 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
}
}
+ if (nr_empty_pages)
+ pcpu_update_empty_pages(chunk, nr_empty_pages);
+
/*
- * Refresh chunk metadata when the free makes a page free, a block
- * free, or spans across blocks. The contig hint may be off by up to
- * a page, but if the hint is contained in a block, it will be accurate
- * with the else condition below.
+ * Refresh chunk metadata when the free makes a block free or spans
+ * across blocks. The contig_hint may be off by up to a page, but if
+ * the contig_hint is contained in a block, it will be accurate with
+ * the else condition below.
*/
- if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
- ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
- s_index != e_index)
- pcpu_chunk_refresh_hint(chunk);
+ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
+ pcpu_chunk_refresh_hint(chunk, true);
else
- pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
- s_block->contig_hint);
+ pcpu_block_update(&chunk->chunk_md,
+ pcpu_block_off_to_off(s_index, start),
+ end);
}
/**
@@ -926,6 +1078,7 @@ static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, bool pop_only)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits, next_off;
/*
@@ -934,12 +1087,12 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
* cannot fit in the global hint, there is memory pressure and creating
* a new chunk would happen soon.
*/
- bit_off = ALIGN(chunk->contig_bits_start, align) -
- chunk->contig_bits_start;
- if (bit_off + alloc_bits > chunk->contig_bits)
+ bit_off = ALIGN(chunk_md->contig_hint_start, align) -
+ chunk_md->contig_hint_start;
+ if (bit_off + alloc_bits > chunk_md->contig_hint)
return -1;
- bit_off = chunk->first_bit;
+ bit_off = pcpu_next_hint(chunk_md, alloc_bits);
bits = 0;
pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
@@ -956,6 +1109,62 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
return bit_off;
}
+/*
+ * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
+ * @map: the address to base the search on
+ * @size: the bitmap size in bits
+ * @start: the bitnumber to start searching at
+ * @nr: the number of zeroed bits we're looking for
+ * @align_mask: alignment mask for zero area
+ * @largest_off: offset of the largest area skipped
+ * @largest_bits: size of the largest area skipped
+ *
+ * The @align_mask should be one less than a power of 2.
+ *
+ * This is a modified version of bitmap_find_next_zero_area_off() to remember
+ * the largest area that was skipped. This is imperfect, but in general is
+ * good enough. The largest remembered region is the largest failed region
+ * seen. This does not include anything we possibly skipped due to alignment.
+ * pcpu_block_update_scan() does scan backwards to try and recover what was
+ * lost to alignment. While this can cause scanning to miss earlier possible
+ * free areas, smaller allocations will eventually fill those holes.
+ */
+static unsigned long pcpu_find_zero_area(unsigned long *map,
+ unsigned long size,
+ unsigned long start,
+ unsigned long nr,
+ unsigned long align_mask,
+ unsigned long *largest_off,
+ unsigned long *largest_bits)
+{
+ unsigned long index, end, i, area_off, area_bits;
+again:
+ index = find_next_zero_bit(map, size, start);
+
+ /* Align allocation */
+ index = __ALIGN_MASK(index, align_mask);
+ area_off = index;
+
+ end = index + nr;
+ if (end > size)
+ return end;
+ i = find_next_bit(map, end, index);
+ if (i < end) {
+ area_bits = i - area_off;
+ /* remember largest unused area with best alignment */
+ if (area_bits > *largest_bits ||
+ (area_bits == *largest_bits && *largest_off &&
+ (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
+ *largest_off = area_off;
+ *largest_bits = area_bits;
+ }
+
+ start = i + 1;
+ goto again;
+ }
+ return index;
+}
+
/**
* pcpu_alloc_area - allocates an area from a pcpu_chunk
* @chunk: chunk of interest
@@ -978,7 +1187,9 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, int start)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
size_t align_mask = (align) ? (align - 1) : 0;
+ unsigned long area_off = 0, area_bits = 0;
int bit_off, end, oslot;
lockdep_assert_held(&pcpu_lock);
@@ -988,12 +1199,16 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
/*
* Search to find a fit.
*/
- end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
- bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
- alloc_bits, align_mask);
+ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
+ pcpu_chunk_map_bits(chunk));
+ bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
+ align_mask, &area_off, &area_bits);
if (bit_off >= end)
return -1;
+ if (area_bits)
+ pcpu_block_update_scan(chunk, area_off, area_bits);
+
/* update alloc map */
bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
@@ -1005,8 +1220,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
/* update first free bit */
- if (bit_off == chunk->first_bit)
- chunk->first_bit = find_next_zero_bit(
+ if (bit_off == chunk_md->first_free)
+ chunk_md->first_free = find_next_zero_bit(
chunk->alloc_map,
pcpu_chunk_map_bits(chunk),
bit_off + alloc_bits);
@@ -1028,6 +1243,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
*/
static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits, end, oslot;
lockdep_assert_held(&pcpu_lock);
@@ -1047,24 +1263,34 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
/* update first free bit */
- chunk->first_bit = min(chunk->first_bit, bit_off);
+ chunk_md->first_free = min(chunk_md->first_free, bit_off);
pcpu_block_update_hint_free(chunk, bit_off, bits);
pcpu_chunk_relocate(chunk, oslot);
}
+static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
+{
+ block->scan_hint = 0;
+ block->contig_hint = nr_bits;
+ block->left_free = nr_bits;
+ block->right_free = nr_bits;
+ block->first_free = 0;
+ block->nr_bits = nr_bits;
+}
+
static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
struct pcpu_block_md *md_block;
+ /* init the chunk's block */
+ pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
+
for (md_block = chunk->md_blocks;
md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
- md_block++) {
- md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
- md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
- md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
- }
+ md_block++)
+ pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}
/**
@@ -1143,11 +1369,8 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
chunk->immutable = true;
bitmap_fill(chunk->populated, chunk->nr_pages);
chunk->nr_populated = chunk->nr_pages;
- chunk->nr_empty_pop_pages =
- pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
- map_size / PCPU_MIN_ALLOC_SIZE);
+ chunk->nr_empty_pop_pages = chunk->nr_pages;
- chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
chunk->free_bytes = map_size;
if (chunk->start_offset) {
@@ -1157,7 +1380,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
set_bit(0, chunk->bound_map);
set_bit(offset_bits, chunk->bound_map);
- chunk->first_bit = offset_bits;
+ chunk->chunk_md.first_free = offset_bits;
pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
}
@@ -1210,7 +1433,6 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
pcpu_init_md_blocks(chunk);
/* init metadata */
- chunk->contig_bits = region_bits;
chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
return chunk;
@@ -1240,7 +1462,6 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
* @chunk: pcpu_chunk which got populated
* @page_start: the start page
* @page_end: the end page
- * @for_alloc: if this is to populate for allocation
*
* Pages in [@page_start,@page_end) have been populated to @chunk. Update
* the bookkeeping information accordingly. Must be called after each
@@ -1250,7 +1471,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
* is to serve an allocation in that area.
*/
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
- int page_end, bool for_alloc)
+ int page_end)
{
int nr = page_end - page_start;
@@ -1260,10 +1481,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
chunk->nr_populated += nr;
pcpu_nr_populated += nr;
- if (!for_alloc) {
- chunk->nr_empty_pop_pages += nr;
- pcpu_nr_empty_pop_pages += nr;
- }
+ pcpu_update_empty_pages(chunk, nr);
}
/**
@@ -1285,9 +1503,9 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
bitmap_clear(chunk->populated, page_start, nr);
chunk->nr_populated -= nr;
- chunk->nr_empty_pop_pages -= nr;
- pcpu_nr_empty_pop_pages -= nr;
pcpu_nr_populated -= nr;
+
+ pcpu_update_empty_pages(chunk, -nr);
}
/*
@@ -1374,7 +1592,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
bool do_warn = !(gfp & __GFP_NOWARN);
static int warn_limit = 10;
- struct pcpu_chunk *chunk;
+ struct pcpu_chunk *chunk, *next;
const char *err;
int slot, off, cpu, ret;
unsigned long flags;
@@ -1436,11 +1654,14 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
restart:
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
- list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+ list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
off = pcpu_find_block_fit(chunk, bits, bit_align,
is_atomic);
- if (off < 0)
+ if (off < 0) {
+ if (slot < PCPU_SLOT_FAIL_THRESHOLD)
+ pcpu_chunk_move(chunk, 0);
continue;
+ }
off = pcpu_alloc_area(chunk, bits, bit_align, off);
if (off >= 0)
@@ -1499,7 +1720,7 @@ area_found:
err = "failed to populate";
goto fail_unlock;
}
- pcpu_chunk_populated(chunk, rs, re, true);
+ pcpu_chunk_populated(chunk, rs, re);
spin_unlock_irqrestore(&pcpu_lock, flags);
}
@@ -1698,7 +1919,7 @@ retry_pop:
if (!ret) {
nr_to_pop -= nr;
spin_lock_irq(&pcpu_lock);
- pcpu_chunk_populated(chunk, rs, rs + nr, false);
+ pcpu_chunk_populated(chunk, rs, rs + nr);
spin_unlock_irq(&pcpu_lock);
} else {
nr_to_pop = 0;
@@ -1738,6 +1959,7 @@ void free_percpu(void __percpu *ptr)
struct pcpu_chunk *chunk;
unsigned long flags;
int off;
+ bool need_balance = false;
if (!ptr)
return;
@@ -1759,7 +1981,7 @@ void free_percpu(void __percpu *ptr)
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
if (pos != chunk) {
- pcpu_schedule_balance_work();
+ need_balance = true;
break;
}
}
@@ -1767,6 +1989,9 @@ void free_percpu(void __percpu *ptr)
trace_percpu_free_percpu(chunk->base_addr, off, ptr);
spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ if (need_balance)
+ pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/shmem.c b/mm/shmem.c
index 2275a0ff7c30..f4dce9c8670d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3631,9 +3631,8 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
return &info->vfs_inode;
}
-static void shmem_destroy_callback(struct rcu_head *head)
+static void shmem_free_in_core_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
if (S_ISLNK(inode->i_mode))
kfree(inode->i_link);
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
@@ -3643,7 +3642,6 @@ static void shmem_destroy_inode(struct inode *inode)
{
if (S_ISREG(inode->i_mode))
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
- call_rcu(&inode->i_rcu, shmem_destroy_callback);
}
static void shmem_init_inode(void *foo)
@@ -3734,6 +3732,7 @@ static const struct inode_operations shmem_special_inode_operations = {
static const struct super_operations shmem_ops = {
.alloc_inode = shmem_alloc_inode,
+ .free_inode = shmem_free_in_core_inode,
.destroy_inode = shmem_destroy_inode,
#ifdef CONFIG_TMPFS
.statfs = shmem_statfs,
diff --git a/mm/slab.c b/mm/slab.c
index 9142ee992493..284ab737faee 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1467,53 +1467,17 @@ static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
}
#ifdef CONFIG_DEBUG_PAGEALLOC
-static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
- unsigned long caller)
-{
- int size = cachep->object_size;
-
- addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
-
- if (size < 5 * sizeof(unsigned long))
- return;
-
- *addr++ = 0x12345678;
- *addr++ = caller;
- *addr++ = smp_processor_id();
- size -= 3 * sizeof(unsigned long);
- {
- unsigned long *sptr = &caller;
- unsigned long svalue;
-
- while (!kstack_end(sptr)) {
- svalue = *sptr++;
- if (kernel_text_address(svalue)) {
- *addr++ = svalue;
- size -= sizeof(unsigned long);
- if (size <= sizeof(unsigned long))
- break;
- }
- }
-
- }
- *addr++ = 0x87654321;
-}
-
-static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
- int map, unsigned long caller)
+static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
{
if (!is_debug_pagealloc_cache(cachep))
return;
- if (caller)
- store_stackinfo(cachep, objp, caller);
-
kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
}
#else
static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
- int map, unsigned long caller) {}
+ int map) {}
#endif
@@ -1661,7 +1625,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);
- slab_kernel_map(cachep, objp, 1, 0);
+ slab_kernel_map(cachep, objp, 1);
}
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
@@ -2433,7 +2397,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON) {
poison_obj(cachep, objp, POISON_FREE);
- slab_kernel_map(cachep, objp, 0, 0);
+ slab_kernel_map(cachep, objp, 0);
}
}
#endif
@@ -2812,7 +2776,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
if (cachep->flags & SLAB_POISON) {
poison_obj(cachep, objp, POISON_FREE);
- slab_kernel_map(cachep, objp, 0, caller);
+ slab_kernel_map(cachep, objp, 0);
}
return objp;
}
@@ -3076,7 +3040,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
return objp;
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);
- slab_kernel_map(cachep, objp, 1, 0);
+ slab_kernel_map(cachep, objp, 1);
poison_obj(cachep, objp, POISON_INUSE);
}
if (cachep->flags & SLAB_STORE_USER)
diff --git a/mm/slub.c b/mm/slub.c
index d30ede89f4a6..6b28cd2b5a58 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -552,31 +552,22 @@ static void set_track(struct kmem_cache *s, void *object,
if (addr) {
#ifdef CONFIG_STACKTRACE
- struct stack_trace trace;
- int i;
+ unsigned int nr_entries;
- trace.nr_entries = 0;
- trace.max_entries = TRACK_ADDRS_COUNT;
- trace.entries = p->addrs;
- trace.skip = 3;
metadata_access_enable();
- save_stack_trace(&trace);
+ nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3);
metadata_access_disable();
- /* See rant in lockdep.c */
- if (trace.nr_entries != 0 &&
- trace.entries[trace.nr_entries - 1] == ULONG_MAX)
- trace.nr_entries--;
-
- for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
- p->addrs[i] = 0;
+ if (nr_entries < TRACK_ADDRS_COUNT)
+ p->addrs[nr_entries] = 0;
#endif
p->addr = addr;
p->cpu = smp_processor_id();
p->pid = current->pid;
p->when = jiffies;
- } else
+ } else {
memset(p, 0, sizeof(struct track));
+ }
}
static void init_tracking(struct kmem_cache *s, void *object)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e86ba6e74b50..e5e9e1fcac01 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -18,6 +18,7 @@
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
@@ -1059,24 +1060,9 @@ static void vb_free(const void *addr, unsigned long size)
spin_unlock(&vb->lock);
}
-/**
- * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
- *
- * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
- * to amortize TLB flushing overheads. What this means is that any page you
- * have now, may, in a former life, have been mapped into kernel virtual
- * address by the vmap layer and so there might be some CPUs with TLB entries
- * still referencing that page (additional to the regular 1:1 kernel mapping).
- *
- * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
- * be sure that none of the pages we have control over will have any aliases
- * from the vmap layer.
- */
-void vm_unmap_aliases(void)
+static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
- unsigned long start = ULONG_MAX, end = 0;
int cpu;
- int flush = 0;
if (unlikely(!vmap_initialized))
return;
@@ -1113,6 +1099,27 @@ void vm_unmap_aliases(void)
flush_tlb_kernel_range(start, end);
mutex_unlock(&vmap_purge_lock);
}
+
+/**
+ * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
+ *
+ * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
+ * to amortize TLB flushing overheads. What this means is that any page you
+ * have now, may, in a former life, have been mapped into kernel virtual
+ * address by the vmap layer and so there might be some CPUs with TLB entries
+ * still referencing that page (additional to the regular 1:1 kernel mapping).
+ *
+ * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
+ * be sure that none of the pages we have control over will have any aliases
+ * from the vmap layer.
+ */
+void vm_unmap_aliases(void)
+{
+ unsigned long start = ULONG_MAX, end = 0;
+ int flush = 0;
+
+ _vm_unmap_aliases(start, end, flush);
+}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
/**
@@ -1505,6 +1512,72 @@ struct vm_struct *remove_vm_area(const void *addr)
return NULL;
}
+static inline void set_area_direct_map(const struct vm_struct *area,
+ int (*set_direct_map)(struct page *page))
+{
+ int i;
+
+ for (i = 0; i < area->nr_pages; i++)
+ if (page_address(area->pages[i]))
+ set_direct_map(area->pages[i]);
+}
+
+/* Handle removing and resetting vm mappings related to the vm_struct. */
+static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
+{
+ unsigned long addr = (unsigned long)area->addr;
+ unsigned long start = ULONG_MAX, end = 0;
+ int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
+ int i;
+
+ /*
+ * The below block can be removed when all architectures that have
+ * direct map permissions also have set_direct_map_() implementations.
+ * This is concerned with resetting the direct map any an vm alias with
+ * execute permissions, without leaving a RW+X window.
+ */
+ if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
+ set_memory_nx(addr, area->nr_pages);
+ set_memory_rw(addr, area->nr_pages);
+ }
+
+ remove_vm_area(area->addr);
+
+ /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
+ if (!flush_reset)
+ return;
+
+ /*
+ * If not deallocating pages, just do the flush of the VM area and
+ * return.
+ */
+ if (!deallocate_pages) {
+ vm_unmap_aliases();
+ return;
+ }
+
+ /*
+ * If execution gets here, flush the vm mapping and reset the direct
+ * map. Find the start and end range of the direct mappings to make sure
+ * the vm_unmap_aliases() flush includes the direct map.
+ */
+ for (i = 0; i < area->nr_pages; i++) {
+ if (page_address(area->pages[i])) {
+ start = min(addr, start);
+ end = max(addr, end);
+ }
+ }
+
+ /*
+ * Set direct map to something invalid so that it won't be cached if
+ * there are any accesses after the TLB flush, then flush the TLB and
+ * reset the direct map permissions to the default.
+ */
+ set_area_direct_map(area, set_direct_map_invalid_noflush);
+ _vm_unmap_aliases(start, end, 1);
+ set_area_direct_map(area, set_direct_map_default_noflush);
+}
+
static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
@@ -1526,7 +1599,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
- remove_vm_area(addr);
+ vm_remove_mappings(area, deallocate_pages);
+
if (deallocate_pages) {
int i;
@@ -1961,8 +2035,9 @@ EXPORT_SYMBOL(vzalloc_node);
*/
void *vmalloc_exec(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
- NUMA_NO_NODE, __builtin_return_address(0));
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE, __builtin_return_address(0));
}
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a815f73ee4d5..fd9de504e516 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -493,7 +493,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
total_scan += delta;
if (total_scan < 0) {
- pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
+ pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
total_scan = freeable;
next_deferred = nr;