summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c17
-rw-r--r--mm/bootmem.c24
-rw-r--r--mm/memblock.c541
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/memory.c16
-rw-r--r--mm/mempolicy.c9
-rw-r--r--mm/page-writeback.c8
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/page_cgroup.c7
-rw-r--r--mm/percpu.c36
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slob.c9
-rw-r--r--mm/slub.c84
-rw-r--r--mm/vmscan.c10
16 files changed, 696 insertions, 84 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 527136b22384..f4e516e9c37c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -128,6 +128,9 @@ config SPARSEMEM_VMEMMAP
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
+config HAVE_MEMBLOCK
+ boolean
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
diff --git a/mm/Makefile b/mm/Makefile
index 8982504bd03b..34b2546a9e37 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,8 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
$(mmu-y)
obj-y += init-mm.o
+obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
+
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 660a87a22511..123bcef13e51 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -104,15 +104,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"b_more_io: %8lu\n"
"bdi_list: %8u\n"
"state: %8lx\n"
- "wb_mask: %8lx\n"
- "wb_list: %8u\n"
- "wb_cnt: %8u\n",
+ "wb_list: %8u\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
(unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
K(bdi_thresh), K(dirty_thresh),
K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
- !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask,
- !list_empty(&bdi->wb_list), bdi->wb_cnt);
+ !list_empty(&bdi->bdi_list), bdi->state,
+ !list_empty(&bdi->wb_list));
#undef K
return 0;
@@ -340,14 +338,13 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
static void bdi_flush_io(struct backing_dev_info *bdi)
{
struct writeback_control wbc = {
- .bdi = bdi,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.range_cyclic = 1,
.nr_to_write = 1024,
};
- writeback_inodes_wbc(&wbc);
+ writeback_inodes_wb(&bdi->wb, &wbc);
}
/*
@@ -675,12 +672,6 @@ int bdi_init(struct backing_dev_info *bdi)
bdi_wb_init(&bdi->wb, bdi);
- /*
- * Just one thread support for now, hard code mask and count
- */
- bdi->wb_mask = 1;
- bdi->wb_cnt = 1;
-
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
err = percpu_counter_init(&bdi->bdi_stat[i], 0);
if (err)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 58c66cc5056a..142c84a54993 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -833,15 +833,24 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ void *ptr;
+
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
#ifdef CONFIG_NO_BOOTMEM
- return __alloc_memory_core_early(pgdat->node_id, size, align,
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+ if (ptr)
+ return ptr;
+
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
goal, -1ULL);
#else
- return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+ ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
#endif
+
+ return ptr;
}
void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -977,14 +986,21 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
+ void *ptr;
+
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
#ifdef CONFIG_NO_BOOTMEM
- return __alloc_memory_core_early(pgdat->node_id, size, align,
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+ if (ptr)
+ return ptr;
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
#else
- return ___alloc_bootmem_node(pgdat->bdata, size, align,
+ ptr = ___alloc_bootmem_node(pgdat->bdata, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
#endif
+ return ptr;
}
diff --git a/mm/memblock.c b/mm/memblock.c
new file mode 100644
index 000000000000..3024eb30fc27
--- /dev/null
+++ b/mm/memblock.c
@@ -0,0 +1,541 @@
+/*
+ * Procedures for maintaining information about logical memory blocks.
+ *
+ * Peter Bergner, IBM Corp. June 2001.
+ * Copyright (C) 2001 Peter Bergner.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/bitops.h>
+#include <linux/memblock.h>
+
+#define MEMBLOCK_ALLOC_ANYWHERE 0
+
+struct memblock memblock;
+
+static int memblock_debug;
+
+static int __init early_memblock(char *p)
+{
+ if (p && strstr(p, "debug"))
+ memblock_debug = 1;
+ return 0;
+}
+early_param("memblock", early_memblock);
+
+static void memblock_dump(struct memblock_region *region, char *name)
+{
+ unsigned long long base, size;
+ int i;
+
+ pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
+
+ for (i = 0; i < region->cnt; i++) {
+ base = region->region[i].base;
+ size = region->region[i].size;
+
+ pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n",
+ name, i, base, base + size - 1, size);
+ }
+}
+
+void memblock_dump_all(void)
+{
+ if (!memblock_debug)
+ return;
+
+ pr_info("MEMBLOCK configuration:\n");
+ pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size);
+ pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size);
+
+ memblock_dump(&memblock.memory, "memory");
+ memblock_dump(&memblock.reserved, "reserved");
+}
+
+static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2,
+ u64 size2)
+{
+ return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+}
+
+static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2)
+{
+ if (base2 == base1 + size1)
+ return 1;
+ else if (base1 == base2 + size2)
+ return -1;
+
+ return 0;
+}
+
+static long memblock_regions_adjacent(struct memblock_region *rgn,
+ unsigned long r1, unsigned long r2)
+{
+ u64 base1 = rgn->region[r1].base;
+ u64 size1 = rgn->region[r1].size;
+ u64 base2 = rgn->region[r2].base;
+ u64 size2 = rgn->region[r2].size;
+
+ return memblock_addrs_adjacent(base1, size1, base2, size2);
+}
+
+static void memblock_remove_region(struct memblock_region *rgn, unsigned long r)
+{
+ unsigned long i;
+
+ for (i = r; i < rgn->cnt - 1; i++) {
+ rgn->region[i].base = rgn->region[i + 1].base;
+ rgn->region[i].size = rgn->region[i + 1].size;
+ }
+ rgn->cnt--;
+}
+
+/* Assumption: base addr of region 1 < base addr of region 2 */
+static void memblock_coalesce_regions(struct memblock_region *rgn,
+ unsigned long r1, unsigned long r2)
+{
+ rgn->region[r1].size += rgn->region[r2].size;
+ memblock_remove_region(rgn, r2);
+}
+
+void __init memblock_init(void)
+{
+ /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
+ * This simplifies the memblock_add() code below...
+ */
+ memblock.memory.region[0].base = 0;
+ memblock.memory.region[0].size = 0;
+ memblock.memory.cnt = 1;
+
+ /* Ditto. */
+ memblock.reserved.region[0].base = 0;
+ memblock.reserved.region[0].size = 0;
+ memblock.reserved.cnt = 1;
+}
+
+void __init memblock_analyze(void)
+{
+ int i;
+
+ memblock.memory.size = 0;
+
+ for (i = 0; i < memblock.memory.cnt; i++)
+ memblock.memory.size += memblock.memory.region[i].size;
+}
+
+static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size)
+{
+ unsigned long coalesced = 0;
+ long adjacent, i;
+
+ if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) {
+ rgn->region[0].base = base;
+ rgn->region[0].size = size;
+ return 0;
+ }
+
+ /* First try and coalesce this MEMBLOCK with another. */
+ for (i = 0; i < rgn->cnt; i++) {
+ u64 rgnbase = rgn->region[i].base;
+ u64 rgnsize = rgn->region[i].size;
+
+ if ((rgnbase == base) && (rgnsize == size))
+ /* Already have this region, so we're done */
+ return 0;
+
+ adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
+ if (adjacent > 0) {
+ rgn->region[i].base -= size;
+ rgn->region[i].size += size;
+ coalesced++;
+ break;
+ } else if (adjacent < 0) {
+ rgn->region[i].size += size;
+ coalesced++;
+ break;
+ }
+ }
+
+ if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) {
+ memblock_coalesce_regions(rgn, i, i+1);
+ coalesced++;
+ }
+
+ if (coalesced)
+ return coalesced;
+ if (rgn->cnt >= MAX_MEMBLOCK_REGIONS)
+ return -1;
+
+ /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
+ for (i = rgn->cnt - 1; i >= 0; i--) {
+ if (base < rgn->region[i].base) {
+ rgn->region[i+1].base = rgn->region[i].base;
+ rgn->region[i+1].size = rgn->region[i].size;
+ } else {
+ rgn->region[i+1].base = base;
+ rgn->region[i+1].size = size;
+ break;
+ }
+ }
+
+ if (base < rgn->region[0].base) {
+ rgn->region[0].base = base;
+ rgn->region[0].size = size;
+ }
+ rgn->cnt++;
+
+ return 0;
+}
+
+long memblock_add(u64 base, u64 size)
+{
+ struct memblock_region *_rgn = &memblock.memory;
+
+ /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */
+ if (base == 0)
+ memblock.rmo_size = size;
+
+ return memblock_add_region(_rgn, base, size);
+
+}
+
+static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
+{
+ u64 rgnbegin, rgnend;
+ u64 end = base + size;
+ int i;
+
+ rgnbegin = rgnend = 0; /* supress gcc warnings */
+
+ /* Find the region where (base, size) belongs to */
+ for (i=0; i < rgn->cnt; i++) {
+ rgnbegin = rgn->region[i].base;
+ rgnend = rgnbegin + rgn->region[i].size;
+
+ if ((rgnbegin <= base) && (end <= rgnend))
+ break;
+ }
+
+ /* Didn't find the region */
+ if (i == rgn->cnt)
+ return -1;
+
+ /* Check to see if we are removing entire region */
+ if ((rgnbegin == base) && (rgnend == end)) {
+ memblock_remove_region(rgn, i);
+ return 0;
+ }
+
+ /* Check to see if region is matching at the front */
+ if (rgnbegin == base) {
+ rgn->region[i].base = end;
+ rgn->region[i].size -= size;
+ return 0;
+ }
+
+ /* Check to see if the region is matching at the end */
+ if (rgnend == end) {
+ rgn->region[i].size -= size;
+ return 0;
+ }
+
+ /*
+ * We need to split the entry - adjust the current one to the
+ * beginging of the hole and add the region after hole.
+ */
+ rgn->region[i].size = base - rgn->region[i].base;
+ return memblock_add_region(rgn, end, rgnend - end);
+}
+
+long memblock_remove(u64 base, u64 size)
+{
+ return __memblock_remove(&memblock.memory, base, size);
+}
+
+long __init memblock_free(u64 base, u64 size)
+{
+ return __memblock_remove(&memblock.reserved, base, size);
+}
+
+long __init memblock_reserve(u64 base, u64 size)
+{
+ struct memblock_region *_rgn = &memblock.reserved;
+
+ BUG_ON(0 == size);
+
+ return memblock_add_region(_rgn, base, size);
+}
+
+long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size)
+{
+ unsigned long i;
+
+ for (i = 0; i < rgn->cnt; i++) {
+ u64 rgnbase = rgn->region[i].base;
+ u64 rgnsize = rgn->region[i].size;
+ if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+ break;
+ }
+
+ return (i < rgn->cnt) ? i : -1;
+}
+
+static u64 memblock_align_down(u64 addr, u64 size)
+{
+ return addr & ~(size - 1);
+}
+
+static u64 memblock_align_up(u64 addr, u64 size)
+{
+ return (addr + (size - 1)) & ~(size - 1);
+}
+
+static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end,
+ u64 size, u64 align)
+{
+ u64 base, res_base;
+ long j;
+
+ base = memblock_align_down((end - size), align);
+ while (start <= base) {
+ j = memblock_overlaps_region(&memblock.reserved, base, size);
+ if (j < 0) {
+ /* this area isn't reserved, take it */
+ if (memblock_add_region(&memblock.reserved, base, size) < 0)
+ base = ~(u64)0;
+ return base;
+ }
+ res_base = memblock.reserved.region[j].base;
+ if (res_base < size)
+ break;
+ base = memblock_align_down(res_base - size, align);
+ }
+
+ return ~(u64)0;
+}
+
+static u64 __init memblock_alloc_nid_region(struct memblock_property *mp,
+ u64 (*nid_range)(u64, u64, int *),
+ u64 size, u64 align, int nid)
+{
+ u64 start, end;
+
+ start = mp->base;
+ end = start + mp->size;
+
+ start = memblock_align_up(start, align);
+ while (start < end) {
+ u64 this_end;
+ int this_nid;
+
+ this_end = nid_range(start, end, &this_nid);
+ if (this_nid == nid) {
+ u64 ret = memblock_alloc_nid_unreserved(start, this_end,
+ size, align);
+ if (ret != ~(u64)0)
+ return ret;
+ }
+ start = this_end;
+ }
+
+ return ~(u64)0;
+}
+
+u64 __init memblock_alloc_nid(u64 size, u64 align, int nid,
+ u64 (*nid_range)(u64 start, u64 end, int *nid))
+{
+ struct memblock_region *mem = &memblock.memory;
+ int i;
+
+ BUG_ON(0 == size);
+
+ size = memblock_align_up(size, align);
+
+ for (i = 0; i < mem->cnt; i++) {
+ u64 ret = memblock_alloc_nid_region(&mem->region[i],
+ nid_range,
+ size, align, nid);
+ if (ret != ~(u64)0)
+ return ret;
+ }
+
+ return memblock_alloc(size, align);
+}
+
+u64 __init memblock_alloc(u64 size, u64 align)
+{
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
+}
+
+u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr)
+{
+ u64 alloc;
+
+ alloc = __memblock_alloc_base(size, align, max_addr);
+
+ if (alloc == 0)
+ panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+ (unsigned long long) size, (unsigned long long) max_addr);
+
+ return alloc;
+}
+
+u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr)
+{
+ long i, j;
+ u64 base = 0;
+ u64 res_base;
+
+ BUG_ON(0 == size);
+
+ size = memblock_align_up(size, align);
+
+ /* On some platforms, make sure we allocate lowmem */
+ /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */
+ if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
+ max_addr = MEMBLOCK_REAL_LIMIT;
+
+ for (i = memblock.memory.cnt - 1; i >= 0; i--) {
+ u64 memblockbase = memblock.memory.region[i].base;
+ u64 memblocksize = memblock.memory.region[i].size;
+
+ if (memblocksize < size)
+ continue;
+ if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
+ base = memblock_align_down(memblockbase + memblocksize - size, align);
+ else if (memblockbase < max_addr) {
+ base = min(memblockbase + memblocksize, max_addr);
+ base = memblock_align_down(base - size, align);
+ } else
+ continue;
+
+ while (base && memblockbase <= base) {
+ j = memblock_overlaps_region(&memblock.reserved, base, size);
+ if (j < 0) {
+ /* this area isn't reserved, take it */
+ if (memblock_add_region(&memblock.reserved, base, size) < 0)
+ return 0;
+ return base;
+ }
+ res_base = memblock.reserved.region[j].base;
+ if (res_base < size)
+ break;
+ base = memblock_align_down(res_base - size, align);
+ }
+ }
+ return 0;
+}
+
+/* You must call memblock_analyze() before this. */
+u64 __init memblock_phys_mem_size(void)
+{
+ return memblock.memory.size;
+}
+
+u64 memblock_end_of_DRAM(void)
+{
+ int idx = memblock.memory.cnt - 1;
+
+ return (memblock.memory.region[idx].base + memblock.memory.region[idx].size);
+}
+
+/* You must call memblock_analyze() after this. */
+void __init memblock_enforce_memory_limit(u64 memory_limit)
+{
+ unsigned long i;
+ u64 limit;
+ struct memblock_property *p;
+
+ if (!memory_limit)
+ return;
+
+ /* Truncate the memblock regions to satisfy the memory limit. */
+ limit = memory_limit;
+ for (i = 0; i < memblock.memory.cnt; i++) {
+ if (limit > memblock.memory.region[i].size) {
+ limit -= memblock.memory.region[i].size;
+ continue;
+ }
+
+ memblock.memory.region[i].size = limit;
+ memblock.memory.cnt = i + 1;
+ break;
+ }
+
+ if (memblock.memory.region[0].size < memblock.rmo_size)
+ memblock.rmo_size = memblock.memory.region[0].size;
+
+ memory_limit = memblock_end_of_DRAM();
+
+ /* And truncate any reserves above the limit also. */
+ for (i = 0; i < memblock.reserved.cnt; i++) {
+ p = &memblock.reserved.region[i];
+
+ if (p->base > memory_limit)
+ p->size = 0;
+ else if ((p->base + p->size) > memory_limit)
+ p->size = memory_limit - p->base;
+
+ if (p->size == 0) {
+ memblock_remove_region(&memblock.reserved, i);
+ i--;
+ }
+ }
+}
+
+int __init memblock_is_reserved(u64 addr)
+{
+ int i;
+
+ for (i = 0; i < memblock.reserved.cnt; i++) {
+ u64 upper = memblock.reserved.region[i].base +
+ memblock.reserved.region[i].size - 1;
+ if ((addr >= memblock.reserved.region[i].base) && (addr <= upper))
+ return 1;
+ }
+ return 0;
+}
+
+int memblock_is_region_reserved(u64 base, u64 size)
+{
+ return memblock_overlaps_region(&memblock.reserved, base, size);
+}
+
+/*
+ * Given a <base, len>, find which memory regions belong to this range.
+ * Adjust the request and return a contiguous chunk.
+ */
+int memblock_find(struct memblock_property *res)
+{
+ int i;
+ u64 rstart, rend;
+
+ rstart = res->base;
+ rend = rstart + res->size - 1;
+
+ for (i = 0; i < memblock.memory.cnt; i++) {
+ u64 start = memblock.memory.region[i].base;
+ u64 end = start + memblock.memory.region[i].size - 1;
+
+ if (start > rend)
+ return -1;
+
+ if ((end >= rstart) && (start < rend)) {
+ /* adjust the request */
+ if (rstart < start)
+ rstart = start;
+ if (rend > end)
+ rend = end;
+ res->base = rstart;
+ res->size = rend - rstart + 1;
+ return 0;
+ }
+ }
+ return -1;
+}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c6ece0a57595..20a8193a7af8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1370,7 +1370,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
static void memcg_oom_recover(struct mem_cgroup *mem)
{
- if (mem->oom_kill_disable && atomic_read(&mem->oom_lock))
+ if (atomic_read(&mem->oom_lock))
memcg_wakeup_oom(mem);
}
@@ -3781,6 +3781,8 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
return -EINVAL;
}
mem->oom_kill_disable = val;
+ if (!val)
+ memcg_oom_recover(mem);
cgroup_unlock();
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index 119b7ccdf39b..bde42c6d3633 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1394,10 +1394,20 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
return i ? : -EFAULT;
}
if (pages) {
- struct page *page = vm_normal_page(gate_vma, start, *pte);
+ struct page *page;
+
+ page = vm_normal_page(gate_vma, start, *pte);
+ if (!page) {
+ if (!(gup_flags & FOLL_DUMP) &&
+ is_zero_pfn(pte_pfn(*pte)))
+ page = pte_page(*pte);
+ else {
+ pte_unmap(pte);
+ return i ? : -EFAULT;
+ }
+ }
pages[i] = page;
- if (page)
- get_page(page);
+ get_page(page);
}
pte_unmap(pte);
if (vmas)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5d6fb339de03..5bc0a96beb51 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2094,7 +2094,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
NODEMASK_SCRATCH(scratch);
if (!scratch)
- return;
+ goto put_mpol;
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
if (IS_ERR(new))
@@ -2103,19 +2103,20 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
task_lock(current);
ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
task_unlock(current);
- mpol_put(mpol); /* drop our ref on sb mpol */
if (ret)
- goto put_free;
+ goto put_new;
/* Create pseudo-vma that contains just the policy */
memset(&pvma, 0, sizeof(struct vm_area_struct));
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
-put_free:
+put_new:
mpol_put(new); /* drop initial ref */
free_scratch:
NODEMASK_SCRATCH_FREE(scratch);
+put_mpol:
+ mpol_put(mpol); /* drop our incoming ref on sb mpol */
}
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bbd396ac9546..37498ef61548 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -495,7 +495,6 @@ static void balance_dirty_pages(struct address_space *mapping,
for (;;) {
struct writeback_control wbc = {
- .bdi = bdi,
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.nr_to_write = write_chunk,
@@ -537,7 +536,7 @@ static void balance_dirty_pages(struct address_space *mapping,
* up.
*/
if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes_wbc(&wbc);
+ writeback_inodes_wb(&bdi->wb, &wbc);
pages_written += write_chunk - wbc.nr_to_write;
get_dirty_limits(&background_thresh, &dirty_thresh,
&bdi_thresh, bdi);
@@ -597,7 +596,7 @@ static void balance_dirty_pages(struct address_space *mapping,
(!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
+ global_page_state(NR_UNSTABLE_NFS))
> background_thresh)))
- bdi_start_writeback(bdi, NULL, 0);
+ bdi_start_background_writeback(bdi);
}
void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -705,9 +704,8 @@ void laptop_mode_timer_fn(unsigned long data)
* We want to write everything out, not just down to the dirty
* threshold
*/
-
if (bdi_has_dirty_io(&q->backing_dev_info))
- bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages);
+ bdi_start_writeback(&q->backing_dev_info, nr_pages);
}
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 431214b941ac..9bd339eb04c6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3634,6 +3634,9 @@ void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
int i;
void *ptr;
+ if (limit > get_max_mapped())
+ limit = get_max_mapped();
+
/* need to go over early_node_map to find out good range for node */
for_each_active_range_index_in_nid(i, nid) {
u64 addr;
@@ -3659,6 +3662,11 @@ void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
ptr = phys_to_virt(addr);
memset(ptr, 0, size);
reserve_early_without_check(addr, addr + size, "BOOTMEM");
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
return ptr;
}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 6c0081441a32..5bffada7cde1 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -9,6 +9,7 @@
#include <linux/vmalloc.h>
#include <linux/cgroup.h>
#include <linux/swapops.h>
+#include <linux/kmemleak.h>
static void __meminit
__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -126,6 +127,12 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
if (!base)
base = vmalloc(table_size);
}
+ /*
+ * The value stored in section->page_cgroup is (base - pfn)
+ * and it does not point to the memory block allocated above,
+ * causing kmemleak false positives.
+ */
+ kmemleak_not_leak(base);
} else {
/*
* We don't have to allocate page_cgroup again, but
diff --git a/mm/percpu.c b/mm/percpu.c
index 39f7dfd59585..6470e7710231 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -229,8 +229,8 @@ static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
-static unsigned long __maybe_unused pcpu_chunk_addr(struct pcpu_chunk *chunk,
- unsigned int cpu, int page_idx)
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+ unsigned int cpu, int page_idx)
{
return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
(page_idx << PAGE_SHIFT);
@@ -978,7 +978,32 @@ bool is_kernel_percpu_address(unsigned long addr)
*/
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
- if (pcpu_addr_in_first_chunk(addr)) {
+ void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
+ bool in_first_chunk = false;
+ unsigned long first_start, first_end;
+ unsigned int cpu;
+
+ /*
+ * The following test on first_start/end isn't strictly
+ * necessary but will speed up lookups of addresses which
+ * aren't in the first chunk.
+ */
+ first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0);
+ first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu,
+ pcpu_unit_pages);
+ if ((unsigned long)addr >= first_start &&
+ (unsigned long)addr < first_end) {
+ for_each_possible_cpu(cpu) {
+ void *start = per_cpu_ptr(base, cpu);
+
+ if (addr >= start && addr < start + pcpu_unit_size) {
+ in_first_chunk = true;
+ break;
+ }
+ }
+ }
+
+ if (in_first_chunk) {
if ((unsigned long)addr < VMALLOC_START ||
(unsigned long)addr >= VMALLOC_END)
return __pa(addr);
@@ -1086,7 +1111,7 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
const size_t static_size = __per_cpu_end - __per_cpu_start;
- int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
+ int nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
int last_allocs, group, unit;
@@ -1096,7 +1121,7 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
/* this function may be called multiple times */
memset(group_map, 0, sizeof(group_map));
- memset(group_cnt, 0, sizeof(group_map));
+ memset(group_cnt, 0, sizeof(group_cnt));
/*
* Determine min_unit_size, alloc_size and max_upa such that
@@ -1130,7 +1155,6 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info(
}
group_map[cpu] = group;
group_cnt[group]++;
- group_cnt_max = max(group_cnt_max, group_cnt[group]);
}
/*
diff --git a/mm/slab.c b/mm/slab.c
index e49f8f46f46d..29aad44a55c2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -861,7 +861,7 @@ static void __cpuinit start_cpu_timer(int cpu)
*/
if (keventd_up() && reap_work->work.func == NULL) {
init_reap_node(cpu);
- INIT_DELAYED_WORK(reap_work, cache_reap);
+ INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap);
schedule_delayed_work_on(cpu, reap_work,
__round_jiffies_relative(HZ, cpu));
}
diff --git a/mm/slob.c b/mm/slob.c
index 23631e2bb57a..6a208f81888a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -394,6 +394,7 @@ static void slob_free(void *block, int size)
slob_t *prev, *next, *b = (slob_t *)block;
slobidx_t units;
unsigned long flags;
+ struct list_head *slob_list;
if (unlikely(ZERO_OR_NULL_PTR(block)))
return;
@@ -422,7 +423,13 @@ static void slob_free(void *block, int size)
set_slob(b, units,
(void *)((unsigned long)(b +
SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
- set_slob_page_free(sp, &free_slob_small);
+ if (size < SLOB_BREAK1)
+ slob_list = &free_slob_small;
+ else if (size < SLOB_BREAK2)
+ slob_list = &free_slob_medium;
+ else
+ slob_list = &free_slob_large;
+ set_slob_page_free(sp, slob_list);
goto out;
}
diff --git a/mm/slub.c b/mm/slub.c
index 39d39653239b..fba51d6d4cc4 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -107,11 +107,17 @@
* the fast path and disables lockless freelists.
*/
+#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+ SLAB_TRACE | SLAB_DEBUG_FREE)
+
+static inline int kmem_cache_debug(struct kmem_cache *s)
+{
#ifdef CONFIG_SLUB_DEBUG
-#define SLABDEBUG 1
+ return unlikely(s->flags & SLAB_DEBUG_FLAGS);
#else
-#define SLABDEBUG 0
+ return 0;
#endif
+}
/*
* Issues still to be resolved:
@@ -162,8 +168,8 @@
#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */
/* Internal SLUB flags */
-#define __OBJECT_POISON 0x80000000 /* Poison object */
-#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
+#define __OBJECT_POISON 0x80000000UL /* Poison object */
+#define __SYSFS_ADD_DEFERRED 0x40000000UL /* Not yet visible via sysfs */
static int kmem_size = sizeof(struct kmem_cache);
@@ -1073,7 +1079,7 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
flags |= __GFP_NOTRACK;
- if (node == -1)
+ if (node == NUMA_NO_NODE)
return alloc_pages(flags, order);
else
return alloc_pages_exact_node(node, flags, order);
@@ -1157,9 +1163,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab = s;
page->flags |= 1 << PG_slab;
- if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
- SLAB_STORE_USER | SLAB_TRACE))
- __SetPageSlubDebug(page);
start = page_address(page);
@@ -1186,14 +1189,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
int order = compound_order(page);
int pages = 1 << order;
- if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
+ if (kmem_cache_debug(s)) {
void *p;
slab_pad_check(s, page);
for_each_object(p, s, page_address(page),
page->objects)
check_object(s, page, p, 0);
- __ClearPageSlubDebug(page);
}
kmemcheck_free_shadow(page, compound_order(page));
@@ -1387,7 +1389,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
{
struct page *page;
- int searchnode = (node == -1) ? numa_node_id() : node;
+ int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
page = get_partial_node(get_node(s, searchnode));
if (page || node != -1)
@@ -1415,8 +1417,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
} else {
stat(s, DEACTIVATE_FULL);
- if (SLABDEBUG && PageSlubDebug(page) &&
- (s->flags & SLAB_STORE_USER))
+ if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
add_full(n, page);
}
slab_unlock(page);
@@ -1515,7 +1516,7 @@ static void flush_all(struct kmem_cache *s)
static inline int node_match(struct kmem_cache_cpu *c, int node)
{
#ifdef CONFIG_NUMA
- if (node != -1 && c->node != node)
+ if (node != NUMA_NO_NODE && c->node != node)
return 0;
#endif
return 1;
@@ -1624,7 +1625,7 @@ load_freelist:
object = c->page->freelist;
if (unlikely(!object))
goto another_slab;
- if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
+ if (kmem_cache_debug(s))
goto debug;
c->freelist = get_freepointer(s, object);
@@ -1727,7 +1728,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
- void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
@@ -1738,7 +1739,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
{
- return slab_alloc(s, gfpflags, -1, _RET_IP_);
+ return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_alloc_notrace);
#endif
@@ -1783,7 +1784,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_SLOWPATH);
slab_lock(page);
- if (unlikely(SLABDEBUG && PageSlubDebug(page)))
+ if (kmem_cache_debug(s))
goto debug;
checks_ok:
@@ -2490,7 +2491,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
- up_write(&slub_lock);
if (kmem_cache_close(s)) {
printk(KERN_ERR "SLUB %s: %s called for cache that "
"still has objects.\n", s->name, __func__);
@@ -2499,8 +2499,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
sysfs_slab_remove(s);
- } else
- up_write(&slub_lock);
+ }
+ up_write(&slub_lock);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2728,7 +2728,7 @@ void *__kmalloc(size_t size, gfp_t flags)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, flags, -1, _RET_IP_);
+ ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
@@ -3118,9 +3118,12 @@ void __init kmem_cache_init(void)
slab_state = UP;
/* Provide the correct kmalloc names now that the caches are up */
- for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
- kmalloc_caches[i]. name =
- kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
+ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+ char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
+
+ BUG_ON(!s);
+ kmalloc_caches[i].name = s;
+ }
#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
@@ -3223,14 +3226,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
*/
s->objsize = max(s->objsize, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
- up_write(&slub_lock);
if (sysfs_slab_alias(s, name)) {
- down_write(&slub_lock);
s->refcount--;
- up_write(&slub_lock);
goto err;
}
+ up_write(&slub_lock);
return s;
}
@@ -3239,14 +3240,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
if (kmem_cache_open(s, GFP_KERNEL, name,
size, align, flags, ctor)) {
list_add(&s->list, &slab_caches);
- up_write(&slub_lock);
if (sysfs_slab_add(s)) {
- down_write(&slub_lock);
list_del(&s->list);
- up_write(&slub_lock);
kfree(s);
goto err;
}
+ up_write(&slub_lock);
return s;
}
kfree(s);
@@ -3312,7 +3311,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, gfpflags, -1, caller);
+ ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
/* Honor the call site pointer we recieved. */
trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -3395,16 +3394,6 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
} else
printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
s->name, page);
-
- if (s->flags & DEBUG_DEFAULT_FLAGS) {
- if (!PageSlubDebug(page))
- printk(KERN_ERR "SLUB %s: SlubDebug not set "
- "on slab 0x%p\n", s->name, page);
- } else {
- if (PageSlubDebug(page))
- printk(KERN_ERR "SLUB %s: SlubDebug set on "
- "slab 0x%p\n", s->name, page);
- }
}
static int validate_slab_node(struct kmem_cache *s,
@@ -4504,6 +4493,13 @@ static int sysfs_slab_add(struct kmem_cache *s)
static void sysfs_slab_remove(struct kmem_cache *s)
{
+ if (slab_state < SYSFS)
+ /*
+ * Sysfs has not been setup yet so no need to remove the
+ * cache from sysfs.
+ */
+ return;
+
kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj);
kobject_put(&s->kobj);
@@ -4549,8 +4545,11 @@ static int __init slab_sysfs_init(void)
struct kmem_cache *s;
int err;
+ down_write(&slub_lock);
+
slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
if (!slab_kset) {
+ up_write(&slub_lock);
printk(KERN_ERR "Cannot register slab subsystem.\n");
return -ENOSYS;
}
@@ -4575,6 +4574,7 @@ static int __init slab_sysfs_init(void)
kfree(al);
}
+ up_write(&slub_lock);
resiliency_test();
return 0;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9c7e57cc63a3..b94fe1b3da43 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -213,8 +213,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
- unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
+ unsigned long max_pass;
+ max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
delta = (4 * scanned) / shrinker->seeks;
delta *= max_pass;
do_div(delta, lru_pages + 1);
@@ -242,8 +243,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
int shrink_ret;
int nr_before;
- nr_before = (*shrinker->shrink)(0, gfp_mask);
- shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
+ nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
+ shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
+ gfp_mask);
if (shrink_ret == -1)
break;
if (shrink_ret < nr_before)
@@ -296,7 +298,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
static void handle_write_error(struct address_space *mapping,
struct page *page, int error)
{
- lock_page(page);
+ lock_page_nosync(page);
if (page_mapping(page) == mapping)
mapping_set_error(mapping, error);
unlock_page(page);