summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorStefan Richter <stefanr@s5r6.in-berlin.de>2011-05-10 20:52:07 +0200
committerStefan Richter <stefanr@s5r6.in-berlin.de>2011-05-10 22:50:41 +0200
commit020abf03cd659388f94cb328e1e1df0656e0d7ff (patch)
tree40d05011708ad1b4a05928d167eb120420581aa6 /mm
parent0ff8fbc61727c926883eec381fbd3d32d1fab504 (diff)
parent693d92a1bbc9e42681c42ed190bd42b636ca876f (diff)
downloadlinux-020abf03cd659388f94cb328e1e1df0656e0d7ff.tar.bz2
Merge tag 'v2.6.39-rc7'
in order to pull in changes in drivers/media/dvb/firewire/ and sound/firewire/.
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig40
-rw-r--r--mm/Kconfig.debug25
-rw-r--r--mm/Makefile11
-rw-r--r--mm/backing-dev.c18
-rw-r--r--mm/bootmem.c188
-rw-r--r--mm/compaction.c209
-rw-r--r--mm/dmapool.c16
-rw-r--r--mm/filemap.c236
-rw-r--r--mm/huge_memory.c2393
-rw-r--r--mm/hugetlb.c127
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h10
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/kmemleak.c19
-rw-r--r--mm/ksm.c106
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memblock.c251
-rw-r--r--mm/memcontrol.c928
-rw-r--r--mm/memory-failure.c156
-rw-r--r--mm/memory.c530
-rw-r--r--mm/memory_hotplug.c25
-rw-r--r--mm/mempolicy.c42
-rw-r--r--mm/migrate.c192
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mlock.c180
-rw-r--r--mm/mmap.c57
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/mprotect.c20
-rw-r--r--mm/mremap.c24
-rw-r--r--mm/nobootmem.c427
-rw-r--r--mm/nommu.c92
-rw-r--r--mm/oom_kill.c98
-rw-r--r--mm/page-writeback.c36
-rw-r--r--mm/page_alloc.c356
-rw-r--r--mm/page_cgroup.c140
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pagewalk.c23
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/percpu.c25
-rw-r--r--mm/pgtable-generic.c121
-rw-r--r--mm/readahead.c18
-rw-r--r--mm/rmap.c200
-rw-r--r--mm/shmem.c35
-rw-r--r--mm/slab.c143
-rw-r--r--mm/slob.c11
-rw-r--r--mm/slub.c461
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/sparse.c6
-rw-r--r--mm/swap.c320
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c416
-rw-r--r--mm/truncate.c39
-rw-r--r--mm/util.c23
-rw-r--r--mm/vmalloc.c248
-rw-r--r--mm/vmscan.c477
-rw-r--r--mm/vmstat.c223
57 files changed, 7274 insertions, 2550 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c2c8a4a11898..e9c0c61f2ddd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
config COMPACTION
bool "Allow for memory compaction"
select MIGRATION
- depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
+ depends on MMU
help
Allows the compaction of memory for the allocation of huge pages.
@@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS
See Documentation/nommu-mmap.txt for more information.
+config TRANSPARENT_HUGEPAGE
+ bool "Transparent Hugepage Support"
+ depends on X86 && MMU
+ select COMPACTION
+ help
+ Transparent Hugepages allows the kernel to use huge pages and
+ huge tlb transparently to the applications whenever possible.
+ This feature can improve computing performance to certain
+ applications by speeding up page faults during memory
+ allocation, by reducing the number of tlb misses and by speeding
+ up the pagetable walking.
+
+ If memory constrained on embedded, you may want to say N.
+
+choice
+ prompt "Transparent Hugepage Support sysfs defaults"
+ depends on TRANSPARENT_HUGEPAGE
+ default TRANSPARENT_HUGEPAGE_ALWAYS
+ help
+ Selects the sysfs defaults for Transparent Hugepage Support.
+
+ config TRANSPARENT_HUGEPAGE_ALWAYS
+ bool "always"
+ help
+ Enabling Transparent Hugepage always, can increase the
+ memory footprint of applications without a guaranteed
+ benefit but it will work automatically for all applications.
+
+ config TRANSPARENT_HUGEPAGE_MADVISE
+ bool "madvise"
+ help
+ Enabling Transparent Hugepage madvise, will only provide a
+ performance improvement benefit to the applications using
+ madvise(MADV_HUGEPAGE) but it won't risk to increase the
+ memory footprint of applications without a guaranteed
+ benefit.
+endchoice
+
#
# UP and nommu archs use km based percpu allocator
#
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index af7cfb43d2f0..8b1a477162dc 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,27 +1,24 @@
config DEBUG_PAGEALLOC
bool "Debug page memory allocations"
- depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
- depends on !HIBERNATION || !PPC && !SPARC
+ depends on DEBUG_KERNEL
+ depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
depends on !KMEMCHECK
+ select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
of memory corruption.
+ For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
+ fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages(). Additionally,
+ this option cannot be enabled in combination with hibernation as
+ that would result in incorrect warnings of memory corruption after
+ a resume because free pages are not saved to the suspend image.
+
config WANT_PAGE_DEBUG_FLAGS
bool
config PAGE_POISONING
- bool "Debug page memory allocations"
- depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
- depends on !HIBERNATION
- select DEBUG_PAGEALLOC
+ bool
select WANT_PAGE_DEBUG_FLAGS
- ---help---
- Fill the pages with poison patterns after free_pages() and verify
- the patterns before alloc_pages(). This results in a large slowdown,
- but helps to find certain types of memory corruption.
-
- This option cannot be enabled in combination with hibernation as
- that would result in incorrect warnings of memory corruption after
- a resume because free pages are not saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index f73f75a29f82..42a8326c3e3d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,9 +5,9 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o pagewalk.o
+ vmalloc.o pagewalk.o pgtable-generic.o
-obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
@@ -15,6 +15,12 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
$(mmu-y)
obj-y += init-mm.o
+ifdef CONFIG_NO_BOOTMEM
+ obj-y += nobootmem.o
+else
+ obj-y += bootmem.o
+endif
+
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_BOUNCE) += bounce.o
@@ -37,6 +43,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 027100d30227..befc87531e4f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,17 +14,11 @@
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
-EXPORT_SYMBOL(default_unplug_io_fn);
-
struct backing_dev_info default_backing_dev_info = {
.name = "default",
.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
.state = 0,
.capabilities = BDI_CAP_MAP_COPY,
- .unplug_io_fn = default_unplug_io_fn,
};
EXPORT_SYMBOL_GPL(default_backing_dev_info);
@@ -73,14 +67,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
struct inode *inode;
nr_wb = nr_dirty = nr_io = nr_more_io = 0;
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
list_for_each_entry(inode, &wb->b_io, i_wb_list)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -604,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (sb->s_bdi == bdi)
- sb->s_bdi = NULL;
+ sb->s_bdi = &default_backing_dev_info;
}
spin_unlock(&sb_lock);
}
@@ -682,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb;
- spin_lock(&inode_lock);
+ spin_lock(&inode_wb_list_lock);
list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
list_splice(&bdi->wb.b_io, &dst->b_io);
list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&inode_lock);
+ spin_unlock(&inode_wb_list_lock);
}
bdi_unregister(bdi);
@@ -793,7 +787,7 @@ EXPORT_SYMBOL(congestion_wait);
* jiffies for either a BDI to exit congestion of the given @sync queue
* or a write to complete.
*
- * In the absense of zone congestion, cond_resched() is called to yield
+ * In the absence of zone congestion, cond_resched() is called to yield
* the processor if necessary but otherwise does not sleep.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 13b0caa9793c..01d5a4b3dd0c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -23,19 +23,17 @@
#include "internal.h"
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data = {
+ .bdata = &bootmem_node_data[0]
+};
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
-#ifdef CONFIG_CRASH_DUMP
-/*
- * If we have booted due to a crash, max_pfn will be a very low value. We need
- * to know the amount of memory that the previous kernel used.
- */
-unsigned long saved_max_pfn;
-#endif
-
-#ifndef CONFIG_NO_BOOTMEM
bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -146,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
min_low_pfn = start;
return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
}
-#endif
+
/*
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
@@ -171,53 +169,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
}
}
-#ifdef CONFIG_NO_BOOTMEM
-static void __init __free_pages_memory(unsigned long start, unsigned long end)
-{
- int i;
- unsigned long start_aligned, end_aligned;
- int order = ilog2(BITS_PER_LONG);
-
- start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
- end_aligned = end & ~(BITS_PER_LONG - 1);
-
- if (end_aligned <= start_aligned) {
- for (i = start; i < end; i++)
- __free_pages_bootmem(pfn_to_page(i), 0);
-
- return;
- }
-
- for (i = start; i < start_aligned; i++)
- __free_pages_bootmem(pfn_to_page(i), 0);
-
- for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
- __free_pages_bootmem(pfn_to_page(i), order);
-
- for (i = end_aligned; i < end; i++)
- __free_pages_bootmem(pfn_to_page(i), 0);
-}
-
-unsigned long __init free_all_memory_core_early(int nodeid)
-{
- int i;
- u64 start, end;
- unsigned long count = 0;
- struct range *range = NULL;
- int nr_range;
-
- nr_range = get_free_all_memory_range(&range, nodeid);
-
- for (i = 0; i < nr_range; i++) {
- start = range[i].start;
- end = range[i].end;
- count += end - start;
- __free_pages_memory(start, end);
- }
-
- return count;
-}
-#else
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
int aligned;
@@ -278,7 +229,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
return count;
}
-#endif
/**
* free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -289,12 +239,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
{
register_page_bootmem_info_node(pgdat);
-#ifdef CONFIG_NO_BOOTMEM
- /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
- return 0;
-#else
return free_all_bootmem_core(pgdat->bdata);
-#endif
}
/**
@@ -304,16 +249,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
*/
unsigned long __init free_all_bootmem(void)
{
-#ifdef CONFIG_NO_BOOTMEM
- /*
- * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
- * because in some case like Node0 doesnt have RAM installed
- * low ram will be on Node1
- * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
- * will be used instead of only Node0 related
- */
- return free_all_memory_core_early(MAX_NUMNODES);
-#else
unsigned long total_pages = 0;
bootmem_data_t *bdata;
@@ -321,10 +256,8 @@ unsigned long __init free_all_bootmem(void)
total_pages += free_all_bootmem_core(bdata);
return total_pages;
-#endif
}
-#ifndef CONFIG_NO_BOOTMEM
static void __init __free(bootmem_data_t *bdata,
unsigned long sidx, unsigned long eidx)
{
@@ -419,7 +352,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
}
BUG();
}
-#endif
/**
* free_bootmem_node - mark a page range as usable
@@ -434,10 +366,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
-#ifdef CONFIG_NO_BOOTMEM
- kmemleak_free_part(__va(physaddr), size);
- memblock_x86_free_range(physaddr, physaddr + size);
-#else
unsigned long start, end;
kmemleak_free_part(__va(physaddr), size);
@@ -446,7 +374,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
end = PFN_DOWN(physaddr + size);
mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
-#endif
}
/**
@@ -460,10 +387,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
*/
void __init free_bootmem(unsigned long addr, unsigned long size)
{
-#ifdef CONFIG_NO_BOOTMEM
- kmemleak_free_part(__va(addr), size);
- memblock_x86_free_range(addr, addr + size);
-#else
unsigned long start, end;
kmemleak_free_part(__va(addr), size);
@@ -472,7 +395,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
end = PFN_DOWN(addr + size);
mark_bootmem(start, end, 0, 0);
-#endif
}
/**
@@ -489,17 +411,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size, int flags)
{
-#ifdef CONFIG_NO_BOOTMEM
- panic("no bootmem");
- return 0;
-#else
unsigned long start, end;
start = PFN_DOWN(physaddr);
end = PFN_UP(physaddr + size);
return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
-#endif
}
/**
@@ -515,20 +432,14 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
int __init reserve_bootmem(unsigned long addr, unsigned long size,
int flags)
{
-#ifdef CONFIG_NO_BOOTMEM
- panic("no bootmem");
- return 0;
-#else
unsigned long start, end;
start = PFN_DOWN(addr);
end = PFN_UP(addr + size);
return mark_bootmem(start, end, 1, flags);
-#endif
}
-#ifndef CONFIG_NO_BOOTMEM
int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
int flags)
{
@@ -685,33 +596,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
#endif
return NULL;
}
-#endif
static void * __init ___alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
unsigned long goal,
unsigned long limit)
{
-#ifdef CONFIG_NO_BOOTMEM
- void *ptr;
-
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
-
-restart:
-
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
-
- if (ptr)
- return ptr;
-
- if (goal != 0) {
- goal = 0;
- goto restart;
- }
-
- return NULL;
-#else
bootmem_data_t *bdata;
void *region;
@@ -737,7 +627,6 @@ restart:
}
return NULL;
-#endif
}
/**
@@ -758,10 +647,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
{
unsigned long limit = 0;
-#ifdef CONFIG_NO_BOOTMEM
- limit = -1UL;
-#endif
-
return ___alloc_bootmem_nopanic(size, align, goal, limit);
}
@@ -798,14 +683,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
{
unsigned long limit = 0;
-#ifdef CONFIG_NO_BOOTMEM
- limit = -1UL;
-#endif
-
return ___alloc_bootmem(size, align, goal, limit);
}
-#ifndef CONFIG_NO_BOOTMEM
static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
@@ -822,7 +702,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
return ___alloc_bootmem(size, align, goal, limit);
}
-#endif
/**
* __alloc_bootmem_node - allocate boot memory from a specific node
@@ -842,24 +721,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
- void *ptr;
-
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-#ifdef CONFIG_NO_BOOTMEM
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, -1ULL);
- if (ptr)
- return ptr;
-
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
- goal, -1ULL);
-#else
- ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
-#endif
-
- return ptr;
+ return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
}
void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -880,13 +745,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
unsigned long new_goal;
new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
-#ifdef CONFIG_NO_BOOTMEM
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- new_goal, -1ULL);
-#else
ptr = alloc_bootmem_core(pgdat->bdata, size, align,
new_goal, 0);
-#endif
if (ptr)
return ptr;
}
@@ -907,16 +767,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
void * __init alloc_bootmem_section(unsigned long size,
unsigned long section_nr)
{
-#ifdef CONFIG_NO_BOOTMEM
- unsigned long pfn, goal, limit;
-
- pfn = section_nr_to_pfn(section_nr);
- goal = pfn << PAGE_SHIFT;
- limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
-
- return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
- SMP_CACHE_BYTES, goal, limit);
-#else
bootmem_data_t *bdata;
unsigned long pfn, goal, limit;
@@ -926,7 +776,6 @@ void * __init alloc_bootmem_section(unsigned long size,
bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
-#endif
}
#endif
@@ -938,16 +787,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-#ifdef CONFIG_NO_BOOTMEM
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, -1ULL);
-#else
ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
if (ptr)
return ptr;
ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
-#endif
if (ptr)
return ptr;
@@ -995,21 +839,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
- void *ptr;
-
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-#ifdef CONFIG_NO_BOOTMEM
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ return ___alloc_bootmem_node(pgdat->bdata, size, align,
goal, ARCH_LOW_ADDRESS_LIMIT);
- if (ptr)
- return ptr;
- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
- goal, ARCH_LOW_ADDRESS_LIMIT);
-#else
- ptr = ___alloc_bootmem_node(pgdat->bdata, size, align,
- goal, ARCH_LOW_ADDRESS_LIMIT);
-#endif
- return ptr;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 4d709ee59013..021a2960ef9e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
#include <linux/sysfs.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
+
/*
* compact_control is used to track pages being migrated and the free pages
* they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
+ bool sync; /* Synchronous migration */
/* Account for isolated anon and file pages */
unsigned long nr_anon;
@@ -60,7 +64,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
struct list_head *freelist)
{
unsigned long zone_end_pfn, end_pfn;
- int total_isolated = 0;
+ int nr_scanned = 0, total_isolated = 0;
struct page *cursor;
/* Get the last PFN we should scan for free pages at */
@@ -81,6 +85,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
if (!pfn_valid_within(blockpfn))
continue;
+ nr_scanned++;
if (!PageBuddy(page))
continue;
@@ -100,6 +105,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
}
}
+ trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
return total_isolated;
}
@@ -147,7 +153,6 @@ static void isolate_freepages(struct zone *zone,
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- spin_lock_irqsave(&zone->lock, flags);
for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
@@ -170,9 +175,19 @@ static void isolate_freepages(struct zone *zone,
if (!suitable_migration_target(page))
continue;
- /* Found a block suitable for isolating free pages from */
- isolated = isolate_freepages_block(zone, pfn, freelist);
- nr_freepages += isolated;
+ /*
+ * Found a block suitable for isolating free pages from. Now
+ * we disabled interrupts, double check things are ok and
+ * isolate the pages. This is to minimise the time IRQs
+ * are disabled
+ */
+ isolated = 0;
+ spin_lock_irqsave(&zone->lock, flags);
+ if (suitable_migration_target(page)) {
+ isolated = isolate_freepages_block(zone, pfn, freelist);
+ nr_freepages += isolated;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
/*
* Record the highest PFN we isolated pages from. When next
@@ -182,7 +197,6 @@ static void isolate_freepages(struct zone *zone,
if (isolated)
high_pfn = max(high_pfn, pfn);
}
- spin_unlock_irqrestore(&zone->lock, flags);
/* split_free_page does not map the pages */
list_for_each_entry(page, freelist, lru) {
@@ -234,6 +248,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
+ unsigned long last_pageblock_nr = 0, pageblock_nr;
+ unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
/* Do not scan outside zone boundaries */
@@ -261,26 +277,74 @@ static unsigned long isolate_migratepages(struct zone *zone,
}
/* Time to isolate some pages for migration */
+ cond_resched();
spin_lock_irq(&zone->lru_lock);
for (; low_pfn < end_pfn; low_pfn++) {
struct page *page;
+ bool locked = true;
+
+ /* give a chance to irqs before checking need_resched() */
+ if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+ spin_unlock_irq(&zone->lru_lock);
+ locked = false;
+ }
+ if (need_resched() || spin_is_contended(&zone->lru_lock)) {
+ if (locked)
+ spin_unlock_irq(&zone->lru_lock);
+ cond_resched();
+ spin_lock_irq(&zone->lru_lock);
+ if (fatal_signal_pending(current))
+ break;
+ } else if (!locked)
+ spin_lock_irq(&zone->lru_lock);
+
if (!pfn_valid_within(low_pfn))
continue;
+ nr_scanned++;
/* Get the page and skip if free */
page = pfn_to_page(low_pfn);
if (PageBuddy(page))
continue;
+ /*
+ * For async migration, also only scan in MOVABLE blocks. Async
+ * migration is optimistic to see if the minimum amount of work
+ * satisfies the allocation
+ */
+ pageblock_nr = low_pfn >> pageblock_order;
+ if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+ low_pfn += pageblock_nr_pages;
+ low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+ last_pageblock_nr = pageblock_nr;
+ continue;
+ }
+
+ if (!PageLRU(page))
+ continue;
+
+ /*
+ * PageLRU is set, and lru_lock excludes isolation,
+ * splitting and collapsing (collapsing has already
+ * happened if PageLRU is set).
+ */
+ if (PageTransHuge(page)) {
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
+
/* Try isolate the page */
if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
continue;
+ VM_BUG_ON(PageTransCompound(page));
+
/* Successfully isolated */
del_page_from_lru_list(zone, page, page_lru(page));
list_add(&page->lru, migratelist);
- mem_cgroup_del_lru(page);
cc->nr_migratepages++;
+ nr_isolated++;
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -292,6 +356,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
cc->migrate_pfn = low_pfn;
+ trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+
return cc->nr_migratepages;
}
@@ -342,10 +408,10 @@ static void update_nr_listpages(struct compact_control *cc)
}
static int compact_finished(struct zone *zone,
- struct compact_control *cc)
+ struct compact_control *cc)
{
unsigned int order;
- unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+ unsigned long watermark;
if (fatal_signal_pending(current))
return COMPACT_PARTIAL;
@@ -355,9 +421,16 @@ static int compact_finished(struct zone *zone,
return COMPACT_COMPLETE;
/* Compaction run is not finished if the watermark is not met */
+ watermark = low_wmark_pages(zone);
+ watermark += (1 << cc->order);
+
if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
return COMPACT_CONTINUE;
+ /*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
if (cc->order == -1)
return COMPACT_CONTINUE;
@@ -375,10 +448,69 @@ static int compact_finished(struct zone *zone,
return COMPACT_CONTINUE;
}
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ * COMPACT_SKIPPED - If there are too few free pages for compaction
+ * COMPACT_PARTIAL - If the allocation would succeed without compaction
+ * COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+ int fragindex;
+ unsigned long watermark;
+
+ /*
+ * Watermarks for order-0 must be met for compaction. Note the 2UL.
+ * This is because during migration, copies of pages need to be
+ * allocated and for a short time, the footprint is higher
+ */
+ watermark = low_wmark_pages(zone) + (2UL << order);
+ if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+ return COMPACT_SKIPPED;
+
+ /*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+ if (order == -1)
+ return COMPACT_CONTINUE;
+
+ /*
+ * fragmentation index determines if allocation failures are due to
+ * low memory or external fragmentation
+ *
+ * index of -1 implies allocations might succeed dependingon watermarks
+ * index towards 0 implies failure is due to lack of memory
+ * index towards 1000 implies failure is due to fragmentation
+ *
+ * Only compact if a failure would be due to fragmentation.
+ */
+ fragindex = fragmentation_index(zone, order);
+ if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+ return COMPACT_SKIPPED;
+
+ if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+ return COMPACT_PARTIAL;
+
+ return COMPACT_CONTINUE;
+}
+
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
+ ret = compaction_suitable(zone, cc->order);
+ switch (ret) {
+ case COMPACT_PARTIAL:
+ case COMPACT_SKIPPED:
+ /* Compaction is likely to fail */
+ return ret;
+ case COMPACT_CONTINUE:
+ /* Fall through to compaction */
+ ;
+ }
+
/* Setup to move all movable pages to the end of the zone */
cc->migrate_pfn = zone->zone_start_pfn;
cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -388,13 +520,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
unsigned long nr_migrate, nr_remaining;
+ int err;
if (!isolate_migratepages(zone, cc))
continue;
nr_migrate = cc->nr_migratepages;
- migrate_pages(&cc->migratepages, compaction_alloc,
- (unsigned long)cc, 0);
+ err = migrate_pages(&cc->migratepages, compaction_alloc,
+ (unsigned long)cc, false,
+ cc->sync);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
@@ -402,9 +536,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
if (nr_remaining)
count_vm_events(COMPACTPAGEFAILED, nr_remaining);
+ trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+ nr_remaining);
/* Release LRU pages not migrated */
- if (!list_empty(&cc->migratepages)) {
+ if (err) {
putback_lru_pages(&cc->migratepages);
cc->nr_migratepages = 0;
}
@@ -418,8 +554,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
return ret;
}
-static unsigned long compact_zone_order(struct zone *zone,
- int order, gfp_t gfp_mask)
+unsigned long compact_zone_order(struct zone *zone,
+ int order, gfp_t gfp_mask,
+ bool sync)
{
struct compact_control cc = {
.nr_freepages = 0,
@@ -427,6 +564,7 @@ static unsigned long compact_zone_order(struct zone *zone,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
+ .sync = sync,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -442,16 +580,17 @@ int sysctl_extfrag_threshold = 500;
* @order: The order of the current allocation
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
- int order, gfp_t gfp_mask, nodemask_t *nodemask)
+ int order, gfp_t gfp_mask, nodemask_t *nodemask,
+ bool sync)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
- unsigned long watermark;
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_SKIPPED;
@@ -461,7 +600,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
* made because an assumption is made that the page allocator can satisfy
* the "cheaper" orders without taking special steps
*/
- if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+ if (!order || !may_enter_fs || !may_perform_io)
return rc;
count_vm_event(COMPACTSTALL);
@@ -469,43 +608,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
nodemask) {
- int fragindex;
int status;
- /*
- * Watermarks for order-0 must be met for compaction. Note
- * the 2UL. This is because during migration, copies of
- * pages need to be allocated and for a short time, the
- * footprint is higher
- */
- watermark = low_wmark_pages(zone) + (2UL << order);
- if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
- continue;
-
- /*
- * fragmentation index determines if allocation failures are
- * due to low memory or external fragmentation
- *
- * index of -1 implies allocations might succeed depending
- * on watermarks
- * index towards 0 implies failure is due to lack of memory
- * index towards 1000 implies failure is due to fragmentation
- *
- * Only compact if a failure would be due to fragmentation.
- */
- fragindex = fragmentation_index(zone, order);
- if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
- continue;
-
- if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
- rc = COMPACT_PARTIAL;
- break;
- }
-
- status = compact_zone_order(zone, order, gfp_mask);
+ status = compact_zone_order(zone, order, gfp_mask, sync);
rc = max(status, rc);
- if (zone_watermark_ok(zone, order, watermark, 0, 0))
+ /* If a normal allocation would succeed, stop compacting */
+ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
break;
}
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 4df2de77e069..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
if (mem_flags & __GFP_WAIT) {
DECLARE_WAITQUEUE(wait, current);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
__add_wait_queue(&pool->waitq, &wait);
spin_unlock_irqrestore(&pool->lock, flags);
@@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
{
- unsigned long flags;
struct dma_page *page;
- spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry(page, &pool->page_list, page_list) {
if (dma < page->dma)
continue;
if (dma < (page->dma + pool->allocation))
- goto done;
+ return page;
}
- page = NULL;
- done:
- spin_unlock_irqrestore(&pool->lock, flags);
- return page;
+ return NULL;
}
/**
@@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
unsigned long flags;
unsigned int offset;
+ spin_lock_irqsave(&pool->lock, flags);
page = pool_find_page(pool, dma);
if (!page) {
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev,
"dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
offset = vaddr - page->vaddr;
#ifdef DMAPOOL_DEBUG
if ((dma - page->dma) != offset) {
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev,
"dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
chain = *(int *)(page->vaddr + chain);
continue;
}
+ spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
"already free\n", pool->name,
@@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
memset(vaddr, POOL_POISON_FREED, pool->size);
#endif
- spin_lock_irqsave(&pool->lock, flags);
page->in_use--;
*(int *)vaddr = page->offset;
page->offset = offset;
diff --git a/mm/filemap.c b/mm/filemap.c
index ea89840fc65f..c641edf553a9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -80,8 +80,8 @@
* ->i_mutex
* ->i_alloc_sem (various)
*
- * ->inode_lock
- * ->sb_lock (fs/fs-writeback.c)
+ * inode_wb_list_lock
+ * sb_lock (fs/fs-writeback.c)
* ->mapping->tree_lock (__sync_single_inode)
*
* ->i_mmap_lock
@@ -98,24 +98,23 @@
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (page_remove_rmap->set_page_dirty)
- * ->inode_lock (zap_pte_range->set_page_dirty)
+ * inode_wb_list_lock (page_remove_rmap->set_page_dirty)
+ * ->inode->i_lock (page_remove_rmap->set_page_dirty)
+ * inode_wb_list_lock (zap_pte_range->set_page_dirty)
+ * ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
*
- * ->task->proc_lock
- * ->dcache_lock (proc_pid_lookup)
- *
* (code doesn't rely on that order, so you could switch it around)
* ->tasklist_lock (memory_failure, collect_procs_ao)
* ->i_mmap_lock
*/
/*
- * Remove a page from the page cache and free it. Caller has to make
+ * Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold the mapping's tree_lock.
*/
-void __remove_from_page_cache(struct page *page)
+void __delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -140,58 +139,42 @@ void __remove_from_page_cache(struct page *page)
}
}
-void remove_from_page_cache(struct page *page)
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have been verified to be in the page
+ * cache and locked. It will never put the page into the free list, the caller
+ * has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
+ freepage = mapping->a_ops->freepage;
spin_lock_irq(&mapping->tree_lock);
- __remove_from_page_cache(page);
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage)
+ freepage(page);
+ page_cache_release(page);
}
-EXPORT_SYMBOL(remove_from_page_cache);
+EXPORT_SYMBOL(delete_from_page_cache);
-static int sync_page(void *word)
+static int sleep_on_page(void *word)
{
- struct address_space *mapping;
- struct page *page;
-
- page = container_of((unsigned long *)word, struct page, flags);
-
- /*
- * page_mapping() is being called without PG_locked held.
- * Some knowledge of the state and use of the page is used to
- * reduce the requirements down to a memory barrier.
- * The danger here is of a stale page_mapping() return value
- * indicating a struct address_space different from the one it's
- * associated with when it is associated with one.
- * After smp_mb(), it's either the correct page_mapping() for
- * the page, or an old page_mapping() and the page's own
- * page_mapping() has gone NULL.
- * The ->sync_page() address_space operation must tolerate
- * page_mapping() going NULL. By an amazing coincidence,
- * this comes about because none of the users of the page
- * in the ->sync_page() methods make essential use of the
- * page_mapping(), merely passing the page down to the backing
- * device's unplug functions when it's non-NULL, which in turn
- * ignore it for all cases but swap, where only page_private(page) is
- * of interest. When page_mapping() does go NULL, the entire
- * call stack gracefully ignores the page and returns.
- * -- wli
- */
- smp_mb();
- mapping = page_mapping(page);
- if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
- mapping->a_ops->sync_page(page);
io_schedule();
return 0;
}
-static int sync_page_killable(void *word)
+static int sleep_on_page_killable(void *word)
{
- sync_page(word);
+ sleep_on_page(word);
return fatal_signal_pending(current) ? -EINTR : 0;
}
@@ -296,7 +279,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
continue;
wait_on_page_writeback(page);
- if (PageError(page))
+ if (TestClearPageError(page))
ret = -EIO;
}
pagevec_release(&pvec);
@@ -385,6 +368,76 @@ int filemap_write_and_wait_range(struct address_space *mapping,
EXPORT_SYMBOL(filemap_write_and_wait_range);
/**
+ * replace_page_cache_page - replace a pagecache page with a new one
+ * @old: page to be replaced
+ * @new: page to replace with
+ * @gfp_mask: allocation mode
+ *
+ * This function replaces a page in the pagecache with a new one. On
+ * success it acquires the pagecache reference for the new page and
+ * drops it for the old page. Both the old and new pages must be
+ * locked. This function does not add the new page to the LRU, the
+ * caller must do that.
+ *
+ * The remove + add is atomic. The only way this function can fail is
+ * memory allocation failure.
+ */
+int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+{
+ int error;
+ struct mem_cgroup *memcg = NULL;
+
+ VM_BUG_ON(!PageLocked(old));
+ VM_BUG_ON(!PageLocked(new));
+ VM_BUG_ON(new->mapping);
+
+ /*
+ * This is not page migration, but prepare_migration and
+ * end_migration does enough work for charge replacement.
+ *
+ * In the longer term we probably want a specialized function
+ * for moving the charge from old to new in a more efficient
+ * manner.
+ */
+ error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
+ if (error)
+ return error;
+
+ error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ if (!error) {
+ struct address_space *mapping = old->mapping;
+ void (*freepage)(struct page *);
+
+ pgoff_t offset = old->index;
+ freepage = mapping->a_ops->freepage;
+
+ page_cache_get(new);
+ new->mapping = mapping;
+ new->index = offset;
+
+ spin_lock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(old);
+ error = radix_tree_insert(&mapping->page_tree, offset, new);
+ BUG_ON(error);
+ mapping->nrpages++;
+ __inc_zone_page_state(new, NR_FILE_PAGES);
+ if (PageSwapBacked(new))
+ __inc_zone_page_state(new, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ radix_tree_preload_end();
+ if (freepage)
+ freepage(old);
+ page_cache_release(old);
+ mem_cgroup_end_migration(memcg, old, new, true);
+ } else {
+ mem_cgroup_end_migration(memcg, old, new, false);
+ }
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(replace_page_cache_page);
+
+/**
* add_to_page_cache_locked - add a locked page to the pagecache
* @page: page to add
* @mapping: the page's address_space
@@ -477,12 +530,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
EXPORT_SYMBOL(__page_cache_alloc);
#endif
-static int __sleep_on_page_lock(void *word)
-{
- io_schedule();
- return 0;
-}
-
/*
* In order to wait for pages to become available there must be
* waitqueues associated with pages. By using a hash table of
@@ -510,7 +557,7 @@ void wait_on_page_bit(struct page *page, int bit_nr)
DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
if (test_bit(bit_nr, &page->flags))
- __wait_on_bit(page_waitqueue(page), &wait, sync_page,
+ __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_on_page_bit);
@@ -574,17 +621,12 @@ EXPORT_SYMBOL(end_page_writeback);
/**
* __lock_page - get a lock on the page, assuming we need to sleep to get it
* @page: the page to lock
- *
- * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some
- * random driver's requestfn sets TASK_RUNNING, we could busywait. However
- * chances are that on the second loop, the block layer's plug list is empty,
- * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
*/
void __lock_page(struct page *page)
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+ __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
@@ -594,24 +636,10 @@ int __lock_page_killable(struct page *page)
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
return __wait_on_bit_lock(page_waitqueue(page), &wait,
- sync_page_killable, TASK_KILLABLE);
+ sleep_on_page_killable, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
-/**
- * __lock_page_nosync - get a lock on the page, without calling sync_page()
- * @page: the page to lock
- *
- * Variant of lock_page that does not require the caller to hold a reference
- * on the page's mapping.
- */
-void __lock_page_nosync(struct page *page)
-{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
- TASK_UNINTERRUPTIBLE);
-}
-
int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags)
{
@@ -619,8 +647,10 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
__lock_page(page);
return 1;
} else {
- up_read(&mm->mmap_sem);
- wait_on_page_locked(page);
+ if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ up_read(&mm->mmap_sem);
+ wait_on_page_locked(page);
+ }
return 0;
}
}
@@ -780,9 +810,13 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
+
+ /*
+ * This can only trigger when the entry at index 0 moves out
+ * of or back to the root: none yet gotten, safe to restart.
+ */
if (radix_tree_deref_retry(page)) {
- if (ret)
- start = pages[ret-1]->index;
+ WARN_ON(start | i);
goto restart;
}
@@ -798,6 +832,13 @@ repeat:
pages[ret] = page;
ret++;
}
+
+ /*
+ * If all entries were removed before we could secure them,
+ * try again, because callers stop trying once 0 is returned.
+ */
+ if (unlikely(!ret && nr_found))
+ goto restart;
rcu_read_unlock();
return ret;
}
@@ -832,12 +873,14 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
+
+ /*
+ * This can only trigger when the entry at index 0 moves out
+ * of or back to the root: none yet gotten, safe to restart.
+ */
if (radix_tree_deref_retry(page))
goto restart;
- if (page->mapping == NULL || page->index != index)
- break;
-
if (!page_cache_get_speculative(page))
goto repeat;
@@ -847,6 +890,16 @@ repeat:
goto repeat;
}
+ /*
+ * must check mapping and index after taking the ref.
+ * otherwise we can get both false positives and false
+ * negatives, which is just confusing to the caller.
+ */
+ if (page->mapping == NULL || page->index != index) {
+ page_cache_release(page);
+ break;
+ }
+
pages[ret] = page;
ret++;
index++;
@@ -885,6 +938,11 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
+
+ /*
+ * This can only trigger when the entry at index 0 moves out
+ * of or back to the root: none yet gotten, safe to restart.
+ */
if (radix_tree_deref_retry(page))
goto restart;
@@ -900,6 +958,13 @@ repeat:
pages[ret] = page;
ret++;
}
+
+ /*
+ * If all entries were removed before we could secure them,
+ * try again, because callers stop trying once 0 is returned.
+ */
+ if (unlikely(!ret && nr_found))
+ goto restart;
rcu_read_unlock();
if (ret)
@@ -1289,12 +1354,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long seg = 0;
size_t count;
loff_t *ppos = &iocb->ki_pos;
+ struct blk_plug plug;
count = 0;
retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
if (retval)
return retval;
+ blk_start_plug(&plug);
+
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) {
loff_t size;
@@ -1367,6 +1435,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
break;
}
out:
+ blk_finish_plug(&plug);
return retval;
}
EXPORT_SYMBOL(generic_file_aio_read);
@@ -2218,7 +2287,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
gfp_notmask = __GFP_FS;
repeat:
page = find_lock_page(mapping, index);
- if (likely(page))
+ if (page)
return page;
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
@@ -2478,11 +2547,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ struct blk_plug plug;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
mutex_lock(&inode->i_mutex);
+ blk_start_plug(&plug);
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
@@ -2493,6 +2564,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err < 0 && ret > 0)
ret = err;
}
+ blk_finish_plug(&plug);
return ret;
}
EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..83326ad66d9b
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2393 @@
+/*
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+ (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+ (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+
+static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
+static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
+
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+ struct hlist_node hash;
+ struct list_head mm_node;
+ struct mm_struct *mm;
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+ struct list_head mm_head;
+ struct mm_slot *mm_slot;
+ unsigned long address;
+} khugepaged_scan = {
+ .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+
+static int set_recommended_min_free_kbytes(void)
+{
+ struct zone *zone;
+ int nr_zones = 0;
+ unsigned long recommended_min;
+ extern int min_free_kbytes;
+
+ if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags) &&
+ !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags))
+ return 0;
+
+ for_each_populated_zone(zone)
+ nr_zones++;
+
+ /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+ recommended_min = pageblock_nr_pages * nr_zones * 2;
+
+ /*
+ * Make sure that on average at least two pageblocks are almost free
+ * of another type, one for a migratetype to fall back to and a
+ * second to avoid subsequent fallbacks of other types There are 3
+ * MIGRATE_TYPES we care about.
+ */
+ recommended_min += pageblock_nr_pages * nr_zones *
+ MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+
+ /* don't ever allow to reserve more than 5% of the lowmem */
+ recommended_min = min(recommended_min,
+ (unsigned long) nr_free_buffer_pages() / 20);
+ recommended_min <<= (PAGE_SHIFT-10);
+
+ if (recommended_min > min_free_kbytes)
+ min_free_kbytes = recommended_min;
+ setup_per_zone_wmarks();
+ return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+
+static int start_khugepaged(void)
+{
+ int err = 0;
+ if (khugepaged_enabled()) {
+ int wakeup;
+ if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ mutex_lock(&khugepaged_mutex);
+ if (!khugepaged_thread)
+ khugepaged_thread = kthread_run(khugepaged, NULL,
+ "khugepaged");
+ if (unlikely(IS_ERR(khugepaged_thread))) {
+ printk(KERN_ERR
+ "khugepaged: kthread_run(khugepaged) failed\n");
+ err = PTR_ERR(khugepaged_thread);
+ khugepaged_thread = NULL;
+ }
+ wakeup = !list_empty(&khugepaged_scan.mm_head);
+ mutex_unlock(&khugepaged_mutex);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ set_recommended_min_free_kbytes();
+ } else
+ /* wakeup to exit */
+ wake_up_interruptible(&khugepaged_wait);
+out:
+ return err;
+}
+
+#ifdef CONFIG_SYSFS
+
+static ssize_t double_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (test_bit(enabled, &transparent_hugepage_flags)) {
+ VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
+ return sprintf(buf, "[always] madvise never\n");
+ } else if (test_bit(req_madv, &transparent_hugepage_flags))
+ return sprintf(buf, "always [madvise] never\n");
+ else
+ return sprintf(buf, "always madvise [never]\n");
+}
+static ssize_t double_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag req_madv)
+{
+ if (!memcmp("always", buf,
+ min(sizeof("always")-1, count))) {
+ set_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("madvise", buf,
+ min(sizeof("madvise")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ set_bit(req_madv, &transparent_hugepage_flags);
+ } else if (!memcmp("never", buf,
+ min(sizeof("never")-1, count))) {
+ clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static ssize_t enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ ssize_t ret;
+
+ ret = double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_FLAG,
+ TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+
+ if (ret > 0) {
+ int err = start_khugepaged();
+ if (err)
+ ret = err;
+ }
+
+ if (ret > 0 &&
+ (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags) ||
+ test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags)))
+ set_recommended_min_free_kbytes();
+
+ return ret;
+}
+static struct kobj_attribute enabled_attr =
+ __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+static ssize_t single_flag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag flag)
+{
+ return sprintf(buf, "%d\n",
+ !!test_bit(flag, &transparent_hugepage_flags));
+}
+
+static ssize_t single_flag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count,
+ enum transparent_hugepage_flag flag)
+{
+ unsigned long value;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &value);
+ if (ret < 0)
+ return ret;
+ if (value > 1)
+ return -EINVAL;
+
+ if (value)
+ set_bit(flag, &transparent_hugepage_flags);
+ else
+ clear_bit(flag, &transparent_hugepage_flags);
+
+ return count;
+}
+
+/*
+ * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
+ * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
+ * memory just to allocate one more hugepage.
+ */
+static ssize_t defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return double_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static ssize_t defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return double_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static struct kobj_attribute defrag_attr =
+ __ATTR(defrag, 0644, defrag_show, defrag_store);
+
+#ifdef CONFIG_DEBUG_VM
+static ssize_t debug_cow_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static ssize_t debug_cow_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static struct kobj_attribute debug_cow_attr =
+ __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
+#endif /* CONFIG_DEBUG_VM */
+
+static struct attribute *hugepage_attr[] = {
+ &enabled_attr.attr,
+ &defrag_attr.attr,
+#ifdef CONFIG_DEBUG_VM
+ &debug_cow_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group hugepage_attr_group = {
+ .attrs = hugepage_attr,
+};
+
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_scan_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+ __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+ scan_sleep_millisecs_store);
+
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned long msecs;
+ int err;
+
+ err = strict_strtoul(buf, 10, &msecs);
+ if (err || msecs > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_alloc_sleep_millisecs = msecs;
+ wake_up_interruptible(&khugepaged_wait);
+
+ return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+ __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+ alloc_sleep_millisecs_store);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long pages;
+
+ err = strict_strtoul(buf, 10, &pages);
+ if (err || !pages || pages > UINT_MAX)
+ return -EINVAL;
+
+ khugepaged_pages_to_scan = pages;
+
+ return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+ __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+ pages_to_scan_store);
+
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+ __ATTR_RO(pages_collapsed);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+ __ATTR_RO(full_scans);
+
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return single_flag_show(kobj, attr, buf,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return single_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+ __ATTR(defrag, 0644, khugepaged_defrag_show,
+ khugepaged_defrag_store);
+
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_none;
+
+ err = strict_strtoul(buf, 10, &max_ptes_none);
+ if (err || max_ptes_none > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_none = max_ptes_none;
+
+ return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+ __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+ khugepaged_max_ptes_none_store);
+
+static struct attribute *khugepaged_attr[] = {
+ &khugepaged_defrag_attr.attr,
+ &khugepaged_max_ptes_none_attr.attr,
+ &pages_to_scan_attr.attr,
+ &pages_collapsed_attr.attr,
+ &full_scans_attr.attr,
+ &scan_sleep_millisecs_attr.attr,
+ &alloc_sleep_millisecs_attr.attr,
+ NULL,
+};
+
+static struct attribute_group khugepaged_attr_group = {
+ .attrs = khugepaged_attr,
+ .name = "khugepaged",
+};
+#endif /* CONFIG_SYSFS */
+
+static int __init hugepage_init(void)
+{
+ int err;
+#ifdef CONFIG_SYSFS
+ static struct kobject *hugepage_kobj;
+#endif
+
+ err = -EINVAL;
+ if (!has_transparent_hugepage()) {
+ transparent_hugepage_flags = 0;
+ goto out;
+ }
+
+#ifdef CONFIG_SYSFS
+ err = -ENOMEM;
+ hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+ if (unlikely(!hugepage_kobj)) {
+ printk(KERN_ERR "hugepage: failed kobject create\n");
+ goto out;
+ }
+
+ err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+ if (err) {
+ printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ goto out;
+ }
+
+ err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+ if (err) {
+ printk(KERN_ERR "hugepage: failed register hugeage group\n");
+ goto out;
+ }
+#endif
+
+ err = khugepaged_slab_init();
+ if (err)
+ goto out;
+
+ err = mm_slots_hash_init();
+ if (err) {
+ khugepaged_slab_free();
+ goto out;
+ }
+
+ /*
+ * By default disable transparent hugepages on smaller systems,
+ * where the extra memory used could hurt more than TLB overhead
+ * is likely to save. The admin can still enable it through /sys.
+ */
+ if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+ transparent_hugepage_flags = 0;
+
+ start_khugepaged();
+
+ set_recommended_min_free_kbytes();
+
+out:
+ return err;
+}
+module_init(hugepage_init)
+
+static int __init setup_transparent_hugepage(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ if (!strcmp(str, "always")) {
+ set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "madvise")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ } else if (!strcmp(str, "never")) {
+ clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+ &transparent_hugepage_flags);
+ clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ printk(KERN_WARNING
+ "transparent_hugepage= cannot parse, ignored\n");
+ return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+
+static void prepare_pmd_huge_pte(pgtable_t pgtable,
+ struct mm_struct *mm)
+{
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ if (!mm->pmd_huge_pte)
+ INIT_LIST_HEAD(&pgtable->lru);
+ else
+ list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+ mm->pmd_huge_pte = pgtable;
+}
+
+static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_flags & VM_WRITE))
+ pmd = pmd_mkwrite(pmd);
+ return pmd;
+}
+
+static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long haddr, pmd_t *pmd,
+ struct page *page)
+{
+ int ret = 0;
+ pgtable_t pgtable;
+
+ VM_BUG_ON(!PageCompound(page));
+ pgtable = pte_alloc_one(mm, haddr);
+ if (unlikely(!pgtable)) {
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ return VM_FAULT_OOM;
+ }
+
+ clear_huge_page(page, haddr, HPAGE_PMD_NR);
+ __SetPageUptodate(page);
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_none(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ mem_cgroup_uncharge_page(page);
+ put_page(page);
+ pte_free(mm, pgtable);
+ } else {
+ pmd_t entry;
+ entry = mk_pmd(page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+ /*
+ * The spinlocking to take the lru_lock inside
+ * page_add_new_anon_rmap() acts as a full memory
+ * barrier to be sure clear_huge_page writes become
+ * visible after the set_pmd_at() write.
+ */
+ page_add_new_anon_rmap(page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ prepare_pmd_huge_pte(pgtable, mm);
+ add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ spin_unlock(&mm->page_table_lock);
+ }
+
+ return ret;
+}
+
+static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+{
+ return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+}
+
+static inline struct page *alloc_hugepage_vma(int defrag,
+ struct vm_area_struct *vma,
+ unsigned long haddr, int nd,
+ gfp_t extra_gfp)
+{
+ return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
+ HPAGE_PMD_ORDER, vma, haddr, nd);
+}
+
+#ifndef CONFIG_NUMA
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+ HPAGE_PMD_ORDER);
+}
+#endif
+
+int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd,
+ unsigned int flags)
+{
+ struct page *page;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ pte_t *pte;
+
+ if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+ if (unlikely(khugepaged_enter(vma)))
+ return VM_FAULT_OOM;
+ page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
+ if (unlikely(!page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ goto out;
+ }
+ count_vm_event(THP_FAULT_ALLOC);
+ if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+ put_page(page);
+ goto out;
+ }
+
+ return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+ }
+out:
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+ return VM_FAULT_OOM;
+ /* if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
+ return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+ struct vm_area_struct *vma)
+{
+ struct page *src_page;
+ pmd_t pmd;
+ pgtable_t pgtable;
+ int ret;
+
+ ret = -ENOMEM;
+ pgtable = pte_alloc_one(dst_mm, addr);
+ if (unlikely(!pgtable))
+ goto out;
+
+ spin_lock(&dst_mm->page_table_lock);
+ spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+
+ ret = -EAGAIN;
+ pmd = *src_pmd;
+ if (unlikely(!pmd_trans_huge(pmd))) {
+ pte_free(dst_mm, pgtable);
+ goto out_unlock;
+ }
+ if (unlikely(pmd_trans_splitting(pmd))) {
+ /* split huge page running from under us */
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
+ pte_free(dst_mm, pgtable);
+
+ wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+ goto out;
+ }
+ src_page = pmd_page(pmd);
+ VM_BUG_ON(!PageHead(src_page));
+ get_page(src_page);
+ page_dup_rmap(src_page);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+
+ pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ pmd = pmd_mkold(pmd_wrprotect(pmd));
+ set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+ prepare_pmd_huge_pte(pgtable, dst_mm);
+
+ ret = 0;
+out_unlock:
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
+out:
+ return ret;
+}
+
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+{
+ pgtable_t pgtable;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ /* FIFO */
+ pgtable = mm->pmd_huge_pte;
+ if (list_empty(&pgtable->lru))
+ mm->pmd_huge_pte = NULL;
+ else {
+ mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+ struct page, lru);
+ list_del(&pgtable->lru);
+ }
+ return pgtable;
+}
+
+static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ pmd_t *pmd, pmd_t orig_pmd,
+ struct page *page,
+ unsigned long haddr)
+{
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ int ret = 0, i;
+ struct page **pages;
+
+ pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
+ GFP_KERNEL);
+ if (unlikely(!pages)) {
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
+ __GFP_OTHER_NODE,
+ vma, address, page_to_nid(page));
+ if (unlikely(!pages[i] ||
+ mem_cgroup_newpage_charge(pages[i], mm,
+ GFP_KERNEL))) {
+ if (pages[i])
+ put_page(pages[i]);
+ mem_cgroup_uncharge_start();
+ while (--i >= 0) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+ }
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ copy_user_highpage(pages[i], page + i,
+ haddr + PAGE_SHIFT*i, vma);
+ __SetPageUptodate(pages[i]);
+ cond_resched();
+ }
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_free_pages;
+ VM_BUG_ON(!PageHead(page));
+
+ pmdp_clear_flush_notify(vma, haddr, pmd);
+ /* leave pmd empty until pte is filled */
+
+ pgtable = get_pmd_huge_pte(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ entry = mk_pte(pages[i], vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ page_add_new_anon_rmap(pages[i], vma, haddr);
+ pte = pte_offset_map(&_pmd, haddr);
+ VM_BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+ kfree(pages);
+
+ mm->nr_ptes++;
+ smp_wmb(); /* make pte visible before pmd */
+ pmd_populate(mm, pmd, pgtable);
+ page_remove_rmap(page);
+ spin_unlock(&mm->page_table_lock);
+
+ ret |= VM_FAULT_WRITE;
+ put_page(page);
+
+out:
+ return ret;
+
+out_free_pages:
+ spin_unlock(&mm->page_table_lock);
+ mem_cgroup_uncharge_start();
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ mem_cgroup_uncharge_page(pages[i]);
+ put_page(pages[i]);
+ }
+ mem_cgroup_uncharge_end();
+ kfree(pages);
+ goto out;
+}
+
+int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+{
+ int ret = 0;
+ struct page *page, *new_page;
+ unsigned long haddr;
+
+ VM_BUG_ON(!vma->anon_vma);
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, orig_pmd)))
+ goto out_unlock;
+
+ page = pmd_page(orig_pmd);
+ VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+ haddr = address & HPAGE_PMD_MASK;
+ if (page_mapcount(page) == 1) {
+ pmd_t entry;
+ entry = pmd_mkyoung(orig_pmd);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
+ update_mmu_cache(vma, address, entry);
+ ret |= VM_FAULT_WRITE;
+ goto out_unlock;
+ }
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+
+ if (transparent_hugepage_enabled(vma) &&
+ !transparent_hugepage_debug_cow())
+ new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+ vma, haddr, numa_node_id(), 0);
+ else
+ new_page = NULL;
+
+ if (unlikely(!new_page)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+ pmd, orig_pmd, page, haddr);
+ put_page(page);
+ goto out;
+ }
+ count_vm_event(THP_FAULT_ALLOC);
+
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ put_page(new_page);
+ put_page(page);
+ ret |= VM_FAULT_OOM;
+ goto out;
+ }
+
+ copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+ __SetPageUptodate(new_page);
+
+ spin_lock(&mm->page_table_lock);
+ put_page(page);
+ if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+ mem_cgroup_uncharge_page(new_page);
+ put_page(new_page);
+ } else {
+ pmd_t entry;
+ VM_BUG_ON(!PageHead(page));
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+ pmdp_clear_flush_notify(vma, haddr, pmd);
+ page_add_new_anon_rmap(new_page, vma, haddr);
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache(vma, address, entry);
+ page_remove_rmap(page);
+ put_page(page);
+ ret |= VM_FAULT_WRITE;
+ }
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+out:
+ return ret;
+}
+
+struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+ unsigned long addr,
+ pmd_t *pmd,
+ unsigned int flags)
+{
+ struct page *page = NULL;
+
+ assert_spin_locked(&mm->page_table_lock);
+
+ if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ goto out;
+
+ page = pmd_page(*pmd);
+ VM_BUG_ON(!PageHead(page));
+ if (flags & FOLL_TOUCH) {
+ pmd_t _pmd;
+ /*
+ * We should set the dirty bit only for FOLL_WRITE but
+ * for now the dirty bit in the pmd is meaningless.
+ * And if the dirty bit will become meaningful and
+ * we'll only set it with FOLL_WRITE, an atomic
+ * set_bit will be required on the pmd to set the
+ * young bit, instead of the current set_pmd_at.
+ */
+ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+ set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+ }
+ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+ VM_BUG_ON(!PageCompound(page));
+ if (flags & FOLL_GET)
+ get_page(page);
+
+out:
+ return page;
+}
+
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd)
+{
+ int ret = 0;
+
+ spin_lock(&tlb->mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&tlb->mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma,
+ pmd);
+ } else {
+ struct page *page;
+ pgtable_t pgtable;
+ pgtable = get_pmd_huge_pte(tlb->mm);
+ page = pmd_page(*pmd);
+ pmd_clear(pmd);
+ page_remove_rmap(page);
+ VM_BUG_ON(page_mapcount(page) < 0);
+ add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+ VM_BUG_ON(!PageHead(page));
+ spin_unlock(&tlb->mm->page_table_lock);
+ tlb_remove_page(tlb, page);
+ pte_free(tlb->mm, pgtable);
+ ret = 1;
+ }
+ } else
+ spin_unlock(&tlb->mm->page_table_lock);
+
+ return ret;
+}
+
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ unsigned char *vec)
+{
+ int ret = 0;
+
+ spin_lock(&vma->vm_mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ ret = !pmd_trans_splitting(*pmd);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ if (unlikely(!ret))
+ wait_split_huge_page(vma->anon_vma, pmd);
+ else {
+ /*
+ * All logical pages in the range are present
+ * if backed by a huge page.
+ */
+ memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+ }
+ } else
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
+ return ret;
+}
+
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long addr, pgprot_t newprot)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int ret = 0;
+
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ pmd_t entry;
+
+ entry = pmdp_get_and_clear(mm, addr, pmd);
+ entry = pmd_modify(entry, newprot);
+ set_pmd_at(mm, addr, pmd, entry);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+ ret = 1;
+ }
+ } else
+ spin_unlock(&vma->vm_mm->page_table_lock);
+
+ return ret;
+}
+
+pmd_t *page_check_address_pmd(struct page *page,
+ struct mm_struct *mm,
+ unsigned long address,
+ enum page_check_address_pmd_flag flag)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, *ret = NULL;
+
+ if (address & ~HPAGE_PMD_MASK)
+ goto out;
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd))
+ goto out;
+ if (pmd_page(*pmd) != page)
+ goto out;
+ /*
+ * split_vma() may create temporary aliased mappings. There is
+ * no risk as long as all huge pmd are found and have their
+ * splitting bit set before __split_huge_page_refcount
+ * runs. Finding the same huge pmd more than once during the
+ * same rmap walk is not a problem.
+ */
+ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+ pmd_trans_splitting(*pmd))
+ goto out;
+ if (pmd_trans_huge(*pmd)) {
+ VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
+ !pmd_trans_splitting(*pmd));
+ ret = pmd;
+ }
+out:
+ return ret;
+}
+
+static int __split_huge_page_splitting(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd;
+ int ret = 0;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+ if (pmd) {
+ /*
+ * We can't temporarily set the pmd to null in order
+ * to split it, the pmd must remain marked huge at all
+ * times or the VM won't take the pmd_trans_huge paths
+ * and it won't wait on the anon_vma->root->lock to
+ * serialize against split_huge_page*.
+ */
+ pmdp_splitting_flush_notify(vma, address, pmd);
+ ret = 1;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+
+static void __split_huge_page_refcount(struct page *page)
+{
+ int i;
+ unsigned long head_index = page->index;
+ struct zone *zone = page_zone(page);
+ int zonestat;
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irq(&zone->lru_lock);
+ compound_lock(page);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *page_tail = page + i;
+
+ /* tail_page->_count cannot change */
+ atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+ BUG_ON(page_count(page) <= 0);
+ atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+ BUG_ON(atomic_read(&page_tail->_count) <= 0);
+
+ /* after clearing PageTail the gup refcount can be released */
+ smp_mb();
+
+ /*
+ * retain hwpoison flag of the poisoned tail page:
+ * fix for the unsuitable process killed on Guest Machine(KVM)
+ * by the memory-failure.
+ */
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+ page_tail->flags |= (page->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate)));
+ page_tail->flags |= (1L << PG_dirty);
+
+ /*
+ * 1) clear PageTail before overwriting first_page
+ * 2) clear PageTail before clearing PageHead for VM_BUG_ON
+ */
+ smp_wmb();
+
+ /*
+ * __split_huge_page_splitting() already set the
+ * splitting bit in all pmd that could map this
+ * hugepage, that will ensure no CPU can alter the
+ * mapcount on the head page. The mapcount is only
+ * accounted in the head page and it has to be
+ * transferred to all tail pages in the below code. So
+ * for this code to be safe, the split the mapcount
+ * can't change. But that doesn't mean userland can't
+ * keep changing and reading the page contents while
+ * we transfer the mapcount, so the pmd splitting
+ * status is achieved setting a reserved bit in the
+ * pmd, not by clearing the present bit.
+ */
+ BUG_ON(page_mapcount(page_tail));
+ page_tail->_mapcount = page->_mapcount;
+
+ BUG_ON(page_tail->mapping);
+ page_tail->mapping = page->mapping;
+
+ page_tail->index = ++head_index;
+
+ BUG_ON(!PageAnon(page_tail));
+ BUG_ON(!PageUptodate(page_tail));
+ BUG_ON(!PageDirty(page_tail));
+ BUG_ON(!PageSwapBacked(page_tail));
+
+ mem_cgroup_split_huge_fixup(page, page_tail);
+
+ lru_add_page_tail(zone, page, page_tail);
+ }
+
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+
+ /*
+ * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+ * so adjust those appropriately if this page is on the LRU.
+ */
+ if (PageLRU(page)) {
+ zonestat = NR_LRU_BASE + page_lru(page);
+ __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+ }
+
+ ClearPageCompound(page);
+ compound_unlock(page);
+ spin_unlock_irq(&zone->lru_lock);
+
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ struct page *page_tail = page + i;
+ BUG_ON(page_count(page_tail) <= 0);
+ /*
+ * Tail pages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(page_tail);
+ }
+
+ /*
+ * Only the head page (now become a regular page) is required
+ * to be pinned by the caller.
+ */
+ BUG_ON(page_count(page) <= 0);
+}
+
+static int __split_huge_page_map(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t *pmd, _pmd;
+ int ret = 0, i;
+ pgtable_t pgtable;
+ unsigned long haddr;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+ if (pmd) {
+ pgtable = get_pmd_huge_pte(mm);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+ i++, haddr += PAGE_SIZE) {
+ pte_t *pte, entry;
+ BUG_ON(PageCompound(page+i));
+ entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (!pmd_write(*pmd))
+ entry = pte_wrprotect(entry);
+ else
+ BUG_ON(page_mapcount(page) != 1);
+ if (!pmd_young(*pmd))
+ entry = pte_mkold(entry);
+ pte = pte_offset_map(&_pmd, haddr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, haddr, pte, entry);
+ pte_unmap(pte);
+ }
+
+ mm->nr_ptes++;
+ smp_wmb(); /* make pte visible before pmd */
+ /*
+ * Up to this point the pmd is present and huge and
+ * userland has the whole access to the hugepage
+ * during the split (which happens in place). If we
+ * overwrite the pmd with the not-huge version
+ * pointing to the pte here (which of course we could
+ * if all CPUs were bug free), userland could trigger
+ * a small page size TLB miss on the small sized TLB
+ * while the hugepage TLB entry is still established
+ * in the huge TLB. Some CPU doesn't like that. See
+ * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
+ * Erratum 383 on page 93. Intel should be safe but is
+ * also warns that it's only safe if the permission
+ * and cache attributes of the two entries loaded in
+ * the two TLB is identical (which should be the case
+ * here). But it is generally safer to never allow
+ * small and huge TLB entries for the same virtual
+ * address to be loaded simultaneously. So instead of
+ * doing "pmd_populate(); flush_tlb_range();" we first
+ * mark the current pmd notpresent (atomically because
+ * here the pmd_trans_huge and pmd_trans_splitting
+ * must remain set at all times on the pmd until the
+ * split is complete for this pmd), then we flush the
+ * SMP TLB and finally we write the non-huge version
+ * of the pmd entry with pmd_populate.
+ */
+ set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ pmd_populate(mm, pmd, pgtable);
+ ret = 1;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ return ret;
+}
+
+/* must be called with anon_vma->root->lock hold */
+static void __split_huge_page(struct page *page,
+ struct anon_vma *anon_vma)
+{
+ int mapcount, mapcount2;
+ struct anon_vma_chain *avc;
+
+ BUG_ON(!PageHead(page));
+ BUG_ON(PageTail(page));
+
+ mapcount = 0;
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ if (addr == -EFAULT)
+ continue;
+ mapcount += __split_huge_page_splitting(page, vma, addr);
+ }
+ /*
+ * It is critical that new vmas are added to the tail of the
+ * anon_vma list. This guarantes that if copy_huge_pmd() runs
+ * and establishes a child pmd before
+ * __split_huge_page_splitting() freezes the parent pmd (so if
+ * we fail to prevent copy_huge_pmd() from running until the
+ * whole __split_huge_page() is complete), we will still see
+ * the newly established pmd of the child later during the
+ * walk, to be able to set it as pmd_trans_splitting too.
+ */
+ if (mapcount != page_mapcount(page))
+ printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+ mapcount, page_mapcount(page));
+ BUG_ON(mapcount != page_mapcount(page));
+
+ __split_huge_page_refcount(page);
+
+ mapcount2 = 0;
+ list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+ struct vm_area_struct *vma = avc->vma;
+ unsigned long addr = vma_address(page, vma);
+ BUG_ON(is_vma_temporary_stack(vma));
+ if (addr == -EFAULT)
+ continue;
+ mapcount2 += __split_huge_page_map(page, vma, addr);
+ }
+ if (mapcount != mapcount2)
+ printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
+ mapcount, mapcount2, page_mapcount(page));
+ BUG_ON(mapcount != mapcount2);
+}
+
+int split_huge_page(struct page *page)
+{
+ struct anon_vma *anon_vma;
+ int ret = 1;
+
+ BUG_ON(!PageAnon(page));
+ anon_vma = page_lock_anon_vma(page);
+ if (!anon_vma)
+ goto out;
+ ret = 0;
+ if (!PageCompound(page))
+ goto out_unlock;
+
+ BUG_ON(!PageSwapBacked(page));
+ __split_huge_page(page, anon_vma);
+ count_vm_event(THP_SPLIT);
+
+ BUG_ON(PageCompound(page));
+out_unlock:
+ page_unlock_anon_vma(anon_vma);
+out:
+ return ret;
+}
+
+#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+ VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+
+int hugepage_madvise(struct vm_area_struct *vma,
+ unsigned long *vm_flags, int advice)
+{
+ switch (advice) {
+ case MADV_HUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
+ return -EINVAL;
+ *vm_flags &= ~VM_NOHUGEPAGE;
+ *vm_flags |= VM_HUGEPAGE;
+ /*
+ * If the vma become good for khugepaged to scan,
+ * register it here without waiting a page fault that
+ * may not happen any time soon.
+ */
+ if (unlikely(khugepaged_enter_vma_merge(vma)))
+ return -ENOMEM;
+ break;
+ case MADV_NOHUGEPAGE:
+ /*
+ * Be somewhat over-protective like KSM for now!
+ */
+ if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
+ return -EINVAL;
+ *vm_flags &= ~VM_HUGEPAGE;
+ *vm_flags |= VM_NOHUGEPAGE;
+ /*
+ * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+ * this vma even if we leave the mm registered in khugepaged if
+ * it got registered before VM_NOHUGEPAGE was set.
+ */
+ break;
+ }
+
+ return 0;
+}
+
+static int __init khugepaged_slab_init(void)
+{
+ mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+ sizeof(struct mm_slot),
+ __alignof__(struct mm_slot), 0, NULL);
+ if (!mm_slot_cache)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __init khugepaged_slab_free(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+ mm_slot_cache = NULL;
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+ if (!mm_slot_cache) /* initialization failed */
+ return NULL;
+ return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+ kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static int __init mm_slots_hash_init(void)
+{
+ mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!mm_slots_hash)
+ return -ENOMEM;
+ return 0;
+}
+
+#if 0
+static void __init mm_slots_hash_free(void)
+{
+ kfree(mm_slots_hash);
+ mm_slots_hash = NULL;
+}
+#endif
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ struct hlist_head *bucket;
+ struct hlist_node *node;
+
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
+ hlist_for_each_entry(mm_slot, node, bucket, hash) {
+ if (mm == mm_slot->mm)
+ return mm_slot;
+ }
+ return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+ struct mm_slot *mm_slot)
+{
+ struct hlist_head *bucket;
+
+ bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+ % MM_SLOTS_HASH_HEADS];
+ mm_slot->mm = mm;
+ hlist_add_head(&mm_slot->hash, bucket);
+}
+
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+ return atomic_read(&mm->mm_users) == 0;
+}
+
+int __khugepaged_enter(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int wakeup;
+
+ mm_slot = alloc_mm_slot();
+ if (!mm_slot)
+ return -ENOMEM;
+
+ /* __khugepaged_exit() must not run from under us */
+ VM_BUG_ON(khugepaged_test_exit(mm));
+ if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+ free_mm_slot(mm_slot);
+ return 0;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ insert_to_mm_slots_hash(mm, mm_slot);
+ /*
+ * Insert just behind the scanning cursor, to let the area settle
+ * down a little.
+ */
+ wakeup = list_empty(&khugepaged_scan.mm_head);
+ list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+ spin_unlock(&khugepaged_mm_lock);
+
+ atomic_inc(&mm->mm_count);
+ if (wakeup)
+ wake_up_interruptible(&khugepaged_wait);
+
+ return 0;
+}
+
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+ unsigned long hstart, hend;
+ if (!vma->anon_vma)
+ /*
+ * Not yet faulted in so we will register later in the
+ * page fault if needed.
+ */
+ return 0;
+ if (vma->vm_ops)
+ /* khugepaged not yet working on file or special mappings */
+ return 0;
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+ * true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart < hend)
+ return khugepaged_enter(vma);
+ return 0;
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+ struct mm_slot *mm_slot;
+ int free = 0;
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ hlist_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+ free = 1;
+ }
+
+ if (free) {
+ spin_unlock(&khugepaged_mm_lock);
+ clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ } else if (mm_slot) {
+ spin_unlock(&khugepaged_mm_lock);
+ /*
+ * This is required to serialize against
+ * khugepaged_test_exit() (which is guaranteed to run
+ * under mmap sem read mode). Stop here (after we
+ * return all pagetables will be destroyed) until
+ * khugepaged has finished working on the pagetables
+ * under the mmap_sem.
+ */
+ down_write(&mm->mmap_sem);
+ up_write(&mm->mmap_sem);
+ } else
+ spin_unlock(&khugepaged_mm_lock);
+}
+
+static void release_pte_page(struct page *page)
+{
+ /* 0 stands for page_is_file_cache(page) == false */
+ dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ unlock_page(page);
+ putback_lru_page(page);
+}
+
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+ while (--_pte >= pte) {
+ pte_t pteval = *_pte;
+ if (!pte_none(pteval))
+ release_pte_page(pte_page(pteval));
+ }
+}
+
+static void release_all_pte_pages(pte_t *pte)
+{
+ release_pte_pages(pte, pte + HPAGE_PMD_NR);
+}
+
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+ unsigned long address,
+ pte_t *pte)
+{
+ struct page *page;
+ pte_t *_pte;
+ int referenced = 0, isolated = 0, none = 0;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ }
+ if (!pte_present(pteval) || !pte_write(pteval)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ page = vm_normal_page(vma, address, pteval);
+ if (unlikely(!page)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ VM_BUG_ON(PageCompound(page));
+ BUG_ON(!PageAnon(page));
+ VM_BUG_ON(!PageSwapBacked(page));
+
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /*
+ * We can do it before isolate_lru_page because the
+ * page can't be freed from under us. NOTE: PG_lock
+ * is needed to serialize against split_huge_page
+ * when invoked from the VM.
+ */
+ if (!trylock_page(page)) {
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /*
+ * Isolate the page to avoid collapsing an hugepage
+ * currently in use by the VM.
+ */
+ if (isolate_lru_page(page)) {
+ unlock_page(page);
+ release_pte_pages(pte, _pte);
+ goto out;
+ }
+ /* 0 stands for page_is_file_cache(page) == false */
+ inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageLRU(page));
+
+ /* If there is no mapped pte young don't collapse the page */
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (unlikely(!referenced))
+ release_all_pte_pages(pte);
+ else
+ isolated = 1;
+out:
+ return isolated;
+}
+
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl)
+{
+ pte_t *_pte;
+ for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+ pte_t pteval = *_pte;
+ struct page *src_page;
+
+ if (pte_none(pteval)) {
+ clear_user_highpage(page, address);
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ } else {
+ src_page = pte_page(pteval);
+ copy_user_highpage(page, src_page, address, vma);
+ VM_BUG_ON(page_mapcount(src_page) != 1);
+ VM_BUG_ON(page_count(src_page) != 2);
+ release_pte_page(src_page);
+ /*
+ * ptl mostly unnecessary, but preempt has to
+ * be disabled to update the per-cpu stats
+ * inside page_remove_rmap().
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ page_remove_rmap(src_page);
+ spin_unlock(ptl);
+ free_page_and_swap_cache(src_page);
+ }
+
+ address += PAGE_SIZE;
+ page++;
+ }
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+ unsigned long address,
+ struct page **hpage,
+ struct vm_area_struct *vma,
+ int node)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd, _pmd;
+ pte_t *pte;
+ pgtable_t pgtable;
+ struct page *new_page;
+ spinlock_t *ptl;
+ int isolated;
+ unsigned long hstart, hend;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
+ VM_BUG_ON(!*hpage);
+ new_page = *hpage;
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ up_read(&mm->mmap_sem);
+ return;
+ }
+#else
+ VM_BUG_ON(*hpage);
+ /*
+ * Allocate the page while the vma is still valid and under
+ * the mmap_sem read mode so there is no memory allocation
+ * later when we take the mmap_sem in write mode. This is more
+ * friendly behavior (OTOH it may actually hide bugs) to
+ * filesystems in userland with daemons allocating memory in
+ * the userland I/O paths. Allocating memory with the
+ * mmap_sem in read mode is good idea also to allow greater
+ * scalability.
+ */
+ new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+ node, __GFP_OTHER_NODE);
+ if (unlikely(!new_page)) {
+ up_read(&mm->mmap_sem);
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ *hpage = ERR_PTR(-ENOMEM);
+ return;
+ }
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+ up_read(&mm->mmap_sem);
+ put_page(new_page);
+ return;
+ }
+#endif
+
+ /* after allocating the hugepage upgrade to mmap_sem write mode */
+ up_read(&mm->mmap_sem);
+
+ /*
+ * Prevent all access to pagetables with the exception of
+ * gup_fast later hanlded by the ptep_clear_flush and the VM
+ * handled by the anon_vma lock + PG_lock.
+ */
+ down_write(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto out;
+
+ vma = find_vma(mm, address);
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ goto out;
+
+ if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE))
+ goto out;
+
+ if (!vma->anon_vma || vma->vm_ops)
+ goto out;
+ if (is_vma_temporary_stack(vma))
+ goto out;
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+ * true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ /* pmd can't go away or become huge under us */
+ if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ goto out;
+
+ anon_vma_lock(vma->anon_vma);
+
+ pte = pte_offset_map(pmd, address);
+ ptl = pte_lockptr(mm, pmd);
+
+ spin_lock(&mm->page_table_lock); /* probably unnecessary */
+ /*
+ * After this gup_fast can't run anymore. This also removes
+ * any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address
+ * to avoid the risk of CPU bugs in that area.
+ */
+ _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+ spin_unlock(&mm->page_table_lock);
+
+ spin_lock(ptl);
+ isolated = __collapse_huge_page_isolate(vma, address, pte);
+ spin_unlock(ptl);
+
+ if (unlikely(!isolated)) {
+ pte_unmap(pte);
+ spin_lock(&mm->page_table_lock);
+ BUG_ON(!pmd_none(*pmd));
+ set_pmd_at(mm, address, pmd, _pmd);
+ spin_unlock(&mm->page_table_lock);
+ anon_vma_unlock(vma->anon_vma);
+ goto out;
+ }
+
+ /*
+ * All pages are isolated and locked so anon_vma rmap
+ * can't run anymore.
+ */
+ anon_vma_unlock(vma->anon_vma);
+
+ __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+ pte_unmap(pte);
+ __SetPageUptodate(new_page);
+ pgtable = pmd_pgtable(_pmd);
+ VM_BUG_ON(page_count(pgtable) != 1);
+ VM_BUG_ON(page_mapcount(pgtable) != 0);
+
+ _pmd = mk_pmd(new_page, vma->vm_page_prot);
+ _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+ _pmd = pmd_mkhuge(_pmd);
+
+ /*
+ * spin_lock() below is not the equivalent of smp_wmb(), so
+ * this is needed to avoid the copy_huge_page writes to become
+ * visible after the set_pmd_at() write.
+ */
+ smp_wmb();
+
+ spin_lock(&mm->page_table_lock);
+ BUG_ON(!pmd_none(*pmd));
+ page_add_new_anon_rmap(new_page, vma, address);
+ set_pmd_at(mm, address, pmd, _pmd);
+ update_mmu_cache(vma, address, entry);
+ prepare_pmd_huge_pte(pgtable, mm);
+ mm->nr_ptes--;
+ spin_unlock(&mm->page_table_lock);
+
+#ifndef CONFIG_NUMA
+ *hpage = NULL;
+#endif
+ khugepaged_pages_collapsed++;
+out_up_write:
+ up_write(&mm->mmap_sem);
+ return;
+
+out:
+ mem_cgroup_uncharge_page(new_page);
+#ifdef CONFIG_NUMA
+ put_page(new_page);
+#endif
+ goto out_up_write;
+}
+
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ struct page **hpage)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte, *_pte;
+ int ret = 0, referenced = 0, none = 0;
+ struct page *page;
+ unsigned long _address;
+ spinlock_t *ptl;
+ int node = -1;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ goto out;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ goto out;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+ goto out;
+
+ pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ _pte++, _address += PAGE_SIZE) {
+ pte_t pteval = *_pte;
+ if (pte_none(pteval)) {
+ if (++none <= khugepaged_max_ptes_none)
+ continue;
+ else
+ goto out_unmap;
+ }
+ if (!pte_present(pteval) || !pte_write(pteval))
+ goto out_unmap;
+ page = vm_normal_page(vma, _address, pteval);
+ if (unlikely(!page))
+ goto out_unmap;
+ /*
+ * Chose the node of the first page. This could
+ * be more sophisticated and look at more pages,
+ * but isn't for now.
+ */
+ if (node == -1)
+ node = page_to_nid(page);
+ VM_BUG_ON(PageCompound(page));
+ if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ goto out_unmap;
+ /* cannot use mapcount: can't collapse if there's a gup pin */
+ if (page_count(page) != 1)
+ goto out_unmap;
+ if (pte_young(pteval) || PageReferenced(page) ||
+ mmu_notifier_test_young(vma->vm_mm, address))
+ referenced = 1;
+ }
+ if (referenced)
+ ret = 1;
+out_unmap:
+ pte_unmap_unlock(pte, ptl);
+ if (ret)
+ /* collapse_huge_page will return with the mmap_sem released */
+ collapse_huge_page(mm, address, hpage, vma, node);
+out:
+ return ret;
+}
+
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+
+ VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_test_exit(mm)) {
+ /* free mm_slot */
+ hlist_del(&mm_slot->hash);
+ list_del(&mm_slot->mm_node);
+
+ /*
+ * Not strictly needed because the mm exited already.
+ *
+ * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ */
+
+ /* khugepaged_mm_lock actually not necessary for the below */
+ free_mm_slot(mm_slot);
+ mmdrop(mm);
+ }
+}
+
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+ struct page **hpage)
+{
+ struct mm_slot *mm_slot;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+ int progress = 0;
+
+ VM_BUG_ON(!pages);
+ VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+ if (khugepaged_scan.mm_slot)
+ mm_slot = khugepaged_scan.mm_slot;
+ else {
+ mm_slot = list_entry(khugepaged_scan.mm_head.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ khugepaged_scan.mm_slot = mm_slot;
+ }
+ spin_unlock(&khugepaged_mm_lock);
+
+ mm = mm_slot->mm;
+ down_read(&mm->mmap_sem);
+ if (unlikely(khugepaged_test_exit(mm)))
+ vma = NULL;
+ else
+ vma = find_vma(mm, khugepaged_scan.address);
+
+ progress++;
+ for (; vma; vma = vma->vm_next) {
+ unsigned long hstart, hend;
+
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm))) {
+ progress++;
+ break;
+ }
+
+ if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+ !khugepaged_always()) ||
+ (vma->vm_flags & VM_NOHUGEPAGE)) {
+ skip:
+ progress++;
+ continue;
+ }
+ if (!vma->anon_vma || vma->vm_ops)
+ goto skip;
+ if (is_vma_temporary_stack(vma))
+ goto skip;
+ /*
+ * If is_pfn_mapping() is true is_learn_pfn_mapping()
+ * must be true too, verify it here.
+ */
+ VM_BUG_ON(is_linear_pfn_mapping(vma) ||
+ vma->vm_flags & VM_NO_THP);
+
+ hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+ hend = vma->vm_end & HPAGE_PMD_MASK;
+ if (hstart >= hend)
+ goto skip;
+ if (khugepaged_scan.address > hend)
+ goto skip;
+ if (khugepaged_scan.address < hstart)
+ khugepaged_scan.address = hstart;
+ VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+
+ while (khugepaged_scan.address < hend) {
+ int ret;
+ cond_resched();
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto breakouterloop;
+
+ VM_BUG_ON(khugepaged_scan.address < hstart ||
+ khugepaged_scan.address + HPAGE_PMD_SIZE >
+ hend);
+ ret = khugepaged_scan_pmd(mm, vma,
+ khugepaged_scan.address,
+ hpage);
+ /* move to next address */
+ khugepaged_scan.address += HPAGE_PMD_SIZE;
+ progress += HPAGE_PMD_NR;
+ if (ret)
+ /* we released mmap_sem so break loop */
+ goto breakouterloop_mmap_sem;
+ if (progress >= pages)
+ goto breakouterloop;
+ }
+ }
+breakouterloop:
+ up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+
+ spin_lock(&khugepaged_mm_lock);
+ VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ /*
+ * Release the current mm_slot if this mm is about to die, or
+ * if we scanned all vmas of this mm.
+ */
+ if (khugepaged_test_exit(mm) || !vma) {
+ /*
+ * Make sure that if mm_users is reaching zero while
+ * khugepaged runs here, khugepaged_exit will find
+ * mm_slot not pointing to the exiting mm.
+ */
+ if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+ khugepaged_scan.mm_slot = list_entry(
+ mm_slot->mm_node.next,
+ struct mm_slot, mm_node);
+ khugepaged_scan.address = 0;
+ } else {
+ khugepaged_scan.mm_slot = NULL;
+ khugepaged_full_scans++;
+ }
+
+ collect_mm_slot(mm_slot);
+ }
+
+ return progress;
+}
+
+static int khugepaged_has_work(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) &&
+ khugepaged_enabled();
+}
+
+static int khugepaged_wait_event(void)
+{
+ return !list_empty(&khugepaged_scan.mm_head) ||
+ !khugepaged_enabled();
+}
+
+static void khugepaged_do_scan(struct page **hpage)
+{
+ unsigned int progress = 0, pass_through_head = 0;
+ unsigned int pages = khugepaged_pages_to_scan;
+
+ barrier(); /* write khugepaged_pages_to_scan to local stack */
+
+ while (progress < pages) {
+ cond_resched();
+
+#ifndef CONFIG_NUMA
+ if (!*hpage) {
+ *hpage = alloc_hugepage(khugepaged_defrag());
+ if (unlikely(!*hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ break;
+ }
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ }
+#else
+ if (IS_ERR(*hpage))
+ break;
+#endif
+
+ if (unlikely(kthread_should_stop() || freezing(current)))
+ break;
+
+ spin_lock(&khugepaged_mm_lock);
+ if (!khugepaged_scan.mm_slot)
+ pass_through_head++;
+ if (khugepaged_has_work() &&
+ pass_through_head < 2)
+ progress += khugepaged_scan_mm_slot(pages - progress,
+ hpage);
+ else
+ progress = pages;
+ spin_unlock(&khugepaged_mm_lock);
+ }
+}
+
+static void khugepaged_alloc_sleep(void)
+{
+ DEFINE_WAIT(wait);
+ add_wait_queue(&khugepaged_wait, &wait);
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(
+ khugepaged_alloc_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
+}
+
+#ifndef CONFIG_NUMA
+static struct page *khugepaged_alloc_hugepage(void)
+{
+ struct page *hpage;
+
+ do {
+ hpage = alloc_hugepage(khugepaged_defrag());
+ if (!hpage) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ khugepaged_alloc_sleep();
+ } else
+ count_vm_event(THP_COLLAPSE_ALLOC);
+ } while (unlikely(!hpage) &&
+ likely(khugepaged_enabled()));
+ return hpage;
+}
+#endif
+
+static void khugepaged_loop(void)
+{
+ struct page *hpage;
+
+#ifdef CONFIG_NUMA
+ hpage = NULL;
+#endif
+ while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
+ hpage = khugepaged_alloc_hugepage();
+ if (unlikely(!hpage)) {
+ count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+ break;
+ }
+ count_vm_event(THP_COLLAPSE_ALLOC);
+#else
+ if (IS_ERR(hpage)) {
+ khugepaged_alloc_sleep();
+ hpage = NULL;
+ }
+#endif
+
+ khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
+ if (hpage)
+ put_page(hpage);
+#endif
+ try_to_freeze();
+ if (unlikely(kthread_should_stop()))
+ break;
+ if (khugepaged_has_work()) {
+ DEFINE_WAIT(wait);
+ if (!khugepaged_scan_sleep_millisecs)
+ continue;
+ add_wait_queue(&khugepaged_wait, &wait);
+ schedule_timeout_interruptible(
+ msecs_to_jiffies(
+ khugepaged_scan_sleep_millisecs));
+ remove_wait_queue(&khugepaged_wait, &wait);
+ } else if (khugepaged_enabled())
+ wait_event_freezable(khugepaged_wait,
+ khugepaged_wait_event());
+ }
+}
+
+static int khugepaged(void *none)
+{
+ struct mm_slot *mm_slot;
+
+ set_freezable();
+ set_user_nice(current, 19);
+
+ /* serialize with start_khugepaged() */
+ mutex_lock(&khugepaged_mutex);
+
+ for (;;) {
+ mutex_unlock(&khugepaged_mutex);
+ VM_BUG_ON(khugepaged_thread != current);
+ khugepaged_loop();
+ VM_BUG_ON(khugepaged_thread != current);
+
+ mutex_lock(&khugepaged_mutex);
+ if (!khugepaged_enabled())
+ break;
+ if (unlikely(kthread_should_stop()))
+ break;
+ }
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = khugepaged_scan.mm_slot;
+ khugepaged_scan.mm_slot = NULL;
+ if (mm_slot)
+ collect_mm_slot(mm_slot);
+ spin_unlock(&khugepaged_mm_lock);
+
+ khugepaged_thread = NULL;
+ mutex_unlock(&khugepaged_mutex);
+
+ return 0;
+}
+
+void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+{
+ struct page *page;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ return;
+ }
+ page = pmd_page(*pmd);
+ VM_BUG_ON(!page_count(page));
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+
+ split_huge_page(page);
+
+ put_page(page);
+ BUG_ON(pmd_trans_huge(*pmd));
+}
+
+static void split_huge_page_address(struct mm_struct *mm,
+ unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return;
+
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+
+ pmd = pmd_offset(pud, address);
+ if (!pmd_present(*pmd))
+ return;
+ /*
+ * Caller holds the mmap_sem write mode, so a huge pmd cannot
+ * materialize from under us.
+ */
+ split_huge_page_pmd(mm, pmd);
+}
+
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ long adjust_next)
+{
+ /*
+ * If the new start address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (start & ~HPAGE_PMD_MASK &&
+ (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, start);
+
+ /*
+ * If the new end address isn't hpage aligned and it could
+ * previously contain an hugepage: check if we need to split
+ * an huge pmd.
+ */
+ if (end & ~HPAGE_PMD_MASK &&
+ (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+ (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+ split_huge_page_address(vma->vm_mm, end);
+
+ /*
+ * If we're also updating the vma->vm_next->vm_start, if the new
+ * vm_next->vm_start isn't page aligned and it could previously
+ * contain an hugepage: check if we need to split an huge pmd.
+ */
+ if (adjust_next > 0) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long nstart = next->vm_start;
+ nstart += adjust_next << PAGE_SHIFT;
+ if (nstart & ~HPAGE_PMD_MASK &&
+ (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+ (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+ split_huge_page_address(next->vm_mm, nstart);
+ }
+}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85855240933d..8ee3bd8ec5b5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t)
if (rg->from > t)
return chg;
- /* We overlap with this area, if it extends futher than
+ /* We overlap with this area, if it extends further than
* us then we must extend ourselves. Account for its
* existing reservation. */
if (rg->to > t) {
@@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma)
return 0;
}
-static void clear_gigantic_page(struct page *page,
- unsigned long addr, unsigned long sz)
-{
- int i;
- struct page *p = page;
-
- might_sleep();
- for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
- cond_resched();
- clear_user_highpage(p, addr + i * PAGE_SIZE);
- }
-}
-static void clear_huge_page(struct page *page,
- unsigned long addr, unsigned long sz)
-{
- int i;
-
- if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
- clear_gigantic_page(page, addr, sz);
- return;
- }
-
- might_sleep();
- for (i = 0; i < sz/PAGE_SIZE; i++) {
- cond_resched();
- clear_user_highpage(page + i, addr + i * PAGE_SIZE);
- }
-}
-
-static void copy_user_gigantic_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma)
-{
- int i;
- struct hstate *h = hstate_vma(vma);
- struct page *dst_base = dst;
- struct page *src_base = src;
-
- for (i = 0; i < pages_per_huge_page(h); ) {
- cond_resched();
- copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
-
- i++;
- dst = mem_map_next(dst, dst_base, i);
- src = mem_map_next(src, src_base, i);
- }
-}
-
-static void copy_user_huge_page(struct page *dst, struct page *src,
- unsigned long addr, struct vm_area_struct *vma)
-{
- int i;
- struct hstate *h = hstate_vma(vma);
-
- if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
- copy_user_gigantic_page(dst, src, addr, vma);
- return;
- }
-
- might_sleep();
- for (i = 0; i < pages_per_huge_page(h); i++) {
- cond_resched();
- copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
- }
-}
-
static void copy_gigantic_page(struct page *dst, struct page *src)
{
int i;
@@ -907,7 +842,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
}
/*
- * Increase the hugetlb pool such that it can accomodate a reservation
+ * Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
*/
static int gather_surplus_pages(struct hstate *h, int delta)
@@ -955,7 +890,7 @@ retry:
/*
* The surplus_list now contains _at_least_ the number of extra pages
- * needed to accomodate the reservation. Add the appropriate number
+ * needed to accommodate the reservation. Add the appropriate number
* of pages to the hugetlb pool and free the extras back to the buddy
* allocator. Commit the entire reservation here to prevent another
* process from stealing the pages as they are added to the pool but
@@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
return sprintf(buf, "%lu\n", nr_huge_pages);
}
+
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t len)
@@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
err = strict_strtoul(buf, 10, &count);
if (err)
- return 0;
+ goto out;
h = kobj_to_hstate(kobj, &nid);
+ if (h->order >= MAX_ORDER) {
+ err = -EINVAL;
+ goto out;
+ }
+
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
@@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
NODEMASK_FREE(nodes_allowed);
return len;
+out:
+ NODEMASK_FREE(nodes_allowed);
+ return err;
}
static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
struct hstate *h = kobj_to_hstate(kobj, NULL);
return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
}
+
static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
@@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
unsigned long input;
struct hstate *h = kobj_to_hstate(kobj, NULL);
+ if (h->order >= MAX_ORDER)
+ return -EINVAL;
+
err = strict_strtoul(buf, 10, &input);
if (err)
- return 0;
+ return err;
spin_lock(&hugetlb_lock);
h->nr_overcommit_huge_pages = input;
@@ -1922,13 +1870,18 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
{
struct hstate *h = &default_hstate;
unsigned long tmp;
+ int ret;
+
+ tmp = h->max_huge_pages;
- if (!write)
- tmp = h->max_huge_pages;
+ if (write && h->order >= MAX_ORDER)
+ return -EINVAL;
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
if (write) {
NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1943,8 +1896,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
if (nodes_allowed != &node_states[N_HIGH_MEMORY])
NODEMASK_FREE(nodes_allowed);
}
-
- return 0;
+out:
+ return ret;
}
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1982,21 +1935,26 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
{
struct hstate *h = &default_hstate;
unsigned long tmp;
+ int ret;
- if (!write)
- tmp = h->nr_overcommit_huge_pages;
+ tmp = h->nr_overcommit_huge_pages;
+
+ if (write && h->order >= MAX_ORDER)
+ return -EINVAL;
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ if (ret)
+ goto out;
if (write) {
spin_lock(&hugetlb_lock);
h->nr_overcommit_huge_pages = tmp;
spin_unlock(&hugetlb_lock);
}
-
- return 0;
+out:
+ return ret;
}
#endif /* CONFIG_SYSCTL */
@@ -2085,7 +2043,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
* This new VMA should share its siblings reservation map if present.
* The VMA will only ever have a valid reservation map pointer where
* it is being copied for another still existing VMA. As that VMA
- * has a reference to the reservation map it cannot dissappear until
+ * has a reference to the reservation map it cannot disappear until
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
@@ -2454,7 +2412,8 @@ retry_avoidcopy:
return VM_FAULT_OOM;
}
- copy_user_huge_page(new_page, old_page, address, vma);
+ copy_user_huge_page(new_page, old_page, address, vma,
+ pages_per_huge_page(h));
__SetPageUptodate(new_page);
/*
@@ -2531,7 +2490,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Currently, we are forced to kill the process in the event the
* original mapper has unmapped pages from the child due to a failed
- * COW. Warn that such a situation has occured as it may not be obvious
+ * COW. Warn that such a situation has occurred as it may not be obvious
*/
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
printk(KERN_WARNING
@@ -2558,7 +2517,7 @@ retry:
ret = -PTR_ERR(page);
goto out;
}
- clear_huge_page(page, address, huge_page_size(h));
+ clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
if (vma->vm_flags & VM_MAYSHARE) {
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 0948f1072d6b..c7fc7fd00e32 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -1,4 +1,4 @@
-/* Inject a hwpoison memory failure on a arbitary pfn */
+/* Inject a hwpoison memory failure on a arbitrary pfn */
#include <linux/module.h>
#include <linux/debugfs.h>
#include <linux/kernel.h>
diff --git a/mm/internal.h b/mm/internal.h
index dedb0aff673f..9d0ced8e505e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,6 +134,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
}
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+ struct vm_area_struct *vma);
+#endif
#else /* !CONFIG_MMU */
static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
{
@@ -158,7 +162,7 @@ static inline struct page *mem_map_offset(struct page *base, int offset)
}
/*
- * Iterator over all subpages withing the maximally aligned gigantic
+ * Iterator over all subpages within the maximally aligned gigantic
* page 'base'. Handle any discontiguity in the mem_map.
*/
static inline struct page *mem_map_next(struct page *iter,
@@ -241,10 +245,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
}
#endif /* CONFIG_SPARSEMEM */
-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, int len, unsigned int foll_flags,
- struct page **pages, struct vm_area_struct **vmas);
-
#define ZONE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1
#define ZONE_RECLAIM_SOME 0
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
* after the module is removed.
*/
for (i = 0; i < 10; i++) {
- elem = kmalloc(sizeof(*elem), GFP_KERNEL);
- pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
+ elem = kzalloc(sizeof(*elem), GFP_KERNEL);
+ pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
if (!elem)
return -ENOMEM;
- memset(elem, 0, sizeof(*elem));
INIT_LIST_HEAD(&elem->list);
-
list_add_tail(&elem->list, &test_list);
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..c1d5867543e4 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
#define BYTES_PER_POINTER sizeof(void *)
/* GFP bitmask for kmemleak internal allocations */
-#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC)
+#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+ __GFP_NORETRY | __GFP_NOMEMALLOC | \
+ __GFP_NOWARN)
/* scanning area inside a memory block */
struct kmemleak_scan_area {
@@ -263,7 +265,7 @@ static void kmemleak_disable(void);
} while (0)
/*
- * Macro invoked when a serious kmemleak condition occured and cannot be
+ * Macro invoked when a serious kmemleak condition occurred and cannot be
* recovered from. Kmemleak will be disabled and further allocation/freeing
* tracing no longer available.
*/
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
struct kmemleak_object *object;
struct prio_tree_node *node;
- object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
+ object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
if (!object) {
- kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
+ pr_warning("Cannot allocate a kmemleak_object structure\n");
+ kmemleak_disable();
return NULL;
}
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
return;
}
- area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK);
+ area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
if (!area) {
- kmemleak_warn("Cannot allocate a scan area\n");
+ pr_warning("Cannot allocate a scan area\n");
goto out;
}
@@ -1003,7 +1006,7 @@ static bool update_checksum(struct kmemleak_object *object)
/*
* Memory scanning is a long process and it needs to be interruptable. This
- * function checks whether such interrupt condition occured.
+ * function checks whether such interrupt condition occurred.
*/
static int scan_should_stop(void)
{
@@ -1730,7 +1733,7 @@ static int __init kmemleak_late_init(void)
if (atomic_read(&kmemleak_error)) {
/*
- * Some error occured and kmemleak was disabled. There is a
+ * Some error occurred and kmemleak was disabled. There is a
* small chance that kmemleak_disable() was called immediately
* after setting kmemleak_initialized and we may end up with
* two clean-up threads but serialized by scan_mutex.
diff --git a/mm/ksm.c b/mm/ksm.c
index 43bc893470b4..942dfc73a2ff 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,7 @@
#include <linux/swap.h>
#include <linux/ksm.h>
#include <linux/hash.h>
+#include <linux/freezer.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -300,20 +301,6 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
return rmap_item->address & STABLE_FLAG;
}
-static void hold_anon_vma(struct rmap_item *rmap_item,
- struct anon_vma *anon_vma)
-{
- rmap_item->anon_vma = anon_vma;
- get_anon_vma(anon_vma);
-}
-
-static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
-{
- struct anon_vma *anon_vma = rmap_item->anon_vma;
-
- drop_anon_vma(anon_vma);
-}
-
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
@@ -396,7 +383,7 @@ static void break_cow(struct rmap_item *rmap_item)
* It is not an accident that whenever we want to break COW
* to undo, we also need to drop a reference to the anon_vma.
*/
- ksm_drop_anon_vma(rmap_item);
+ put_anon_vma(rmap_item->anon_vma);
down_read(&mm->mmap_sem);
if (ksm_test_exit(mm))
@@ -411,6 +398,20 @@ out:
up_read(&mm->mmap_sem);
}
+static struct page *page_trans_compound_anon(struct page *page)
+{
+ if (PageTransCompound(page)) {
+ struct page *head = compound_trans_head(page);
+ /*
+ * head may actually be splitted and freed from under
+ * us but it's ok here.
+ */
+ if (PageAnon(head))
+ return head;
+ }
+ return NULL;
+}
+
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
@@ -430,7 +431,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
- if (PageAnon(page)) {
+ if (PageAnon(page) || page_trans_compound_anon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
@@ -451,7 +452,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
ksm_pages_sharing--;
else
ksm_pages_shared--;
- ksm_drop_anon_vma(rmap_item);
+ put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
cond_resched();
}
@@ -539,7 +540,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
else
ksm_pages_shared--;
- ksm_drop_anon_vma(rmap_item);
+ put_anon_vma(rmap_item->anon_vma);
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -708,6 +709,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (addr == -EFAULT)
goto out;
+ BUG_ON(PageTransCompound(page));
ptep = page_check_address(page, mm, addr, &ptl, 0);
if (!ptep)
goto out;
@@ -718,7 +720,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
swapped = PageSwapCache(page);
flush_cache_page(vma, addr, page_to_pfn(page));
/*
- * Ok this is tricky, when get_user_pages_fast() run it doesnt
+ * Ok this is tricky, when get_user_pages_fast() run it doesn't
* take any lock, therefore the check that we are going to make
* with the pagecount against the mapcount is racey and
* O_DIRECT can happen right after the check.
@@ -783,6 +785,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
goto out;
pmd = pmd_offset(pud, addr);
+ BUG_ON(pmd_trans_huge(*pmd));
if (!pmd_present(*pmd))
goto out;
@@ -800,6 +803,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
page_remove_rmap(page);
+ if (!page_mapped(page))
+ try_to_free_swap(page);
put_page(page);
pte_unmap_unlock(ptep, ptl);
@@ -808,6 +813,33 @@ out:
return err;
}
+static int page_trans_compound_anon_split(struct page *page)
+{
+ int ret = 0;
+ struct page *transhuge_head = page_trans_compound_anon(page);
+ if (transhuge_head) {
+ /* Get the reference on the head to split it. */
+ if (get_page_unless_zero(transhuge_head)) {
+ /*
+ * Recheck we got the reference while the head
+ * was still anonymous.
+ */
+ if (PageAnon(transhuge_head))
+ ret = split_huge_page(transhuge_head);
+ else
+ /*
+ * Retry later if split_huge_page run
+ * from under us.
+ */
+ ret = 1;
+ put_page(transhuge_head);
+ } else
+ /* Retry later if split_huge_page run from under us. */
+ ret = 1;
+ }
+ return ret;
+}
+
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
@@ -828,6 +860,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
if (!(vma->vm_flags & VM_MERGEABLE))
goto out;
+ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+ goto out;
+ BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
@@ -900,7 +935,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
goto out;
/* Must get reference to anon_vma while still holding mmap_sem */
- hold_anon_vma(rmap_item, vma->anon_vma);
+ rmap_item->anon_vma = vma->anon_vma;
+ get_anon_vma(vma->anon_vma);
out:
up_read(&mm->mmap_sem);
return err;
@@ -1247,6 +1283,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
slot = ksm_scan.mm_slot;
if (slot == &ksm_mm_head) {
+ /*
+ * A number of pages can hang around indefinitely on per-cpu
+ * pagevecs, raised page count preventing write_protect_page
+ * from merging them. Though it doesn't really matter much,
+ * it is puzzling to see some stuck in pages_volatile until
+ * other activity jostles them out, and they also prevented
+ * LTP's KSM test from succeeding deterministically; so drain
+ * them here (here rather than on entry to ksm_do_scan(),
+ * so we don't IPI too often when pages_to_scan is set low).
+ */
+ lru_add_drain_all();
+
root_unstable_tree = RB_ROOT;
spin_lock(&ksm_mmlist_lock);
@@ -1277,7 +1325,13 @@ next_mm:
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
- if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
+ if (IS_ERR_OR_NULL(*page)) {
+ ksm_scan.address += PAGE_SIZE;
+ cond_resched();
+ continue;
+ }
+ if (PageAnon(*page) ||
+ page_trans_compound_anon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1345,7 @@ next_mm:
up_read(&mm->mmap_sem);
return rmap_item;
}
- if (!IS_ERR_OR_NULL(*page))
- put_page(*page);
+ put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
@@ -1352,7 +1405,7 @@ static void ksm_do_scan(unsigned int scan_npages)
struct rmap_item *rmap_item;
struct page *uninitialized_var(page);
- while (scan_npages--) {
+ while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
rmap_item = scan_get_next_rmap_item(&page);
if (!rmap_item)
@@ -1370,6 +1423,7 @@ static int ksmd_should_run(void)
static int ksm_scan_thread(void *nothing)
{
+ set_freezable();
set_user_nice(current, 5);
while (!kthread_should_stop()) {
@@ -1378,11 +1432,13 @@ static int ksm_scan_thread(void *nothing)
ksm_do_scan(ksm_thread_pages_to_scan);
mutex_unlock(&ksm_thread_mutex);
+ try_to_freeze();
+
if (ksmd_should_run()) {
schedule_timeout_interruptible(
msecs_to_jiffies(ksm_thread_sleep_millisecs));
} else {
- wait_event_interruptible(ksm_thread_wait,
+ wait_event_freezable(ksm_thread_wait,
ksmd_should_run() || kthread_should_stop());
}
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
if (error)
goto out;
break;
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+ error = hugepage_madvise(vma, &new_flags, behavior);
+ if (error)
+ goto out;
+ break;
}
if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+#endif
return 1;
default:
diff --git a/mm/memblock.c b/mm/memblock.c
index 400dc62697d7..a0562d1a6ad4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -58,28 +58,6 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}
-static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1,
- phys_addr_t base2, phys_addr_t size2)
-{
- if (base2 == base1 + size1)
- return 1;
- else if (base1 == base2 + size2)
- return -1;
-
- return 0;
-}
-
-static long __init_memblock memblock_regions_adjacent(struct memblock_type *type,
- unsigned long r1, unsigned long r2)
-{
- phys_addr_t base1 = type->regions[r1].base;
- phys_addr_t size1 = type->regions[r1].size;
- phys_addr_t base2 = type->regions[r2].base;
- phys_addr_t size2 = type->regions[r2].size;
-
- return memblock_addrs_adjacent(base1, size1, base2, size2);
-}
-
long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
{
unsigned long i;
@@ -137,8 +115,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
BUG_ON(0 == size);
- size = memblock_align_up(size, align);
-
/* Pump up max_addr */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
end = memblock.current_limit;
@@ -208,14 +184,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
type->regions[i].size = type->regions[i + 1].size;
}
type->cnt--;
-}
-/* Assumption: base addr of region 1 < base addr of region 2 */
-static void __init_memblock memblock_coalesce_regions(struct memblock_type *type,
- unsigned long r1, unsigned long r2)
-{
- type->regions[r1].size += type->regions[r2].size;
- memblock_remove_region(type, r2);
+ /* Special case for empty arrays */
+ if (type->cnt == 0) {
+ type->cnt = 1;
+ type->regions[0].base = 0;
+ type->regions[0].size = 0;
+ }
}
/* Defined below but needed now */
@@ -278,7 +253,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
return 0;
/* Add the new reserved region now. Should not fail ! */
- BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0);
+ BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size));
/* If the array wasn't our static init one, then free it. We only do
* that before SLAB is available as later on, we don't know whether
@@ -298,58 +273,99 @@ extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1
return 1;
}
-static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+static long __init_memblock memblock_add_region(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
{
- unsigned long coalesced = 0;
- long adjacent, i;
-
- if ((type->cnt == 1) && (type->regions[0].size == 0)) {
- type->regions[0].base = base;
- type->regions[0].size = size;
- return 0;
- }
+ phys_addr_t end = base + size;
+ int i, slot = -1;
- /* First try and coalesce this MEMBLOCK with another. */
+ /* First try and coalesce this MEMBLOCK with others */
for (i = 0; i < type->cnt; i++) {
- phys_addr_t rgnbase = type->regions[i].base;
- phys_addr_t rgnsize = type->regions[i].size;
+ struct memblock_region *rgn = &type->regions[i];
+ phys_addr_t rend = rgn->base + rgn->size;
- if ((rgnbase == base) && (rgnsize == size))
- /* Already have this region, so we're done */
+ /* Exit if there's no possible hits */
+ if (rgn->base > end || rgn->size == 0)
+ break;
+
+ /* Check if we are fully enclosed within an existing
+ * block
+ */
+ if (rgn->base <= base && rend >= end)
return 0;
- adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
- /* Check if arch allows coalescing */
- if (adjacent != 0 && type == &memblock.memory &&
- !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize))
- break;
- if (adjacent > 0) {
- type->regions[i].base -= size;
- type->regions[i].size += size;
- coalesced++;
- break;
- } else if (adjacent < 0) {
- type->regions[i].size += size;
- coalesced++;
- break;
+ /* Check if we overlap or are adjacent with the bottom
+ * of a block.
+ */
+ if (base < rgn->base && end >= rgn->base) {
+ /* If we can't coalesce, create a new block */
+ if (!memblock_memory_can_coalesce(base, size,
+ rgn->base,
+ rgn->size)) {
+ /* Overlap & can't coalesce are mutually
+ * exclusive, if you do that, be prepared
+ * for trouble
+ */
+ WARN_ON(end != rgn->base);
+ goto new_block;
+ }
+ /* We extend the bottom of the block down to our
+ * base
+ */
+ rgn->base = base;
+ rgn->size = rend - base;
+
+ /* Return if we have nothing else to allocate
+ * (fully coalesced)
+ */
+ if (rend >= end)
+ return 0;
+
+ /* We continue processing from the end of the
+ * coalesced block.
+ */
+ base = rend;
+ size = end - base;
+ }
+
+ /* Now check if we overlap or are adjacent with the
+ * top of a block
+ */
+ if (base <= rend && end >= rend) {
+ /* If we can't coalesce, create a new block */
+ if (!memblock_memory_can_coalesce(rgn->base,
+ rgn->size,
+ base, size)) {
+ /* Overlap & can't coalesce are mutually
+ * exclusive, if you do that, be prepared
+ * for trouble
+ */
+ WARN_ON(rend != base);
+ goto new_block;
+ }
+ /* We adjust our base down to enclose the
+ * original block and destroy it. It will be
+ * part of our new allocation. Since we've
+ * freed an entry, we know we won't fail
+ * to allocate one later, so we won't risk
+ * losing the original block allocation.
+ */
+ size += (base - rgn->base);
+ base = rgn->base;
+ memblock_remove_region(type, i--);
}
}
- /* If we plugged a hole, we may want to also coalesce with the
- * next region
+ /* If the array is empty, special case, replace the fake
+ * filler region and return
*/
- if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) &&
- ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base,
- type->regions[i].size,
- type->regions[i+1].base,
- type->regions[i+1].size)))) {
- memblock_coalesce_regions(type, i, i+1);
- coalesced++;
+ if ((type->cnt == 1) && (type->regions[0].size == 0)) {
+ type->regions[0].base = base;
+ type->regions[0].size = size;
+ return 0;
}
- if (coalesced)
- return coalesced;
-
+ new_block:
/* If we are out of space, we fail. It's too late to resize the array
* but then this shouldn't have happened in the first place.
*/
@@ -364,13 +380,14 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys
} else {
type->regions[i+1].base = base;
type->regions[i+1].size = size;
+ slot = i + 1;
break;
}
}
-
if (base < type->regions[0].base) {
type->regions[0].base = base;
type->regions[0].size = size;
+ slot = 0;
}
type->cnt++;
@@ -378,7 +395,8 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys
* our allocation and return an error
*/
if (type->cnt == type->max && memblock_double_array(type)) {
- type->cnt--;
+ BUG_ON(slot < 0);
+ memblock_remove_region(type, slot);
return -1;
}
@@ -391,52 +409,55 @@ long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
}
-static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+static long __init_memblock __memblock_remove(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
{
- phys_addr_t rgnbegin, rgnend;
phys_addr_t end = base + size;
int i;
- rgnbegin = rgnend = 0; /* supress gcc warnings */
-
- /* Find the region where (base, size) belongs to */
- for (i=0; i < type->cnt; i++) {
- rgnbegin = type->regions[i].base;
- rgnend = rgnbegin + type->regions[i].size;
+ /* Walk through the array for collisions */
+ for (i = 0; i < type->cnt; i++) {
+ struct memblock_region *rgn = &type->regions[i];
+ phys_addr_t rend = rgn->base + rgn->size;
- if ((rgnbegin <= base) && (end <= rgnend))
+ /* Nothing more to do, exit */
+ if (rgn->base > end || rgn->size == 0)
break;
- }
- /* Didn't find the region */
- if (i == type->cnt)
- return -1;
+ /* If we fully enclose the block, drop it */
+ if (base <= rgn->base && end >= rend) {
+ memblock_remove_region(type, i--);
+ continue;
+ }
- /* Check to see if we are removing entire region */
- if ((rgnbegin == base) && (rgnend == end)) {
- memblock_remove_region(type, i);
- return 0;
- }
+ /* If we are fully enclosed within a block
+ * then we need to split it and we are done
+ */
+ if (base > rgn->base && end < rend) {
+ rgn->size = base - rgn->base;
+ if (!memblock_add_region(type, end, rend - end))
+ return 0;
+ /* Failure to split is bad, we at least
+ * restore the block before erroring
+ */
+ rgn->size = rend - rgn->base;
+ WARN_ON(1);
+ return -1;
+ }
- /* Check to see if region is matching at the front */
- if (rgnbegin == base) {
- type->regions[i].base = end;
- type->regions[i].size -= size;
- return 0;
- }
+ /* Check if we need to trim the bottom of a block */
+ if (rgn->base < end && rend > end) {
+ rgn->size -= end - rgn->base;
+ rgn->base = end;
+ break;
+ }
- /* Check to see if the region is matching at the end */
- if (rgnend == end) {
- type->regions[i].size -= size;
- return 0;
- }
+ /* And check if we need to trim the top of a block */
+ if (base < rend)
+ rgn->size -= rend - base;
- /*
- * We need to split the entry - adjust the current one to the
- * beginging of the hole and add the region after hole.
- */
- type->regions[i].size = base - type->regions[i].base;
- return memblock_add_region(type, end, rgnend - end);
+ }
+ return 0;
}
long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
@@ -469,7 +490,7 @@ phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, ph
found = memblock_find_base(size, align, 0, max_addr);
if (found != MEMBLOCK_ERROR &&
- memblock_add_region(&memblock.reserved, found, size) >= 0)
+ !memblock_add_region(&memblock.reserved, found, size))
return found;
return 0;
@@ -550,7 +571,7 @@ static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
if (this_nid == nid) {
phys_addr_t ret = memblock_find_region(start, this_end, size, align);
if (ret != MEMBLOCK_ERROR &&
- memblock_add_region(&memblock.reserved, ret, size) >= 0)
+ !memblock_add_region(&memblock.reserved, ret, size))
return ret;
}
start = this_end;
@@ -683,13 +704,13 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
{
- int idx = memblock_search(&memblock.reserved, base);
+ int idx = memblock_search(&memblock.memory, base);
if (idx == -1)
return 0;
- return memblock.reserved.regions[idx].base <= base &&
- (memblock.reserved.regions[idx].base +
- memblock.reserved.regions[idx].size) >= (base + size);
+ return memblock.memory.regions[idx].base <= base &&
+ (memblock.memory.regions[idx].base +
+ memblock.memory.regions[idx].size) >= (base + size);
}
int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7a22b4129211..010f9166fa6e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,15 +73,6 @@ static int really_do_swap_account __initdata = 0;
#define do_swap_account (0)
#endif
-/*
- * Per memcg event counter is incremented at every pagein/pageout. This counter
- * is used for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- *
- * These values will be used as !((event) & ((1 <<(thresh)) - 1))
- */
-#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
-#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
/*
* Statistics for memory cgroup.
@@ -93,19 +84,36 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
- MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
- MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
- /* incremented at every pagein/pageout */
- MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
-
MEM_CGROUP_STAT_NSTATS,
};
+enum mem_cgroup_events_index {
+ MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
+ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
+ MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
+ MEM_CGROUP_EVENTS_NSTATS,
+};
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremated by the number of pages. This counter is used for
+ * for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+ MEM_CGROUP_TARGET_THRESH,
+ MEM_CGROUP_TARGET_SOFTLIMIT,
+ MEM_CGROUP_NTARGETS,
+};
+#define THRESHOLDS_EVENTS_TARGET (128)
+#define SOFTLIMIT_EVENTS_TARGET (1024)
+
struct mem_cgroup_stat_cpu {
- s64 count[MEM_CGROUP_STAT_NSTATS];
+ long count[MEM_CGROUP_STAT_NSTATS];
+ unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+ unsigned long targets[MEM_CGROUP_NTARGETS];
};
/*
@@ -218,12 +226,6 @@ struct mem_cgroup {
* per zone LRU lists.
*/
struct mem_cgroup_lru_info info;
-
- /*
- protect against reclaim related member.
- */
- spinlock_t reclaim_param_lock;
-
/*
* While reclaiming in a hierarchy, we cache the last child we
* reclaimed from.
@@ -292,7 +294,6 @@ static struct move_charge_struct {
unsigned long moved_charge;
unsigned long moved_swap;
struct task_struct *moving_task; /* a task moving charges */
- struct mm_struct *mm;
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -328,13 +329,6 @@ enum charge_type {
NR_CHARGE_TYPE,
};
-/* only for here (for easy reading.) */
-#define PCGF_CACHE (1UL << PCG_CACHE)
-#define PCGF_USED (1UL << PCG_USED)
-#define PCGF_LOCK (1UL << PCG_LOCK)
-/* Not used, but added here for completeness */
-#define PCGF_ACCT (1UL << PCG_ACCT)
-
/* for encoding cft->private value on file */
#define _MEM (0)
#define _MEMSWAP (1)
@@ -372,14 +366,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
}
static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct page_cgroup *pc)
+page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
{
- struct mem_cgroup *mem = pc->mem_cgroup;
- int nid = page_cgroup_nid(pc);
- int zid = page_cgroup_zid(pc);
-
- if (!mem)
- return NULL;
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
return mem_cgroup_zoneinfo(mem, nid, zid);
}
@@ -505,11 +495,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
}
}
-static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
-{
- return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
-}
-
static struct mem_cgroup_per_zone *
__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
{
@@ -566,11 +551,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
* common workload, threashold and synchonization as vmstat[] should be
* implemented.
*/
-static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
- enum mem_cgroup_stat_index idx)
+static long mem_cgroup_read_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
{
+ long val = 0;
int cpu;
- s64 val = 0;
get_online_cpus();
for_each_online_cpu(cpu)
@@ -584,9 +569,9 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
return val;
}
-static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
+static long mem_cgroup_local_usage(struct mem_cgroup *mem)
{
- s64 ret;
+ long ret;
ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
@@ -600,24 +585,41 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
}
-static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
- struct page_cgroup *pc,
- bool charge)
+static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
+ enum mem_cgroup_events_index idx)
{
- int val = (charge) ? 1 : -1;
+ unsigned long val = 0;
+ int cpu;
+ for_each_online_cpu(cpu)
+ val += per_cpu(mem->stat->events[idx], cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+ spin_lock(&mem->pcp_counter_lock);
+ val += mem->nocpu_base.events[idx];
+ spin_unlock(&mem->pcp_counter_lock);
+#endif
+ return val;
+}
+
+static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
+ bool file, int nr_pages)
+{
preempt_disable();
- if (PageCgroupCache(pc))
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
+ if (file)
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
else
- __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
+ __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
+
+ /* pagein of a big page is an event. So, ignore page size */
+ if (nr_pages > 0)
+ __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
+ else {
+ __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
+ nr_pages = -nr_pages; /* for event */
+ }
- if (charge)
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
- else
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
- __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
+ __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
preempt_enable();
}
@@ -637,13 +639,34 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
return total;
}
-static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
+static bool __memcg_event_check(struct mem_cgroup *mem, int target)
{
- s64 val;
+ unsigned long val, next;
+
+ val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+ next = this_cpu_read(mem->stat->targets[target]);
+ /* from time_after() in jiffies.h */
+ return ((long)next - (long)val < 0);
+}
- val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
+static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
+{
+ unsigned long val, next;
+
+ val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
- return !(val & ((1 << event_mask_shift) - 1));
+ switch (target) {
+ case MEM_CGROUP_TARGET_THRESH:
+ next = val + THRESHOLDS_EVENTS_TARGET;
+ break;
+ case MEM_CGROUP_TARGET_SOFTLIMIT:
+ next = val + SOFTLIMIT_EVENTS_TARGET;
+ break;
+ default:
+ return;
+ }
+
+ this_cpu_write(mem->stat->targets[target], next);
}
/*
@@ -653,10 +676,15 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
{
/* threshold event is triggered in finer grain than soft limit */
- if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
+ if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
mem_cgroup_threshold(mem);
- if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
+ __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
+ if (unlikely(__memcg_event_check(mem,
+ MEM_CGROUP_TARGET_SOFTLIMIT))){
mem_cgroup_update_tree(mem, page);
+ __mem_cgroup_target_update(mem,
+ MEM_CGROUP_TARGET_SOFTLIMIT);
+ }
}
}
@@ -815,13 +843,13 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
* We don't check PCG_USED bit. It's cleared when the "page" is finally
* removed from global LRU.
*/
- mz = page_cgroup_zoneinfo(pc);
- MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
+ /* huge page split is done under lru_lock. so, we have no races. */
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
VM_BUG_ON(list_empty(&pc->lru));
list_del_init(&pc->lru);
- return;
}
void mem_cgroup_del_lru(struct page *page)
@@ -829,24 +857,49 @@ void mem_cgroup_del_lru(struct page *page)
mem_cgroup_del_lru_list(page, page_lru(page));
}
-void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
+/*
+ * Writeback is about to end against a page which has been marked for immediate
+ * reclaim. If it still appears to be reclaimable, move it to the tail of the
+ * inactive list.
+ */
+void mem_cgroup_rotate_reclaimable_page(struct page *page)
{
struct mem_cgroup_per_zone *mz;
struct page_cgroup *pc;
+ enum lru_list lru = page_lru(page);
if (mem_cgroup_disabled())
return;
pc = lookup_page_cgroup(page);
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
+ /* unused or root page is not rotated. */
+ if (!PageCgroupUsed(pc))
+ return;
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
smp_rmb();
+ if (mem_cgroup_is_root(pc->mem_cgroup))
+ return;
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
+ list_move_tail(&pc->lru, &mz->lists[lru]);
+}
+
+void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct page_cgroup *pc;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ pc = lookup_page_cgroup(page);
/* unused or root page is not rotated. */
- if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+ if (!PageCgroupUsed(pc))
return;
- mz = page_cgroup_zoneinfo(pc);
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
+ if (mem_cgroup_is_root(pc->mem_cgroup))
+ return;
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
list_move(&pc->lru, &mz->lists[lru]);
}
@@ -859,16 +912,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
return;
pc = lookup_page_cgroup(page);
VM_BUG_ON(PageCgroupAcctLRU(pc));
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
- smp_rmb();
if (!PageCgroupUsed(pc))
return;
-
- mz = page_cgroup_zoneinfo(pc);
- MEM_CGROUP_ZSTAT(mz, lru) += 1;
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
+ /* huge page split is done under lru_lock. so, we have no races. */
+ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
SetPageCgroupAcctLRU(pc);
if (mem_cgroup_is_root(pc->mem_cgroup))
return;
@@ -876,18 +926,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
}
/*
- * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
- * lru because the page may.be reused after it's fully uncharged (because of
- * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
- * it again. This function is only used to charge SwapCache. It's done under
- * lock_page and expected that zone->lru_lock is never held.
+ * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
+ * while it's linked to lru because the page may be reused after it's fully
+ * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
+ * It's done under lock_page and expected that zone->lru_lock isnever held.
*/
-static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
+static void mem_cgroup_lru_del_before_commit(struct page *page)
{
unsigned long flags;
struct zone *zone = page_zone(page);
struct page_cgroup *pc = lookup_page_cgroup(page);
+ /*
+ * Doing this check without taking ->lru_lock seems wrong but this
+ * is safe. Because if page_cgroup's USED bit is unset, the page
+ * will not be added to any memcg's LRU. If page_cgroup's USED bit is
+ * set, the commit after this will fail, anyway.
+ * This all charge/uncharge is done under some mutual execustion.
+ * So, we don't need to taking care of changes in USED bit.
+ */
+ if (likely(!PageLRU(page)))
+ return;
+
spin_lock_irqsave(&zone->lru_lock, flags);
/*
* Forget old LRU when this page_cgroup is *not* used. This Used bit
@@ -898,12 +958,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
-static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
+static void mem_cgroup_lru_add_after_commit(struct page *page)
{
unsigned long flags;
struct zone *zone = page_zone(page);
struct page_cgroup *pc = lookup_page_cgroup(page);
+ /* taking care of that the page is added to LRU while we commit it */
+ if (likely(!PageLRU(page)))
+ return;
spin_lock_irqsave(&zone->lru_lock, flags);
/* link when the page is linked to LRU but page_cgroup isn't */
if (PageLRU(page) && !PageCgroupAcctLRU(pc))
@@ -1032,18 +1095,11 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
return NULL;
pc = lookup_page_cgroup(page);
- /*
- * Used bit is set without atomic ops but after smp_wmb().
- * For making pc->mem_cgroup visible, insert smp_rmb() here.
- */
- smp_rmb();
if (!PageCgroupUsed(pc))
return NULL;
-
- mz = page_cgroup_zoneinfo(pc);
- if (!mz)
- return NULL;
-
+ /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+ smp_rmb();
+ mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
return &mz->reclaim_stat;
}
@@ -1075,9 +1131,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
if (scan >= nr_to_scan)
break;
- page = pc->page;
if (unlikely(!PageCgroupUsed(pc)))
continue;
+
+ page = lookup_cgroup_page(pc);
+
if (unlikely(!PageLRU(page)))
continue;
@@ -1087,7 +1145,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
/* we don't affect global LRU but rotate in our LRU */
@@ -1109,32 +1167,32 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
#define mem_cgroup_from_res_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
-static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
+/**
+ * mem_cgroup_margin - calculate chargeable space of a memory cgroup
+ * @mem: the memory cgroup
+ *
+ * Returns the maximum amount of memory @mem can be charged with, in
+ * pages.
+ */
+static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
{
- if (do_swap_account) {
- if (res_counter_check_under_limit(&mem->res) &&
- res_counter_check_under_limit(&mem->memsw))
- return true;
- } else
- if (res_counter_check_under_limit(&mem->res))
- return true;
- return false;
+ unsigned long long margin;
+
+ margin = res_counter_margin(&mem->res);
+ if (do_swap_account)
+ margin = min(margin, res_counter_margin(&mem->memsw));
+ return margin >> PAGE_SHIFT;
}
static unsigned int get_swappiness(struct mem_cgroup *memcg)
{
struct cgroup *cgrp = memcg->css.cgroup;
- unsigned int swappiness;
/* root ? */
if (cgrp->parent == NULL)
return vm_swappiness;
- spin_lock(&memcg->reclaim_param_lock);
- swappiness = memcg->swappiness;
- spin_unlock(&memcg->reclaim_param_lock);
-
- return swappiness;
+ return memcg->swappiness;
}
static void mem_cgroup_start_move(struct mem_cgroup *mem)
@@ -1312,8 +1370,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
u64 limit;
u64 memsw;
- limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
- total_swap_pages;
+ limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
+ limit += total_swap_pages << PAGE_SHIFT;
+
memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
* If memsw is finite and limits the amount of swap space available
@@ -1349,13 +1408,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
rcu_read_unlock();
/* Updates scanning parameter */
- spin_lock(&root_mem->reclaim_param_lock);
if (!css) {
/* this means start scan from ID:1 */
root_mem->last_scanned_child = 0;
} else
root_mem->last_scanned_child = found;
- spin_unlock(&root_mem->reclaim_param_lock);
}
return ret;
@@ -1384,7 +1441,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
- unsigned long excess = mem_cgroup_get_excess(root_mem);
+ unsigned long excess;
+
+ excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
/* If memsw_is_minimum==1, swap-out is of-no-use. */
if (root_mem->memsw_is_minimum)
@@ -1407,7 +1466,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
break;
}
/*
- * We want to do more targetted reclaim.
+ * We want to do more targeted reclaim.
* excess >> 2 is not to excessive so as to
* reclaim too much, nor too less that we keep
* coming back to reclaim from this cgroup
@@ -1441,9 +1500,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
return ret;
total += ret;
if (check_soft) {
- if (res_counter_check_under_soft_limit(&root_mem->res))
+ if (!res_counter_soft_limit_excess(&root_mem->res))
return total;
- } else if (mem_cgroup_check_under_limit(root_mem))
+ } else if (mem_cgroup_margin(root_mem))
return 1 + total;
}
return total;
@@ -1600,11 +1659,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
* possibility of race condition. If there is, we take a lock.
*/
-static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+ enum mem_cgroup_page_stat_item idx, int val)
{
struct mem_cgroup *mem;
struct page_cgroup *pc = lookup_page_cgroup(page);
bool need_unlock = false;
+ unsigned long uninitialized_var(flags);
if (unlikely(!pc))
return;
@@ -1614,55 +1675,52 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
if (unlikely(!mem || !PageCgroupUsed(pc)))
goto out;
/* pc->mem_cgroup is unstable ? */
- if (unlikely(mem_cgroup_stealed(mem))) {
+ if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
/* take a lock against to access pc->mem_cgroup */
- lock_page_cgroup(pc);
+ move_lock_page_cgroup(pc, &flags);
need_unlock = true;
mem = pc->mem_cgroup;
if (!mem || !PageCgroupUsed(pc))
goto out;
}
- this_cpu_add(mem->stat->count[idx], val);
-
switch (idx) {
- case MEM_CGROUP_STAT_FILE_MAPPED:
+ case MEMCG_NR_FILE_MAPPED:
if (val > 0)
SetPageCgroupFileMapped(pc);
else if (!page_mapped(page))
ClearPageCgroupFileMapped(pc);
+ idx = MEM_CGROUP_STAT_FILE_MAPPED;
break;
default:
BUG();
}
+ this_cpu_add(mem->stat->count[idx], val);
+
out:
if (unlikely(need_unlock))
- unlock_page_cgroup(pc);
+ move_unlock_page_cgroup(pc, &flags);
rcu_read_unlock();
return;
}
-
-void mem_cgroup_update_file_mapped(struct page *page, int val)
-{
- mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
-}
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
* TODO: maybe necessary to use big numbers in big irons.
*/
-#define CHARGE_SIZE (32 * PAGE_SIZE)
+#define CHARGE_BATCH 32U
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
- int charge;
+ unsigned int nr_pages;
struct work_struct work;
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static atomic_t memcg_drain_count;
/*
- * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
+ * Try to consume stocked charge on this cpu. If success, one page is consumed
* from local stock and true is returned. If the stock is 0 or charges from a
* cgroup which is not current target, returns false. This stock will be
* refilled.
@@ -1673,8 +1731,8 @@ static bool consume_stock(struct mem_cgroup *mem)
bool ret = true;
stock = &get_cpu_var(memcg_stock);
- if (mem == stock->cached && stock->charge)
- stock->charge -= PAGE_SIZE;
+ if (mem == stock->cached && stock->nr_pages)
+ stock->nr_pages--;
else /* need to call res_counter_charge */
ret = false;
put_cpu_var(memcg_stock);
@@ -1688,13 +1746,15 @@ static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
- if (stock->charge) {
- res_counter_uncharge(&old->res, stock->charge);
+ if (stock->nr_pages) {
+ unsigned long bytes = stock->nr_pages * PAGE_SIZE;
+
+ res_counter_uncharge(&old->res, bytes);
if (do_swap_account)
- res_counter_uncharge(&old->memsw, stock->charge);
+ res_counter_uncharge(&old->memsw, bytes);
+ stock->nr_pages = 0;
}
stock->cached = NULL;
- stock->charge = 0;
}
/*
@@ -1711,7 +1771,7 @@ static void drain_local_stock(struct work_struct *dummy)
* Cache charges(val) which is from res_counter, to local per_cpu area.
* This will be consumed by consume_stock() function, later.
*/
-static void refill_stock(struct mem_cgroup *mem, int val)
+static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
@@ -1719,7 +1779,7 @@ static void refill_stock(struct mem_cgroup *mem, int val)
drain_stock(stock);
stock->cached = mem;
}
- stock->charge += val;
+ stock->nr_pages += nr_pages;
put_cpu_var(memcg_stock);
}
@@ -1771,11 +1831,17 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
spin_lock(&mem->pcp_counter_lock);
for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
- s64 x = per_cpu(mem->stat->count[i], cpu);
+ long x = per_cpu(mem->stat->count[i], cpu);
per_cpu(mem->stat->count[i], cpu) = 0;
mem->nocpu_base.count[i] += x;
}
+ for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+ unsigned long x = per_cpu(mem->stat->events[i], cpu);
+
+ per_cpu(mem->stat->events[i], cpu) = 0;
+ mem->nocpu_base.events[i] += x;
+ }
/* need to clear ON_MOVE value, works as a kind of lock. */
per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
spin_unlock(&mem->pcp_counter_lock);
@@ -1825,9 +1891,10 @@ enum {
CHARGE_OOM_DIE, /* the current is killed because of OOM */
};
-static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
- int csize, bool oom_check)
+static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+ unsigned int nr_pages, bool oom_check)
{
+ unsigned long csize = nr_pages * PAGE_SIZE;
struct mem_cgroup *mem_over_limit;
struct res_counter *fail_res;
unsigned long flags = 0;
@@ -1842,27 +1909,38 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
if (likely(!ret))
return CHARGE_OK;
+ res_counter_uncharge(&mem->res, csize);
mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
} else
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
-
- if (csize > PAGE_SIZE) /* change csize and retry */
+ /*
+ * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
+ * of regular pages (CHARGE_BATCH), or a single regular page (1).
+ *
+ * Never reclaim on behalf of optional batching, retry with a
+ * single page instead.
+ */
+ if (nr_pages == CHARGE_BATCH)
return CHARGE_RETRY;
if (!(gfp_mask & __GFP_WAIT))
return CHARGE_WOULDBLOCK;
ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
+ gfp_mask, flags);
+ if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
+ return CHARGE_RETRY;
/*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
+ * Even though the limit is exceeded at this point, reclaim
+ * may have been able to free some pages. Retry the charge
+ * before killing the task.
+ *
+ * Only for regular pages, though: huge pages are rather
+ * unlikely to succeed so close to the limit, and we fall back
+ * to regular pages anyway in case of failure.
*/
- if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+ if (nr_pages == 1 && ret)
return CHARGE_RETRY;
/*
@@ -1887,12 +1965,15 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask,
+ unsigned int nr_pages,
+ struct mem_cgroup **memcg,
+ bool oom)
{
+ unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem = NULL;
int ret;
- int csize = CHARGE_SIZE;
/*
* Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1917,7 +1998,7 @@ again:
VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;
- if (consume_stock(mem))
+ if (nr_pages == 1 && consume_stock(mem))
goto done;
css_get(&mem->css);
} else {
@@ -1925,23 +2006,22 @@ again:
rcu_read_lock();
p = rcu_dereference(mm->owner);
- VM_BUG_ON(!p);
/*
- * because we don't have task_lock(), "p" can exit while
- * we're here. In that case, "mem" can point to root
- * cgroup but never be NULL. (and task_struct itself is freed
- * by RCU, cgroup itself is RCU safe.) Then, we have small
- * risk here to get wrong cgroup. But such kind of mis-account
- * by race always happens because we don't have cgroup_mutex().
- * It's overkill and we allow that small race, here.
+ * Because we don't have task_lock(), "p" can exit.
+ * In that case, "mem" can point to root or p can be NULL with
+ * race with swapoff. Then, we have small risk of mis-accouning.
+ * But such kind of mis-account by race always happens because
+ * we don't have cgroup_mutex(). It's overkill and we allo that
+ * small race, here.
+ * (*) swapoff at el will charge against mm-struct not against
+ * task-struct. So, mm->owner can be NULL.
*/
mem = mem_cgroup_from_task(p);
- VM_BUG_ON(!mem);
- if (mem_cgroup_is_root(mem)) {
+ if (!mem || mem_cgroup_is_root(mem)) {
rcu_read_unlock();
goto done;
}
- if (consume_stock(mem)) {
+ if (nr_pages == 1 && consume_stock(mem)) {
/*
* It seems dagerous to access memcg without css_get().
* But considering how consume_stok works, it's not
@@ -1976,13 +2056,12 @@ again:
nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
- ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
-
+ ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
switch (ret) {
case CHARGE_OK:
break;
case CHARGE_RETRY: /* not in OOM situation but retry */
- csize = PAGE_SIZE;
+ batch = nr_pages;
css_put(&mem->css);
mem = NULL;
goto again;
@@ -2003,8 +2082,8 @@ again:
}
} while (ret != CHARGE_OK);
- if (csize > PAGE_SIZE)
- refill_stock(mem, csize - PAGE_SIZE);
+ if (batch > nr_pages)
+ refill_stock(mem, batch - nr_pages);
css_put(&mem->css);
done:
*memcg = mem;
@@ -2023,20 +2102,17 @@ bypass:
* gotten by try_charge().
*/
static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
- unsigned long count)
+ unsigned int nr_pages)
{
if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE * count);
+ unsigned long bytes = nr_pages * PAGE_SIZE;
+
+ res_counter_uncharge(&mem->res, bytes);
if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
+ res_counter_uncharge(&mem->memsw, bytes);
}
}
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
-{
- __mem_cgroup_cancel_charge(mem, 1);
-}
-
/*
* A helper function to get mem_cgroup from ID. must be called under
* rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -2084,26 +2160,22 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
return mem;
}
-/*
- * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
- * USED state. If already USED, uncharge and return.
- */
-
static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
- struct page_cgroup *pc,
- enum charge_type ctype)
+ struct page *page,
+ unsigned int nr_pages,
+ struct page_cgroup *pc,
+ enum charge_type ctype)
{
- /* try_charge() can return NULL to *memcg, taking care of it. */
- if (!mem)
- return;
-
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
- mem_cgroup_cancel_charge(mem);
+ __mem_cgroup_cancel_charge(mem, nr_pages);
return;
}
-
+ /*
+ * we don't need page_cgroup_lock about tail pages, becase they are not
+ * accessed by any other context at this point.
+ */
pc->mem_cgroup = mem;
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2127,19 +2199,62 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
break;
}
- mem_cgroup_charge_statistics(mem, pc, true);
-
+ mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
unlock_page_cgroup(pc);
/*
* "charge_statistics" updated event counter. Then, check it.
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
*/
- memcg_check_events(mem, pc->page);
+ memcg_check_events(mem, page);
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
+ (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+{
+ struct page_cgroup *head_pc = lookup_page_cgroup(head);
+ struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
+ unsigned long flags;
+
+ if (mem_cgroup_disabled())
+ return;
+ /*
+ * We have no races with charge/uncharge but will have races with
+ * page state accounting.
+ */
+ move_lock_page_cgroup(head_pc, &flags);
+
+ tail_pc->mem_cgroup = head_pc->mem_cgroup;
+ smp_wmb(); /* see __commit_charge() */
+ if (PageCgroupAcctLRU(head_pc)) {
+ enum lru_list lru;
+ struct mem_cgroup_per_zone *mz;
+
+ /*
+ * LRU flags cannot be copied because we need to add tail
+ *.page to LRU by generic call and our hook will be called.
+ * We hold lru_lock, then, reduce counter directly.
+ */
+ lru = page_lru(head);
+ mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
+ MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+ }
+ tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+ move_unlock_page_cgroup(head_pc, &flags);
}
+#endif
/**
- * __mem_cgroup_move_account - move account of the page
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
@@ -2147,22 +2262,42 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
- * - the pc is locked, used, and ->mem_cgroup points to @from.
+ * - compound_lock is held when nr_pages > 1
*
* This function doesn't do "charge" nor css_get to new cgroup. It should be
- * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
+ * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
* true, this function does "uncharge" from old cgroup, but it doesn't if
* @uncharge is false, so a caller should do "uncharge".
*/
-
-static void __mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+static int mem_cgroup_move_account(struct page *page,
+ unsigned int nr_pages,
+ struct page_cgroup *pc,
+ struct mem_cgroup *from,
+ struct mem_cgroup *to,
+ bool uncharge)
{
+ unsigned long flags;
+ int ret;
+
VM_BUG_ON(from == to);
- VM_BUG_ON(PageLRU(pc->page));
- VM_BUG_ON(!page_is_cgroup_locked(pc));
- VM_BUG_ON(!PageCgroupUsed(pc));
- VM_BUG_ON(pc->mem_cgroup != from);
+ VM_BUG_ON(PageLRU(page));
+ /*
+ * The page is isolated from LRU. So, collapse function
+ * will not handle this page. But page splitting can happen.
+ * Do this check under compound_page_lock(). The caller should
+ * hold it.
+ */
+ ret = -EBUSY;
+ if (nr_pages > 1 && !PageTransHuge(page))
+ goto out;
+
+ lock_page_cgroup(pc);
+
+ ret = -EINVAL;
+ if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
+ goto unlock;
+
+ move_lock_page_cgroup(pc, &flags);
if (PageCgroupFileMapped(pc)) {
/* Update mapped_file data for mem_cgroup */
@@ -2171,42 +2306,31 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
preempt_enable();
}
- mem_cgroup_charge_statistics(from, pc, false);
+ mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
if (uncharge)
/* This is not "cancel", but cancel_charge does all we need. */
- mem_cgroup_cancel_charge(from);
+ __mem_cgroup_cancel_charge(from, nr_pages);
/* caller should have done css_get */
pc->mem_cgroup = to;
- mem_cgroup_charge_statistics(to, pc, true);
+ mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
/*
* We charges against "to" which may not have any tasks. Then, "to"
* can be under rmdir(). But in current implementation, caller of
* this function is just force_empty() and move charge, so it's
- * garanteed that "to" is never removed. So, we don't check rmdir
+ * guaranteed that "to" is never removed. So, we don't check rmdir
* status here.
*/
-}
-
-/*
- * check whether the @pc is valid for moving account and call
- * __mem_cgroup_move_account()
- */
-static int mem_cgroup_move_account(struct page_cgroup *pc,
- struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
-{
- int ret = -EINVAL;
- lock_page_cgroup(pc);
- if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
- __mem_cgroup_move_account(pc, from, to, uncharge);
- ret = 0;
- }
+ move_unlock_page_cgroup(pc, &flags);
+ ret = 0;
+unlock:
unlock_page_cgroup(pc);
/*
* check events
*/
- memcg_check_events(to, pc->page);
- memcg_check_events(from, pc->page);
+ memcg_check_events(to, page);
+ memcg_check_events(from, page);
+out:
return ret;
}
@@ -2214,14 +2338,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
* move charges to its parent.
*/
-static int mem_cgroup_move_parent(struct page_cgroup *pc,
+static int mem_cgroup_move_parent(struct page *page,
+ struct page_cgroup *pc,
struct mem_cgroup *child,
gfp_t gfp_mask)
{
- struct page *page = pc->page;
struct cgroup *cg = child->css.cgroup;
struct cgroup *pcg = cg->parent;
struct mem_cgroup *parent;
+ unsigned int nr_pages;
+ unsigned long uninitialized_var(flags);
int ret;
/* Is ROOT ? */
@@ -2234,14 +2360,22 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
if (isolate_lru_page(page))
goto put;
+ nr_pages = hpage_nr_pages(page);
+
parent = mem_cgroup_from_cont(pcg);
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
if (ret || !parent)
goto put_back;
- ret = mem_cgroup_move_account(pc, child, parent, true);
+ if (nr_pages > 1)
+ flags = compound_lock_irqsave(page);
+
+ ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
if (ret)
- mem_cgroup_cancel_charge(parent);
+ __mem_cgroup_cancel_charge(parent, nr_pages);
+
+ if (nr_pages > 1)
+ compound_unlock_irqrestore(page, flags);
put_back:
putback_lru_page(page);
put:
@@ -2260,20 +2394,29 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, enum charge_type ctype)
{
struct mem_cgroup *mem = NULL;
+ unsigned int nr_pages = 1;
struct page_cgroup *pc;
+ bool oom = true;
int ret;
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ /*
+ * Never OOM-kill a process for a huge page. The
+ * fault handler will fall back to regular pages.
+ */
+ oom = false;
+ }
+
pc = lookup_page_cgroup(page);
- /* can happen at boot */
- if (unlikely(!pc))
- return 0;
- prefetchw(pc);
+ BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
- ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
if (ret || !mem)
return ret;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
return 0;
}
@@ -2282,8 +2425,6 @@ int mem_cgroup_newpage_charge(struct page *page,
{
if (mem_cgroup_disabled())
return 0;
- if (PageCompound(page))
- return 0;
/*
* If already mapped, we don't have to account.
* If page cache, page->mapping has address_space.
@@ -2303,9 +2444,26 @@ static void
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
enum charge_type ctype);
+static void
+__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
+ enum charge_type ctype)
+{
+ struct page_cgroup *pc = lookup_page_cgroup(page);
+ /*
+ * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
+ * is already on LRU. It means the page may on some other page_cgroup's
+ * LRU. Take care of it.
+ */
+ mem_cgroup_lru_del_before_commit(page);
+ __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
+ mem_cgroup_lru_add_after_commit(page);
+ return;
+}
+
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
+ struct mem_cgroup *mem = NULL;
int ret;
if (mem_cgroup_disabled())
@@ -2340,14 +2498,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
if (unlikely(!mm))
mm = &init_mm;
- if (page_is_file_cache(page))
- return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_CACHE);
+ if (page_is_file_cache(page)) {
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
+ if (ret || !mem)
+ return ret;
+ /*
+ * FUSE reuses pages without going through the final
+ * put that would remove them from the LRU list, make
+ * sure that they get relinked properly.
+ */
+ __mem_cgroup_commit_charge_lrucare(page, mem,
+ MEM_CGROUP_CHARGE_TYPE_CACHE);
+ return ret;
+ }
/* shmem */
if (PageSwapCache(page)) {
- struct mem_cgroup *mem = NULL;
-
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
if (!ret)
__mem_cgroup_commit_charge_swapin(page, mem,
@@ -2372,6 +2538,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
struct mem_cgroup *mem;
int ret;
+ *ptr = NULL;
+
if (mem_cgroup_disabled())
return 0;
@@ -2389,30 +2557,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
if (!mem)
goto charge_cur_mm;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+ ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
css_put(&mem->css);
return ret;
charge_cur_mm:
if (unlikely(!mm))
mm = &init_mm;
- return __mem_cgroup_try_charge(mm, mask, ptr, true);
+ return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
}
static void
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
enum charge_type ctype)
{
- struct page_cgroup *pc;
-
if (mem_cgroup_disabled())
return;
if (!ptr)
return;
cgroup_exclude_rmdir(&ptr->css);
- pc = lookup_page_cgroup(page);
- mem_cgroup_lru_del_before_commit_swapcache(page);
- __mem_cgroup_commit_charge(ptr, pc, ctype);
- mem_cgroup_lru_add_after_commit_swapcache(page);
+
+ __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
/*
* Now swap is on-memory. This means this page may be
* counted both as mem and swap....double count.
@@ -2460,14 +2624,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
return;
if (!mem)
return;
- mem_cgroup_cancel_charge(mem);
+ __mem_cgroup_cancel_charge(mem, 1);
}
-static void
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
+ unsigned int nr_pages,
+ const enum charge_type ctype)
{
struct memcg_batch_info *batch = NULL;
bool uncharge_memsw = true;
+
/* If swapout, usage of swap doesn't decrease */
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
uncharge_memsw = false;
@@ -2482,7 +2648,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
batch->memcg = mem;
/*
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
- * In those cases, all pages freed continously can be expected to be in
+ * In those cases, all pages freed continuously can be expected to be in
* the same cgroup and we have chance to coalesce uncharges.
* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
* because we want to do uncharge as soon as possible.
@@ -2491,6 +2657,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
+ if (nr_pages > 1)
+ goto direct_uncharge;
+
/*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
@@ -2499,14 +2668,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
if (batch->memcg != mem)
goto direct_uncharge;
/* remember freed charge and uncharge it later */
- batch->bytes += PAGE_SIZE;
+ batch->nr_pages++;
if (uncharge_memsw)
- batch->memsw_bytes += PAGE_SIZE;
+ batch->memsw_nr_pages++;
return;
direct_uncharge:
- res_counter_uncharge(&mem->res, PAGE_SIZE);
+ res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
if (uncharge_memsw)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
if (unlikely(batch->memcg != mem))
memcg_oom_recover(mem);
return;
@@ -2518,8 +2687,9 @@ direct_uncharge:
static struct mem_cgroup *
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
- struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
+ unsigned int nr_pages = 1;
+ struct page_cgroup *pc;
if (mem_cgroup_disabled())
return NULL;
@@ -2527,6 +2697,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
if (PageSwapCache(page))
return NULL;
+ if (PageTransHuge(page)) {
+ nr_pages <<= compound_order(page);
+ VM_BUG_ON(!PageTransHuge(page));
+ }
/*
* Check if our page_cgroup is valid
*/
@@ -2559,7 +2733,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- mem_cgroup_charge_statistics(mem, pc, false);
+ mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
ClearPageCgroupUsed(pc);
/*
@@ -2580,7 +2754,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
mem_cgroup_get(mem);
}
if (!mem_cgroup_is_root(mem))
- __do_uncharge(mem, ctype);
+ mem_cgroup_do_uncharge(mem, nr_pages, ctype);
return mem;
@@ -2620,8 +2794,8 @@ void mem_cgroup_uncharge_start(void)
/* We can do nest. */
if (current->memcg_batch.do_batch == 1) {
current->memcg_batch.memcg = NULL;
- current->memcg_batch.bytes = 0;
- current->memcg_batch.memsw_bytes = 0;
+ current->memcg_batch.nr_pages = 0;
+ current->memcg_batch.memsw_nr_pages = 0;
}
}
@@ -2642,10 +2816,12 @@ void mem_cgroup_uncharge_end(void)
* This "batch->memcg" is valid without any css_get/put etc...
* bacause we hide charges behind us.
*/
- if (batch->bytes)
- res_counter_uncharge(&batch->memcg->res, batch->bytes);
- if (batch->memsw_bytes)
- res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
+ if (batch->nr_pages)
+ res_counter_uncharge(&batch->memcg->res,
+ batch->nr_pages * PAGE_SIZE);
+ if (batch->memsw_nr_pages)
+ res_counter_uncharge(&batch->memcg->memsw,
+ batch->memsw_nr_pages * PAGE_SIZE);
memcg_oom_recover(batch->memcg);
/* forget this pointer (for sanity check) */
batch->memcg = NULL;
@@ -2768,13 +2944,16 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
* page belongs to.
*/
int mem_cgroup_prepare_migration(struct page *page,
- struct page *newpage, struct mem_cgroup **ptr)
+ struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
{
- struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
+ struct page_cgroup *pc;
enum charge_type ctype;
int ret = 0;
+ *ptr = NULL;
+
+ VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return 0;
@@ -2824,7 +3003,7 @@ int mem_cgroup_prepare_migration(struct page *page,
return 0;
*ptr = mem;
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+ ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
css_put(&mem->css);/* drop extra refcnt */
if (ret || *ptr == NULL) {
if (PageAnon(page)) {
@@ -2851,13 +3030,13 @@ int mem_cgroup_prepare_migration(struct page *page,
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(mem, pc, ctype);
+ __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
return ret;
}
/* remove redundant charge if migration failed*/
void mem_cgroup_end_migration(struct mem_cgroup *mem,
- struct page *oldpage, struct page *newpage)
+ struct page *oldpage, struct page *newpage, bool migration_ok)
{
struct page *used, *unused;
struct page_cgroup *pc;
@@ -2866,8 +3045,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
return;
/* blocks rmdir() */
cgroup_exclude_rmdir(&mem->css);
- /* at migration success, oldpage->mapping is NULL. */
- if (oldpage->mapping) {
+ if (!migration_ok) {
used = oldpage;
unused = newpage;
} else {
@@ -2917,7 +3095,7 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
struct mm_struct *mm,
gfp_t gfp_mask)
{
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *mem;
int ret;
if (mem_cgroup_disabled())
@@ -2930,6 +3108,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
return ret;
}
+#ifdef CONFIG_DEBUG_VM
+static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
+{
+ struct page_cgroup *pc;
+
+ pc = lookup_page_cgroup(page);
+ if (likely(pc) && PageCgroupUsed(pc))
+ return pc;
+ return NULL;
+}
+
+bool mem_cgroup_bad_page_check(struct page *page)
+{
+ if (mem_cgroup_disabled())
+ return false;
+
+ return lookup_page_cgroup_used(page) != NULL;
+}
+
+void mem_cgroup_print_bad_page(struct page *page)
+{
+ struct page_cgroup *pc;
+
+ pc = lookup_page_cgroup_used(page);
+ if (pc) {
+ int ret = -1;
+ char *path;
+
+ printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
+ pc, pc->flags, pc->mem_cgroup);
+
+ path = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (path) {
+ rcu_read_lock();
+ ret = cgroup_path(pc->mem_cgroup->css.cgroup,
+ path, PATH_MAX);
+ rcu_read_unlock();
+ }
+
+ printk(KERN_CONT "(%s)\n",
+ (ret < 0) ? "cannot get the path" : path);
+ kfree(path);
+ }
+}
+#endif
+
static DEFINE_MUTEX(set_limit_mutex);
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
@@ -3173,6 +3397,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
loop += 256;
busy = NULL;
while (loop--) {
+ struct page *page;
+
ret = 0;
spin_lock_irqsave(&zone->lru_lock, flags);
if (list_empty(list)) {
@@ -3188,7 +3414,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
}
spin_unlock_irqrestore(&zone->lru_lock, flags);
- ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
+ page = lookup_cgroup_page(pc);
+
+ ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
if (ret == -ENOMEM)
break;
@@ -3336,13 +3564,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
}
-static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
- enum mem_cgroup_stat_index idx)
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
{
struct mem_cgroup *iter;
- s64 val = 0;
+ long val = 0;
- /* each per cpu's value can be minus.Then, use s64 */
+ /* Per-cpu values can be negative, use a signed accumulator */
for_each_mem_cgroup_tree(iter, mem)
val += mem_cgroup_read_stat(iter, idx);
@@ -3362,12 +3590,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
return res_counter_read_u64(&mem->memsw, RES_USAGE);
}
- val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
- val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
+ val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
if (swap)
- val += mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_SWAPOUT);
+ val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
return val << PAGE_SHIFT;
}
@@ -3587,9 +3814,9 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
s->stat[MCS_RSS] += val * PAGE_SIZE;
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
+ val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
s->stat[MCS_PGPGIN] += val;
- val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
+ val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
s->stat[MCS_PGPGOUT] += val;
if (do_swap_account) {
val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
@@ -3713,9 +3940,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
return -EINVAL;
}
- spin_lock(&memcg->reclaim_param_lock);
memcg->swappiness = val;
- spin_unlock(&memcg->reclaim_param_lock);
cgroup_unlock();
@@ -4177,13 +4402,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
*/
if (!node_state(node, N_NORMAL_MEMORY))
tmp = -1;
- pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+ pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
if (!pn)
return 1;
mem->info.nodeinfo[node] = pn;
- memset(pn, 0, sizeof(*pn));
-
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = &pn->zoneinfo[zone];
for_each_lru(l)
@@ -4207,14 +4430,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
/* Can be very big if MAX_NUMNODES is very big */
if (size < PAGE_SIZE)
- mem = kmalloc(size, GFP_KERNEL);
+ mem = kzalloc(size, GFP_KERNEL);
else
- mem = vmalloc(size);
+ mem = vzalloc(size);
if (!mem)
return NULL;
- memset(mem, 0, size);
mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!mem->stat)
goto out_free;
@@ -4374,7 +4596,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
res_counter_init(&mem->memsw, NULL);
}
mem->last_scanned_child = 0;
- spin_lock_init(&mem->reclaim_param_lock);
INIT_LIST_HEAD(&mem->oom_notify);
if (parent)
@@ -4462,7 +4683,7 @@ one_by_one:
batch_count = PRECHARGE_COUNT_AT_ONCE;
cond_resched();
}
- ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+ ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
if (ret || !mem)
/* mem_cgroup_clear_mc() will do uncharge later */
return -ENOMEM;
@@ -4624,6 +4845,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
+ split_huge_page_pmd(walk->mm, pmd);
+
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE)
if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4639,7 +4862,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
unsigned long precharge;
struct vm_area_struct *vma;
- /* We've already held the mmap_sem */
+ down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
struct mm_walk mem_cgroup_count_precharge_walk = {
.pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4651,6 +4874,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
walk_page_range(vma->vm_start, vma->vm_end,
&mem_cgroup_count_precharge_walk);
}
+ up_read(&mm->mmap_sem);
precharge = mc.precharge;
mc.precharge = 0;
@@ -4660,10 +4884,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
static int mem_cgroup_precharge_mc(struct mm_struct *mm)
{
- return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+ unsigned long precharge = mem_cgroup_count_precharge(mm);
+
+ VM_BUG_ON(mc.moving_task);
+ mc.moving_task = current;
+ return mem_cgroup_do_precharge(precharge);
}
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
@@ -4698,23 +4927,28 @@ static void mem_cgroup_clear_mc(void)
PAGE_SIZE * mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
-
mc.moved_swap = 0;
}
- if (mc.mm) {
- up_read(&mc.mm->mmap_sem);
- mmput(mc.mm);
- }
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
+ wake_up_all(&mc.waitq);
+}
+
+static void mem_cgroup_clear_mc(void)
+{
+ struct mem_cgroup *from = mc.from;
+
+ /*
+ * we must clear moving_task before waking up waiters at the end of
+ * task migration.
+ */
+ mc.moving_task = NULL;
+ __mem_cgroup_clear_mc();
spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
spin_unlock(&mc.lock);
- mc.moving_task = NULL;
- mc.mm = NULL;
mem_cgroup_end_move(from);
- memcg_oom_recover(from);
- memcg_oom_recover(to);
- wake_up_all(&mc.waitq);
}
static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4736,38 +4970,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
return 0;
/* We move charges only when we move a owner of the mm */
if (mm->owner == p) {
- /*
- * We do all the move charge works under one mmap_sem to
- * avoid deadlock with down_write(&mmap_sem)
- * -> try_charge() -> if (mc.moving_task) -> sleep.
- */
- down_read(&mm->mmap_sem);
-
VM_BUG_ON(mc.from);
VM_BUG_ON(mc.to);
VM_BUG_ON(mc.precharge);
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
- VM_BUG_ON(mc.moving_task);
- VM_BUG_ON(mc.mm);
-
mem_cgroup_start_move(from);
spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
- mc.precharge = 0;
- mc.moved_charge = 0;
- mc.moved_swap = 0;
spin_unlock(&mc.lock);
- mc.moving_task = current;
- mc.mm = mm;
+ /* We set mc.moving_task later */
ret = mem_cgroup_precharge_mc(mm);
if (ret)
mem_cgroup_clear_mc();
- /* We call up_read() and mmput() in clear_mc(). */
- } else
- mmput(mm);
+ }
+ mmput(mm);
}
return ret;
}
@@ -4789,6 +5008,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
+ split_huge_page_pmd(walk->mm, pmd);
retry:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
for (; addr != end; addr += PAGE_SIZE) {
@@ -4809,8 +5029,8 @@ retry:
if (isolate_lru_page(page))
goto put;
pc = lookup_page_cgroup(page);
- if (!mem_cgroup_move_account(pc,
- mc.from, mc.to, false)) {
+ if (!mem_cgroup_move_account(page, 1, pc,
+ mc.from, mc.to, false)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
@@ -4855,7 +5075,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
struct vm_area_struct *vma;
lru_add_drain_all();
- /* We've already held the mmap_sem */
+retry:
+ if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+ /*
+ * Someone who are holding the mmap_sem might be waiting in
+ * waitq. So we cancel all extra charges, wake up all waiters,
+ * and retry. Because we cancel precharges, we might not be able
+ * to move enough charges, but moving charge is a best-effort
+ * feature anyway, so it wouldn't be a big problem.
+ */
+ __mem_cgroup_clear_mc();
+ cond_resched();
+ goto retry;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
int ret;
struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4874,6 +5106,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
*/
break;
}
+ up_read(&mm->mmap_sem);
}
static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4882,11 +5115,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
struct task_struct *p,
bool threadgroup)
{
- if (!mc.mm)
+ struct mm_struct *mm;
+
+ if (!mc.to)
/* no need to move charge */
return;
- mem_cgroup_move_charge(mc.mm);
+ mm = get_task_mm(p);
+ if (mm) {
+ mem_cgroup_move_charge(mm);
+ mmput(mm);
+ }
mem_cgroup_clear_mc();
}
#else /* !CONFIG_MMU */
@@ -4930,9 +5169,9 @@ struct cgroup_subsys mem_cgroup_subsys = {
static int __init enable_swap_account(char *s)
{
/* consider enabled if no parameter or 1 is given */
- if (!s || !strcmp(s, "1"))
+ if (!(*s) || !strcmp(s, "=1"))
really_do_swap_account = 1;
- else if (!strcmp(s, "0"))
+ else if (!strcmp(s, "=0"))
really_do_swap_account = 0;
return 1;
}
@@ -4940,7 +5179,8 @@ __setup("swapaccount", enable_swap_account);
static int __init disable_swap_account(char *s)
{
- enable_swap_account("0");
+ printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
+ enable_swap_account("=0");
return 1;
}
__setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 46ab2c044b0e..2b9a5eef39e0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -203,12 +203,12 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
- si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+ si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
/*
* Don't use force here, it's convenient if the signal
* can be temporarily blocked.
* This could cause a loop when the user sets SIGBUS
- * to SIG_IGN, but hopefully noone will do that?
+ * to SIG_IGN, but hopefully no one will do that?
*/
ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
if (ret < 0)
@@ -233,8 +233,8 @@ void shake_page(struct page *p, int access)
}
/*
- * Only all shrink_slab here (which would also
- * shrink other caches) if access is not potentially fatal.
+ * Only call shrink_slab here (which would also shrink other caches) if
+ * access is not potentially fatal.
*/
if (access) {
int nr;
@@ -634,7 +634,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* when the page is reread or dropped. If an
* application assumes it will always get error on
* fsync, but does other operations on the fd before
- * and the page is dropped inbetween then the error
+ * and the page is dropped between then the error
* will not be properly reported.
*
* This can already happen even without hwpoisoned
@@ -728,7 +728,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
* The table matches them in order and calls the right handler.
*
* This is quite tricky because we can access page at any time
- * in its live cycle, so all accesses have to be extremly careful.
+ * in its live cycle, so all accesses have to be extremely careful.
*
* This is not complete. More states could be added.
* For any missing state don't attempt recovery.
@@ -854,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
int ret;
int kill = 1;
struct page *hpage = compound_head(p);
+ struct page *ppage;
if (PageReserved(p) || PageSlab(p))
return SWAP_SUCCESS;
@@ -895,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
}
/*
+ * ppage: poisoned page
+ * if p is regular page(4k page)
+ * ppage == real poisoned page;
+ * else p is hugetlb or THP, ppage == head page.
+ */
+ ppage = hpage;
+
+ if (PageTransHuge(hpage)) {
+ /*
+ * Verify that this isn't a hugetlbfs head page, the check for
+ * PageAnon is just for avoid tripping a split_huge_page
+ * internal debug check, as split_huge_page refuses to deal with
+ * anything that isn't an anon page. PageAnon can't go away fro
+ * under us because we hold a refcount on the hpage, without a
+ * refcount on the hpage. split_huge_page can't be safely called
+ * in the first place, having a refcount on the tail isn't
+ * enough * to be safe.
+ */
+ if (!PageHuge(hpage) && PageAnon(hpage)) {
+ if (unlikely(split_huge_page(hpage))) {
+ /*
+ * FIXME: if splitting THP is failed, it is
+ * better to stop the following operation rather
+ * than causing panic by unmapping. System might
+ * survive if the page is freed later.
+ */
+ printk(KERN_INFO
+ "MCE %#lx: failed to split THP\n", pfn);
+
+ BUG_ON(!PageHWPoison(p));
+ return SWAP_FAIL;
+ }
+ /* THP is split, so ppage should be the real poisoned page. */
+ ppage = p;
+ }
+ }
+
+ /*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
* because ttu takes the rmap data structures down.
@@ -903,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(hpage, &tokill);
+ collect_procs(ppage, &tokill);
- ret = try_to_unmap(hpage, ttu);
+ if (hpage != ppage)
+ lock_page(ppage);
+
+ ret = try_to_unmap(ppage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
+ pfn, page_mapcount(ppage));
+
+ if (hpage != ppage)
+ unlock_page(ppage);
/*
* Now that the dirty bit has been propagated to the
@@ -919,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+ kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
ret != SWAP_SUCCESS, p, pfn);
return ret;
@@ -928,7 +973,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
static void set_page_hwpoison_huge_page(struct page *hpage)
{
int i;
- int nr_pages = 1 << compound_order(hpage);
+ int nr_pages = 1 << compound_trans_order(hpage);
for (i = 0; i < nr_pages; i++)
SetPageHWPoison(hpage + i);
}
@@ -936,7 +981,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
static void clear_page_hwpoison_huge_page(struct page *hpage)
{
int i;
- int nr_pages = 1 << compound_order(hpage);
+ int nr_pages = 1 << compound_trans_order(hpage);
for (i = 0; i < nr_pages; i++)
ClearPageHWPoison(hpage + i);
}
@@ -966,7 +1011,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
return 0;
}
- nr_pages = 1 << compound_order(hpage);
+ nr_pages = 1 << compound_trans_order(hpage);
atomic_long_add(nr_pages, &mce_bad_pages);
/*
@@ -993,7 +1038,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* Check "just unpoisoned", "filter hit", and
* "race with other subpage."
*/
- lock_page_nosync(hpage);
+ lock_page(hpage);
if (!PageHWPoison(hpage)
|| (hwpoison_filter(p) && TestClearPageHWPoison(p))
|| (p != hpage && TestSetPageHWPoison(hpage))) {
@@ -1020,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageLRU(p) && !PageHuge(p))
- shake_page(p, 0);
- if (!PageLRU(p) && !PageHuge(p)) {
- /*
- * shake_page could have turned it free.
- */
- if (is_free_buddy_page(p)) {
- action_result(pfn, "free buddy, 2nd try", DELAYED);
- return 0;
+ if (!PageHuge(p) && !PageTransCompound(p)) {
+ if (!PageLRU(p))
+ shake_page(p, 0);
+ if (!PageLRU(p)) {
+ /*
+ * shake_page could have turned it free.
+ */
+ if (is_free_buddy_page(p)) {
+ action_result(pfn, "free buddy, 2nd try",
+ DELAYED);
+ return 0;
+ }
+ action_result(pfn, "non LRU", IGNORED);
+ put_page(p);
+ return -EBUSY;
}
- action_result(pfn, "non LRU", IGNORED);
- put_page(p);
- return -EBUSY;
}
/*
@@ -1040,7 +1088,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
*/
- lock_page_nosync(hpage);
+ lock_page(hpage);
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1062,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* For error on the tail page, we should set PG_hwpoison
* on the head page to show that the hugepage is hwpoisoned
*/
- if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
action_result(pfn, "hugepage already hardware poisoned",
IGNORED);
unlock_page(hpage);
@@ -1082,7 +1130,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
/*
* Now take care of user space mappings.
- * Abort on fail: __remove_from_page_cache() assumes unmapped page.
+ * Abort on fail: __delete_from_page_cache() assumes unmapped page.
*/
if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
@@ -1164,7 +1212,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
- nr_pages = 1 << compound_order(page);
+ nr_pages = 1 << compound_trans_order(page);
if (!get_page_unless_zero(page)) {
/*
@@ -1183,7 +1231,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
- lock_page_nosync(page);
+ lock_page(page);
/*
* This test is racy because PG_hwpoison is set outside of page lock.
* That's acceptable because that won't trigger kernel panic. Instead,
@@ -1290,9 +1338,13 @@ static int soft_offline_huge_page(struct page *page, int flags)
/* Keep page count to indicate a given hugepage is isolated. */
list_add(&hpage->lru, &pagelist);
- ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+ true);
if (ret) {
- putback_lru_pages(&pagelist);
+ struct page *page1, *page2;
+ list_for_each_entry_safe(page1, page2, &pagelist, lru)
+ put_page(page1);
+
pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
@@ -1301,7 +1353,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
}
done:
if (!PageHWPoison(hpage))
- atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+ atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
set_page_hwpoison_huge_page(hpage);
dequeue_hwpoisoned_huge_page(hpage);
/* keep elevated page count for bad page */
@@ -1413,8 +1465,10 @@ int soft_offline_page(struct page *page, int flags)
LIST_HEAD(pagelist);
list_add(&page->lru, &pagelist);
- ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+ 0, true);
if (ret) {
+ putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
@@ -1433,35 +1487,3 @@ done:
/* keep elevated page count for bad page */
return ret;
}
-
-/*
- * The caller must hold current->mm->mmap_sem in read mode.
- */
-int is_hwpoison_address(unsigned long addr)
-{
- pgd_t *pgdp;
- pud_t pud, *pudp;
- pmd_t pmd, *pmdp;
- pte_t pte, *ptep;
- swp_entry_t entry;
-
- pgdp = pgd_offset(current->mm, addr);
- if (!pgd_present(*pgdp))
- return 0;
- pudp = pud_offset(pgdp, addr);
- pud = *pudp;
- if (!pud_present(pud) || pud_large(pud))
- return 0;
- pmdp = pmd_offset(pudp, addr);
- pmd = *pmdp;
- if (!pmd_present(pmd) || pmd_large(pmd))
- return 0;
- ptep = pte_offset_map(pmdp, addr);
- pte = *ptep;
- pte_unmap(ptep);
- if (!is_swap_pte(pte))
- return 0;
- entry = pte_to_swp_entry(pte);
- return is_hwpoison_entry(entry);
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/mm/memory.c b/mm/memory.c
index 02e48aa0ed13..61e66f026563 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long address)
{
pgtable_t new = pte_alloc_one(mm, address);
+ int wait_split_huge_page;
if (!new)
return -ENOMEM;
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
spin_lock(&mm->page_table_lock);
- if (!pmd_present(*pmd)) { /* Has another populated it ? */
+ wait_split_huge_page = 0;
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm->nr_ptes++;
pmd_populate(mm, pmd, new);
new = NULL;
- }
+ } else if (unlikely(pmd_trans_splitting(*pmd)))
+ wait_split_huge_page = 1;
spin_unlock(&mm->page_table_lock);
if (new)
pte_free(mm, new);
+ if (wait_split_huge_page)
+ wait_split_huge_page(vma->anon_vma, pmd);
return 0;
}
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&init_mm.page_table_lock);
- if (!pmd_present(*pmd)) { /* Has another populated it ? */
+ if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
- }
+ } else
+ VM_BUG_ON(pmd_trans_splitting(*pmd));
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
@@ -719,9 +726,9 @@ out_set_pte:
return 0;
}
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
+int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
{
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*src_pmd)) {
+ int err;
+ VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+ err = copy_huge_pmd(dst_mm, src_mm,
+ dst_pmd, src_pmd, addr, vma);
+ if (err == -ENOMEM)
+ return -ENOMEM;
+ if (!err)
+ continue;
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next-addr != HPAGE_PMD_SIZE) {
+ VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+ split_huge_page_pmd(vma->vm_mm, pmd);
+ } else if (zap_huge_pmd(tlb, vma, pmd)) {
+ (*zap_work)--;
+ continue;
+ }
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(pmd)) {
(*zap_work)--;
continue;
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
pud = pud_offset(pgd, address);
if (pud_none(*pud))
goto no_page_table;
- if (pud_huge(*pud)) {
+ if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
BUG_ON(flags & FOLL_GET);
page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
goto out;
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
pmd = pmd_offset(pud, address);
if (pmd_none(*pmd))
goto no_page_table;
- if (pmd_huge(*pmd)) {
+ if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
BUG_ON(flags & FOLL_GET);
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
}
+ if (pmd_trans_huge(*pmd)) {
+ if (flags & FOLL_SPLIT) {
+ split_huge_page_pmd(mm, pmd);
+ goto split_fallthrough;
+ }
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*pmd))) {
+ if (unlikely(pmd_trans_splitting(*pmd))) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, pmd);
+ } else {
+ page = follow_trans_huge_pmd(mm, address,
+ pmd, flags);
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
+ } else
+ spin_unlock(&mm->page_table_lock);
+ /* fall through */
+ }
+split_fallthrough:
if (unlikely(pmd_bad(*pmd)))
goto no_page_table;
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
*/
mark_page_accessed(page);
}
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /*
+ * The preliminary mapping check is mainly to avoid the
+ * pointless overhead of lock_page on the ZERO_PAGE
+ * which might bounce very badly if there is contention.
+ *
+ * If the page is already locked, we don't need to
+ * handle it now - vmscan will handle it later if and
+ * when it attempts to reclaim the page.
+ */
+ if (page->mapping && trylock_page(page)) {
+ lru_add_drain(); /* push cached pages to LRU */
+ /*
+ * Because we lock page here and migration is
+ * blocked by the pte's page reference, we need
+ * only check for file-cache page truncation.
+ */
+ if (page->mapping)
+ mlock_vma_page(page);
+ unlock_page(page);
+ }
+ }
unlock:
pte_unmap_unlock(ptep, ptl);
out:
@@ -1339,9 +1410,65 @@ no_page_table:
return page;
}
+static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+{
+ return stack_guard_page_start(vma, addr) ||
+ stack_guard_page_end(vma, addr+PAGE_SIZE);
+}
+
+/**
+ * __get_user_pages() - pin user pages in memory
+ * @tsk: task_struct of target task
+ * @mm: mm_struct of target mm
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ * @nonblocking: whether waiting for disk IO or mmap_sem contention
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * __get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * __get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
+ * the page is written to, set_page_dirty (or set_page_dirty_lock, as
+ * appropriate) must be called after the page is finished with, and
+ * before put_page is called.
+ *
+ * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
+ * or mmap_sem contention, and if waiting is needed to pin all pages,
+ * *@nonblocking will be set to 0.
+ *
+ * In most cases, get_user_pages or get_user_pages_fast should be used
+ * instead of __get_user_pages. __get_user_pages should be used only if
+ * you need some special @gup_flags.
+ */
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int gup_flags,
- struct page **pages, struct vm_area_struct **vmas)
+ struct page **pages, struct vm_area_struct **vmas,
+ int *nonblocking)
{
int i;
unsigned long vm_flags;
@@ -1365,9 +1492,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
struct vm_area_struct *vma;
vma = find_extend_vma(mm, start);
- if (!vma && in_gate_area(tsk, start)) {
+ if (!vma && in_gate_area(mm, start)) {
unsigned long pg = start & PAGE_MASK;
- struct vm_area_struct *gate_vma = get_gate_vma(tsk);
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
@@ -1386,15 +1512,17 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
pmd = pmd_offset(pud, pg);
if (pmd_none(*pmd))
return i ? : -EFAULT;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
pte = pte_offset_map(pmd, pg);
if (pte_none(*pte)) {
pte_unmap(pte);
return i ? : -EFAULT;
}
+ vma = get_gate_vma(mm);
if (pages) {
struct page *page;
- page = vm_normal_page(gate_vma, start, *pte);
+ page = vm_normal_page(vma, start, *pte);
if (!page) {
if (!(gup_flags & FOLL_DUMP) &&
is_zero_pfn(pte_pfn(*pte)))
@@ -1408,12 +1536,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
get_page(page);
}
pte_unmap(pte);
- if (vmas)
- vmas[i] = gate_vma;
- i++;
- start += PAGE_SIZE;
- nr_pages--;
- continue;
+ goto next_page;
}
if (!vma ||
@@ -1441,24 +1564,52 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
cond_resched();
while (!(page = follow_page(vma, start, foll_flags))) {
int ret;
+ unsigned int fault_flags = 0;
+
+ /* For mlock, just skip the stack guard page. */
+ if (foll_flags & FOLL_MLOCK) {
+ if (stack_guard_page(vma, start))
+ goto next_page;
+ }
+ if (foll_flags & FOLL_WRITE)
+ fault_flags |= FAULT_FLAG_WRITE;
+ if (nonblocking)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (foll_flags & FOLL_NOWAIT)
+ fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
ret = handle_mm_fault(mm, vma, start,
- (foll_flags & FOLL_WRITE) ?
- FAULT_FLAG_WRITE : 0);
+ fault_flags);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM;
- if (ret &
- (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
- VM_FAULT_SIGBUS))
+ if (ret & (VM_FAULT_HWPOISON |
+ VM_FAULT_HWPOISON_LARGE)) {
+ if (i)
+ return i;
+ else if (gup_flags & FOLL_HWPOISON)
+ return -EHWPOISON;
+ else
+ return -EFAULT;
+ }
+ if (ret & VM_FAULT_SIGBUS)
return i ? i : -EFAULT;
BUG();
}
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
+
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+
+ if (ret & VM_FAULT_RETRY) {
+ if (nonblocking)
+ *nonblocking = 0;
+ return i;
+ }
/*
* The VM_FAULT_WRITE bit tells us that
@@ -1486,6 +1637,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
flush_anon_page(vma, page, start);
flush_dcache_page(page);
}
+next_page:
if (vmas)
vmas[i] = vma;
i++;
@@ -1495,10 +1647,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
} while (nr_pages);
return i;
}
+EXPORT_SYMBOL(__get_user_pages);
/**
* get_user_pages() - pin user pages in memory
- * @tsk: task_struct of target task
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -1559,7 +1713,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
}
EXPORT_SYMBOL(get_user_pages);
@@ -1584,7 +1739,8 @@ struct page *get_dump_page(unsigned long addr)
struct page *page;
if (__get_user_pages(current, current->mm, addr, 1,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+ NULL) < 1)
return NULL;
flush_cache_page(vma, addr, page_to_pfn(page));
return page;
@@ -1598,8 +1754,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
pud_t * pud = pud_alloc(mm, pgd, addr);
if (pud) {
pmd_t * pmd = pmd_alloc(mm, pud, addr);
- if (pmd)
+ if (pmd) {
+ VM_BUG_ON(pmd_trans_huge(*pmd));
return pte_alloc_map_lock(mm, pmd, addr, ptl);
+ }
}
return NULL;
}
@@ -1818,6 +1976,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
pmd = pmd_alloc(mm, pud, addr);
if (!pmd)
return -ENOMEM;
+ VM_BUG_ON(pmd_trans_huge(*pmd));
do {
next = pmd_addr_end(addr, end);
if (remap_pte_range(mm, pmd, addr, next,
@@ -2027,10 +2186,10 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
* handle_pte_fault chooses page fault handler according to an entry
* which was read non-atomically. Before making any commitment, on
* those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_file_page
+ * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
* must check under lock before unmapping the pte and proceeding
* (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page and do_no_page can safely check later on).
+ * and do_anonymous_page can safely check later on).
*/
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte)
@@ -2048,19 +2207,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
return same;
}
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
- * servicing faults for write access. In the normal case, do always want
- * pte_mkwrite. But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-{
- if (likely(vma->vm_flags & VM_WRITE))
- pte = pte_mkwrite(pte);
- return pte;
-}
-
static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
{
/*
@@ -2112,7 +2258,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *old_page, *new_page;
pte_t entry;
- int reuse = 0, ret = 0;
+ int ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
@@ -2144,19 +2290,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
- page_cache_release(old_page);
goto unlock;
}
page_cache_release(old_page);
}
- reuse = reuse_swap_page(old_page);
- if (reuse)
+ if (reuse_swap_page(old_page)) {
/*
* The page is all ours. Move it to our anon_vma so
* the rmap code will not search our parent or siblings.
* Protected against the rmap code by the page lock.
*/
page_move_anon_rmap(old_page, vma, address);
+ unlock_page(old_page);
+ goto reuse;
+ }
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
@@ -2212,7 +2359,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
- page_cache_release(old_page);
goto unlock;
}
@@ -2220,18 +2366,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
dirty_page = old_page;
get_page(dirty_page);
- reuse = 1;
- }
- if (reuse) {
reuse:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, page_table);
+ pte_unmap_unlock(page_table, ptl);
ret |= VM_FAULT_WRITE;
- goto unlock;
+
+ if (!dirty_page)
+ return ret;
+
+ /*
+ * Yes, Virginia, this is actually required to prevent a race
+ * with clear_page_dirty_for_io() from clearing the page dirty
+ * bit after it clear all dirty ptes, but before a racing
+ * do_wp_page installs a dirty pte.
+ *
+ * __do_fault is protected similarly.
+ */
+ if (!page_mkwrite) {
+ wait_on_page_locked(dirty_page);
+ set_page_dirty_balance(dirty_page, page_mkwrite);
+ }
+ put_page(dirty_page);
+ if (page_mkwrite) {
+ struct address_space *mapping = dirty_page->mapping;
+
+ set_page_dirty(dirty_page);
+ unlock_page(dirty_page);
+ page_cache_release(dirty_page);
+ if (mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ }
+ }
+
+ /* file_update_time outside page_lock */
+ if (vma->vm_file)
+ file_update_time(vma->vm_file);
+
+ return ret;
}
/*
@@ -2256,16 +2436,6 @@ gotten:
}
__SetPageUptodate(new_page);
- /*
- * Don't let another task, with possibly unlocked vma,
- * keep the mlocked page.
- */
- if ((vma->vm_flags & VM_LOCKED) && old_page) {
- lock_page(old_page); /* for LRU manipulation */
- clear_page_mlock(old_page);
- unlock_page(old_page);
- }
-
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
@@ -2333,42 +2503,19 @@ gotten:
if (new_page)
page_cache_release(new_page);
- if (old_page)
- page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
- if (dirty_page) {
+ if (old_page) {
/*
- * Yes, Virginia, this is actually required to prevent a race
- * with clear_page_dirty_for_io() from clearing the page dirty
- * bit after it clear all dirty ptes, but before a racing
- * do_wp_page installs a dirty pte.
- *
- * do_no_page is protected similarly.
+ * Don't let another task, with possibly unlocked vma,
+ * keep the mlocked page.
*/
- if (!page_mkwrite) {
- wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page, page_mkwrite);
- }
- put_page(dirty_page);
- if (page_mkwrite) {
- struct address_space *mapping = dirty_page->mapping;
-
- set_page_dirty(dirty_page);
- unlock_page(dirty_page);
- page_cache_release(dirty_page);
- if (mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
+ if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+ lock_page(old_page); /* LRU manipulation */
+ munlock_vma_page(old_page);
+ unlock_page(old_page);
}
-
- /* file_update_time outside page_lock */
- if (vma->vm_file)
- file_update_time(vma->vm_file);
+ page_cache_release(old_page);
}
return ret;
oom_free_new:
@@ -2572,6 +2719,7 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX;
details.i_mmap_lock = &mapping->i_mmap_lock;
+ mutex_lock(&mapping->unmap_mutex);
spin_lock(&mapping->i_mmap_lock);
/* Protect against endless unmapping loops */
@@ -2588,6 +2736,7 @@ void unmap_mapping_range(struct address_space *mapping,
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->unmap_mutex);
}
EXPORT_SYMBOL(unmap_mapping_range);
@@ -2629,7 +2778,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t pte;
int locked;
- struct mem_cgroup *ptr = NULL;
+ struct mem_cgroup *ptr;
int exclusive = 0;
int ret = 0;
@@ -2975,12 +3124,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
}
charged = 1;
- /*
- * Don't let another task, with possibly unlocked vma,
- * keep the mlocked page.
- */
- if (vma->vm_flags & VM_LOCKED)
- clear_page_mlock(vmf.page);
copy_user_highpage(page, vmf.page, address, vma);
__SetPageUptodate(page);
} else {
@@ -3147,9 +3290,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
-static inline int handle_pte_fault(struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- pte_t *pte, pmd_t *pmd, unsigned int flags)
+int handle_pte_fault(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
@@ -3228,9 +3371,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
return VM_FAULT_OOM;
- pte = pte_alloc_map(mm, pmd, address);
- if (!pte)
+ if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+ if (!vma->vm_ops)
+ return do_huge_pmd_anonymous_page(mm, vma, address,
+ pmd, flags);
+ } else {
+ pmd_t orig_pmd = *pmd;
+ barrier();
+ if (pmd_trans_huge(orig_pmd)) {
+ if (flags & FAULT_FLAG_WRITE &&
+ !pmd_write(orig_pmd) &&
+ !pmd_trans_splitting(orig_pmd))
+ return do_huge_pmd_wp_page(mm, vma, address,
+ pmd, orig_pmd);
+ return 0;
+ }
+ }
+
+ /*
+ * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * run pte_offset_map on the pmd, if an huge pmd could
+ * materialize from under us from a different thread.
+ */
+ if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
return VM_FAULT_OOM;
+ /* if an huge pmd materialized from under us just retry later */
+ if (unlikely(pmd_trans_huge(*pmd)))
+ return 0;
+ /*
+ * A regular pmd is established and it can't morph into a huge pmd
+ * from under us anymore at this point because we hold the mmap_sem
+ * read mode and khugepaged takes it in write mode. So now it's
+ * safe to run pte_offset_map().
+ */
+ pte = pte_offset_map(pmd, address);
return handle_pte_fault(mm, vma, address, pte, pmd, flags);
}
@@ -3296,7 +3470,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
vma = find_vma(current->mm, addr);
if (!vma)
return -ENOMEM;
- write = (vma->vm_flags & VM_WRITE) != 0;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
BUG_ON(addr >= end);
BUG_ON(end > vma->vm_end);
len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3331,7 +3510,7 @@ static int __init gate_vma_init(void)
__initcall(gate_vma_init);
#endif
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
#ifdef AT_SYSINFO_EHDR
return &gate_vma;
@@ -3340,7 +3519,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
#endif
}
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
{
#ifdef AT_SYSINFO_EHDR
if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
@@ -3368,6 +3547,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
goto out;
pmd = pmd_offset(pud, address);
+ VM_BUG_ON(pmd_trans_huge(*pmd));
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
@@ -3480,20 +3660,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
#endif
/*
- * Access another process' address space.
- * Source/target buffer must be kernel space,
- * Do not walk the page table directly, use get_user_pages
+ * Access another process' address space as given in mm. If non-NULL, use the
+ * given task for page fault accounting.
*/
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long addr, void *buf, int len, int write)
{
- struct mm_struct *mm;
struct vm_area_struct *vma;
void *old_buf = buf;
- mm = get_task_mm(tsk);
- if (!mm)
- return 0;
-
down_read(&mm->mmap_sem);
/* ignore errors, just check how much was successfully transferred */
while (len) {
@@ -3510,7 +3685,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
*/
#ifdef CONFIG_HAVE_IOREMAP_PROT
vma = find_vma(mm, addr);
- if (!vma)
+ if (!vma || vma->vm_start > addr)
break;
if (vma->vm_ops && vma->vm_ops->access)
ret = vma->vm_ops->access(vma, addr, buf,
@@ -3542,11 +3717,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
addr += bytes;
}
up_read(&mm->mmap_sem);
- mmput(mm);
return buf - old_buf;
}
+/**
+ * access_remote_vm - access another process' address space
+ * @mm: the mm_struct of the target address space
+ * @addr: start address to access
+ * @buf: source or destination buffer
+ * @len: number of bytes to transfer
+ * @write: whether the access is a write
+ *
+ * The caller must hold a reference on @mm.
+ */
+int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, int write)
+{
+ return __access_remote_vm(NULL, mm, addr, buf, len, write);
+}
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr,
+ void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+ int ret;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
+ mmput(mm);
+
+ return ret;
+}
+
/*
* Print the name of a VMA.
*/
@@ -3608,3 +3819,74 @@ void might_fault(void)
}
EXPORT_SYMBOL(might_fault);
#endif
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+ unsigned long addr,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *p = page;
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page;
+ i++, p = mem_map_next(p, page, i)) {
+ cond_resched();
+ clear_user_highpage(p, addr + i * PAGE_SIZE);
+ }
+}
+void clear_huge_page(struct page *page,
+ unsigned long addr, unsigned int pages_per_huge_page)
+{
+ int i;
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ clear_gigantic_page(page, addr, pages_per_huge_page);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page; i++) {
+ cond_resched();
+ clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+ }
+}
+
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+ unsigned long addr,
+ struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < pages_per_huge_page; ) {
+ cond_resched();
+ copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void copy_user_huge_page(struct page *dst, struct page *src,
+ unsigned long addr, struct vm_area_struct *vma,
+ unsigned int pages_per_huge_page)
+{
+ int i;
+
+ if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+ copy_user_gigantic_page(dst, src, addr, vma,
+ pages_per_huge_page);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page; i++) {
+ cond_resched();
+ copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+ }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2c6523af5473..9ca1d604f7cd 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res)
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
#ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info, struct page *page, int type)
+static void get_page_bootmem(unsigned long info, struct page *page,
+ unsigned long type)
{
- atomic_set(&page->_mapcount, type);
+ page->lru.next = (struct list_head *) type;
SetPagePrivate(page);
set_page_private(page, info);
atomic_inc(&page->_count);
@@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
* so use __ref to tell modpost not to generate a warning */
void __ref put_page_bootmem(struct page *page)
{
- int type;
+ unsigned long type;
- type = atomic_read(&page->_mapcount);
- BUG_ON(type >= -1);
+ type = (unsigned long) page->lru.next;
+ BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+ type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
if (atomic_dec_return(&page->_count) == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
- reset_page_mapcount(page);
+ INIT_LIST_HEAD(&page->lru);
__free_pages_bootmem(page, 0);
}
@@ -373,7 +375,7 @@ void online_page(struct page *page)
#endif
#ifdef CONFIG_FLATMEM
- max_mapnr = max(page_to_pfn(page), max_mapnr);
+ max_mapnr = max(pfn, max_mapnr);
#endif
ClearPageReserved(page);
@@ -407,6 +409,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
int ret;
struct memory_notify arg;
+ lock_memory_hotplug();
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
arg.status_change_nid = -1;
@@ -419,6 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
ret = notifier_to_errno(ret);
if (ret) {
memory_notify(MEM_CANCEL_ONLINE, &arg);
+ unlock_memory_hotplug();
return ret;
}
/*
@@ -443,6 +447,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
nr_pages, pfn);
memory_notify(MEM_CANCEL_ONLINE, &arg);
+ unlock_memory_hotplug();
return ret;
}
@@ -467,6 +472,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
+ unlock_memory_hotplug();
return 0;
}
@@ -718,7 +724,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
pfn);
dump_page(page);
#endif
- /* Becasue we don't have big zone->lock. we should
+ /* Because we don't have big zone->lock. we should
check this again here. */
if (page_count(page)) {
not_managed++;
@@ -733,7 +739,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
goto out;
}
/* this function returns # of failed pages */
- ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+ ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+ true, true);
if (ret)
putback_lru_pages(&source);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 11ff260fb282..959a8b8c7350 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ split_huge_page_pmd(vma->vm_mm, pmd);
if (pmd_none_or_clear_bad(pmd))
continue;
if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
return PTR_ERR(vma);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, new_node_page, dest, 0);
+ err = migrate_pages(&pagelist, new_node_page, dest,
+ false, true);
if (err)
putback_lru_pages(&pagelist);
}
@@ -991,7 +993,7 @@ int do_migrate_pages(struct mm_struct *mm,
* most recent <s, d> pair that moved (s != d). If we find a pair
* that not only moved, but what's better, moved to an empty slot
* (d is not set in tmp), then we break out then, with that pair.
- * Otherwise when we finish scannng from_tmp, we at least have the
+ * Otherwise when we finish scanning from_tmp, we at least have the
* most recent <s, d> pair that moved. If we get all the way through
* the scan of tmp without finding any node that moved, much less
* moved to an empty node, then there is nothing left worth migrating.
@@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma, 0);
+ (unsigned long)vma,
+ false, true);
if (nr_failed)
putback_lru_pages(&pagelist);
}
@@ -1308,16 +1311,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
/* Find the mm_struct */
rcu_read_lock();
- read_lock(&tasklist_lock);
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
- read_unlock(&tasklist_lock);
rcu_read_unlock();
err = -ESRCH;
goto out;
}
mm = get_task_mm(task);
- read_unlock(&tasklist_lock);
rcu_read_unlock();
err = -EINVAL;
@@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
}
/* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+ int nd)
{
- int nd = numa_node_id();
-
switch (policy->mode) {
case MPOL_PREFERRED:
if (!(policy->flags & MPOL_F_LOCAL))
@@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
zl = node_zonelist(interleave_nid(*mpol, vma, addr,
huge_page_shift(hstate_vma(vma))), gfp_flags);
} else {
- zl = policy_zonelist(gfp_flags, *mpol);
+ zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
if ((*mpol)->mode == MPOL_BIND)
*nodemask = &(*mpol)->v.nodes;
}
@@ -1796,7 +1795,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
}
/**
- * alloc_page_vma - Allocate a page for a VMA.
+ * alloc_pages_vma - Allocate a page for a VMA.
*
* @gfp:
* %GFP_USER user allocation.
@@ -1805,6 +1804,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* %GFP_FS allocation should not call back into a file system.
* %GFP_ATOMIC don't sleep.
*
+ * @order:Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
*
@@ -1818,7 +1818,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* Should be called with the mm_sem of the vma hold.
*/
struct page *
-alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+ unsigned long addr, int node)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
@@ -1828,18 +1829,18 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
- nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
+ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
- page = alloc_page_interleave(gfp, 0, nid);
+ page = alloc_page_interleave(gfp, order, nid);
put_mems_allowed();
return page;
}
- zl = policy_zonelist(gfp, pol);
+ zl = policy_zonelist(gfp, pol, node);
if (unlikely(mpol_needs_cond_ref(pol))) {
/*
* slow path: ref counted shared policy
*/
- struct page *page = __alloc_pages_nodemask(gfp, 0,
+ struct page *page = __alloc_pages_nodemask(gfp, order,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
put_mems_allowed();
@@ -1848,7 +1849,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
/*
* fast path: default or task policy
*/
- page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+ page = __alloc_pages_nodemask(gfp, order, zl,
+ policy_nodemask(gfp, pol));
put_mems_allowed();
return page;
}
@@ -1889,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else
page = __alloc_pages_nodemask(gfp, order,
- policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+ policy_zonelist(gfp, pol, numa_node_id()),
+ policy_nodemask(gfp, pol));
put_mems_allowed();
return page;
}
@@ -1976,8 +1979,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
case MPOL_INTERLEAVE:
return nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
- return a->v.preferred_node == b->v.preferred_node &&
- a->flags == b->flags;
+ return a->v.preferred_node == b->v.preferred_node;
default:
BUG();
return 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index fe5a3c6a5426..34132f8e9109 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,6 +35,8 @@
#include <linux/hugetlb.h>
#include <linux/gfp.h>
+#include <asm/tlbflush.h>
+
#include "internal.h"
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -111,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
goto out;
pmd = pmd_offset(pud, addr);
+ if (pmd_trans_huge(*pmd))
+ goto out;
if (!pmd_present(*pmd))
goto out;
@@ -244,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
- (struct page *)radix_tree_deref_slot(pslot) != page) {
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -316,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
- (struct page *)radix_tree_deref_slot(pslot) != page) {
+ radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -371,7 +375,7 @@ void migrate_page_copy(struct page *newpage, struct page *page)
* redo the accounting that clear_page_dirty_for_io undid,
* but we can't use set_page_dirty because that function
* is actually a signal that all of the page has become dirty.
- * Wheras only part of our page may be dirty.
+ * Whereas only part of our page may be dirty.
*/
__set_page_dirty_nobuffers(newpage);
}
@@ -560,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping,
* == 0 - success
*/
static int move_to_new_page(struct page *newpage, struct page *page,
- int remap_swapcache)
+ int remap_swapcache, bool sync)
{
struct address_space *mapping;
int rc;
@@ -582,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page,
mapping = page_mapping(page);
if (!mapping)
rc = migrate_page(mapping, newpage, page);
- else if (mapping->a_ops->migratepage)
+ else {
/*
- * Most pages have a mapping and most filesystems
- * should provide a migration function. Anonymous
- * pages are part of swap space which also has its
- * own migration function. This is the most common
- * path for page migration.
+ * Do not writeback pages if !sync and migratepage is
+ * not pointing to migrate_page() which is nonblocking
+ * (swapcache/tmpfs uses migratepage = migrate_page).
*/
- rc = mapping->a_ops->migratepage(mapping,
- newpage, page);
- else
- rc = fallback_migrate_page(mapping, newpage, page);
+ if (PageDirty(page) && !sync &&
+ mapping->a_ops->migratepage != migrate_page)
+ rc = -EBUSY;
+ else if (mapping->a_ops->migratepage)
+ /*
+ * Most pages have a mapping and most filesystems
+ * should provide a migration function. Anonymous
+ * pages are part of swap space which also has its
+ * own migration function. This is the most common
+ * path for page migration.
+ */
+ rc = mapping->a_ops->migratepage(mapping,
+ newpage, page);
+ else
+ rc = fallback_migrate_page(mapping, newpage, page);
+ }
if (rc) {
newpage->mapping = NULL;
@@ -612,15 +626,14 @@ static int move_to_new_page(struct page *newpage, struct page *page,
* to the newly allocated page in newpage.
*/
static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, int offlining)
+ struct page *page, int force, bool offlining, bool sync)
{
int rc = 0;
int *result = NULL;
struct page *newpage = get_new_page(page, private, &result);
int remap_swapcache = 1;
- int rcu_locked = 0;
int charge = 0;
- struct mem_cgroup *mem = NULL;
+ struct mem_cgroup *mem;
struct anon_vma *anon_vma = NULL;
if (!newpage)
@@ -630,13 +643,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
/* page was freed from under us. So we are done. */
goto move_newpage;
}
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page)))
+ goto move_newpage;
/* prepare cgroup just returns 0 or -ENOMEM */
rc = -EAGAIN;
if (!trylock_page(page)) {
- if (!force)
+ if (!force || !sync)
goto move_newpage;
+
+ /*
+ * It's not safe for direct compaction to call lock_page.
+ * For example, during page readahead pages are added locked
+ * to the LRU. Later, when the IO completes the pages are
+ * marked uptodate and unlocked. However, the queueing
+ * could be merging multiple pages for one bio (e.g.
+ * mpage_readpages). If an allocation happens for the
+ * second or third page, the process can end up locking
+ * the same page twice and deadlocking. Rather than
+ * trying to be clever about what pages can be locked,
+ * avoid the use of lock_page for direct compaction
+ * altogether.
+ */
+ if (current->flags & PF_MEMALLOC)
+ goto move_newpage;
+
lock_page(page);
}
@@ -655,7 +688,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
}
/* charge against new page */
- charge = mem_cgroup_prepare_migration(page, newpage, &mem);
+ charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
if (charge == -ENOMEM) {
rc = -ENOMEM;
goto unlock;
@@ -663,6 +696,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
BUG_ON(charge);
if (PageWriteback(page)) {
+ /*
+ * For !sync, there is no point retrying as the retry loop
+ * is expected to be too short for PageWriteback to be cleared
+ */
+ if (!sync) {
+ rc = -EBUSY;
+ goto uncharge;
+ }
if (!force)
goto uncharge;
wait_on_page_writeback(page);
@@ -670,20 +711,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
/*
* By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
* we cannot notice that anon_vma is freed while we migrates a page.
- * This rcu_read_lock() delays freeing anon_vma pointer until the end
+ * This get_anon_vma() delays freeing anon_vma pointer until the end
* of migration. File cache pages are no problem because of page_lock()
* File Caches may use write_page() or lock_page() in migration, then,
* just care Anon page here.
*/
if (PageAnon(page)) {
- rcu_read_lock();
- rcu_locked = 1;
-
- /* Determine how to safely use anon_vma */
- if (!page_mapped(page)) {
- if (!PageSwapCache(page))
- goto rcu_unlock;
-
+ /*
+ * Only page_lock_anon_vma() understands the subtleties of
+ * getting a hold on an anon_vma from outside one of its mms.
+ */
+ anon_vma = page_lock_anon_vma(page);
+ if (anon_vma) {
+ /*
+ * Take a reference count on the anon_vma if the
+ * page is mapped so that it is guaranteed to
+ * exist when the page is remapped later
+ */
+ get_anon_vma(anon_vma);
+ page_unlock_anon_vma(anon_vma);
+ } else if (PageSwapCache(page)) {
/*
* We cannot be sure that the anon_vma of an unmapped
* swapcache page is safe to use because we don't
@@ -698,13 +745,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
*/
remap_swapcache = 0;
} else {
- /*
- * Take a reference count on the anon_vma if the
- * page is mapped so that it is guaranteed to
- * exist when the page is remapped later
- */
- anon_vma = page_anon_vma(page);
- get_anon_vma(anon_vma);
+ goto uncharge;
}
}
@@ -721,16 +762,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* free the metadata, so the page can be freed.
*/
if (!page->mapping) {
- if (!PageAnon(page) && page_has_private(page)) {
- /*
- * Go direct to try_to_free_buffers() here because
- * a) that's what try_to_release_page() would do anyway
- * b) we may be under rcu_read_lock() here, so we can't
- * use GFP_KERNEL which is what try_to_release_page()
- * needs to be effective.
- */
+ VM_BUG_ON(PageAnon(page));
+ if (page_has_private(page)) {
try_to_free_buffers(page);
- goto rcu_unlock;
+ goto uncharge;
}
goto skip_unmap;
}
@@ -740,24 +775,22 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
skip_unmap:
if (!page_mapped(page))
- rc = move_to_new_page(newpage, page, remap_swapcache);
+ rc = move_to_new_page(newpage, page, remap_swapcache, sync);
if (rc && remap_swapcache)
remove_migration_ptes(page, page);
-rcu_unlock:
/* Drop an anon_vma reference if we took one */
if (anon_vma)
- drop_anon_vma(anon_vma);
+ put_anon_vma(anon_vma);
- if (rcu_locked)
- rcu_read_unlock();
uncharge:
if (!charge)
- mem_cgroup_end_migration(mem, page, newpage);
+ mem_cgroup_end_migration(mem, page, newpage, rc == 0);
unlock:
unlock_page(page);
+move_newpage:
if (rc != -EAGAIN) {
/*
* A page that has been migrated has all references
@@ -771,8 +804,6 @@ unlock:
putback_lru_page(page);
}
-move_newpage:
-
/*
* Move the new page to the LRU. If migration was not successful
* then this will free the page.
@@ -808,12 +839,11 @@ move_newpage:
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
unsigned long private, struct page *hpage,
- int force, int offlining)
+ int force, bool offlining, bool sync)
{
int rc = 0;
int *result = NULL;
struct page *new_hpage = get_new_page(hpage, private, &result);
- int rcu_locked = 0;
struct anon_vma *anon_vma = NULL;
if (!new_hpage)
@@ -822,39 +852,29 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
rc = -EAGAIN;
if (!trylock_page(hpage)) {
- if (!force)
+ if (!force || !sync)
goto out;
lock_page(hpage);
}
if (PageAnon(hpage)) {
- rcu_read_lock();
- rcu_locked = 1;
-
- if (page_mapped(hpage)) {
- anon_vma = page_anon_vma(hpage);
- atomic_inc(&anon_vma->external_refcount);
+ anon_vma = page_lock_anon_vma(hpage);
+ if (anon_vma) {
+ get_anon_vma(anon_vma);
+ page_unlock_anon_vma(anon_vma);
}
}
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
if (!page_mapped(hpage))
- rc = move_to_new_page(new_hpage, hpage, 1);
+ rc = move_to_new_page(new_hpage, hpage, 1, sync);
if (rc)
remove_migration_ptes(hpage, hpage);
- if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
- &anon_vma->lock)) {
- int empty = list_empty(&anon_vma->head);
- spin_unlock(&anon_vma->lock);
- if (empty)
- anon_vma_free(anon_vma);
- }
-
- if (rcu_locked)
- rcu_read_unlock();
+ if (anon_vma)
+ put_anon_vma(anon_vma);
out:
unlock_page(hpage);
@@ -885,12 +905,13 @@ out:
* are movable anymore because to has become empty
* or no retryable pages exist anymore.
* Caller should call putback_lru_pages to return pages to the LRU
- * or free list.
+ * or free list only if ret != 0.
*
* Return: Number of pages not migrated or error code.
*/
int migrate_pages(struct list_head *from,
- new_page_t get_new_page, unsigned long private, int offlining)
+ new_page_t get_new_page, unsigned long private, bool offlining,
+ bool sync)
{
int retry = 1;
int nr_failed = 0;
@@ -910,7 +931,8 @@ int migrate_pages(struct list_head *from,
cond_resched();
rc = unmap_and_move(get_new_page, private,
- page, pass > 2, offlining);
+ page, pass > 2, offlining,
+ sync);
switch(rc) {
case -ENOMEM:
@@ -939,7 +961,8 @@ out:
}
int migrate_huge_pages(struct list_head *from,
- new_page_t get_new_page, unsigned long private, int offlining)
+ new_page_t get_new_page, unsigned long private, bool offlining,
+ bool sync)
{
int retry = 1;
int nr_failed = 0;
@@ -955,7 +978,8 @@ int migrate_huge_pages(struct list_head *from,
cond_resched();
rc = unmap_and_move_huge_page(get_new_page,
- private, page, pass > 2, offlining);
+ private, page, pass > 2, offlining,
+ sync);
switch(rc) {
case -ENOMEM:
@@ -974,10 +998,6 @@ int migrate_huge_pages(struct list_head *from,
}
rc = 0;
out:
-
- list_for_each_entry_safe(page, page2, from, lru)
- put_page(page);
-
if (rc)
return rc;
@@ -1040,7 +1060,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
goto set_status;
- page = follow_page(vma, pp->addr, FOLL_GET);
+ page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
err = PTR_ERR(page);
if (IS_ERR(page))
@@ -1088,7 +1108,7 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0);
+ (unsigned long)pm, 0, true);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1285,14 +1305,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
return -EPERM;
/* Find the mm_struct */
- read_lock(&tasklist_lock);
+ rcu_read_lock();
task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return -ESRCH;
}
mm = get_task_mm(task);
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
if (!mm)
return -EINVAL;
diff --git a/mm/mincore.c b/mm/mincore.c
index 9ac42dc6d7b6..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
+ vec += (next - addr) >> PAGE_SHIFT;
+ continue;
+ }
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(pmd))
mincore_unmapped_range(vma, addr, next, vec);
else
diff --git a/mm/mlock.c b/mm/mlock.c
index b70919ce4f72..516b2c2ddd5a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page)
}
}
-static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
-{
- return (vma->vm_flags & VM_GROWSDOWN) &&
- (vma->vm_start == addr) &&
- !vma_stack_continue(vma->vm_prev, addr);
-}
-
/**
* __mlock_vma_pages_range() - mlock a range of pages in the vma.
* @vma: target vma
@@ -155,13 +148,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
* vma->vm_mm->mmap_sem must be held for at least read.
*/
static long __mlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end,
+ int *nonblocking)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start;
- struct page *pages[16]; /* 16 gives a reasonable batch */
int nr_pages = (end - start) / PAGE_SIZE;
- int ret = 0;
int gup_flags;
VM_BUG_ON(start & ~PAGE_MASK);
@@ -170,73 +162,24 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
VM_BUG_ON(end > vma->vm_end);
VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
- gup_flags = FOLL_TOUCH | FOLL_GET;
- if (vma->vm_flags & VM_WRITE)
+ gup_flags = FOLL_TOUCH | FOLL_MLOCK;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
gup_flags |= FOLL_WRITE;
- /* We don't try to access the guard page of a stack vma */
- if (stack_guard_page(vma, start)) {
- addr += PAGE_SIZE;
- nr_pages--;
- }
-
- while (nr_pages > 0) {
- int i;
-
- cond_resched();
-
- /*
- * get_user_pages makes pages present if we are
- * setting mlock. and this extra reference count will
- * disable migration of this page. However, page may
- * still be truncated out from under us.
- */
- ret = __get_user_pages(current, mm, addr,
- min_t(int, nr_pages, ARRAY_SIZE(pages)),
- gup_flags, pages, NULL);
- /*
- * This can happen for, e.g., VM_NONLINEAR regions before
- * a page has been allocated and mapped at a given offset,
- * or for addresses that map beyond end of a file.
- * We'll mlock the pages if/when they get faulted in.
- */
- if (ret < 0)
- break;
-
- lru_add_drain(); /* push cached pages to LRU */
-
- for (i = 0; i < ret; i++) {
- struct page *page = pages[i];
-
- if (page->mapping) {
- /*
- * That preliminary check is mainly to avoid
- * the pointless overhead of lock_page on the
- * ZERO_PAGE: which might bounce very badly if
- * there is contention. However, we're still
- * dirtying its cacheline with get/put_page:
- * we'll add another __get_user_pages flag to
- * avoid it if that case turns out to matter.
- */
- lock_page(page);
- /*
- * Because we lock page here and migration is
- * blocked by the elevated reference, we need
- * only check for file-cache page truncation.
- */
- if (page->mapping)
- mlock_vma_page(page);
- unlock_page(page);
- }
- put_page(page); /* ref from get_user_pages() */
- }
-
- addr += ret * PAGE_SIZE;
- nr_pages -= ret;
- ret = 0;
- }
+ /*
+ * We want mlock to succeed for regions that have any permissions
+ * other than PROT_NONE.
+ */
+ if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+ gup_flags |= FOLL_FORCE;
- return ret; /* 0 or negative error code */
+ return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
+ NULL, NULL, nonblocking);
}
/*
@@ -278,9 +221,9 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current))) {
+ vma == get_gate_vma(current->mm))) {
- __mlock_vma_pages_range(vma, start, end);
+ __mlock_vma_pages_range(vma, start, end, NULL);
/* Hide errors from mmap() and other callers */
return 0;
@@ -372,18 +315,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
int ret = 0;
int lock = newflags & VM_LOCKED;
- if (newflags == vma->vm_flags ||
- (vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
+ is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
goto out; /* don't set VM_LOCKED, don't count */
- if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
- is_vm_hugetlb_page(vma) ||
- vma == get_gate_vma(current)) {
- if (lock)
- make_pages_present(start, end);
- goto out; /* don't set VM_LOCKED, don't count */
- }
-
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma));
@@ -419,14 +354,10 @@ success:
* set VM_LOCKED, __mlock_vma_pages_range will bring it back.
*/
- if (lock) {
+ if (lock)
vma->vm_flags = newflags;
- ret = __mlock_vma_pages_range(vma, start, end);
- if (ret < 0)
- ret = __mlock_posix_error_return(ret);
- } else {
+ else
munlock_vma_pages_range(vma, start, end);
- }
out:
*prev = vma;
@@ -439,7 +370,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
struct vm_area_struct * vma, * prev;
int error;
- len = PAGE_ALIGN(len);
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
end = start + len;
if (end < start)
return -EINVAL;
@@ -482,6 +414,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
+static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ int ret = 0;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ /*
+ * Now fault in a range of pages. __mlock_vma_pages_range()
+ * double checks the vma flags, so that it won't mlock pages
+ * if the vma was already munlocked.
+ */
+ ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
+ if (ret < 0) {
+ if (ignore_errors) {
+ ret = 0;
+ continue; /* continue at next VMA */
+ }
+ ret = __mlock_posix_error_return(ret);
+ break;
+ }
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret; /* 0 or negative error code */
+}
+
SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
unsigned long locked;
@@ -507,6 +495,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = do_mlock(start, len, 1);
up_write(&current->mm->mmap_sem);
+ if (!error)
+ error = do_mlock_pages(start, len, 0);
return error;
}
@@ -571,6 +561,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
capable(CAP_IPC_LOCK))
ret = do_mlockall(flags);
up_write(&current->mm->mmap_sem);
+ if (!ret && (flags & MCL_CURRENT)) {
+ /* Ignore errors */
+ do_mlock_pages(0, TASK_SIZE, 1);
+ }
out:
return ret;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index b179abb1474a..772140c53ab1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
#include <linux/mmu_notifier.h>
#include <linux/perf_event.h>
#include <linux/audit.h>
+#include <linux/khugepaged.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
down_write(&mm->mmap_sem);
#ifdef CONFIG_COMPAT_BRK
- min_brk = mm->end_code;
+ /*
+ * CONFIG_COMPAT_BRK can still be overridden by setting
+ * randomize_va_space to 2, which will still cause mm->start_brk
+ * to be arbitrarily shifted
+ */
+ if (current->brk_randomized)
+ min_brk = mm->start_brk;
+ else
+ min_brk = mm->end_data;
#else
min_brk = mm->start_brk;
#endif
@@ -588,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end);
}
}
+ vma_adjust_trans_huge(vma, start, end, adjust_next);
+
/*
* When changing only vma->vm_end, we don't really need anon_vma
* lock. This is a fairly rare case by itself, but the anon_vma
@@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
end, prev->vm_pgoff, NULL);
if (err)
return NULL;
+ khugepaged_enter_vma_merge(prev);
return prev;
}
@@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
next->vm_pgoff - pglen, NULL);
if (err)
return NULL;
+ khugepaged_enter_vma_merge(area);
return area;
}
@@ -1754,13 +1767,17 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
size = address - vma->vm_start;
grow = (address - vma->vm_end) >> PAGE_SHIFT;
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- vma->vm_end = address;
- perf_event_mmap(vma);
+ error = -ENOMEM;
+ if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ vma->vm_end = address;
+ perf_event_mmap(vma);
+ }
}
}
vma_unlock_anon_vma(vma);
+ khugepaged_enter_vma_merge(vma);
return error;
}
#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1800,14 +1817,18 @@ static int expand_downwards(struct vm_area_struct *vma,
size = vma->vm_end - address;
grow = (vma->vm_start - address) >> PAGE_SHIFT;
- error = acct_stack_growth(vma, size, grow);
- if (!error) {
- vma->vm_start = address;
- vma->vm_pgoff -= grow;
- perf_event_mmap(vma);
+ error = -ENOMEM;
+ if (grow <= vma->vm_pgoff) {
+ error = acct_stack_growth(vma, size, grow);
+ if (!error) {
+ vma->vm_start = address;
+ vma->vm_pgoff -= grow;
+ perf_event_mmap(vma);
+ }
}
}
vma_unlock_anon_vma(vma);
+ khugepaged_enter_vma_merge(vma);
return error;
}
@@ -2462,6 +2483,7 @@ int install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long vm_flags, struct page **pages)
{
+ int ret;
struct vm_area_struct *vma;
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
@@ -2479,16 +2501,23 @@ int install_special_mapping(struct mm_struct *mm,
vma->vm_ops = &special_mapping_vmops;
vma->vm_private_data = pages;
- if (unlikely(insert_vm_struct(mm, vma))) {
- kmem_cache_free(vm_area_cachep, vma);
- return -ENOMEM;
- }
+ ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+ if (ret)
+ goto out;
+
+ ret = insert_vm_struct(mm, vma);
+ if (ret)
+ goto out;
mm->total_vm += len >> PAGE_SHIFT;
perf_event_mmap(vma);
return 0;
+
+out:
+ kmem_cache_free(vm_area_cachep, vma);
+ return ret;
}
static DEFINE_MUTEX(mm_all_locks_mutex);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
return young;
}
+int __mmu_notifier_test_young(struct mm_struct *mm,
+ unsigned long address)
+{
+ struct mmu_notifier *mn;
+ struct hlist_node *n;
+ int young = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops->test_young) {
+ young = mn->ops->test_young(mn, mm, address);
+ if (young)
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return young;
+}
+
void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
pte_t pte)
{
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
return 1;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
- unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-
- /*
- * While kswapd is awake, it is considered the zone is under some
- * memory pressure. Under pressure, there is a risk that
- * per-cpu-counter-drift will allow the min watermark to be breached
- * potentially causing a live-lock. While kswapd is awake and
- * free pages are low, get a better estimate for free pages
- */
- if (nr_free_pages < zone->percpu_drift_mark &&
- !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
- return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-
- return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c5133873097..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte_unmap_unlock(pte - 1, ptl);
}
-static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE)
+ split_huge_page_pmd(vma->vm_mm, pmd);
+ else if (change_huge_pmd(vma, pmd, addr, newprot))
+ continue;
+ /* fall through */
+ }
if (pmd_none_or_clear_bad(pmd))
continue;
- change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
+ change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+ dirty_accountable);
} while (pmd++, addr = next, addr != end);
}
-static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable)
{
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
+ change_pmd_range(vma, pud, addr, next, newprot,
+ dirty_accountable);
} while (pud++, addr = next, addr != end);
}
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
+ change_pud_range(vma, pgd, addr, next, newprot,
+ dirty_accountable);
} while (pgd++, addr = next, addr != end);
flush_tlb_range(vma, start, end);
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 563fbdd6293a..a7c1f9f9b941 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return NULL;
pmd = pmd_offset(pud, addr);
+ split_huge_page_pmd(mm, pmd);
if (pmd_none_or_clear_bad(pmd))
return NULL;
return pmd;
}
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
if (!pmd)
return NULL;
- if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+ VM_BUG_ON(pmd_trans_huge(*pmd));
+ if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
return NULL;
return pmd;
@@ -91,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
*/
mapping = vma->vm_file->f_mapping;
spin_lock(&mapping->i_mmap_lock);
- if (new_vma->vm_truncate_count &&
- new_vma->vm_truncate_count != vma->vm_truncate_count)
- new_vma->vm_truncate_count = 0;
+ new_vma->vm_truncate_count = 0;
}
/*
@@ -147,7 +148,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
- new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+ new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
@@ -276,9 +277,16 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (old_len > vma->vm_end - addr)
goto Efault;
- if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
- if (new_len > old_len)
+ /* Need to be careful about a growing mapping */
+ if (new_len > old_len) {
+ unsigned long pgoff;
+
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
goto Efault;
+ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff += vma->vm_pgoff;
+ if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+ goto Einval;
}
if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
new file mode 100644
index 000000000000..9109049f0bbc
--- /dev/null
+++ b/mm/nobootmem.c
@@ -0,0 +1,427 @@
+/*
+ * bootmem - A boot-time physical memory allocator and configurator
+ *
+ * Copyright (C) 1999 Ingo Molnar
+ * 1999 Kanoj Sarcar, SGI
+ * 2008 Johannes Weiner
+ *
+ * Access to this subsystem has to be serialized externally (which is true
+ * for the boot process anyway).
+ */
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include <linux/range.h>
+#include <linux/memblock.h>
+
+#include <asm/bug.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+
+#include "internal.h"
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data;
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
+unsigned long max_low_pfn;
+unsigned long min_low_pfn;
+unsigned long max_pfn;
+
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ void *ptr;
+ u64 addr;
+
+ if (limit > memblock.current_limit)
+ limit = memblock.current_limit;
+
+ addr = find_memory_core_early(nid, size, align, goal, limit);
+
+ if (addr == MEMBLOCK_ERROR)
+ return NULL;
+
+ ptr = phys_to_virt(addr);
+ memset(ptr, 0, size);
+ memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+ return ptr;
+}
+
+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system. Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+ unsigned long cursor, end;
+
+ kmemleak_free_part(__va(addr), size);
+
+ cursor = PFN_UP(addr);
+ end = PFN_DOWN(addr + size);
+
+ for (; cursor < end; cursor++) {
+ __free_pages_bootmem(pfn_to_page(cursor), 0);
+ totalram_pages++;
+ }
+}
+
+static void __init __free_pages_memory(unsigned long start, unsigned long end)
+{
+ int i;
+ unsigned long start_aligned, end_aligned;
+ int order = ilog2(BITS_PER_LONG);
+
+ start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+ end_aligned = end & ~(BITS_PER_LONG - 1);
+
+ if (end_aligned <= start_aligned) {
+ for (i = start; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ return;
+ }
+
+ for (i = start; i < start_aligned; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
+ __free_pages_bootmem(pfn_to_page(i), order);
+
+ for (i = end_aligned; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+}
+
+unsigned long __init free_all_memory_core_early(int nodeid)
+{
+ int i;
+ u64 start, end;
+ unsigned long count = 0;
+ struct range *range = NULL;
+ int nr_range;
+
+ nr_range = get_free_all_memory_range(&range, nodeid);
+
+ for (i = 0; i < nr_range; i++) {
+ start = range[i].start;
+ end = range[i].end;
+ count += end - start;
+ __free_pages_memory(start, end);
+ }
+
+ return count;
+}
+
+/**
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+{
+ register_page_bootmem_info_node(pgdat);
+
+ /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
+ return 0;
+}
+
+/**
+ * free_all_bootmem - release free pages to the buddy allocator
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem(void)
+{
+ /*
+ * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+ * because in some case like Node0 doesn't have RAM installed
+ * low ram will be on Node1
+ * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
+ * will be used instead of only Node0 related
+ */
+ return free_all_memory_core_early(MAX_NUMNODES);
+}
+
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must reside completely on the specified node.
+ */
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ unsigned long size)
+{
+ kmemleak_free_part(__va(physaddr), size);
+ memblock_x86_free_range(physaddr, physaddr + size);
+}
+
+/**
+ * free_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long addr, unsigned long size)
+{
+ kmemleak_free_part(__va(addr), size);
+ memblock_x86_free_range(addr, addr + size);
+}
+
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+ unsigned long align,
+ unsigned long goal,
+ unsigned long limit)
+{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc(size, GFP_NOWAIT);
+
+restart:
+
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+
+ if (ptr)
+ return ptr;
+
+ if (goal != 0) {
+ goal = 0;
+ goto restart;
+ }
+
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ unsigned long limit = -1UL;
+
+ return ___alloc_bootmem_nopanic(size, align, goal, limit);
+}
+
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal, unsigned long limit)
+{
+ void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
+
+ if (mem)
+ return mem;
+ /*
+ * Whoops, we cannot satisfy the allocation request.
+ */
+ printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ panic("Out of memory");
+ return NULL;
+}
+
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ unsigned long limit = -1UL;
+
+ return ___alloc_bootmem(size, align, goal, limit);
+}
+
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+ if (ptr)
+ return ptr;
+
+ return __alloc_memory_core_early(MAX_NUMNODES, size, align,
+ goal, -1ULL);
+}
+
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+#ifdef MAX_DMA32_PFN
+ unsigned long end_pfn;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ /* update goal according ...MAX_DMA32_PFN */
+ end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+
+ if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
+ (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
+ void *ptr;
+ unsigned long new_goal;
+
+ new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ new_goal, -1ULL);
+ if (ptr)
+ return ptr;
+ }
+#endif
+
+ return __alloc_bootmem_node(pgdat, size, align, goal);
+
+}
+
+#ifdef CONFIG_SPARSEMEM
+/**
+ * alloc_bootmem_section - allocate boot memory from a specific section
+ * @size: size of the request in bytes
+ * @section_nr: sparse map section to allocate from
+ *
+ * Return NULL on failure.
+ */
+void * __init alloc_bootmem_section(unsigned long size,
+ unsigned long section_nr)
+{
+ unsigned long pfn, goal, limit;
+
+ pfn = section_nr_to_pfn(section_nr);
+ goal = pfn << PAGE_SHIFT;
+ limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+
+ return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
+ SMP_CACHE_BYTES, goal, limit);
+}
+#endif
+
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, -1ULL);
+ if (ptr)
+ return ptr;
+
+ return __alloc_bootmem_nopanic(size, align, goal);
+}
+
+#ifndef ARCH_LOW_ADDRESS_LIMIT
+#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
+#endif
+
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
+ unsigned long goal)
+{
+ return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+}
+
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ void *ptr;
+
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+ if (ptr)
+ return ptr;
+
+ return __alloc_memory_core_early(MAX_NUMNODES, size, align,
+ goal, ARCH_LOW_ADDRESS_LIMIT);
+}
diff --git a/mm/nommu.c b/mm/nommu.c
index 27a9ac588516..c4c542c736a9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
* Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
* Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
* Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
- * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org>
+ * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
*/
#include <linux/module.h>
@@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp)
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int foll_flags,
- struct page **pages, struct vm_area_struct **vmas)
+ struct page **pages, struct vm_area_struct **vmas,
+ int *retry)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
@@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+ NULL);
}
EXPORT_SYMBOL(get_user_pages);
@@ -328,6 +330,7 @@ void *vmalloc_node(unsigned long size, int node)
{
return vmalloc(size);
}
+EXPORT_SYMBOL(vmalloc_node);
/**
* vzalloc_node - allocate memory on a specific node with zero fill
@@ -440,6 +443,31 @@ void __attribute__((weak)) vmalloc_sync_all(void)
{
}
+/**
+ * alloc_vm_area - allocate a range of kernel address space
+ * @size: size of the area
+ *
+ * Returns: NULL on failure, vm_struct on success
+ *
+ * This function reserves a range of kernel address space, and
+ * allocates pagetables to map that range. No actual mappings
+ * are created. If the kernel address space is not shared
+ * between processes, it syncs the pagetable across all
+ * processes.
+ */
+struct vm_struct *alloc_vm_area(size_t size)
+{
+ BUG();
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
+
+void free_vm_area(struct vm_struct *area)
+{
+ BUG();
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
+
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
{
@@ -1814,10 +1842,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
-
unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
@@ -1935,7 +1959,7 @@ error:
return -ENOMEM;
}
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
{
return 0;
}
@@ -1947,21 +1971,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_fault);
-/*
- * Access another process' address space.
- * - source/target buffer must be kernel space
- */
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long addr, void *buf, int len, int write)
{
struct vm_area_struct *vma;
- struct mm_struct *mm;
-
- if (addr + len < addr)
- return 0;
-
- mm = get_task_mm(tsk);
- if (!mm)
- return 0;
down_read(&mm->mmap_sem);
@@ -1986,6 +1999,43 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
}
up_read(&mm->mmap_sem);
+
+ return len;
+}
+
+/**
+ * @access_remote_vm - access another process' address space
+ * @mm: the mm_struct of the target address space
+ * @addr: start address to access
+ * @buf: source or destination buffer
+ * @len: number of bytes to transfer
+ * @write: whether the access is a write
+ *
+ * The caller must hold a reference on @mm.
+ */
+int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, int write)
+{
+ return __access_remote_vm(NULL, mm, addr, buf, len, write);
+}
+
+/*
+ * Access another process' address space.
+ * - source/target buffer must be kernel space
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+ struct mm_struct *mm;
+
+ if (addr + len < addr)
+ return 0;
+
+ mm = get_task_mm(tsk);
+ if (!mm)
+ return 0;
+
+ len = __access_remote_vm(tsk, mm, addr, buf, len, write);
+
mmput(mm);
return len;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7dcca55ede7c..f52e85c80e8d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,6 +31,7 @@
#include <linux/memcontrol.h>
#include <linux/mempolicy.h>
#include <linux/security.h>
+#include <linux/ptrace.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
@@ -83,24 +84,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
#endif /* CONFIG_NUMA */
/*
- * If this is a system OOM (not a memcg OOM) and the task selected to be
- * killed is not already running at high (RT) priorities, speed up the
- * recovery by boosting the dying task to the lowest FIFO priority.
- * That helps with the recovery and avoids interfering with RT tasks.
- */
-static void boost_dying_task_prio(struct task_struct *p,
- struct mem_cgroup *mem)
-{
- struct sched_param param = { .sched_priority = 1 };
-
- if (mem)
- return;
-
- if (!rt_task(p))
- sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-}
-
-/*
* The process p may have detached its own ->mm while exiting or through
* use_mm(), but one or more of its subthreads may still have a valid
* pointer. Return p, or any of its subthreads with a valid ->mm, with
@@ -189,10 +172,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
/*
* The baseline for the badness score is the proportion of RAM that each
- * task's rss and swap space use.
+ * task's rss, pagetable and swap space use.
*/
- points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
- totalpages;
+ points = get_mm_rss(p->mm) + p->mm->nr_ptes;
+ points += get_mm_counter(p->mm, MM_SWAPENTS);
+
+ points *= 1000;
+ points /= totalpages;
task_unlock(p);
/*
@@ -292,13 +278,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
unsigned long totalpages, struct mem_cgroup *mem,
const nodemask_t *nodemask)
{
- struct task_struct *p;
+ struct task_struct *g, *p;
struct task_struct *chosen = NULL;
*ppoints = 0;
- for_each_process(p) {
+ do_each_thread(g, p) {
unsigned int points;
+ if (!p->mm)
+ continue;
if (oom_unkillable_task(p, mem, nodemask))
continue;
@@ -314,22 +302,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
if (test_tsk_thread_flag(p, TIF_MEMDIE))
return ERR_PTR(-1UL);
- /*
- * This is in the process of releasing memory so wait for it
- * to finish before killing some other task by mistake.
- *
- * However, if p is the current task, we allow the 'kill' to
- * go ahead if it is exiting: this will simply set TIF_MEMDIE,
- * which will allow it to gain access to memory reserves in
- * the process of exiting and releasing its resources.
- * Otherwise we could get an easy OOM deadlock.
- */
- if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
- if (p != current)
- return ERR_PTR(-1UL);
-
- chosen = p;
- *ppoints = 1000;
+ if (p->flags & PF_EXITING) {
+ /*
+ * If p is the current task and is in the process of
+ * releasing memory, we allow the "kill" to set
+ * TIF_MEMDIE, which will allow it to gain access to
+ * memory reserves. Otherwise, it may stall forever.
+ *
+ * The loop isn't broken here, however, in case other
+ * threads are found to have already been oom killed.
+ */
+ if (p == current) {
+ chosen = p;
+ *ppoints = 1000;
+ } else {
+ /*
+ * If this task is not being ptraced on exit,
+ * then wait for it to finish before killing
+ * some other task unnecessarily.
+ */
+ if (!(task_ptrace(p->group_leader) &
+ PT_TRACE_EXIT))
+ return ERR_PTR(-1UL);
+ }
}
points = oom_badness(p, mem, nodemask, totalpages);
@@ -337,7 +332,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
chosen = p;
*ppoints = points;
}
- }
+ } while_each_thread(g, p);
return chosen;
}
@@ -396,7 +391,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
task_unlock(current);
dump_stack();
mem_cgroup_print_oom_info(mem, p);
- show_mem();
+ show_mem(SHOW_MEM_FILTER_NODES);
if (sysctl_oom_dump_tasks)
dump_tasks(mem, nodemask);
}
@@ -442,13 +437,6 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
set_tsk_thread_flag(p, TIF_MEMDIE);
force_sig(SIGKILL, p);
- /*
- * We give our sacrificial lamb high priority and access to
- * all the memory it needs. That way it should be able to
- * exit() and clear out its resources quickly...
- */
- boost_dying_task_prio(p, mem);
-
return 0;
}
#undef K
@@ -472,7 +460,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
*/
if (p->flags & PF_EXITING) {
set_tsk_thread_flag(p, TIF_MEMDIE);
- boost_dying_task_prio(p, mem);
return 0;
}
@@ -491,6 +478,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
+ if (child->mm == p->mm)
+ continue;
/*
* oom_badness() returns 0 if the thread is unkillable
*/
@@ -537,6 +526,16 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
unsigned int points = 0;
struct task_struct *p;
+ /*
+ * If current has a pending SIGKILL, then automatically select it. The
+ * goal is to allow it to allocate so that it may quickly exit and free
+ * its memory.
+ */
+ if (fatal_signal_pending(current)) {
+ set_thread_flag(TIF_MEMDIE);
+ return;
+ }
+
check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
read_lock(&tasklist_lock);
@@ -689,7 +688,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
*/
if (fatal_signal_pending(current)) {
set_thread_flag(TIF_MEMDIE);
- boost_dying_task_prio(current, NULL);
return;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b840afa89761..31f698862420 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -404,15 +404,18 @@ unsigned long determine_dirtyable_memory(void)
* - vm.dirty_background_ratio or vm.dirty_background_bytes
* - vm.dirty_ratio or vm.dirty_bytes
* The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * runtime tasks.
+ * real-time tasks.
*/
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
unsigned long background;
unsigned long dirty;
- unsigned long available_memory = determine_dirtyable_memory();
+ unsigned long uninitialized_var(available_memory);
struct task_struct *tsk;
+ if (!vm_dirty_bytes || !dirty_background_bytes)
+ available_memory = determine_dirtyable_memory();
+
if (vm_dirty_bytes)
dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
else
@@ -563,7 +566,7 @@ static void balance_dirty_pages(struct address_space *mapping,
break; /* We've done our duty */
}
trace_wbc_balance_dirty_wait(&wbc, bdi);
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
io_schedule_timeout(pause);
/*
@@ -924,7 +927,7 @@ retry:
break;
}
- done_index = page->index + 1;
+ done_index = page->index;
lock_page(page);
@@ -974,6 +977,7 @@ continue_unlock:
* not be suitable for data integrity
* writeout).
*/
+ done_index = page->index + 1;
done = 1;
break;
}
@@ -1036,11 +1040,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
+ struct blk_plug plug;
+ int ret;
+
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;
- return write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_start_plug(&plug);
+ ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+ blk_finish_plug(&plug);
+ return ret;
}
EXPORT_SYMBOL(generic_writepages);
@@ -1103,7 +1113,7 @@ EXPORT_SYMBOL(write_one_page);
int __set_page_dirty_no_writeback(struct page *page)
{
if (!PageDirty(page))
- SetPageDirty(page);
+ return !TestSetPageDirty(page);
return 0;
}
@@ -1208,6 +1218,17 @@ int set_page_dirty(struct page *page)
if (likely(mapping)) {
int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
+ /*
+ * readahead/lru_deactivate_page could remain
+ * PG_readahead/PG_reclaim due to race with end_page_writeback
+ * About readahead, if the page is written, the flags would be
+ * reset. So no problem.
+ * About lru_deactivate_page, if the page is redirty, the flag
+ * will be reset. So no problem. but if the page is used by readahead
+ * it will confuse readahead and make it restart the size rampup
+ * process. But it's a trivial problem.
+ */
+ ClearPageReclaim(page);
#ifdef CONFIG_BLOCK
if (!spd)
spd = __set_page_dirty_buffers;
@@ -1236,7 +1257,7 @@ int set_page_dirty_lock(struct page *page)
{
int ret;
- lock_page_nosync(page);
+ lock_page(page);
ret = set_page_dirty(page);
unlock_page(page);
return ret;
@@ -1263,7 +1284,6 @@ int clear_page_dirty_for_io(struct page *page)
BUG_ON(!PageLocked(page));
- ClearPageReclaim(page);
if (mapping && mapping_cap_account_dirty(mapping)) {
/*
* Yes, Virginia, this is indeed insane.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ff7e15872398..9f8a97b9a350 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,7 @@
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <linux/ftrace_event.h>
+#include <linux/memcontrol.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -286,7 +287,7 @@ static void bad_page(struct page *page)
/* Don't complain about poisoned pages */
if (PageHWPoison(page)) {
- __ClearPageBuddy(page);
+ reset_page_mapcount(page); /* remove PageBuddy */
return;
}
@@ -317,7 +318,7 @@ static void bad_page(struct page *page)
dump_stack();
out:
/* Leave bad fields for debug, except PageBuddy could make trouble */
- __ClearPageBuddy(page);
+ reset_page_mapcount(page); /* remove PageBuddy */
add_taint(TAINT_BAD_PAGE);
}
@@ -357,6 +358,7 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}
+/* update __split_huge_page_refcount if you change this function */
static int destroy_compound_page(struct page *page, unsigned long order)
{
int i;
@@ -426,18 +428,10 @@ static inline void rmv_page_order(struct page *page)
*
* Assumption: *_mem_map is contiguous at least up to MAX_ORDER
*/
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
- unsigned long buddy_idx = page_idx ^ (1 << order);
-
- return page + (buddy_idx - page_idx);
-}
-
static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
{
- return (page_idx & ~(1 << order));
+ return page_idx ^ (1 << order);
}
/*
@@ -448,8 +442,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
- * For recording whether a page is in the buddy system, we use PG_buddy.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
@@ -482,7 +476,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
* order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
@@ -499,6 +493,7 @@ static inline void __free_one_page(struct page *page,
{
unsigned long page_idx;
unsigned long combined_idx;
+ unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
if (unlikely(PageCompound(page)))
@@ -513,7 +508,8 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON(bad_range(zone, page));
while (order < MAX_ORDER-1) {
- buddy = __page_find_buddy(page, page_idx, order);
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
break;
@@ -521,7 +517,7 @@ static inline void __free_one_page(struct page *page,
list_del(&buddy->lru);
zone->free_area[order].nr_free--;
rmv_page_order(buddy);
- combined_idx = __find_combined_index(page_idx, order);
+ combined_idx = buddy_idx & page_idx;
page = page + (combined_idx - page_idx);
page_idx = combined_idx;
order++;
@@ -538,9 +534,10 @@ static inline void __free_one_page(struct page *page,
*/
if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
struct page *higher_page, *higher_buddy;
- combined_idx = __find_combined_index(page_idx, order);
- higher_page = page + combined_idx - page_idx;
- higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+ combined_idx = buddy_idx & page_idx;
+ higher_page = page + (combined_idx - page_idx);
+ buddy_idx = __find_buddy_index(combined_idx, order + 1);
+ higher_buddy = page + (buddy_idx - combined_idx);
if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
@@ -569,7 +566,8 @@ static inline int free_pages_check(struct page *page)
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
(atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
+ (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
+ (mem_cgroup_bad_page_check(page)))) {
bad_page(page);
return 1;
}
@@ -618,6 +616,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
list = &pcp->lists[migratetype];
} while (list_empty(list));
+ /* This is the only non-empty list. Free them all. */
+ if (batch_free == MIGRATE_PCPTYPES)
+ batch_free = to_free;
+
do {
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
@@ -651,13 +653,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
trace_mm_page_free_direct(page, order);
kmemcheck_free_shadow(page, order);
- for (i = 0; i < (1 << order); i++) {
- struct page *pg = page + i;
-
- if (PageAnon(pg))
- pg->mapping = NULL;
- bad += free_pages_check(pg);
- }
+ if (PageAnon(page))
+ page->mapping = NULL;
+ for (i = 0; i < (1 << order); i++)
+ bad += free_pages_check(page + i);
if (bad)
return false;
@@ -757,7 +756,8 @@ static inline int check_new_page(struct page *page)
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
(atomic_read(&page->_count) != 0) |
- (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
+ (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
+ (mem_cgroup_bad_page_check(page)))) {
bad_page(page);
return 1;
}
@@ -870,9 +870,8 @@ static int move_freepages(struct zone *zone,
}
order = page_order(page);
- list_del(&page->lru);
- list_add(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
+ list_move(&page->lru,
+ &zone->free_area[order].free_list[migratetype]);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -943,7 +942,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
* If breaking a large block of pages, move all free
* pages to the preferred allocation list. If falling
* back for a reclaimable kernel allocation, be more
- * agressive about taking ownership of free pages
+ * aggressive about taking ownership of free pages
*/
if (unlikely(current_order >= (pageblock_order >> 1)) ||
start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1095,8 +1094,10 @@ static void drain_pages(unsigned int cpu)
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
- free_pcppages_bulk(zone, pcp->count, pcp);
- pcp->count = 0;
+ if (pcp->count) {
+ free_pcppages_bulk(zone, pcp->count, pcp);
+ pcp->count = 0;
+ }
local_irq_restore(flags);
}
}
@@ -1338,7 +1339,7 @@ again:
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(preferred_zone, zone);
+ zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
VM_BUG_ON(bad_range(zone, page));
@@ -1460,24 +1461,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
- long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
int o;
+ free_pages -= (1 << order) + 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
- return 0;
+ return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1487,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min >>= 1;
if (free_pages <= min)
- return 0;
+ return false;
}
- return 1;
+ return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
}
#ifdef CONFIG_NUMA
@@ -1700,6 +1720,20 @@ try_next_zone:
return page;
}
+/*
+ * Large machines with many possible nodes should not always dump per-node
+ * meminfo in irq context.
+ */
+static inline bool should_suppress_show_mem(void)
+{
+ bool ret = false;
+
+#if NODES_SHIFT > 8
+ ret = in_interrupt();
+#endif
+ return ret;
+}
+
static inline int
should_alloc_retry(gfp_t gfp_mask, unsigned int order,
unsigned long pages_reclaimed)
@@ -1793,15 +1827,18 @@ static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
{
struct page *page;
if (!order || compaction_deferred(preferred_zone))
return NULL;
+ current->flags |= PF_MEMALLOC;
*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
- nodemask);
+ nodemask, sync_migration);
+ current->flags &= ~PF_MEMALLOC;
if (*did_some_progress != COMPACT_SKIPPED) {
/* Page migration frees to the PCP lists but we want merging */
@@ -1837,7 +1874,8 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int migratetype, unsigned long *did_some_progress)
+ int migratetype, unsigned long *did_some_progress,
+ bool sync_migration)
{
return NULL;
}
@@ -1852,23 +1890,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
{
struct page *page = NULL;
struct reclaim_state reclaim_state;
- struct task_struct *p = current;
bool drained = false;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- p->flags |= PF_MEMALLOC;
+ current->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ current->reclaim_state = &reclaim_state;
*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
- p->reclaim_state = NULL;
+ current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
- p->flags &= ~PF_MEMALLOC;
+ current->flags &= ~PF_MEMALLOC;
cond_resched();
@@ -1920,19 +1957,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
static inline
void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
- enum zone_type high_zoneidx)
+ enum zone_type high_zoneidx,
+ enum zone_type classzone_idx)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
- wakeup_kswapd(zone, order);
+ wakeup_kswapd(zone, order, classzone_idx);
}
static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
- struct task_struct *p = current;
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
const gfp_t wait = gfp_mask & __GFP_WAIT;
@@ -1948,18 +1985,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
if (!wait) {
- alloc_flags |= ALLOC_HARDER;
+ /*
+ * Not worth trying to allocate harder for
+ * __GFP_NOMEMALLOC even if it can't schedule.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ alloc_flags |= ALLOC_HARDER;
/*
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(p)) && !in_interrupt())
+ } else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
if (!in_interrupt() &&
- ((p->flags & PF_MEMALLOC) ||
+ ((current->flags & PF_MEMALLOC) ||
unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
@@ -1978,7 +2020,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int alloc_flags;
unsigned long pages_reclaimed = 0;
unsigned long did_some_progress;
- struct task_struct *p = current;
+ bool sync_migration = false;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -2003,7 +2045,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
goto nopage;
restart:
- wake_all_kswapd(order, zonelist, high_zoneidx);
+ if (!(gfp_mask & __GFP_NO_KSWAPD))
+ wake_all_kswapd(order, zonelist, high_zoneidx,
+ zone_idx(preferred_zone));
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2012,6 +2056,14 @@ restart:
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ /*
+ * Find the true preferred zone if the allocation is unconstrained by
+ * cpusets.
+ */
+ if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+ first_zones_zonelist(zonelist, high_zoneidx, NULL,
+ &preferred_zone);
+
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2034,21 +2086,26 @@ rebalance:
goto nopage;
/* Avoid recursion of direct reclaim */
- if (p->flags & PF_MEMALLOC)
+ if (current->flags & PF_MEMALLOC)
goto nopage;
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
- /* Try direct compaction */
+ /*
+ * Try direct compaction. The first pass is asynchronous. Subsequent
+ * attempts after direct reclaim are synchronous
+ */
page = __alloc_pages_direct_compact(gfp_mask, order,
zonelist, high_zoneidx,
nodemask,
alloc_flags, preferred_zone,
- migratetype, &did_some_progress);
+ migratetype, &did_some_progress,
+ sync_migration);
if (page)
goto got_pg;
+ sync_migration = !(gfp_mask & __GFP_NO_KSWAPD);
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2102,15 +2159,43 @@ rebalance:
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
goto rebalance;
+ } else {
+ /*
+ * High-order allocations do not necessarily loop after
+ * direct reclaim and reclaim/compaction depends on compaction
+ * being called after reclaim so call directly if necessary
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ zonelist, high_zoneidx,
+ nodemask,
+ alloc_flags, preferred_zone,
+ migratetype, &did_some_progress,
+ sync_migration);
+ if (page)
+ goto got_pg;
}
nopage:
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
- printk(KERN_WARNING "%s: page allocation failure."
- " order:%d, mode:0x%x\n",
- p->comm, order, gfp_mask);
+ unsigned int filter = SHOW_MEM_FILTER_NODES;
+
+ /*
+ * This documents exceptions given to allocations in certain
+ * contexts that are allowed to allocate outside current's set
+ * of allowed nodes.
+ */
+ if (!(gfp_mask & __GFP_NOMEMALLOC))
+ if (test_thread_flag(TIF_MEMDIE) ||
+ (current->flags & (PF_MEMALLOC | PF_EXITING)))
+ filter &= ~SHOW_MEM_FILTER_NODES;
+ if (in_interrupt() || !wait)
+ filter &= ~SHOW_MEM_FILTER_NODES;
+
+ pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n",
+ current->comm, order, gfp_mask);
dump_stack();
- show_mem();
+ if (!should_suppress_show_mem())
+ show_mem(filter);
}
return page;
got_pg:
@@ -2151,7 +2236,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
get_mems_allowed();
/* The preferred zone is used for statistics later */
- first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+ first_zones_zonelist(zonelist, high_zoneidx,
+ nodemask ? : &cpuset_current_mems_allowed,
+ &preferred_zone);
if (!preferred_zone) {
put_mems_allowed();
return NULL;
@@ -2358,19 +2445,42 @@ void si_meminfo_node(struct sysinfo *val, int nid)
}
#endif
+/*
+ * Determine whether the zone's node should be displayed or not, depending on
+ * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas().
+ */
+static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone)
+{
+ bool ret = false;
+
+ if (!(flags & SHOW_MEM_FILTER_NODES))
+ goto out;
+
+ get_mems_allowed();
+ ret = !node_isset(zone->zone_pgdat->node_id,
+ cpuset_current_mems_allowed);
+ put_mems_allowed();
+out:
+ return ret;
+}
+
#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
+ * Suppresses nodes that are not allowed by current's cpuset if
+ * SHOW_MEM_FILTER_NODES is passed.
*/
-void show_free_areas(void)
+void __show_free_areas(unsigned int filter)
{
int cpu;
struct zone *zone;
for_each_populated_zone(zone) {
+ if (skip_free_areas_zone(filter, zone))
+ continue;
show_node(zone);
printk("%s per-cpu:\n", zone->name);
@@ -2412,6 +2522,8 @@ void show_free_areas(void)
for_each_populated_zone(zone) {
int i;
+ if (skip_free_areas_zone(filter, zone))
+ continue;
show_node(zone);
printk("%s"
" free:%lukB"
@@ -2442,7 +2554,7 @@ void show_free_areas(void)
" all_unreclaimable? %s"
"\n",
zone->name,
- K(zone_nr_free_pages(zone)),
+ K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
@@ -2479,6 +2591,8 @@ void show_free_areas(void)
for_each_populated_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ if (skip_free_areas_zone(filter, zone))
+ continue;
show_node(zone);
printk("%s: ", zone->name);
@@ -2498,6 +2612,11 @@ void show_free_areas(void)
show_swap_cache_info();
}
+void show_free_areas(void)
+{
+ __show_free_areas(0);
+}
+
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
@@ -2585,9 +2704,16 @@ static int __parse_numa_zonelist_order(char *s)
static __init int setup_numa_zonelist_order(char *s)
{
- if (s)
- return __parse_numa_zonelist_order(s);
- return 0;
+ int ret;
+
+ if (!s)
+ return 0;
+
+ ret = __parse_numa_zonelist_order(s);
+ if (ret == 0)
+ strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+
+ return ret;
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);
@@ -3050,7 +3176,7 @@ static __init_refok int __build_all_zonelists(void *data)
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*/
-void build_all_zonelists(void *data)
+void __ref build_all_zonelists(void *data)
{
set_zonelist_order();
@@ -3639,13 +3765,45 @@ void __init free_bootmem_with_active_regions(int nid,
}
#ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Basic iterator support. Return the last range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns last region regardless of node
+ */
+static int __meminit last_active_region_index_in_nid(int nid)
+{
+ int i;
+
+ for (i = nr_nodemap_entries - 1; i >= 0; i--)
+ if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+ return i;
+
+ return -1;
+}
+
+/*
+ * Basic iterator support. Return the previous active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit previous_active_region_index_in_nid(int index, int nid)
+{
+ for (index = index - 1; index >= 0; index--)
+ if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+ return index;
+
+ return -1;
+}
+
+#define for_each_active_range_index_in_nid_reverse(i, nid) \
+ for (i = last_active_region_index_in_nid(nid); i != -1; \
+ i = previous_active_region_index_in_nid(i, nid))
+
u64 __init find_memory_core_early(int nid, u64 size, u64 align,
u64 goal, u64 limit)
{
int i;
/* Need to go over early_node_map to find out good range for node */
- for_each_active_range_index_in_nid(i, nid) {
+ for_each_active_range_index_in_nid_reverse(i, nid) {
u64 addr;
u64 ei_start, ei_last;
u64 final_start, final_end;
@@ -3688,34 +3846,6 @@ int __init add_from_early_node_map(struct range *range, int az,
return nr_range;
}
-#ifdef CONFIG_NO_BOOTMEM
-void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
- u64 goal, u64 limit)
-{
- void *ptr;
- u64 addr;
-
- if (limit > memblock.current_limit)
- limit = memblock.current_limit;
-
- addr = find_memory_core_early(nid, size, align, goal, limit);
-
- if (addr == MEMBLOCK_ERROR)
- return NULL;
-
- ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
- memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
- /*
- * The min_count is set to 0 so that bootmem allocated blocks
- * are never reported as leaks.
- */
- kmemleak_alloc(ptr, size, 0, 0);
- return ptr;
-}
-#endif
-
-
void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
{
int i;
@@ -3796,7 +3926,7 @@ static void __init find_usable_zone_for_movable(void)
/*
* The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independant of architecture. Unlike the other zones,
+ * because it is sized independent of architecture. Unlike the other zones,
* the starting point for ZONE_MOVABLE is not fixed. It may be different
* in each node depending on the size of each node and how evenly kernelcore
* is distributed. This helper function adjusts the zone ranges
@@ -4014,7 +4144,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
}
#else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
struct zone *zone, unsigned long zonesize) {}
#endif /* CONFIG_SPARSEMEM */
@@ -4749,15 +4879,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
dma_reserve = new_dma_reserve;
}
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = {
-#ifndef CONFIG_NO_BOOTMEM
- .bdata = &bootmem_node_data[0]
-#endif
- };
-EXPORT_SYMBOL(contig_page_data);
-#endif
-
void __init free_area_init(unsigned long *zones_size)
{
free_area_init_node(0, zones_size,
@@ -5316,10 +5437,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
unsigned long check = pfn + iter;
- if (!pfn_valid_within(check)) {
- iter++;
+ if (!pfn_valid_within(check))
continue;
- }
+
page = pfn_to_page(check);
if (!page_count(page)) {
if (PageBuddy(page))
@@ -5517,7 +5637,6 @@ static struct trace_print_flags pageflag_names[] = {
{1UL << PG_swapcache, "swapcache" },
{1UL << PG_mappedtodisk, "mappedtodisk" },
{1UL << PG_reclaim, "reclaim" },
- {1UL << PG_buddy, "buddy" },
{1UL << PG_swapbacked, "swapbacked" },
{1UL << PG_unevictable, "unevictable" },
#ifdef CONFIG_MMU
@@ -5565,7 +5684,8 @@ void dump_page(struct page *page)
{
printk(KERN_ALERT
"page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
- page, page_count(page), page_mapcount(page),
+ page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
dump_page_flags(page->flags);
+ mem_cgroup_print_bad_page(page);
}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5bffada7cde1..99055010cece 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -11,12 +11,11 @@
#include <linux/swapops.h>
#include <linux/kmemleak.h>
-static void __meminit
-__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
+static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
{
pc->flags = 0;
+ set_page_cgroup_array_id(pc, id);
pc->mem_cgroup = NULL;
- pc->page = pfn_to_page(pfn);
INIT_LIST_HEAD(&pc->lru);
}
static unsigned long total_usage;
@@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
return base + offset;
}
+struct page *lookup_cgroup_page(struct page_cgroup *pc)
+{
+ unsigned long pfn;
+ struct page *page;
+ pg_data_t *pgdat;
+
+ pgdat = NODE_DATA(page_cgroup_array_id(pc));
+ pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
+ page = pfn_to_page(pfn);
+ VM_BUG_ON(pc != lookup_page_cgroup(page));
+ return page;
+}
+
static int __init alloc_node_page_cgroup(int nid)
{
struct page_cgroup *base, *pc;
@@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid)
return -ENOMEM;
for (index = 0; index < nr_pages; index++) {
pc = base + index;
- __init_page_cgroup(pc, start_pfn + index);
+ init_page_cgroup(pc, nid);
}
NODE_DATA(nid)->node_page_cgroup = base;
total_usage += table_size;
@@ -105,46 +117,75 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
return section->page_cgroup + pfn;
}
-/* __alloc_bootmem...() is protected by !slab_available() */
+struct page *lookup_cgroup_page(struct page_cgroup *pc)
+{
+ struct mem_section *section;
+ struct page *page;
+ unsigned long nr;
+
+ nr = page_cgroup_array_id(pc);
+ section = __nr_to_section(nr);
+ page = pfn_to_page(pc - section->page_cgroup);
+ VM_BUG_ON(pc != lookup_page_cgroup(page));
+ return page;
+}
+
+static void *__init_refok alloc_page_cgroup(size_t size, int nid)
+{
+ void *addr = NULL;
+
+ addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN);
+ if (addr)
+ return addr;
+
+ if (node_state(nid, N_HIGH_MEMORY))
+ addr = vmalloc_node(size, nid);
+ else
+ addr = vmalloc(size);
+
+ return addr;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_page_cgroup(void *addr)
+{
+ if (is_vmalloc_addr(addr)) {
+ vfree(addr);
+ } else {
+ struct page *page = virt_to_page(addr);
+ size_t table_size =
+ sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+
+ BUG_ON(PageReserved(page));
+ free_pages_exact(addr, table_size);
+ }
+}
+#endif
+
static int __init_refok init_section_page_cgroup(unsigned long pfn)
{
- struct mem_section *section = __pfn_to_section(pfn);
struct page_cgroup *base, *pc;
+ struct mem_section *section;
unsigned long table_size;
+ unsigned long nr;
int nid, index;
- if (!section->page_cgroup) {
- nid = page_to_nid(pfn_to_page(pfn));
- table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
- VM_BUG_ON(!slab_is_available());
- if (node_state(nid, N_HIGH_MEMORY)) {
- base = kmalloc_node(table_size,
- GFP_KERNEL | __GFP_NOWARN, nid);
- if (!base)
- base = vmalloc_node(table_size, nid);
- } else {
- base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
- if (!base)
- base = vmalloc(table_size);
- }
- /*
- * The value stored in section->page_cgroup is (base - pfn)
- * and it does not point to the memory block allocated above,
- * causing kmemleak false positives.
- */
- kmemleak_not_leak(base);
- } else {
- /*
- * We don't have to allocate page_cgroup again, but
- * address of memmap may be changed. So, we have to initialize
- * again.
- */
- base = section->page_cgroup + pfn;
- table_size = 0;
- /* check address of memmap is changed or not. */
- if (base->page == pfn_to_page(pfn))
- return 0;
- }
+ nr = pfn_to_section_nr(pfn);
+ section = __nr_to_section(nr);
+
+ if (section->page_cgroup)
+ return 0;
+
+ nid = page_to_nid(pfn_to_page(pfn));
+ table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+ base = alloc_page_cgroup(table_size, nid);
+
+ /*
+ * The value stored in section->page_cgroup is (base - pfn)
+ * and it does not point to the memory block allocated above,
+ * causing kmemleak false positives.
+ */
+ kmemleak_not_leak(base);
if (!base) {
printk(KERN_ERR "page cgroup allocation failure\n");
@@ -153,7 +194,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
for (index = 0; index < PAGES_PER_SECTION; index++) {
pc = base + index;
- __init_page_cgroup(pc, pfn + index);
+ init_page_cgroup(pc, nr);
}
section->page_cgroup = base - pfn;
@@ -170,16 +211,8 @@ void __free_page_cgroup(unsigned long pfn)
if (!ms || !ms->page_cgroup)
return;
base = ms->page_cgroup + pfn;
- if (is_vmalloc_addr(base)) {
- vfree(base);
- ms->page_cgroup = NULL;
- } else {
- struct page *page = virt_to_page(base);
- if (!PageReserved(page)) { /* Is bootmem ? */
- kfree(base);
- ms->page_cgroup = NULL;
- }
- }
+ free_page_cgroup(base);
+ ms->page_cgroup = NULL;
}
int __meminit online_page_cgroup(unsigned long start_pfn,
@@ -243,12 +276,7 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
break;
}
- if (ret)
- ret = notifier_from_errno(ret);
- else
- ret = NOTIFY_OK;
-
- return ret;
+ return notifier_from_errno(ret);
}
#endif
@@ -349,7 +377,7 @@ not_enough_page:
* @new: new id
*
* Returns old id at success, 0 at failure.
- * (There is no mem_cgroup useing 0 as its id)
+ * (There is no mem_cgroup using 0 as its id)
*/
unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
unsigned short old, unsigned short new)
diff --git a/mm/page_io.c b/mm/page_io.c
index 2dee975bf469..dc76b4d0611e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
goto out;
}
if (wbc->sync_mode == WB_SYNC_ALL)
- rw |= REQ_SYNC | REQ_UNPLUG;
+ rw |= REQ_SYNC;
count_vm_event(PSWPOUT);
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 38cc58b8b2b0..c3450d533611 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -33,18 +33,35 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
pmd = pmd_offset(pud, addr);
do {
+again:
next = pmd_addr_end(addr, end);
- if (pmd_none_or_clear_bad(pmd)) {
+ if (pmd_none(*pmd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
+ /*
+ * This implies that each ->pmd_entry() handler
+ * needs to know about pmd_trans_huge() pmds
+ */
if (walk->pmd_entry)
err = walk->pmd_entry(pmd, addr, next, walk);
- if (!err && walk->pte_entry)
- err = walk_pte_range(pmd, addr, next, walk);
+ if (err)
+ break;
+
+ /*
+ * Check this here so we only break down trans_huge
+ * pages when we _need_ to
+ */
+ if (!walk->pte_entry)
+ continue;
+
+ split_huge_page_pmd(walk->mm, pmd);
+ if (pmd_none_or_clear_bad(pmd))
+ goto again;
+ err = walk_pte_range(pmd, addr, next, walk);
if (err)
break;
} while (pmd++, addr = next, addr != end);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 7d9c1d0ebd3f..ea534960a04b 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
return NULL;
vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
- pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
+ pcpu_nr_groups, pcpu_atom_size);
if (!vms) {
pcpu_free_chunk(chunk);
return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index efe816856a9d..a160db39b810 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -258,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
/*
* (Un)populated page region iterators. Iterate over (un)populated
- * page regions betwen @start and @end in @chunk. @rs and @re should
+ * page regions between @start and @end in @chunk. @rs and @re should
* be integer variables and will be set to start and end page index of
* the current region.
*/
@@ -293,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size)
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
- else {
- void *ptr = vmalloc(size);
- if (ptr)
- memset(ptr, 0, size);
- return ptr;
- }
+ else
+ return vzalloc(size);
}
/**
@@ -346,7 +342,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
* @chunk: chunk of interest
*
* Determine whether area map of @chunk needs to be extended to
- * accomodate a new allocation.
+ * accommodate a new allocation.
*
* CONTEXT:
* pcpu_lock.
@@ -435,7 +431,7 @@ out_unlock:
* depending on @head, is reduced by @tail bytes and @tail byte block
* is inserted after the target block.
*
- * @chunk->map must have enough free slots to accomodate the split.
+ * @chunk->map must have enough free slots to accommodate the split.
*
* CONTEXT:
* pcpu_lock.
@@ -1012,8 +1008,7 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
}
if (in_first_chunk) {
- if ((unsigned long)addr < VMALLOC_START ||
- (unsigned long)addr >= VMALLOC_END)
+ if (!is_vmalloc_addr(addr))
return __pa(addr);
else
return page_to_phys(vmalloc_to_page(addr));
@@ -1268,7 +1263,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
- pcpu_dump_alloc_info(KERN_INFO, ai);
+ pcpu_dump_alloc_info(KERN_DEBUG, ai);
pcpu_nr_groups = ai->nr_groups;
pcpu_group_offsets = group_offsets;
@@ -1440,7 +1435,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
/*
* Determine min_unit_size, alloc_size and max_upa such that
* alloc_size is multiple of atom_size and is the smallest
- * which can accomodate 4k aligned segments which are equal to
+ * which can accommodate 4k aligned segments which are equal to
* or larger than min_unit_size.
*/
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
@@ -1555,7 +1550,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
* @alloc_fn: function to allocate percpu page
- * @free_fn: funtion to free percpu page
+ * @free_fn: function to free percpu page
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
@@ -1683,7 +1678,7 @@ out_free:
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
- * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ * @free_fn: function to free percpu page, always called with PAGE_SIZE
* @populate_pte_fn: function to populate pte
*
* This is a helper to ease setting up page-remapped first percpu
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000000..eb663fb533e0
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,121 @@
+/*
+ * mm/pgtable-generic.c
+ *
+ * Generic pgtable methods declared in asm-generic/pgtable.h
+ *
+ * Copyright (C) 2010 Linus Torvalds
+ */
+
+#include <linux/pagemap.h>
+#include <asm/tlb.h>
+#include <asm-generic/pgtable.h>
+
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+/*
+ * Only sets the access flags (dirty, accessed, and
+ * writable). Furthermore, we know it always gets set to a "more
+ * permissive" setting, which allows most architectures to optimize
+ * this. We return whether the PTE actually changed, which in turn
+ * instructs the caller to do things like update__mmu_cache. This
+ * used to be done in the caller, but sparc needs minor faults to
+ * force that call on sun4c so we changed this macro slightly
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+ if (changed) {
+ set_pte_at(vma->vm_mm, address, ptep, entry);
+ flush_tlb_page(vma, address);
+ }
+ return changed;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ int changed = !pmd_same(*pmdp, entry);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ if (changed) {
+ set_pmd_at(vma->vm_mm, address, pmdp, entry);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+ return changed;
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+ BUG();
+ return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ int young;
+ young = ptep_test_and_clear_young(vma, address, ptep);
+ if (young)
+ flush_tlb_page(vma, address);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+ BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return young;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+{
+ pte_t pte;
+ pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
+ flush_tlb_page(vma, address);
+ return pte;
+}
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ pmd_t pmd = pmd_mksplitting(*pmdp);
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+ /* tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/readahead.c b/mm/readahead.c
index 77506a291a2d..2c0cc489e288 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages);
static int read_pages(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned nr_pages)
{
+ struct blk_plug plug;
unsigned page_idx;
int ret;
+ blk_start_plug(&plug);
+
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
@@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp,
page_cache_release(page);
}
ret = 0;
+
out:
+ blk_finish_plug(&plug);
+
return ret;
}
@@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping,
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, true, offset, req_size);
-
-#ifdef CONFIG_BLOCK
- /*
- * Normally the current page is !uptodate and lock_page() will be
- * immediately called to implicitly unplug the device. However this
- * is not always true for RAID conifgurations, where data arrives
- * not strictly in their submission order. In this case we need to
- * explicitly kick off the IO.
- */
- if (PageUptodate(page))
- blk_run_backing_dev(mapping->backing_dev_info, NULL);
-#endif
}
EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index 1a8bf76bfd03..8da044a1db0f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -31,11 +31,12 @@
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
- * inode_lock (in set_page_dirty's __mark_inode_dirty)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
* mapping->tree_lock (widely used, in set_page_dirty,
* in arch-dependent flush_dcache_mmap_lock,
- * within inode_lock in __sync_single_inode)
+ * within inode_wb_list_lock in __sync_single_inode)
*
* (code doesn't rely on that order so it could be switched around)
* ->tasklist_lock
@@ -67,11 +68,24 @@ static struct kmem_cache *anon_vma_chain_cachep;
static inline struct anon_vma *anon_vma_alloc(void)
{
- return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+ struct anon_vma *anon_vma;
+
+ anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+ if (anon_vma) {
+ atomic_set(&anon_vma->refcount, 1);
+ /*
+ * Initialise the anon_vma root to point to itself. If called
+ * from fork, the root will be reset to the parents anon_vma.
+ */
+ anon_vma->root = anon_vma;
+ }
+
+ return anon_vma;
}
-void anon_vma_free(struct anon_vma *anon_vma)
+static inline void anon_vma_free(struct anon_vma *anon_vma)
{
+ VM_BUG_ON(atomic_read(&anon_vma->refcount));
kmem_cache_free(anon_vma_cachep, anon_vma);
}
@@ -94,7 +108,7 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
* anonymous pages mapped into it with that anon_vma.
*
* The common case will be that we already have one, but if
- * if not we either need to find an adjacent mapping that we
+ * not we either need to find an adjacent mapping that we
* can re-use the anon_vma from (very common when the only
* reason for splitting a vma has been mprotect()), or we
* allocate a new one.
@@ -133,11 +147,6 @@ int anon_vma_prepare(struct vm_area_struct *vma)
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
allocated = anon_vma;
- /*
- * This VMA had no anon_vma yet. This anon_vma is
- * the root of any anon_vma tree that might form.
- */
- anon_vma->root = anon_vma;
}
anon_vma_lock(anon_vma);
@@ -156,7 +165,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
anon_vma_unlock(anon_vma);
if (unlikely(allocated))
- anon_vma_free(allocated);
+ put_anon_vma(allocated);
if (unlikely(avc))
anon_vma_chain_free(avc);
}
@@ -177,6 +186,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
list_add(&avc->same_vma, &vma->anon_vma_chain);
anon_vma_lock(anon_vma);
+ /*
+ * It's critical to add new vmas to the tail of the anon_vma,
+ * see comment in huge_memory.c:__split_huge_page().
+ */
list_add_tail(&avc->same_anon_vma, &anon_vma->head);
anon_vma_unlock(anon_vma);
}
@@ -237,9 +250,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
*/
anon_vma->root = pvma->anon_vma->root;
/*
- * With KSM refcounts, an anon_vma can stay around longer than the
- * process it belongs to. The root anon_vma needs to be pinned
- * until this anon_vma is freed, because the lock lives in the root.
+ * With refcounts, an anon_vma can stay around longer than the
+ * process it belongs to. The root anon_vma needs to be pinned until
+ * this anon_vma is freed, because the lock lives in the root.
*/
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
@@ -249,7 +262,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
return 0;
out_error_free_anon_vma:
- anon_vma_free(anon_vma);
+ put_anon_vma(anon_vma);
out_error:
unlink_anon_vmas(vma);
return -ENOMEM;
@@ -268,15 +281,11 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
list_del(&anon_vma_chain->same_anon_vma);
/* We must garbage collect the anon_vma if it's empty */
- empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
+ empty = list_empty(&anon_vma->head);
anon_vma_unlock(anon_vma);
- if (empty) {
- /* We no longer need the root anon_vma */
- if (anon_vma->root != anon_vma)
- drop_anon_vma(anon_vma->root);
- anon_vma_free(anon_vma);
- }
+ if (empty)
+ put_anon_vma(anon_vma);
}
void unlink_anon_vmas(struct vm_area_struct *vma)
@@ -299,7 +308,7 @@ static void anon_vma_ctor(void *data)
struct anon_vma *anon_vma = data;
spin_lock_init(&anon_vma->lock);
- anonvma_external_refcount_init(anon_vma);
+ atomic_set(&anon_vma->refcount, 0);
INIT_LIST_HEAD(&anon_vma->head);
}
@@ -360,7 +369,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
* Returns virtual address or -EFAULT if page's index/offset is not
* within the range mapped the @vma.
*/
-static inline unsigned long
+inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -435,6 +444,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return NULL;
+ if (pmd_trans_huge(*pmd))
+ return NULL;
pte = pte_offset_map(pmd, address);
/* Make a quick check before getting the lock */
@@ -489,35 +500,65 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
- pte_t *pte;
- spinlock_t *ptl;
int referenced = 0;
- pte = page_check_address(page, mm, address, &ptl, 0);
- if (!pte)
- goto out;
-
- /*
- * Don't want to elevate referenced for mlocked page that gets this far,
- * in order that it progresses to try_to_unmap and is moved to the
- * unevictable list.
- */
- if (vma->vm_flags & VM_LOCKED) {
- *mapcount = 1; /* break early from loop */
- *vm_flags |= VM_LOCKED;
- goto out_unmap;
- }
+ if (unlikely(PageTransHuge(page))) {
+ pmd_t *pmd;
- if (ptep_clear_flush_young_notify(vma, address, pte)) {
+ spin_lock(&mm->page_table_lock);
/*
- * Don't treat a reference through a sequentially read
- * mapping as such. If the page has been used in
- * another mapping, we will catch it; if this other
- * mapping is already gone, the unmap path will have
- * set PG_referenced or activated the page.
+ * rmap might return false positives; we must filter
+ * these out using page_check_address_pmd().
*/
- if (likely(!VM_SequentialReadHint(vma)))
+ pmd = page_check_address_pmd(page, mm, address,
+ PAGE_CHECK_ADDRESS_PMD_FLAG);
+ if (!pmd) {
+ spin_unlock(&mm->page_table_lock);
+ goto out;
+ }
+
+ if (vma->vm_flags & VM_LOCKED) {
+ spin_unlock(&mm->page_table_lock);
+ *mapcount = 0; /* break early from loop */
+ *vm_flags |= VM_LOCKED;
+ goto out;
+ }
+
+ /* go ahead even if the pmd is pmd_trans_splitting() */
+ if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
+ spin_unlock(&mm->page_table_lock);
+ } else {
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ /*
+ * rmap might return false positives; we must filter
+ * these out using page_check_address().
+ */
+ pte = page_check_address(page, mm, address, &ptl, 0);
+ if (!pte)
+ goto out;
+
+ if (vma->vm_flags & VM_LOCKED) {
+ pte_unmap_unlock(pte, ptl);
+ *mapcount = 0; /* break early from loop */
+ *vm_flags |= VM_LOCKED;
+ goto out;
+ }
+
+ if (ptep_clear_flush_young_notify(vma, address, pte)) {
+ /*
+ * Don't treat a reference through a sequentially read
+ * mapping as such. If the page has been used in
+ * another mapping, we will catch it; if this other
+ * mapping is already gone, the unmap path will have
+ * set PG_referenced or activated the page.
+ */
+ if (likely(!VM_SequentialReadHint(vma)))
+ referenced++;
+ }
+ pte_unmap_unlock(pte, ptl);
}
/* Pretend the page is referenced if the task has the
@@ -526,9 +567,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
rwsem_is_locked(&mm->mmap_sem))
referenced++;
-out_unmap:
(*mapcount)--;
- pte_unmap_unlock(pte, ptl);
if (referenced)
*vm_flags |= vma->vm_flags;
@@ -864,8 +903,13 @@ void do_page_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, int exclusive)
{
int first = atomic_inc_and_test(&page->_mapcount);
- if (first)
- __inc_zone_page_state(page, NR_ANON_PAGES);
+ if (first) {
+ if (!PageTransHuge(page))
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __inc_zone_page_state(page,
+ NR_ANON_TRANSPARENT_HUGEPAGES);
+ }
if (unlikely(PageKsm(page)))
return;
@@ -893,7 +937,10 @@ void page_add_new_anon_rmap(struct page *page,
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
SetPageSwapBacked(page);
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
- __inc_zone_page_state(page, NR_ANON_PAGES);
+ if (!PageTransHuge(page))
+ __inc_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__page_set_anon_rmap(page, vma, address, 1);
if (page_evictable(page, vma))
lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -911,7 +958,7 @@ void page_add_file_rmap(struct page *page)
{
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, 1);
+ mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
}
}
@@ -946,10 +993,14 @@ void page_remove_rmap(struct page *page)
return;
if (PageAnon(page)) {
mem_cgroup_uncharge_page(page);
- __dec_zone_page_state(page, NR_ANON_PAGES);
+ if (!PageTransHuge(page))
+ __dec_zone_page_state(page, NR_ANON_PAGES);
+ else
+ __dec_zone_page_state(page,
+ NR_ANON_TRANSPARENT_HUGEPAGES);
} else {
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_update_file_mapped(page, -1);
+ mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
}
/*
* It would be tidy to reset the PageAnon mapping here,
@@ -1202,7 +1253,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
return ret;
}
-static bool is_vma_temporary_stack(struct vm_area_struct *vma)
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1400,6 +1451,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
int ret;
BUG_ON(!PageLocked(page));
+ VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
if (unlikely(PageKsm(page)))
ret = try_to_unmap_ksm(page, flags);
@@ -1439,41 +1491,15 @@ int try_to_munlock(struct page *page)
return try_to_unmap_file(page, TTU_MUNLOCK);
}
-#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
-/*
- * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
- * if necessary. Be careful to do all the tests under the lock. Once
- * we know we are the last user, nobody else can get a reference and we
- * can do the freeing without the lock.
- */
-void drop_anon_vma(struct anon_vma *anon_vma)
+void __put_anon_vma(struct anon_vma *anon_vma)
{
- BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
- if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
- struct anon_vma *root = anon_vma->root;
- int empty = list_empty(&anon_vma->head);
- int last_root_user = 0;
- int root_empty = 0;
+ struct anon_vma *root = anon_vma->root;
- /*
- * The refcount on a non-root anon_vma got dropped. Drop
- * the refcount on the root and check if we need to free it.
- */
- if (empty && anon_vma != root) {
- BUG_ON(atomic_read(&root->external_refcount) <= 0);
- last_root_user = atomic_dec_and_test(&root->external_refcount);
- root_empty = list_empty(&root->head);
- }
- anon_vma_unlock(anon_vma);
+ if (root != anon_vma && atomic_dec_and_test(&root->refcount))
+ anon_vma_free(root);
- if (empty) {
- anon_vma_free(anon_vma);
- if (root_empty && last_root_user)
- anon_vma_free(root);
- }
- }
+ anon_vma_free(anon_vma);
}
-#endif
#ifdef CONFIG_MIGRATION
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 47fdeeb9d636..8fa27e4e582a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -224,7 +224,6 @@ static const struct vm_operations_struct shmem_vm_ops;
static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
.ra_pages = 0, /* No readahead */
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
- .unplug_io_fn = default_unplug_io_fn,
};
static LIST_HEAD(shmem_swaplist);
@@ -422,7 +421,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
* a waste to allocate index if we cannot allocate data.
*/
if (sbinfo->max_blocks) {
- if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks - 1) >= 0)
return ERR_PTR(-ENOSPC);
percpu_counter_inc(&sbinfo->used_blocks);
spin_lock(&inode->i_lock);
@@ -779,7 +779,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
* If truncating down to a partial page, then
* if that page is already allocated, hold it
* in memory until the truncation is over, so
- * truncate_partial_page cannnot miss it were
+ * truncate_partial_page cannot miss it were
* it assigned to swap.
*/
if (newsize & (PAGE_CACHE_SIZE-1)) {
@@ -1081,7 +1081,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
shmem_recalc_inode(inode);
if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
- remove_from_page_cache(page);
+ delete_from_page_cache(page);
shmem_swp_set(info, entry, swap.val);
shmem_swp_unmap(entry);
if (list_empty(&info->swaplist))
@@ -1091,7 +1091,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
spin_unlock(&info->lock);
swap_shmem_alloc(swap);
BUG_ON(page_mapped(page));
- page_cache_release(page); /* pagecache ref */
swap_writepage(page, wbc);
if (inode) {
mutex_lock(&shmem_swaplist_mutex);
@@ -1399,7 +1398,8 @@ repeat:
shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) {
- if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ sbinfo->max_blocks) >= 0 ||
shmem_acct_block(info->flags)) {
spin_unlock(&info->lock);
error = -ENOSPC;
@@ -1843,8 +1843,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
- error = security_inode_init_security(inode, dir, NULL, NULL,
- NULL);
+ error = security_inode_init_security(inode, dir,
+ &dentry->d_name, NULL,
+ NULL, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
iput(inode);
@@ -1983,8 +1984,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
if (!inode)
return -ENOSPC;
- error = security_inode_init_security(inode, dir, NULL, NULL,
- NULL);
+ error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
+ NULL, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
iput(inode);
@@ -2144,8 +2145,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
{
struct inode *inode = dentry->d_inode;
- if (*len < 3)
+ if (*len < 3) {
+ *len = 3;
return 255;
+ }
if (inode_unhashed(inode)) {
/* Unfortunately insert_inode_hash is not idempotent,
@@ -2415,13 +2418,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
return &p->vfs_inode;
}
+static void shmem_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ INIT_LIST_HEAD(&inode->i_dentry);
+ kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+}
+
static void shmem_destroy_inode(struct inode *inode)
{
if ((inode->i_mode & S_IFMT) == S_IFREG) {
/* only struct inode is valid if it's an inline symlink */
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
}
- kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+ call_rcu(&inode->i_rcu, shmem_i_callback);
}
static void init_once(void *foo)
@@ -2784,5 +2794,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
fput(vma->vm_file);
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/mm/slab.c b/mm/slab.c
index b1e40dafbab3..46a9c163a92f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -191,22 +191,6 @@ typedef unsigned int kmem_bufctl_t;
#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
/*
- * struct slab
- *
- * Manages the objs in a slab. Placed either at the beginning of mem allocated
- * for a slab, or allocated from an general cache.
- * Slabs are chained into three list: fully used, partial, fully free slabs.
- */
-struct slab {
- struct list_head list;
- unsigned long colouroff;
- void *s_mem; /* including colour offset */
- unsigned int inuse; /* num of objs active in slab */
- kmem_bufctl_t free;
- unsigned short nodeid;
-};
-
-/*
* struct slab_rcu
*
* slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
@@ -219,8 +203,6 @@ struct slab {
*
* rcu_read_lock before reading the address, then rcu_read_unlock after
* taking the spinlock within the structure expected at that address.
- *
- * We assume struct slab_rcu can overlay struct slab when destroying.
*/
struct slab_rcu {
struct rcu_head head;
@@ -229,6 +211,27 @@ struct slab_rcu {
};
/*
+ * struct slab
+ *
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
+ */
+struct slab {
+ union {
+ struct {
+ struct list_head list;
+ unsigned long colouroff;
+ void *s_mem; /* including colour offset */
+ unsigned int inuse; /* num of objs active in slab */
+ kmem_bufctl_t free;
+ unsigned short nodeid;
+ };
+ struct slab_rcu __slab_cover_slab_rcu;
+ };
+};
+
+/*
* struct array_cache
*
* Purpose:
@@ -284,7 +287,7 @@ struct kmem_list3 {
* Need this for bootstrapping a per node allocator.
*/
#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
-struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
#define CACHE_CACHE 0
#define SIZE_AC MAX_NUMNODES
#define SIZE_L3 (2 * MAX_NUMNODES)
@@ -829,12 +832,12 @@ static void init_reap_node(int cpu)
static void next_reap_node(void)
{
- int node = __get_cpu_var(slab_reap_node);
+ int node = __this_cpu_read(slab_reap_node);
node = next_node(node, node_online_map);
if (unlikely(node >= MAX_NUMNODES))
node = first_node(node_online_map);
- __get_cpu_var(slab_reap_node) = node;
+ __this_cpu_write(slab_reap_node, node);
}
#else
@@ -875,7 +878,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
nc = kmalloc_node(memsize, gfp, node);
/*
* The array_cache structures contain pointers to free object.
- * However, when such objects are allocated or transfered to another
+ * However, when such objects are allocated or transferred to another
* cache the pointers are not cleared and they could be counted as
* valid references during a kmemleak scan. Therefore, kmemleak must
* not scan such objects.
@@ -1012,7 +1015,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
*/
static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
{
- int node = __get_cpu_var(slab_reap_node);
+ int node = __this_cpu_read(slab_reap_node);
if (l3->alien) {
struct array_cache *ac = l3->alien[node];
@@ -1293,7 +1296,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
* anything expensive but will only modify reap_work
* and reschedule the timer.
*/
- cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
+ cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
/* Now the cache_reaper is guaranteed to be not running. */
per_cpu(slab_reap_work, cpu).work.func = NULL;
break;
@@ -1387,7 +1390,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
break;
}
out:
- return ret ? notifier_from_errno(ret) : NOTIFY_OK;
+ return notifier_from_errno(ret);
}
#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
@@ -2147,8 +2150,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
*
* @name must be valid until the cache is destroyed. This implies that
* the module calling this has to destroy the cache before getting unloaded.
- * Note that kmem_cache_name() is not guaranteed to return the same pointer,
- * therefore applications must manage it themselves.
*
* The flags are
*
@@ -2288,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (ralign < align) {
ralign = align;
}
- /* disable debug if not aligning with REDZONE_ALIGN */
- if (ralign & (__alignof__(unsigned long long) - 1))
+ /* disable debug if necessary */
+ if (ralign > __alignof__(unsigned long long))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
@@ -2315,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
if (flags & SLAB_RED_ZONE) {
/* add space for red zone words */
- cachep->obj_offset += align;
- size += align + sizeof(unsigned long long);
+ cachep->obj_offset += sizeof(unsigned long long);
+ size += 2 * sizeof(unsigned long long);
}
if (flags & SLAB_STORE_USER) {
/* user store requires one word storage behind the end of
@@ -2605,7 +2606,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
*
* The cache must be empty before calling this function.
*
- * The caller must guarantee that noone will allocate memory from the cache
+ * The caller must guarantee that no one will allocate memory from the cache
* during the kmem_cache_destroy().
*/
void kmem_cache_destroy(struct kmem_cache *cachep)
@@ -2781,7 +2782,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
/*
* Map pages beginning at addr to the given cache and slab. This is required
* for the slab allocator to be able to lookup the cache and slab of a
- * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ * virtual address for kfree, ksize, and slab debugging.
*/
static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
void *addr)
@@ -3653,42 +3654,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+void *
+kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
{
- return __cache_alloc(cachep, flags, __builtin_return_address(0));
-}
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
-#endif
+ void *ret;
-/**
- * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
- * @cachep: the cache we're checking against
- * @ptr: pointer to validate
- *
- * This verifies that the untrusted pointer looks sane;
- * it is _not_ a guarantee that the pointer is actually
- * part of the slab cache in question, but it at least
- * validates that the pointer can be dereferenced and
- * looks half-way sane.
- *
- * Currently only used for dentry validation.
- */
-int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
-{
- unsigned long size = cachep->buffer_size;
- struct page *page;
+ ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
- if (unlikely(!kern_ptr_validate(ptr, size)))
- goto out;
- page = virt_to_page(ptr);
- if (unlikely(!PageSlab(page)))
- goto out;
- if (unlikely(page_get_cache(page) != cachep))
- goto out;
- return 1;
-out:
- return 0;
+ trace_kmalloc(_RET_IP_, ret,
+ size, slab_buffer_size(cachep), flags);
+ return ret;
}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+#endif
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
@@ -3705,31 +3683,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
EXPORT_SYMBOL(kmem_cache_alloc_node);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
- gfp_t flags,
- int nodeid)
+void *kmem_cache_alloc_node_trace(size_t size,
+ struct kmem_cache *cachep,
+ gfp_t flags,
+ int nodeid)
{
- return __cache_alloc_node(cachep, flags, nodeid,
+ void *ret;
+
+ ret = __cache_alloc_node(cachep, flags, nodeid,
__builtin_return_address(0));
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, slab_buffer_size(cachep),
+ flags, nodeid);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
{
struct kmem_cache *cachep;
- void *ret;
cachep = kmem_find_general_cachep(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
-
- trace_kmalloc_node((unsigned long) caller, ret,
- size, cachep->buffer_size, flags, node);
-
- return ret;
+ return kmem_cache_alloc_node_trace(size, cachep, flags, node);
}
#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
@@ -3862,12 +3841,6 @@ unsigned int kmem_cache_size(struct kmem_cache *cachep)
}
EXPORT_SYMBOL(kmem_cache_size);
-const char *kmem_cache_name(struct kmem_cache *cachep)
-{
- return cachep->name;
-}
-EXPORT_SYMBOL_GPL(kmem_cache_name);
-
/*
* This initializes kmem_list3 or resizes various caches for all nodes.
*/
@@ -4075,7 +4048,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
* necessary. Note that the l3 listlock also protects the array_cache
* if drain_array() is used on the shared array.
*/
-void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
struct array_cache *ac, int force, int node)
{
int tofree;
@@ -4339,7 +4312,7 @@ static const struct seq_operations slabinfo_op = {
* @count: data length
* @ppos: unused
*/
-ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
diff --git a/mm/slob.c b/mm/slob.c
index 617b6d6c42c7..46e0aee33a23 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -666,23 +666,12 @@ unsigned int kmem_cache_size(struct kmem_cache *c)
}
EXPORT_SYMBOL(kmem_cache_size);
-const char *kmem_cache_name(struct kmem_cache *c)
-{
- return c->name;
-}
-EXPORT_SYMBOL(kmem_cache_name);
-
int kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
}
EXPORT_SYMBOL(kmem_cache_shrink);
-int kmem_ptr_validate(struct kmem_cache *a, const void *b)
-{
- return 0;
-}
-
static unsigned int slob_ready __read_mostly;
int slab_is_available(void)
diff --git a/mm/slub.c b/mm/slub.c
index bec0e355fbad..9d2e5e46bf09 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -28,6 +28,8 @@
#include <linux/math64.h>
#include <linux/fault-inject.h>
+#include <trace/events/kmem.h>
+
/*
* Lock order:
* 1. slab_lock(page)
@@ -62,7 +64,7 @@
* we must stay away from it for a while since we may cause a bouncing
* cacheline if we try to acquire the lock. So go onto the next slab.
* If all pages are busy then we may allocate a new slab instead of reusing
- * a partial slab. A new slab has noone operating on it and thus there is
+ * a partial slab. A new slab has no one operating on it and thus there is
* no danger of cacheline contention.
*
* Interrupts are disabled during allocation and deallocation in order to
@@ -215,7 +217,7 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
#endif
-static inline void stat(struct kmem_cache *s, enum stat_item si)
+static inline void stat(const struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
__this_cpu_inc(s->cpu_slab->stat[si]);
@@ -279,11 +281,40 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
return (p - addr) / s->size;
}
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->objsize;
+
+#endif
+ /*
+ * If we have the need to store the freelist pointer
+ * back there or track user information then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+}
+
+static inline int order_objects(int order, unsigned long size, int reserved)
+{
+ return ((PAGE_SIZE << order) - reserved) / size;
+}
+
static inline struct kmem_cache_order_objects oo_make(int order,
- unsigned long size)
+ unsigned long size, int reserved)
{
struct kmem_cache_order_objects x = {
- (order << OO_SHIFT) + (PAGE_SIZE << order) / size
+ (order << OO_SHIFT) + order_objects(order, size, reserved)
};
return x;
@@ -615,7 +646,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
return 1;
start = page_address(page);
- length = (PAGE_SIZE << compound_order(page));
+ length = (PAGE_SIZE << compound_order(page)) - s->reserved;
end = start + length;
remainder = length % s->size;
if (!remainder)
@@ -696,7 +727,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
return 0;
}
- maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
+ maxobj = order_objects(compound_order(page), s->size, s->reserved);
if (page->objects > maxobj) {
slab_err(s, page, "objects %u > max %u",
s->name, page->objects, maxobj);
@@ -746,7 +777,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
nr++;
}
- max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
+ max_objects = order_objects(compound_order(page), s->size, s->reserved);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
@@ -798,21 +829,31 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
{
flags &= gfp_allowed_mask;
- kmemcheck_slab_alloc(s, flags, object, s->objsize);
+ kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
}
static inline void slab_free_hook(struct kmem_cache *s, void *x)
{
kmemleak_free_recursive(x, s->flags);
-}
-static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
-{
- kmemcheck_slab_free(s, object, s->objsize);
- debug_check_no_locks_freed(object, s->objsize);
+ /*
+ * Trouble is that we may no longer disable interupts in the fast path
+ * So in order to make the debug calls that expect irqs to be
+ * disabled we need to disable interrupts temporarily.
+ */
+#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+ {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ kmemcheck_slab_free(s, x, s->objsize);
+ debug_check_no_locks_freed(x, s->objsize);
+ local_irq_restore(flags);
+ }
+#endif
if (!(s->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(object, s->objsize);
+ debug_check_no_obj_freed(x, s->objsize);
}
/*
@@ -1099,9 +1140,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
-static inline void slab_free_hook_irq(struct kmem_cache *s,
- void *object) {}
-
#endif /* CONFIG_SLUB_DEBUG */
/*
@@ -1247,21 +1285,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__free_pages(page, order);
}
+#define need_reserve_slab_rcu \
+ (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
+
static void rcu_free_slab(struct rcu_head *h)
{
struct page *page;
- page = container_of((struct list_head *)h, struct page, lru);
+ if (need_reserve_slab_rcu)
+ page = virt_to_head_page(h);
+ else
+ page = container_of((struct list_head *)h, struct page, lru);
+
__free_slab(page->slab, page);
}
static void free_slab(struct kmem_cache *s, struct page *page)
{
if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
- /*
- * RCU free overloads the RCU head over the LRU
- */
- struct rcu_head *head = (void *)&page->lru;
+ struct rcu_head *head;
+
+ if (need_reserve_slab_rcu) {
+ int order = compound_order(page);
+ int offset = (PAGE_SIZE << order) - s->reserved;
+
+ VM_BUG_ON(s->reserved != sizeof(*head));
+ head = page_address(page) + offset;
+ } else {
+ /*
+ * RCU free overloads the RCU head over the LRU
+ */
+ head = (void *)&page->lru;
+ }
call_rcu(head, rcu_free_slab);
} else
@@ -1485,6 +1540,78 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
}
}
+#ifdef CONFIG_CMPXCHG_LOCAL
+#ifdef CONFIG_PREEMPT
+/*
+ * Calculate the next globally unique transaction for disambiguiation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
+#else
+/*
+ * No preemption supported therefore also no need to check for
+ * different cpus.
+ */
+#define TID_STEP 1
+#endif
+
+static inline unsigned long next_tid(unsigned long tid)
+{
+ return tid + TID_STEP;
+}
+
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+ return tid % TID_STEP;
+}
+
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+ return tid / TID_STEP;
+}
+
+static inline unsigned int init_tid(int cpu)
+{
+ return cpu;
+}
+
+static inline void note_cmpxchg_failure(const char *n,
+ const struct kmem_cache *s, unsigned long tid)
+{
+#ifdef SLUB_DEBUG_CMPXCHG
+ unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+
+ printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
+
+#ifdef CONFIG_PREEMPT
+ if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+ printk("due to cpu change %d -> %d\n",
+ tid_to_cpu(tid), tid_to_cpu(actual_tid));
+ else
+#endif
+ if (tid_to_event(tid) != tid_to_event(actual_tid))
+ printk("due to cpu running other code. Event %ld->%ld\n",
+ tid_to_event(tid), tid_to_event(actual_tid));
+ else
+ printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
+ actual_tid, tid, next_tid(tid));
+#endif
+ stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
+}
+
+#endif
+
+void init_kmem_cache_cpus(struct kmem_cache *s)
+{
+#ifdef CONFIG_CMPXCHG_LOCAL
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+#endif
+
+}
/*
* Remove the cpu slab
*/
@@ -1516,6 +1643,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
page->inuse--;
}
c->page = NULL;
+#ifdef CONFIG_CMPXCHG_LOCAL
+ c->tid = next_tid(c->tid);
+#endif
unfreeze_slab(s, page, tail);
}
@@ -1650,6 +1780,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
{
void **object;
struct page *new;
+#ifdef CONFIG_CMPXCHG_LOCAL
+ unsigned long flags;
+
+ local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+ /*
+ * We may have been preempted and rescheduled on a different
+ * cpu before disabling interrupts. Need to reload cpu area
+ * pointer.
+ */
+ c = this_cpu_ptr(s->cpu_slab);
+#endif
+#endif
/* We handle __GFP_ZERO in the caller */
gfpflags &= ~__GFP_ZERO;
@@ -1676,6 +1819,10 @@ load_freelist:
c->node = page_to_nid(c->page);
unlock_out:
slab_unlock(c->page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+ c->tid = next_tid(c->tid);
+ local_irq_restore(flags);
+#endif
stat(s, ALLOC_SLOWPATH);
return object;
@@ -1711,6 +1858,9 @@ new_slab:
}
if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
slab_out_of_memory(s, gfpflags, node);
+#ifdef CONFIG_CMPXCHG_LOCAL
+ local_irq_restore(flags);
+#endif
return NULL;
debug:
if (!alloc_debug_processing(s, c->page, object, addr))
@@ -1737,23 +1887,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
{
void **object;
struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+ unsigned long tid;
+#else
unsigned long flags;
+#endif
if (slab_pre_alloc_hook(s, gfpflags))
return NULL;
+#ifndef CONFIG_CMPXCHG_LOCAL
local_irq_save(flags);
+#else
+redo:
+#endif
+
+ /*
+ * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+ * enabled. We may switch back and forth between cpus while
+ * reading from one cpu area. That does not matter as long
+ * as we end up on the original cpu again when doing the cmpxchg.
+ */
c = __this_cpu_ptr(s->cpu_slab);
+
+#ifdef CONFIG_CMPXCHG_LOCAL
+ /*
+ * The transaction ids are globally unique per cpu and per operation on
+ * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+ * occurs on the right processor and that there was no operation on the
+ * linked list in between.
+ */
+ tid = c->tid;
+ barrier();
+#endif
+
object = c->freelist;
if (unlikely(!object || !node_match(c, node)))
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
+#ifdef CONFIG_CMPXCHG_LOCAL
+ /*
+ * The cmpxchg will only match if there was no additional
+ * operation and if we are on the right processor.
+ *
+ * The cmpxchg does the following atomically (without lock semantics!)
+ * 1. Relocate first pointer to the current per cpu area.
+ * 2. Verify that tid and freelist have not been changed
+ * 3. If they were not changed replace tid and freelist
+ *
+ * Since this is without lock semantics the protection is only against
+ * code executing on this cpu *not* from access by other cpus.
+ */
+ if (unlikely(!irqsafe_cpu_cmpxchg_double(
+ s->cpu_slab->freelist, s->cpu_slab->tid,
+ object, tid,
+ get_freepointer(s, object), next_tid(tid)))) {
+
+ note_cmpxchg_failure("slab_alloc", s, tid);
+ goto redo;
+ }
+#else
c->freelist = get_freepointer(s, object);
+#endif
stat(s, ALLOC_FASTPATH);
}
+
+#ifndef CONFIG_CMPXCHG_LOCAL
local_irq_restore(flags);
+#endif
if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, s->objsize);
@@ -1774,11 +1977,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
- return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+ trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+{
+ void *ret = kmalloc_order(size, flags, order);
+ trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+ return ret;
+}
+EXPORT_SYMBOL(kmalloc_order_trace);
#endif
#ifdef CONFIG_NUMA
@@ -1794,13 +2007,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
EXPORT_SYMBOL(kmem_cache_alloc_node);
#ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
- int node)
+ int node, size_t size)
{
- return slab_alloc(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+
+ trace_kmalloc_node(_RET_IP_, ret,
+ size, s->size, gfpflags, node);
+ return ret;
}
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
#endif
@@ -1817,9 +2034,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
{
void *prior;
void **object = (void *)x;
+#ifdef CONFIG_CMPXCHG_LOCAL
+ unsigned long flags;
- stat(s, FREE_SLOWPATH);
+ local_irq_save(flags);
+#endif
slab_lock(page);
+ stat(s, FREE_SLOWPATH);
if (kmem_cache_debug(s))
goto debug;
@@ -1849,6 +2070,9 @@ checks_ok:
out_unlock:
slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+ local_irq_restore(flags);
+#endif
return;
slab_empty:
@@ -1860,6 +2084,9 @@ slab_empty:
stat(s, FREE_REMOVE_PARTIAL);
}
slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+ local_irq_restore(flags);
+#endif
stat(s, FREE_SLAB);
discard_slab(s, page);
return;
@@ -1886,23 +2113,56 @@ static __always_inline void slab_free(struct kmem_cache *s,
{
void **object = (void *)x;
struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+ unsigned long tid;
+#else
unsigned long flags;
+#endif
slab_free_hook(s, x);
+#ifndef CONFIG_CMPXCHG_LOCAL
local_irq_save(flags);
+
+#else
+redo:
+#endif
+
+ /*
+ * Determine the currently cpus per cpu slab.
+ * The cpu may change afterward. However that does not matter since
+ * data is retrieved via this pointer. If we are on the same cpu
+ * during the cmpxchg then the free will succedd.
+ */
c = __this_cpu_ptr(s->cpu_slab);
- slab_free_hook_irq(s, x);
+#ifdef CONFIG_CMPXCHG_LOCAL
+ tid = c->tid;
+ barrier();
+#endif
if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
set_freepointer(s, object, c->freelist);
+
+#ifdef CONFIG_CMPXCHG_LOCAL
+ if (unlikely(!irqsafe_cpu_cmpxchg_double(
+ s->cpu_slab->freelist, s->cpu_slab->tid,
+ c->freelist, tid,
+ object, next_tid(tid)))) {
+
+ note_cmpxchg_failure("slab_free", s, tid);
+ goto redo;
+ }
+#else
c->freelist = object;
+#endif
stat(s, FREE_FASTPATH);
} else
__slab_free(s, page, x, addr);
+#ifndef CONFIG_CMPXCHG_LOCAL
local_irq_restore(flags);
+#endif
}
void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -1917,17 +2177,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
}
EXPORT_SYMBOL(kmem_cache_free);
-/* Figure out on which slab page the object resides */
-static struct page *get_object_page(const void *x)
-{
- struct page *page = virt_to_head_page(x);
-
- if (!PageSlab(page))
- return NULL;
-
- return page;
-}
-
/*
* Object placement in a slab is made very easy because we always start at
* offset 0. If we tune the size of the object to the alignment then we can
@@ -1983,13 +2232,13 @@ static int slub_nomerge;
* the smallest order which will fit the object.
*/
static inline int slab_order(int size, int min_objects,
- int max_order, int fract_leftover)
+ int max_order, int fract_leftover, int reserved)
{
int order;
int rem;
int min_order = slub_min_order;
- if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
+ if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
return get_order(size * MAX_OBJS_PER_PAGE) - 1;
for (order = max(min_order,
@@ -1998,10 +2247,10 @@ static inline int slab_order(int size, int min_objects,
unsigned long slab_size = PAGE_SIZE << order;
- if (slab_size < min_objects * size)
+ if (slab_size < min_objects * size + reserved)
continue;
- rem = slab_size % size;
+ rem = (slab_size - reserved) % size;
if (rem <= slab_size / fract_leftover)
break;
@@ -2011,7 +2260,7 @@ static inline int slab_order(int size, int min_objects,
return order;
}
-static inline int calculate_order(int size)
+static inline int calculate_order(int size, int reserved)
{
int order;
int min_objects;
@@ -2029,14 +2278,14 @@ static inline int calculate_order(int size)
min_objects = slub_min_objects;
if (!min_objects)
min_objects = 4 * (fls(nr_cpu_ids) + 1);
- max_objects = (PAGE_SIZE << slub_max_order)/size;
+ max_objects = order_objects(slub_max_order, size, reserved);
min_objects = min(min_objects, max_objects);
while (min_objects > 1) {
fraction = 16;
while (fraction >= 4) {
order = slab_order(size, min_objects,
- slub_max_order, fraction);
+ slub_max_order, fraction, reserved);
if (order <= slub_max_order)
return order;
fraction /= 2;
@@ -2048,14 +2297,14 @@ static inline int calculate_order(int size)
* We were unable to place multiple objects in a slab. Now
* lets see if we can place a single object there.
*/
- order = slab_order(size, 1, slub_max_order, 1);
+ order = slab_order(size, 1, slub_max_order, 1, reserved);
if (order <= slub_max_order)
return order;
/*
* Doh this slab cannot be placed using slub_max_order.
*/
- order = slab_order(size, 1, MAX_ORDER, 1);
+ order = slab_order(size, 1, MAX_ORDER, 1, reserved);
if (order < MAX_ORDER)
return order;
return -ENOSYS;
@@ -2105,9 +2354,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
+#ifdef CONFIG_CMPXCHG_LOCAL
+ /*
+ * Must align to double word boundary for the double cmpxchg instructions
+ * to work.
+ */
+ s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
+#else
+ /* Regular alignment is sufficient */
s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+#endif
+
+ if (!s->cpu_slab)
+ return 0;
+
+ init_kmem_cache_cpus(s);
- return s->cpu_slab != NULL;
+ return 1;
}
static struct kmem_cache *kmem_cache_node;
@@ -2306,7 +2569,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
if (forced_order >= 0)
order = forced_order;
else
- order = calculate_order(size);
+ order = calculate_order(size, s->reserved);
if (order < 0)
return 0;
@@ -2324,8 +2587,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
/*
* Determine the number of objects per slab
*/
- s->oo = oo_make(order, size);
- s->min = oo_make(get_order(size), size);
+ s->oo = oo_make(order, size, s->reserved);
+ s->min = oo_make(get_order(size), size, s->reserved);
if (oo_objects(s->oo) > oo_objects(s->max))
s->max = s->oo;
@@ -2344,6 +2607,10 @@ static int kmem_cache_open(struct kmem_cache *s,
s->objsize = size;
s->align = align;
s->flags = kmem_cache_flags(size, flags, name, ctor);
+ s->reserved = 0;
+
+ if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
+ s->reserved = sizeof(struct rcu_head);
if (!calculate_sizes(s, -1))
goto error;
@@ -2386,35 +2653,6 @@ error:
}
/*
- * Check if a given pointer is valid
- */
-int kmem_ptr_validate(struct kmem_cache *s, const void *object)
-{
- struct page *page;
-
- if (!kern_ptr_validate(object, s->size))
- return 0;
-
- page = get_object_page(object);
-
- if (!page || s != page->slab)
- /* No slab or wrong slab */
- return 0;
-
- if (!check_valid_pointer(s, page, object))
- return 0;
-
- /*
- * We could also check if the object is on the slabs freelist.
- * But this would be too expensive and it seems that the main
- * purpose of kmem_ptr_valid() is to check if the object belongs
- * to a certain slab.
- */
- return 1;
-}
-EXPORT_SYMBOL(kmem_ptr_validate);
-
-/*
* Determine the size of a slab object
*/
unsigned int kmem_cache_size(struct kmem_cache *s)
@@ -2423,12 +2661,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
}
EXPORT_SYMBOL(kmem_cache_size);
-const char *kmem_cache_name(struct kmem_cache *s)
-{
- return s->name;
-}
-EXPORT_SYMBOL(kmem_cache_name);
-
static void list_slab_objects(struct kmem_cache *s, struct page *page,
const char *text)
{
@@ -2720,7 +2952,6 @@ EXPORT_SYMBOL(__kmalloc_node);
size_t ksize(const void *object)
{
struct page *page;
- struct kmem_cache *s;
if (unlikely(object == ZERO_SIZE_PTR))
return 0;
@@ -2731,28 +2962,8 @@ size_t ksize(const void *object)
WARN_ON(!PageCompound(page));
return PAGE_SIZE << compound_order(page);
}
- s = page->slab;
-
-#ifdef CONFIG_SLUB_DEBUG
- /*
- * Debugging requires use of the padding between object
- * and whatever may come after it.
- */
- if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
- return s->objsize;
-#endif
- /*
- * If we have the need to store the freelist pointer
- * back there or track user information then we can
- * only use the space before that information.
- */
- if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
- return s->inuse;
- /*
- * Else we can use all the padding etc for the allocation
- */
- return s->size;
+ return slab_ksize(page->slab);
}
EXPORT_SYMBOL(ksize);
@@ -3336,7 +3547,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
- /* Honor the call site pointer we recieved. */
+ /* Honor the call site pointer we received. */
trace_kmalloc(caller, ret, size, s->size, gfpflags);
return ret;
@@ -3366,7 +3577,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
ret = slab_alloc(s, gfpflags, node, caller);
- /* Honor the call site pointer we recieved. */
+ /* Honor the call site pointer we received. */
trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
return ret;
@@ -3660,7 +3871,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf + len, "%7ld ", l->count);
if (l->addr)
- len += sprint_symbol(buf + len, (unsigned long)l->addr);
+ len += sprintf(buf + len, "%pS", (void *)l->addr);
else
len += sprintf(buf + len, "<not-available>");
@@ -3821,7 +4032,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
}
}
- down_read(&slub_lock);
+ lock_memory_hotplug();
#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
for_each_node_state(node, N_NORMAL_MEMORY) {
@@ -3862,7 +4073,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
- up_read(&slub_lock);
+ unlock_memory_hotplug();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
@@ -3970,12 +4181,9 @@ SLAB_ATTR(min_partial);
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
- if (s->ctor) {
- int n = sprint_symbol(buf, (unsigned long)s->ctor);
-
- return n + sprintf(buf + n, "\n");
- }
- return 0;
+ if (!s->ctor)
+ return 0;
+ return sprintf(buf, "%pS\n", s->ctor);
}
SLAB_ATTR_RO(ctor);
@@ -4044,6 +4252,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(destroy_by_rcu);
+static ssize_t reserved_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", s->reserved);
+}
+SLAB_ATTR_RO(reserved);
+
#ifdef CONFIG_SLUB_DEBUG
static ssize_t slabs_show(struct kmem_cache *s, char *buf)
{
@@ -4330,6 +4544,7 @@ static struct attribute *slab_attrs[] = {
&reclaim_account_attr.attr,
&destroy_by_rcu_attr.attr,
&shrink_attr.attr,
+ &reserved_attr.attr,
#ifdef CONFIG_SLUB_DEBUG
&total_objects_attr.attr,
&slabs_attr.attr,
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 29d6cbffb283..64b984091edb 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -9,7 +9,7 @@
*
* However, virtual mappings need a page table and TLBs. Many Linux
* architectures already map their physical space using 1-1 mappings
- * via TLBs. For those arches the virtual memmory map is essentially
+ * via TLBs. For those arches the virtual memory map is essentially
* for free if we use the same page size as the 1-1 mappings. In that
* case the overhead consists of a few additional pages that are
* allocated to create a view of memory for vmemmap.
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219af379..aa64b12831a2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -500,7 +500,7 @@ void __init sparse_init(void)
* so alloc 2M (with 2M align) and 24 bytes in turn will
* make next 2M slip to one more 2M later.
* then in big system, the memory will have a lot of holes...
- * here try to allocate 2M pages continously.
+ * here try to allocate 2M pages continuously.
*
* powerpc need to call sparse_init_one_section right after each
* sparse_early_mem_map_alloc, so allocate usemap_map at first.
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
static void free_map_bootmem(struct page *page, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
- int magic;
+ unsigned long magic;
for (i = 0; i < nr_pages; i++, page++) {
- magic = atomic_read(&page->_mapcount);
+ magic = (unsigned long) page->lru.next;
BUG_ON(magic == NODE_INFO);
diff --git a/mm/swap.c b/mm/swap.c
index 3f4854205b16..a448db377cb0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,6 +39,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
/*
* This path almost never happens for VM activity - pages are normally
@@ -56,17 +57,97 @@ static void __page_cache_release(struct page *page)
del_page_from_lru(zone, page);
spin_unlock_irqrestore(&zone->lru_lock, flags);
}
+}
+
+static void __put_single_page(struct page *page)
+{
+ __page_cache_release(page);
free_hot_cold_page(page, 0);
}
-static void put_compound_page(struct page *page)
+static void __put_compound_page(struct page *page)
{
- page = compound_head(page);
- if (put_page_testzero(page)) {
- compound_page_dtor *dtor;
+ compound_page_dtor *dtor;
- dtor = get_compound_page_dtor(page);
- (*dtor)(page);
+ __page_cache_release(page);
+ dtor = get_compound_page_dtor(page);
+ (*dtor)(page);
+}
+
+static void put_compound_page(struct page *page)
+{
+ if (unlikely(PageTail(page))) {
+ /* __split_huge_page_refcount can run under us */
+ struct page *page_head = page->first_page;
+ smp_rmb();
+ /*
+ * If PageTail is still set after smp_rmb() we can be sure
+ * that the page->first_page we read wasn't a dangling pointer.
+ * See __split_huge_page_refcount() smp_wmb().
+ */
+ if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+ unsigned long flags;
+ /*
+ * Verify that our page_head wasn't converted
+ * to a a regular page before we got a
+ * reference on it.
+ */
+ if (unlikely(!PageHead(page_head))) {
+ /* PageHead is cleared after PageTail */
+ smp_rmb();
+ VM_BUG_ON(PageTail(page));
+ goto out_put_head;
+ }
+ /*
+ * Only run compound_lock on a valid PageHead,
+ * after having it pinned with
+ * get_page_unless_zero() above.
+ */
+ smp_mb();
+ /* page_head wasn't a dangling pointer */
+ flags = compound_lock_irqsave(page_head);
+ if (unlikely(!PageTail(page))) {
+ /* __split_huge_page_refcount run before us */
+ compound_unlock_irqrestore(page_head, flags);
+ VM_BUG_ON(PageHead(page_head));
+ out_put_head:
+ if (put_page_testzero(page_head))
+ __put_single_page(page_head);
+ out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+ return;
+ }
+ VM_BUG_ON(page_head != page->first_page);
+ /*
+ * We can release the refcount taken by
+ * get_page_unless_zero now that
+ * split_huge_page_refcount is blocked on the
+ * compound_lock.
+ */
+ if (put_page_testzero(page_head))
+ VM_BUG_ON(1);
+ /* __split_huge_page_refcount will wait now */
+ VM_BUG_ON(atomic_read(&page->_count) <= 0);
+ atomic_dec(&page->_count);
+ VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+ compound_unlock_irqrestore(page_head, flags);
+ if (put_page_testzero(page_head)) {
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+ else
+ __put_single_page(page_head);
+ }
+ } else {
+ /* page_head is a dangling pointer */
+ VM_BUG_ON(PageTail(page));
+ goto out_put_single;
+ }
+ } else if (put_page_testzero(page)) {
+ if (PageHead(page))
+ __put_compound_page(page);
+ else
+ __put_single_page(page);
}
}
@@ -75,7 +156,7 @@ void put_page(struct page *page)
if (unlikely(PageCompound(page)))
put_compound_page(page);
else if (put_page_testzero(page))
- __page_cache_release(page);
+ __put_single_page(page);
}
EXPORT_SYMBOL(put_page);
@@ -98,15 +179,13 @@ void put_pages_list(struct list_head *pages)
}
EXPORT_SYMBOL(put_pages_list);
-/*
- * pagevec_move_tail() must be called with IRQ disabled.
- * Otherwise this may cause nasty races.
- */
-static void pagevec_move_tail(struct pagevec *pvec)
+static void pagevec_lru_move_fn(struct pagevec *pvec,
+ void (*move_fn)(struct page *page, void *arg),
+ void *arg)
{
int i;
- int pgmoved = 0;
struct zone *zone = NULL;
+ unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
@@ -114,29 +193,50 @@ static void pagevec_move_tail(struct pagevec *pvec)
if (pagezone != zone) {
if (zone)
- spin_unlock(&zone->lru_lock);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
zone = pagezone;
- spin_lock(&zone->lru_lock);
- }
- if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- int lru = page_lru_base_type(page);
- list_move_tail(&page->lru, &zone->lru[lru].list);
- pgmoved++;
+ spin_lock_irqsave(&zone->lru_lock, flags);
}
+
+ (*move_fn)(page, arg);
}
if (zone)
- spin_unlock(&zone->lru_lock);
- __count_vm_events(PGROTATED, pgmoved);
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
release_pages(pvec->pages, pvec->nr, pvec->cold);
pagevec_reinit(pvec);
}
+static void pagevec_move_tail_fn(struct page *page, void *arg)
+{
+ int *pgmoved = arg;
+ struct zone *zone = page_zone(page);
+
+ if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ enum lru_list lru = page_lru_base_type(page);
+ list_move_tail(&page->lru, &zone->lru[lru].list);
+ mem_cgroup_rotate_reclaimable_page(page);
+ (*pgmoved)++;
+ }
+}
+
+/*
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
+ */
+static void pagevec_move_tail(struct pagevec *pvec)
+{
+ int pgmoved = 0;
+
+ pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
+ __count_vm_events(PGROTATED, pgmoved);
+}
+
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
* inactive list.
*/
-void rotate_reclaimable_page(struct page *page)
+void rotate_reclaimable_page(struct page *page)
{
if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
!PageUnevictable(page) && PageLRU(page)) {
@@ -267,6 +367,71 @@ void add_page_to_unevictable_list(struct page *page)
}
/*
+ * If the page can not be invalidated, it is moved to the
+ * inactive list to speed up its reclaim. It is moved to the
+ * head of the list, rather than the tail, to give the flusher
+ * threads some time to write it out, as this is much more
+ * effective than the single-page writeout from reclaim.
+ *
+ * If the page isn't page_mapped and dirty/writeback, the page
+ * could reclaim asap using PG_reclaim.
+ *
+ * 1. active, mapped page -> none
+ * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
+ * 3. inactive, mapped page -> none
+ * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
+ * 5. inactive, clean -> inactive, tail
+ * 6. Others -> none
+ *
+ * In 4, why it moves inactive's head, the VM expects the page would
+ * be write it out by flusher threads as this is much more effective
+ * than the single-page writeout from reclaim.
+ */
+static void lru_deactivate_fn(struct page *page, void *arg)
+{
+ int lru, file;
+ bool active;
+ struct zone *zone = page_zone(page);
+
+ if (!PageLRU(page))
+ return;
+
+ /* Some processes are using the page */
+ if (page_mapped(page))
+ return;
+
+ active = PageActive(page);
+
+ file = page_is_file_cache(page);
+ lru = page_lru_base_type(page);
+ del_page_from_lru_list(zone, page, lru + active);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(zone, page, lru);
+
+ if (PageWriteback(page) || PageDirty(page)) {
+ /*
+ * PG_reclaim could be raced with end_page_writeback
+ * It can make readahead confusing. But race window
+ * is _really_ small and it's non-critical problem.
+ */
+ SetPageReclaim(page);
+ } else {
+ /*
+ * The page's writeback ends up during pagevec
+ * We moves tha page into tail of inactive.
+ */
+ list_move_tail(&page->lru, &zone->lru[lru].list);
+ mem_cgroup_rotate_reclaimable_page(page);
+ __count_vm_event(PGROTATED);
+ }
+
+ if (active)
+ __count_vm_event(PGDEACTIVATE);
+ update_page_reclaim_stat(zone, page, file, 0);
+}
+
+/*
* Drain pages out of the cpu's pagevecs.
* Either "cpu" is the current CPU, and preemption has already been
* disabled; or "cpu" is being hot-unplugged, and is already dead.
@@ -292,6 +457,29 @@ static void drain_cpu_pagevecs(int cpu)
pagevec_move_tail(pvec);
local_irq_restore(flags);
}
+
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+}
+
+/**
+ * deactivate_page - forcefully deactivate a page
+ * @page: page to deactivate
+ *
+ * This function hints the VM that @page is a good reclaim candidate,
+ * for example if its invalidation fails due to the page being dirty
+ * or under writeback.
+ */
+void deactivate_page(struct page *page)
+{
+ if (likely(get_page_unless_zero(page))) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
}
void lru_add_drain(void)
@@ -399,44 +587,70 @@ void __pagevec_release(struct pagevec *pvec)
EXPORT_SYMBOL(__pagevec_release);
+/* used by __split_huge_page_refcount() */
+void lru_add_page_tail(struct zone* zone,
+ struct page *page, struct page *page_tail)
+{
+ int active;
+ enum lru_list lru;
+ const int file = 0;
+ struct list_head *head;
+
+ VM_BUG_ON(!PageHead(page));
+ VM_BUG_ON(PageCompound(page_tail));
+ VM_BUG_ON(PageLRU(page_tail));
+ VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+
+ SetPageLRU(page_tail);
+
+ if (page_evictable(page_tail, NULL)) {
+ if (PageActive(page)) {
+ SetPageActive(page_tail);
+ active = 1;
+ lru = LRU_ACTIVE_ANON;
+ } else {
+ active = 0;
+ lru = LRU_INACTIVE_ANON;
+ }
+ update_page_reclaim_stat(zone, page_tail, file, active);
+ if (likely(PageLRU(page)))
+ head = page->lru.prev;
+ else
+ head = &zone->lru[lru].list;
+ __add_page_to_lru_list(zone, page_tail, lru, head);
+ } else {
+ SetPageUnevictable(page_tail);
+ add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
+ }
+}
+
+static void ____pagevec_lru_add_fn(struct page *page, void *arg)
+{
+ enum lru_list lru = (enum lru_list)arg;
+ struct zone *zone = page_zone(page);
+ int file = is_file_lru(lru);
+ int active = is_active_lru(lru);
+
+ VM_BUG_ON(PageActive(page));
+ VM_BUG_ON(PageUnevictable(page));
+ VM_BUG_ON(PageLRU(page));
+
+ SetPageLRU(page);
+ if (active)
+ SetPageActive(page);
+ update_page_reclaim_stat(zone, page, file, active);
+ add_page_to_lru_list(zone, page, lru);
+}
+
/*
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
*/
void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
{
- int i;
- struct zone *zone = NULL;
-
VM_BUG_ON(is_unevictable_lru(lru));
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- struct zone *pagezone = page_zone(page);
- int file;
- int active;
-
- if (pagezone != zone) {
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- zone = pagezone;
- spin_lock_irq(&zone->lru_lock);
- }
- VM_BUG_ON(PageActive(page));
- VM_BUG_ON(PageUnevictable(page));
- VM_BUG_ON(PageLRU(page));
- SetPageLRU(page);
- active = is_active_lru(lru);
- file = is_file_lru(lru);
- if (active)
- SetPageActive(page);
- update_page_reclaim_stat(zone, page, file, active);
- add_page_to_lru_list(zone, page, lru);
- }
- if (zone)
- spin_unlock_irq(&zone->lru_lock);
- release_pages(pvec->pages, pvec->nr, pvec->cold);
- pagevec_reinit(pvec);
+ pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
}
EXPORT_SYMBOL(____pagevec_lru_add);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e10f5833167f..46680461785b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,12 +24,10 @@
/*
* swapper_space is a fiction, retained to simplify the path through
- * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
- * future use of radix_tree tags in the swap cache.
+ * vmscan's shrink_page_list.
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
- .sync_page = block_sync_page,
.set_page_dirty = __set_page_dirty_nobuffers,
.migratepage = migrate_page,
};
@@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = {
static struct backing_dev_info swap_backing_dev_info = {
.name = "swap",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
- .unplug_io_fn = swap_unplug_io_fn,
};
struct address_space swapper_space = {
@@ -157,6 +154,12 @@ int add_to_swap(struct page *page)
if (!entry.val)
return 0;
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page))) {
+ swapcache_free(entry, NULL);
+ return 0;
+ }
+
/*
* Radix-tree node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 67ddaaf98c74..8c6b3ce38f09 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -95,39 +95,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
}
/*
- * We need this because the bdev->unplug_fn can sleep and we cannot
- * hold swap_lock while calling the unplug_fn. And swap_lock
- * cannot be turned into a mutex.
- */
-static DECLARE_RWSEM(swap_unplug_sem);
-
-void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
-{
- swp_entry_t entry;
-
- down_read(&swap_unplug_sem);
- entry.val = page_private(page);
- if (PageSwapCache(page)) {
- struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
- struct backing_dev_info *bdi;
-
- /*
- * If the page is removed from swapcache from under us (with a
- * racy try_to_unuse/swapoff) we need an additional reference
- * count to avoid reading garbage from page_private(page) above.
- * If the WARN_ON triggers during a swapoff it maybe the race
- * condition and it's harmless. However if it triggers without
- * swapoff it signals a problem.
- */
- WARN_ON(page_count(page) <= 1);
-
- bdi = bdev->bd_inode->i_mapping->backing_dev_info;
- blk_run_backing_dev(bdi, page);
- }
- up_read(&swap_unplug_sem);
-}
-
-/*
* swapon tell device that all the old swap contents can be discarded,
* to allow the swap device to optimize its wear-levelling.
*/
@@ -212,8 +179,8 @@ static int wait_for_discard(void *word)
#define SWAPFILE_CLUSTER 256
#define LATENCY_LIMIT 256
-static inline unsigned long scan_swap_map(struct swap_info_struct *si,
- unsigned char usage)
+static unsigned long scan_swap_map(struct swap_info_struct *si,
+ unsigned char usage)
{
unsigned long offset;
unsigned long scan_base;
@@ -880,7 +847,7 @@ unsigned int count_swap_pages(int type, int free)
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, swp_entry_t entry, struct page *page)
{
- struct mem_cgroup *ptr = NULL;
+ struct mem_cgroup *ptr;
spinlock_t *ptl;
pte_t *pte;
int ret = 1;
@@ -964,6 +931,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (unlikely(pmd_trans_huge(*pmd)))
+ continue;
if (pmd_none_or_clear_bad(pmd))
continue;
ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
@@ -1548,6 +1517,36 @@ bad_bmap:
goto out;
}
+static void enable_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map)
+{
+ int i, prev;
+
+ spin_lock(&swap_lock);
+ if (prio >= 0)
+ p->prio = prio;
+ else
+ p->prio = --least_priority;
+ p->swap_map = swap_map;
+ p->flags |= SWP_WRITEOK;
+ nr_swap_pages += p->pages;
+ total_swap_pages += p->pages;
+
+ /* insert swap space into swap_list: */
+ prev = -1;
+ for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+ if (p->prio >= swap_info[i]->prio)
+ break;
+ prev = i;
+ }
+ p->next = i;
+ if (prev < 0)
+ swap_list.head = swap_list.next = p->type;
+ else
+ swap_info[prev]->next = p->type;
+ spin_unlock(&swap_lock);
+}
+
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
@@ -1619,32 +1618,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
current->flags &= ~PF_OOM_ORIGIN;
if (err) {
+ /*
+ * reading p->prio and p->swap_map outside the lock is
+ * safe here because only sys_swapon and sys_swapoff
+ * change them, and there can be no other sys_swapon or
+ * sys_swapoff for this swap_info_struct at this point.
+ */
/* re-insert swap space back into swap_list */
- spin_lock(&swap_lock);
- if (p->prio < 0)
- p->prio = --least_priority;
- prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
- if (p->prio >= swap_info[i]->prio)
- break;
- prev = i;
- }
- p->next = i;
- if (prev < 0)
- swap_list.head = swap_list.next = type;
- else
- swap_info[prev]->next = type;
- nr_swap_pages += p->pages;
- total_swap_pages += p->pages;
- p->flags |= SWP_WRITEOK;
- spin_unlock(&swap_lock);
+ enable_swap_info(p, p->prio, p->swap_map);
goto out_dput;
}
- /* wait for any unplug function to finish */
- down_write(&swap_unplug_sem);
- up_write(&swap_unplug_sem);
-
destroy_swap_extents(p);
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
@@ -1677,7 +1661,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
set_blocksize(bdev, p->old_block_size);
- bd_release(bdev);
+ blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
} else {
mutex_lock(&inode->i_mutex);
inode->i_flags &= ~S_SWAPFILE;
@@ -1842,49 +1826,24 @@ static int __init max_swapfiles_check(void)
late_initcall(max_swapfiles_check);
#endif
-/*
- * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
- *
- * The swapon system call
- */
-SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+static struct swap_info_struct *alloc_swap_info(void)
{
struct swap_info_struct *p;
- char *name = NULL;
- struct block_device *bdev = NULL;
- struct file *swap_file = NULL;
- struct address_space *mapping;
unsigned int type;
- int i, prev;
- int error;
- union swap_header *swap_header;
- unsigned int nr_good_pages;
- int nr_extents = 0;
- sector_t span;
- unsigned long maxpages;
- unsigned long swapfilepages;
- unsigned char *swap_map = NULL;
- struct page *page = NULL;
- struct inode *inode = NULL;
- int did_down = 0;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
spin_lock(&swap_lock);
for (type = 0; type < nr_swapfiles; type++) {
if (!(swap_info[type]->flags & SWP_USED))
break;
}
- error = -EPERM;
if (type >= MAX_SWAPFILES) {
spin_unlock(&swap_lock);
kfree(p);
- goto out;
+ return ERR_PTR(-EPERM);
}
if (type >= nr_swapfiles) {
p->type = type;
@@ -1909,80 +1868,49 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->next = -1;
spin_unlock(&swap_lock);
- name = getname(specialfile);
- error = PTR_ERR(name);
- if (IS_ERR(name)) {
- name = NULL;
- goto bad_swap_2;
- }
- swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
- error = PTR_ERR(swap_file);
- if (IS_ERR(swap_file)) {
- swap_file = NULL;
- goto bad_swap_2;
- }
-
- p->swap_file = swap_file;
- mapping = swap_file->f_mapping;
- inode = mapping->host;
-
- error = -EBUSY;
- for (i = 0; i < nr_swapfiles; i++) {
- struct swap_info_struct *q = swap_info[i];
+ return p;
+}
- if (i == type || !q->swap_file)
- continue;
- if (mapping == q->swap_file->f_mapping)
- goto bad_swap;
- }
+static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
+{
+ int error;
- error = -EINVAL;
if (S_ISBLK(inode->i_mode)) {
- bdev = I_BDEV(inode);
- error = bd_claim(bdev, sys_swapon);
+ p->bdev = bdgrab(I_BDEV(inode));
+ error = blkdev_get(p->bdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+ sys_swapon);
if (error < 0) {
- bdev = NULL;
- error = -EINVAL;
- goto bad_swap;
+ p->bdev = NULL;
+ return -EINVAL;
}
- p->old_block_size = block_size(bdev);
- error = set_blocksize(bdev, PAGE_SIZE);
+ p->old_block_size = block_size(p->bdev);
+ error = set_blocksize(p->bdev, PAGE_SIZE);
if (error < 0)
- goto bad_swap;
- p->bdev = bdev;
+ return error;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
mutex_lock(&inode->i_mutex);
- did_down = 1;
- if (IS_SWAPFILE(inode)) {
- error = -EBUSY;
- goto bad_swap;
- }
- } else {
- goto bad_swap;
- }
+ if (IS_SWAPFILE(inode))
+ return -EBUSY;
+ } else
+ return -EINVAL;
- swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
+ return 0;
+}
- /*
- * Read the swap header.
- */
- if (!mapping->a_ops->readpage) {
- error = -EINVAL;
- goto bad_swap;
- }
- page = read_mapping_page(mapping, 0, swap_file);
- if (IS_ERR(page)) {
- error = PTR_ERR(page);
- goto bad_swap;
- }
- swap_header = kmap(page);
+static unsigned long read_swap_header(struct swap_info_struct *p,
+ union swap_header *swap_header,
+ struct inode *inode)
+{
+ int i;
+ unsigned long maxpages;
+ unsigned long swapfilepages;
if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
printk(KERN_ERR "Unable to find swap-space signature\n");
- error = -EINVAL;
- goto bad_swap;
+ return 0;
}
/* swap partition endianess hack... */
@@ -1998,8 +1926,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
printk(KERN_WARNING
"Unable to handle swap header version %d\n",
swap_header->info.version);
- error = -EINVAL;
- goto bad_swap;
+ return 0;
}
p->lowest_bit = 1;
@@ -2030,61 +1957,155 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
p->highest_bit = maxpages - 1;
- error = -EINVAL;
if (!maxpages)
- goto bad_swap;
+ return 0;
+ swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
if (swapfilepages && maxpages > swapfilepages) {
printk(KERN_WARNING
"Swap area shorter than signature indicates\n");
- goto bad_swap;
+ return 0;
}
if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
- goto bad_swap;
+ return 0;
if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
- goto bad_swap;
+ return 0;
- /* OK, set up the swap map and apply the bad block list */
- swap_map = vmalloc(maxpages);
- if (!swap_map) {
- error = -ENOMEM;
- goto bad_swap;
- }
+ return maxpages;
+}
+
+static int setup_swap_map_and_extents(struct swap_info_struct *p,
+ union swap_header *swap_header,
+ unsigned char *swap_map,
+ unsigned long maxpages,
+ sector_t *span)
+{
+ int i;
+ unsigned int nr_good_pages;
+ int nr_extents;
- memset(swap_map, 0, maxpages);
nr_good_pages = maxpages - 1; /* omit header page */
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
- if (page_nr == 0 || page_nr > swap_header->info.last_page) {
- error = -EINVAL;
- goto bad_swap;
- }
+ if (page_nr == 0 || page_nr > swap_header->info.last_page)
+ return -EINVAL;
if (page_nr < maxpages) {
swap_map[page_nr] = SWAP_MAP_BAD;
nr_good_pages--;
}
}
- error = swap_cgroup_swapon(type, maxpages);
- if (error)
- goto bad_swap;
-
if (nr_good_pages) {
swap_map[0] = SWAP_MAP_BAD;
p->max = maxpages;
p->pages = nr_good_pages;
- nr_extents = setup_swap_extents(p, &span);
- if (nr_extents < 0) {
- error = nr_extents;
- goto bad_swap;
- }
+ nr_extents = setup_swap_extents(p, span);
+ if (nr_extents < 0)
+ return nr_extents;
nr_good_pages = p->pages;
}
if (!nr_good_pages) {
printk(KERN_WARNING "Empty swap-file\n");
+ return -EINVAL;
+ }
+
+ return nr_extents;
+}
+
+SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+{
+ struct swap_info_struct *p;
+ char *name;
+ struct file *swap_file = NULL;
+ struct address_space *mapping;
+ int i;
+ int prio;
+ int error;
+ union swap_header *swap_header;
+ int nr_extents;
+ sector_t span;
+ unsigned long maxpages;
+ unsigned char *swap_map = NULL;
+ struct page *page = NULL;
+ struct inode *inode = NULL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ p = alloc_swap_info();
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ name = getname(specialfile);
+ if (IS_ERR(name)) {
+ error = PTR_ERR(name);
+ name = NULL;
+ goto bad_swap;
+ }
+ swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+ if (IS_ERR(swap_file)) {
+ error = PTR_ERR(swap_file);
+ swap_file = NULL;
+ goto bad_swap;
+ }
+
+ p->swap_file = swap_file;
+ mapping = swap_file->f_mapping;
+
+ for (i = 0; i < nr_swapfiles; i++) {
+ struct swap_info_struct *q = swap_info[i];
+
+ if (q == p || !q->swap_file)
+ continue;
+ if (mapping == q->swap_file->f_mapping) {
+ error = -EBUSY;
+ goto bad_swap;
+ }
+ }
+
+ inode = mapping->host;
+ /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
+ error = claim_swapfile(p, inode);
+ if (unlikely(error))
+ goto bad_swap;
+
+ /*
+ * Read the swap header.
+ */
+ if (!mapping->a_ops->readpage) {
error = -EINVAL;
goto bad_swap;
}
+ page = read_mapping_page(mapping, 0, swap_file);
+ if (IS_ERR(page)) {
+ error = PTR_ERR(page);
+ goto bad_swap;
+ }
+ swap_header = kmap(page);
+
+ maxpages = read_swap_header(p, swap_header, inode);
+ if (unlikely(!maxpages)) {
+ error = -EINVAL;
+ goto bad_swap;
+ }
+
+ /* OK, set up the swap map and apply the bad block list */
+ swap_map = vzalloc(maxpages);
+ if (!swap_map) {
+ error = -ENOMEM;
+ goto bad_swap;
+ }
+
+ error = swap_cgroup_swapon(p->type, maxpages);
+ if (error)
+ goto bad_swap;
+
+ nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
+ maxpages, &span);
+ if (unlikely(nr_extents < 0)) {
+ error = nr_extents;
+ goto bad_swap;
+ }
if (p->bdev) {
if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2096,58 +2117,46 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
mutex_lock(&swapon_mutex);
- spin_lock(&swap_lock);
+ prio = -1;
if (swap_flags & SWAP_FLAG_PREFER)
- p->prio =
+ prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- else
- p->prio = --least_priority;
- p->swap_map = swap_map;
- p->flags |= SWP_WRITEOK;
- nr_swap_pages += nr_good_pages;
- total_swap_pages += nr_good_pages;
+ enable_swap_info(p, prio, swap_map);
printk(KERN_INFO "Adding %uk swap on %s. "
"Priority:%d extents:%d across:%lluk %s%s\n",
- nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
+ p->pages<<(PAGE_SHIFT-10), name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
(p->flags & SWP_DISCARDABLE) ? "D" : "");
- /* insert swap space into swap_list: */
- prev = -1;
- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
- if (p->prio >= swap_info[i]->prio)
- break;
- prev = i;
- }
- p->next = i;
- if (prev < 0)
- swap_list.head = swap_list.next = type;
- else
- swap_info[prev]->next = type;
- spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
+ if (S_ISREG(inode->i_mode))
+ inode->i_flags |= S_SWAPFILE;
error = 0;
goto out;
bad_swap:
- if (bdev) {
- set_blocksize(bdev, p->old_block_size);
- bd_release(bdev);
+ if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
+ set_blocksize(p->bdev, p->old_block_size);
+ blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
}
destroy_swap_extents(p);
- swap_cgroup_swapoff(type);
-bad_swap_2:
+ swap_cgroup_swapoff(p->type);
spin_lock(&swap_lock);
p->swap_file = NULL;
p->flags = 0;
spin_unlock(&swap_lock);
vfree(swap_map);
- if (swap_file)
+ if (swap_file) {
+ if (inode && S_ISREG(inode->i_mode)) {
+ mutex_unlock(&inode->i_mutex);
+ inode = NULL;
+ }
filp_close(swap_file, NULL);
+ }
out:
if (page && !IS_ERR(page)) {
kunmap(page);
@@ -2155,11 +2164,8 @@ out:
}
if (name)
putname(name);
- if (did_down) {
- if (!error)
- inode->i_flags |= S_SWAPFILE;
+ if (inode && S_ISREG(inode->i_mode))
mutex_unlock(&inode->i_mutex);
- }
return error;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index ba887bff48c5..a95667529135 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -106,9 +106,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
cancel_dirty_page(page, PAGE_CACHE_SIZE);
clear_page_mlock(page);
- remove_from_page_cache(page);
ClearPageMappedToDisk(page);
- page_cache_release(page); /* pagecache ref */
+ delete_from_page_cache(page);
return 0;
}
@@ -225,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
next = start;
while (next <= end &&
pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ mem_cgroup_uncharge_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t page_index = page->index;
@@ -247,6 +247,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
unlock_page(page);
}
pagevec_release(&pvec);
+ mem_cgroup_uncharge_end();
cond_resched();
}
@@ -320,11 +321,12 @@ EXPORT_SYMBOL(truncate_inode_pages);
* pagetables.
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
+ pgoff_t start, pgoff_t end)
{
struct pagevec pvec;
pgoff_t next = start;
- unsigned long ret = 0;
+ unsigned long ret;
+ unsigned long count = 0;
int i;
pagevec_init(&pvec, 0);
@@ -351,9 +353,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
if (lock_failed)
continue;
- ret += invalidate_inode_page(page);
-
+ ret = invalidate_inode_page(page);
unlock_page(page);
+ /*
+ * Invalidation is a hint that the page is no longer
+ * of interest and try to speed up its reclaim.
+ */
+ if (!ret)
+ deactivate_page(page);
+ count += ret;
if (next > end)
break;
}
@@ -361,7 +369,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
mem_cgroup_uncharge_end();
cond_resched();
}
- return ret;
+ return count;
}
EXPORT_SYMBOL(invalidate_mapping_pages);
@@ -387,9 +395,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
clear_page_mlock(page);
BUG_ON(page_has_private(page));
- __remove_from_page_cache(page);
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (mapping->a_ops->freepage)
+ mapping->a_ops->freepage(page);
+
page_cache_release(page); /* pagecache ref */
return 1;
failed:
@@ -545,13 +557,12 @@ EXPORT_SYMBOL(truncate_pagecache);
* @inode: inode
* @newsize: new file size
*
- * truncate_setsize updastes i_size update and performs pagecache
- * truncation (if necessary) for a file size updates. It will be
- * typically be called from the filesystem's setattr function when
- * ATTR_SIZE is passed in.
+ * truncate_setsize updates i_size and performs pagecache truncation (if
+ * necessary) to @newsize. It will be typically be called from the filesystem's
+ * setattr function when ATTR_SIZE is passed in.
*
- * Must be called with inode_mutex held and after all filesystem
- * specific block truncation has been performed.
+ * Must be called with inode_mutex held and before all filesystem specific
+ * block truncation has been performed.
*/
void truncate_setsize(struct inode *inode, loff_t newsize)
{
diff --git a/mm/util.c b/mm/util.c
index 73dac81e9f78..e7b103a6fd21 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,27 +186,6 @@ void kzfree(const void *p)
}
EXPORT_SYMBOL(kzfree);
-int kern_ptr_validate(const void *ptr, unsigned long size)
-{
- unsigned long addr = (unsigned long)ptr;
- unsigned long min_addr = PAGE_OFFSET;
- unsigned long align_mask = sizeof(void *) - 1;
-
- if (unlikely(addr < min_addr))
- goto out;
- if (unlikely(addr > (unsigned long)high_memory - size))
- goto out;
- if (unlikely(addr & align_mask))
- goto out;
- if (unlikely(!kern_addr_valid(addr)))
- goto out;
- if (unlikely(!kern_addr_valid(addr + size - 1)))
- goto out;
- return 1;
-out:
- return 0;
-}
-
/*
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
@@ -248,7 +227,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
- * If the architecture not support this fucntion, simply return with no
+ * If the architecture not support this function, simply return with no
* page pinned
*/
int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index eb5cc7d00c5a..5d6030235d7a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -261,8 +261,15 @@ struct vmap_area {
};
static DEFINE_SPINLOCK(vmap_area_lock);
-static struct rb_root vmap_area_root = RB_ROOT;
static LIST_HEAD(vmap_area_list);
+static struct rb_root vmap_area_root = RB_ROOT;
+
+/* The vmap cache globals are protected by vmap_area_lock */
+static struct rb_node *free_vmap_cache;
+static unsigned long cached_hole_size;
+static unsigned long cached_vstart;
+static unsigned long cached_align;
+
static unsigned long vmap_area_pcpu_hole;
static struct vmap_area *__find_vmap_area(unsigned long addr)
@@ -331,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
struct rb_node *n;
unsigned long addr;
int purged = 0;
+ struct vmap_area *first;
BUG_ON(!size);
BUG_ON(size & ~PAGE_MASK);
+ BUG_ON(!is_power_of_2(align));
va = kmalloc_node(sizeof(struct vmap_area),
gfp_mask & GFP_RECLAIM_MASK, node);
@@ -341,79 +350,106 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-ENOMEM);
retry:
- addr = ALIGN(vstart, align);
-
spin_lock(&vmap_area_lock);
- if (addr + size - 1 < addr)
- goto overflow;
+ /*
+ * Invalidate cache if we have more permissive parameters.
+ * cached_hole_size notes the largest hole noticed _below_
+ * the vmap_area cached in free_vmap_cache: if size fits
+ * into that hole, we want to scan from vstart to reuse
+ * the hole instead of allocating above free_vmap_cache.
+ * Note that __free_vmap_area may update free_vmap_cache
+ * without updating cached_hole_size or cached_align.
+ */
+ if (!free_vmap_cache ||
+ size < cached_hole_size ||
+ vstart < cached_vstart ||
+ align < cached_align) {
+nocache:
+ cached_hole_size = 0;
+ free_vmap_cache = NULL;
+ }
+ /* record if we encounter less permissive parameters */
+ cached_vstart = vstart;
+ cached_align = align;
+
+ /* find starting point for our search */
+ if (free_vmap_cache) {
+ first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+ addr = ALIGN(first->va_end + PAGE_SIZE, align);
+ if (addr < vstart)
+ goto nocache;
+ if (addr + size - 1 < addr)
+ goto overflow;
+
+ } else {
+ addr = ALIGN(vstart, align);
+ if (addr + size - 1 < addr)
+ goto overflow;
- /* XXX: could have a last_hole cache */
- n = vmap_area_root.rb_node;
- if (n) {
- struct vmap_area *first = NULL;
+ n = vmap_area_root.rb_node;
+ first = NULL;
- do {
+ while (n) {
struct vmap_area *tmp;
tmp = rb_entry(n, struct vmap_area, rb_node);
if (tmp->va_end >= addr) {
- if (!first && tmp->va_start < addr + size)
- first = tmp;
- n = n->rb_left;
- } else {
first = tmp;
+ if (tmp->va_start <= addr)
+ break;
+ n = n->rb_left;
+ } else
n = n->rb_right;
- }
- } while (n);
+ }
if (!first)
goto found;
-
- if (first->va_end < addr) {
- n = rb_next(&first->rb_node);
- if (n)
- first = rb_entry(n, struct vmap_area, rb_node);
- else
- goto found;
- }
-
- while (addr + size > first->va_start && addr + size <= vend) {
- addr = ALIGN(first->va_end + PAGE_SIZE, align);
- if (addr + size - 1 < addr)
- goto overflow;
-
- n = rb_next(&first->rb_node);
- if (n)
- first = rb_entry(n, struct vmap_area, rb_node);
- else
- goto found;
- }
}
-found:
- if (addr + size > vend) {
-overflow:
- spin_unlock(&vmap_area_lock);
- if (!purged) {
- purge_vmap_area_lazy();
- purged = 1;
- goto retry;
- }
- if (printk_ratelimit())
- printk(KERN_WARNING
- "vmap allocation for size %lu failed: "
- "use vmalloc=<size> to increase size.\n", size);
- kfree(va);
- return ERR_PTR(-EBUSY);
+
+ /* from the starting point, walk areas until a suitable hole is found */
+ while (addr + size >= first->va_start && addr + size <= vend) {
+ if (addr + cached_hole_size < first->va_start)
+ cached_hole_size = first->va_start - addr;
+ addr = ALIGN(first->va_end + PAGE_SIZE, align);
+ if (addr + size - 1 < addr)
+ goto overflow;
+
+ n = rb_next(&first->rb_node);
+ if (n)
+ first = rb_entry(n, struct vmap_area, rb_node);
+ else
+ goto found;
}
- BUG_ON(addr & (align-1));
+found:
+ if (addr + size > vend)
+ goto overflow;
va->va_start = addr;
va->va_end = addr + size;
va->flags = 0;
__insert_vmap_area(va);
+ free_vmap_cache = &va->rb_node;
spin_unlock(&vmap_area_lock);
+ BUG_ON(va->va_start & (align-1));
+ BUG_ON(va->va_start < vstart);
+ BUG_ON(va->va_end > vend);
+
return va;
+
+overflow:
+ spin_unlock(&vmap_area_lock);
+ if (!purged) {
+ purge_vmap_area_lazy();
+ purged = 1;
+ goto retry;
+ }
+ if (printk_ratelimit())
+ printk(KERN_WARNING
+ "vmap allocation for size %lu failed: "
+ "use vmalloc=<size> to increase size.\n", size);
+ kfree(va);
+ return ERR_PTR(-EBUSY);
}
static void rcu_free_va(struct rcu_head *head)
@@ -426,6 +462,22 @@ static void rcu_free_va(struct rcu_head *head)
static void __free_vmap_area(struct vmap_area *va)
{
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+
+ if (free_vmap_cache) {
+ if (va->va_end < cached_vstart) {
+ free_vmap_cache = NULL;
+ } else {
+ struct vmap_area *cache;
+ cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+ if (va->va_start <= cache->va_start) {
+ free_vmap_cache = rb_prev(&va->rb_node);
+ /*
+ * We don't try to update cached_hole_size or
+ * cached_align, but it won't go very wrong.
+ */
+ }
+ }
+ }
rb_erase(&va->rb_node, &vmap_area_root);
RB_CLEAR_NODE(&va->rb_node);
list_del_rcu(&va->list);
@@ -748,7 +800,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
node, gfp_mask);
- if (unlikely(IS_ERR(va))) {
+ if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
}
@@ -1175,6 +1227,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
{
vunmap_page_range(addr, addr + size);
}
+EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
/**
* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
@@ -1315,13 +1368,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
-1, GFP_KERNEL, caller);
}
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
- int node, gfp_t gfp_mask)
-{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
- node, gfp_mask, __builtin_return_address(0));
-}
-
static struct vm_struct *find_vm_area(const void *addr)
{
struct vmap_area *va;
@@ -1537,25 +1583,12 @@ fail:
return NULL;
}
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
-{
- void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
- __builtin_return_address(0));
-
- /*
- * A ref_count = 3 is needed because the vm_struct and vmap_area
- * structures allocated in the __get_vm_area_node() function contain
- * references to the virtual address of the vmalloc'ed block.
- */
- kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
-
- return addr;
-}
-
/**
- * __vmalloc_node - allocate virtually contiguous memory
+ * __vmalloc_node_range - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
+ * @start: vm area range start
+ * @end: vm area range end
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
* @node: node to use for allocation or -1
@@ -1565,9 +1598,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
*/
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, void *caller)
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, int node, void *caller)
{
struct vm_struct *area;
void *addr;
@@ -1577,8 +1610,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
return NULL;
- area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
- VMALLOC_END, node, gfp_mask, caller);
+ area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
+ gfp_mask, caller);
if (!area)
return NULL;
@@ -1595,6 +1628,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
return addr;
}
+/**
+ * __vmalloc_node - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @node: node to use for allocation or -1
+ * @caller: caller's return address
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
+ */
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, pgprot_t prot,
+ int node, void *caller)
+{
+ return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+ gfp_mask, prot, node, caller);
+}
+
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
{
return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -1949,8 +2003,6 @@ finished:
* should know vmalloc() area is valid and can use memcpy().
* This is for routines which have to access vmalloc area without
* any informaion, as /dev/kmem.
- *
- * The caller should guarantee KM_USER1 is not used.
*/
long vwrite(char *buf, char *addr, unsigned long count)
@@ -2203,17 +2255,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
* @sizes: array containing size of each area
* @nr_vms: the number of areas to allocate
* @align: alignment, all entries in @offsets and @sizes must be aligned to this
- * @gfp_mask: allocation mask
*
* Returns: kmalloc'd vm_struct pointer array pointing to allocated
* vm_structs on success, %NULL on failure
*
* Percpu allocator wants to use congruent vm areas so that it can
* maintain the offsets among percpu areas. This function allocates
- * congruent vmalloc areas for it. These areas tend to be scattered
- * pretty far, distance between two areas easily going up to
- * gigabytes. To avoid interacting with regular vmallocs, these areas
- * are allocated from top.
+ * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
+ * be scattered pretty far, distance between two areas easily going up
+ * to gigabytes. To avoid interacting with regular vmallocs, these
+ * areas are allocated from top.
*
* Despite its complicated look, this allocator is rather simple. It
* does everything top-down and scans areas from the end looking for
@@ -2224,7 +2275,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
- size_t align, gfp_t gfp_mask)
+ size_t align)
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2234,8 +2285,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
unsigned long base, start, end, last_end;
bool purged = false;
- gfp_mask &= GFP_RECLAIM_MASK;
-
/* verify parameters and allocate data structures */
BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2268,14 +2317,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
return NULL;
}
- vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
- vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
+ vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
+ vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
if (!vas || !vms)
goto err_free;
for (area = 0; area < nr_vms; area++) {
- vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
- vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
+ vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
+ vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
if (!vas[area] || !vms[area])
goto err_free;
}
@@ -2456,13 +2505,8 @@ static int s_show(struct seq_file *m, void *p)
seq_printf(m, "0x%p-0x%p %7ld",
v->addr, v->addr + v->size, v->size);
- if (v->caller) {
- char buff[KSYM_SYMBOL_LEN];
-
- seq_putc(m, ' ');
- sprint_symbol(buff, (unsigned long)v->caller);
- seq_puts(m, buff);
- }
+ if (v->caller)
+ seq_printf(m, " %pS", v->caller);
if (v->nr_pages)
seq_printf(m, " pages=%d", v->nr_pages);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d31d7ce52c0e..f6b435c80079 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
+#include <linux/compaction.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
@@ -40,6 +41,7 @@
#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
+#include <linux/oom.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -51,11 +53,23 @@
#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>
-enum lumpy_mode {
- LUMPY_MODE_NONE,
- LUMPY_MODE_ASYNC,
- LUMPY_MODE_SYNC,
-};
+/*
+ * reclaim_mode determines how the inactive list is shrunk
+ * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
+ * RECLAIM_MODE_ASYNC: Do not block
+ * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
+ * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
+ * page from the LRU and reclaim all pages within a
+ * naturally aligned range
+ * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ * order-0 pages and then compact the zone
+ */
+typedef unsigned __bitwise__ reclaim_mode_t;
+#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
+#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
+#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
+#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
+#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
@@ -88,7 +102,7 @@ struct scan_control {
* Intend to reclaim enough continuous memory rather than reclaim
* enough amount of memory. i.e, mode for high order allocation.
*/
- enum lumpy_mode lumpy_reclaim_mode;
+ reclaim_mode_t reclaim_mode;
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;
@@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
return ret;
}
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
+static void set_reclaim_mode(int priority, struct scan_control *sc,
bool sync)
{
- enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
+ reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
/*
- * Some reclaim have alredy been failed. No worth to try synchronous
- * lumpy reclaim.
+ * Initially assume we are entering either lumpy reclaim or
+ * reclaim/compaction.Depending on the order, we will either set the
+ * sync mode or just reclaim order-0 pages later.
*/
- if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
- return;
+ if (COMPACTION_BUILD)
+ sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
+ else
+ sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
/*
- * If we need a large contiguous chunk of memory, or have
- * trouble getting a small set of contiguous pages, we
- * will reclaim both active and inactive pages.
+ * Avoid using lumpy reclaim or reclaim/compaction if possible by
+ * restricting when its set to either costly allocations or when
+ * under memory pressure
*/
if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
- sc->lumpy_reclaim_mode = mode;
+ sc->reclaim_mode |= syncmode;
else if (sc->order && priority < DEF_PRIORITY - 2)
- sc->lumpy_reclaim_mode = mode;
+ sc->reclaim_mode |= syncmode;
else
- sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+ sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
}
-static void disable_lumpy_reclaim_mode(struct scan_control *sc)
+static void reset_reclaim_mode(struct scan_control *sc)
{
- sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+ sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
}
static inline int is_page_cache_freeable(struct page *page)
@@ -342,7 +359,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
static void handle_write_error(struct address_space *mapping,
struct page *page, int error)
{
- lock_page_nosync(page);
+ lock_page(page);
if (page_mapping(page) == mapping)
mapping_set_error(mapping, error);
unlock_page(page);
@@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
* first attempt to free a range of pages fails.
*/
if (PageWriteback(page) &&
- sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
+ (sc->reclaim_mode & RECLAIM_MODE_SYNC))
wait_on_page_writeback(page);
if (!PageWriteback(page)) {
@@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
ClearPageReclaim(page);
}
trace_mm_vmscan_writepage(page,
- trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
+ trace_reclaim_flags(page, sc->reclaim_mode));
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -494,9 +511,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
spin_unlock_irq(&mapping->tree_lock);
swapcache_free(swap, page);
} else {
- __remove_from_page_cache(page);
+ void (*freepage)(struct page *);
+
+ freepage = mapping->a_ops->freepage;
+
+ __delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
+
+ if (freepage != NULL)
+ freepage(page);
}
return 1;
@@ -615,7 +639,7 @@ static enum page_references page_check_references(struct page *page,
referenced_page = TestClearPageReferenced(page);
/* Lumpy reclaim - ignore references */
- if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
+ if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
return PAGEREF_RECLAIM;
/*
@@ -732,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* for any page for which writeback has already
* started.
*/
- if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
+ if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
may_enter_fs)
wait_on_page_writeback(page);
else {
@@ -888,7 +912,7 @@ cull_mlocked:
try_to_free_swap(page);
unlock_page(page);
putback_lru_page(page);
- disable_lumpy_reclaim_mode(sc);
+ reset_reclaim_mode(sc);
continue;
activate_locked:
@@ -901,7 +925,7 @@ activate_locked:
keep_locked:
unlock_page(page);
keep:
- disable_lumpy_reclaim_mode(sc);
+ reset_reclaim_mode(sc);
keep_lumpy:
list_add(&page->lru, &ret_pages);
VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
@@ -1021,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
case 0:
list_move(&page->lru, dst);
mem_cgroup_del_lru(page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
break;
case -EBUSY:
@@ -1042,7 +1066,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* surrounding the tag page. Only take those pages of
* the same active state as that tag page. We may safely
* round the target page pfn down to the requested order
- * as the mem_map is guarenteed valid out to MAX_ORDER,
+ * as the mem_map is guaranteed valid out to MAX_ORDER,
* where that page is in a different zone we will detect
* it from its zone id and abort this block scan.
*/
@@ -1079,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
if (__isolate_lru_page(cursor_page, mode, file) == 0) {
list_move(&cursor_page->lru, dst);
mem_cgroup_del_lru(cursor_page);
- nr_taken++;
+ nr_taken += hpage_nr_pages(page);
nr_lumpy_taken++;
if (PageDirty(cursor_page))
nr_lumpy_dirty++;
@@ -1134,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
struct page *page;
list_for_each_entry(page, page_list, lru) {
+ int numpages = hpage_nr_pages(page);
lru = page_lru_base_type(page);
if (PageActive(page)) {
lru += LRU_ACTIVE;
ClearPageActive(page);
- nr_active++;
+ nr_active += numpages;
}
if (count)
- count[lru]++;
+ count[lru] += numpages;
}
return nr_active;
@@ -1251,7 +1276,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
add_page_to_lru_list(zone, page, lru);
if (is_active_lru(lru)) {
int file = is_file_lru(lru);
- reclaim_stat->recent_rotated[file]++;
+ int numpages = hpage_nr_pages(page);
+ reclaim_stat->recent_rotated[file] += numpages;
}
if (!pagevec_add(&pvec, page)) {
spin_unlock_irq(&zone->lru_lock);
@@ -1317,7 +1343,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
return false;
/* Only stall on lumpy reclaim */
- if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+ if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
return false;
/* If we have relaimed everything on the isolated list, no stall */
@@ -1361,15 +1387,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
return SWAP_CLUSTER_MAX;
}
- set_lumpy_reclaim_mode(priority, sc, false);
+ set_reclaim_mode(priority, sc, false);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
if (scanning_global_lru(sc)) {
nr_taken = isolate_pages_global(nr_to_scan,
&page_list, &nr_scanned, sc->order,
- sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
- ISOLATE_INACTIVE : ISOLATE_BOTH,
+ sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
zone, 0, file);
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
@@ -1381,8 +1407,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
} else {
nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
&page_list, &nr_scanned, sc->order,
- sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
- ISOLATE_INACTIVE : ISOLATE_BOTH,
+ sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
zone, sc->mem_cgroup,
0, file);
/*
@@ -1404,7 +1430,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
/* Check if we should syncronously wait for writeback */
if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
- set_lumpy_reclaim_mode(priority, sc, true);
+ set_reclaim_mode(priority, sc, true);
nr_reclaimed += shrink_page_list(&page_list, zone, sc);
}
@@ -1419,7 +1445,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
zone_idx(zone),
nr_scanned, nr_reclaimed,
priority,
- trace_shrink_flags(file, sc->lumpy_reclaim_mode));
+ trace_shrink_flags(file, sc->reclaim_mode));
return nr_reclaimed;
}
@@ -1459,7 +1485,7 @@ static void move_active_pages_to_lru(struct zone *zone,
list_move(&page->lru, &zone->lru[lru].list);
mem_cgroup_add_lru_list(page, lru);
- pgmoved++;
+ pgmoved += hpage_nr_pages(page);
if (!pagevec_add(&pvec, page) || list_empty(list)) {
spin_unlock_irq(&zone->lru_lock);
@@ -1527,7 +1553,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
}
if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
- nr_rotated++;
+ nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
@@ -1798,6 +1824,69 @@ out:
}
/*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+ unsigned long nr_reclaimed,
+ unsigned long nr_scanned,
+ struct scan_control *sc)
+{
+ unsigned long pages_for_compaction;
+ unsigned long inactive_lru_pages;
+
+ /* If not in reclaim/compaction mode, stop */
+ if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+ return false;
+
+ /* Consider stopping depending on scan and reclaim activity */
+ if (sc->gfp_mask & __GFP_REPEAT) {
+ /*
+ * For __GFP_REPEAT allocations, stop reclaiming if the
+ * full LRU list has been scanned and we are still failing
+ * to reclaim pages. This full LRU scan is potentially
+ * expensive but a __GFP_REPEAT caller really wants to succeed
+ */
+ if (!nr_reclaimed && !nr_scanned)
+ return false;
+ } else {
+ /*
+ * For non-__GFP_REPEAT allocations which can presumably
+ * fail without consequence, stop if we failed to reclaim
+ * any pages from the last SWAP_CLUSTER_MAX number of
+ * pages that were scanned. This will return to the
+ * caller faster at the risk reclaim/compaction and
+ * the resulting allocation attempt fails
+ */
+ if (!nr_reclaimed)
+ return false;
+ }
+
+ /*
+ * If we have not reclaimed enough pages for compaction and the
+ * inactive lists are large enough, continue reclaiming
+ */
+ pages_for_compaction = (2UL << sc->order);
+ inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+ zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+ if (sc->nr_reclaimed < pages_for_compaction &&
+ inactive_lru_pages > pages_for_compaction)
+ return true;
+
+ /* If compaction would go ahead or the allocation would succeed, stop */
+ switch (compaction_suitable(zone, sc->order)) {
+ case COMPACT_PARTIAL:
+ case COMPACT_CONTINUE:
+ return false;
+ default:
+ return true;
+ }
+}
+
+/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_zone(int priority, struct zone *zone,
@@ -1806,9 +1895,12 @@ static void shrink_zone(int priority, struct zone *zone,
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list l;
- unsigned long nr_reclaimed = sc->nr_reclaimed;
+ unsigned long nr_reclaimed, nr_scanned;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+restart:
+ nr_reclaimed = 0;
+ nr_scanned = sc->nr_scanned;
get_scan_count(zone, sc, nr, priority);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1834,8 +1926,7 @@ static void shrink_zone(int priority, struct zone *zone,
if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
break;
}
-
- sc->nr_reclaimed = nr_reclaimed;
+ sc->nr_reclaimed += nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
@@ -1844,6 +1935,11 @@ static void shrink_zone(int priority, struct zone *zone,
if (inactive_anon_is_low(zone, sc))
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+ /* reclaim/compaction might need reclaim to continue */
+ if (should_continue_reclaim(zone, nr_reclaimed,
+ sc->nr_scanned - nr_scanned, sc))
+ goto restart;
+
throttle_vm_writeout(sc->gfp_mask);
}
@@ -1893,17 +1989,12 @@ static bool zone_reclaimable(struct zone *zone)
return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
}
-/*
- * As hibernation is going on, kswapd is freezed so that it can't mark
- * the zone into all_unreclaimable. It can't handle OOM during hibernation.
- * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
- */
+/* All zones in zonelist are unreclaimable? */
static bool all_unreclaimable(struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
- bool all_unreclaimable = true;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1911,13 +2002,11 @@ static bool all_unreclaimable(struct zonelist *zonelist,
continue;
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- if (zone_reclaimable(zone)) {
- all_unreclaimable = false;
- break;
- }
+ if (!zone->all_unreclaimable)
+ return false;
}
- return all_unreclaimable;
+ return true;
}
/*
@@ -2000,7 +2089,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct zone *preferred_zone;
first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
- NULL, &preferred_zone);
+ &cpuset_current_mems_allowed,
+ &preferred_zone);
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
}
}
@@ -2012,6 +2102,14 @@ out:
if (sc->nr_reclaimed)
return sc->nr_reclaimed;
+ /*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
+ * check.
+ */
+ if (oom_killer_disabled)
+ return 0;
+
/* top priority shrink_zones still had more to do? don't OOM, then */
if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
@@ -2117,38 +2215,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
}
#endif
+/*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+ * by the callers classzone_idx are added to balanced_pages. The total of
+ * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ * for the node to be considered balanced. Forcing all zones to be balanced
+ * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * The choice of 25% is due to
+ * o a 16M DMA zone that is balanced will not balance a zone on any
+ * reasonable sized machine
+ * o On all other machines, the top zone must be at least a reasonable
+ * percentage of the middle zones. For example, on 32-bit x86, highmem
+ * would need to be at least 256M for it to be balance a whole node.
+ * Similarly, on x86-64 the Normal zone would need to be at least 1G
+ * to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+ int classzone_idx)
+{
+ unsigned long present_pages = 0;
+ int i;
+
+ for (i = 0; i <= classzone_idx; i++)
+ present_pages += pgdat->node_zones[i].present_pages;
+
+ return balanced_pages > (present_pages >> 2);
+}
+
/* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ int classzone_idx)
{
int i;
+ unsigned long balanced = 0;
+ bool all_zones_ok = true;
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
- return 1;
+ return true;
- /* If after HZ/10, a zone is below the high mark, it's premature */
+ /* Check the watermark levels */
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
- if (zone->all_unreclaimable)
+ /*
+ * balance_pgdat() skips over all_unreclaimable after
+ * DEF_PRIORITY. Effectively, it considers them balanced so
+ * they must be considered balanced here as well if kswapd
+ * is to sleep
+ */
+ if (zone->all_unreclaimable) {
+ balanced += zone->present_pages;
continue;
+ }
- if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
- 0, 0))
- return 1;
+ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
+ classzone_idx, 0))
+ all_zones_ok = false;
+ else
+ balanced += zone->present_pages;
}
- return 0;
+ /*
+ * For high-order requests, the balanced zones must contain at least
+ * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
+ * must be balanced
+ */
+ if (order)
+ return pgdat_balanced(pgdat, balanced, classzone_idx);
+ else
+ return !all_zones_ok;
}
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
@@ -2165,11 +2312,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+ int *classzone_idx)
{
int all_zones_ok;
+ unsigned long balanced;
int priority;
int i;
+ int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long total_scanned;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc = {
@@ -2192,7 +2342,6 @@ loop_again:
count_vm_event(PAGEOUTRUN);
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
unsigned long lru_pages = 0;
int has_under_min_watermark_zone = 0;
@@ -2201,6 +2350,7 @@ loop_again:
disable_swap_token();
all_zones_ok = 1;
+ balanced = 0;
/*
* Scan in the highmem->dma direction for the highest
@@ -2223,9 +2373,10 @@ loop_again:
shrink_active_list(SWAP_CLUSTER_MAX, zone,
&sc, priority, 0);
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
+ *classzone_idx = i;
break;
}
}
@@ -2250,6 +2401,7 @@ loop_again:
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
+ unsigned long balance_gap;
if (!populated_zone(zone))
continue;
@@ -2266,20 +2418,31 @@ loop_again:
mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
/*
- * We put equal pressure on every zone, unless one
- * zone has way too many pages free already.
+ * We put equal pressure on every zone, unless
+ * one zone has way too many pages free
+ * already. The "too many pages" is defined
+ * as the high wmark plus a "gap" where the
+ * gap is either the low watermark or 1%
+ * of the zone, whichever is smaller.
*/
- if (!zone_watermark_ok(zone, order,
- 8*high_wmark_pages(zone), end_zone, 0))
+ balance_gap = min(low_wmark_pages(zone),
+ (zone->present_pages +
+ KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+ KSWAPD_ZONE_BALANCE_GAP_RATIO);
+ if (!zone_watermark_ok_safe(zone, order,
+ high_wmark_pages(zone) + balance_gap,
+ end_zone, 0))
shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
+
if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 && !zone_reclaimable(zone))
+ if (nr_slab == 0 &&
+ !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
@@ -2290,7 +2453,7 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
/*
@@ -2298,7 +2461,7 @@ loop_again:
* means that we have a GFP_ATOMIC allocation
* failure risk. Hurry up!
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
min_wmark_pages(zone), end_zone, 0))
has_under_min_watermark_zone = 1;
} else {
@@ -2310,10 +2473,12 @@ loop_again:
* spectulatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
+ if (i <= *classzone_idx)
+ balanced += zone->present_pages;
}
}
- if (all_zones_ok)
+ if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
* OK, kswapd is getting into trouble. Take a nap, then take
@@ -2336,7 +2501,13 @@ loop_again:
break;
}
out:
- if (!all_zones_ok) {
+
+ /*
+ * order-0: All zones must meet high watermark for a balanced node
+ * high-order: Balanced zones must make up at least 25% of the node
+ * for the node to be balanced
+ */
+ if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
cond_resched();
try_to_freeze();
@@ -2361,7 +2532,88 @@ out:
goto loop_again;
}
- return sc.nr_reclaimed;
+ /*
+ * If kswapd was reclaiming at a higher order, it has the option of
+ * sleeping without all zones being balanced. Before it does, it must
+ * ensure that the watermarks for order-0 on *all* zones are met and
+ * that the congestion flags are cleared. The congestion flag must
+ * be cleared as kswapd is the only mechanism that clears the flag
+ * and it is potentially going to sleep here.
+ */
+ if (order) {
+ for (i = 0; i <= end_zone; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ continue;
+
+ /* Confirm the zone is balanced for order-0 */
+ if (!zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone), 0, 0)) {
+ order = sc.order = 0;
+ goto loop_again;
+ }
+
+ /* If balanced, clear the congested flag */
+ zone_clear_flag(zone, ZONE_CONGESTED);
+ }
+ }
+
+ /*
+ * Return the order we were reclaiming at so sleeping_prematurely()
+ * makes a decision on the order we were last reclaiming at. However,
+ * if another caller entered the allocator slow path while kswapd
+ * was awake, order will remain at the higher level
+ */
+ *classzone_idx = end_zone;
+ return order;
+}
+
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ long remaining = 0;
+ DEFINE_WAIT(wait);
+
+ if (freezing(current) || kthread_should_stop())
+ return;
+
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+
+ /* Try to sleep for a short interval */
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ remaining = schedule_timeout(HZ/10);
+ finish_wait(&pgdat->kswapd_wait, &wait);
+ prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+ }
+
+ /*
+ * After a short sleep, check if it was a premature sleep. If not, then
+ * go fully to sleep until explicitly woken up.
+ */
+ if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+
+ /*
+ * vmstat counters are not perfectly accurate and the estimated
+ * value for counters such as NR_FREE_PAGES can deviate from the
+ * true value by nr_online_cpus * threshold. To avoid the zone
+ * watermarks being breached while under pressure, we reduce the
+ * per-cpu vmstat threshold while kswapd is awake and restore
+ * them before going back to sleep.
+ */
+ set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+ schedule();
+ set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
+ } else {
+ if (remaining)
+ count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+ else
+ count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+ }
+ finish_wait(&pgdat->kswapd_wait, &wait);
}
/*
@@ -2380,9 +2632,10 @@ out:
static int kswapd(void *p)
{
unsigned long order;
+ int classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
- DEFINE_WAIT(wait);
+
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
@@ -2410,49 +2663,30 @@ static int kswapd(void *p)
set_freezable();
order = 0;
+ classzone_idx = MAX_NR_ZONES - 1;
for ( ; ; ) {
unsigned long new_order;
+ int new_classzone_idx;
int ret;
- prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
- if (order < new_order) {
+ pgdat->classzone_idx = MAX_NR_ZONES - 1;
+ if (order < new_order || classzone_idx > new_classzone_idx) {
/*
* Don't sleep if someone wants a larger 'order'
- * allocation
+ * allocation or has tigher zone constraints
*/
order = new_order;
+ classzone_idx = new_classzone_idx;
} else {
- if (!freezing(current) && !kthread_should_stop()) {
- long remaining = 0;
-
- /* Try to sleep for a short interval */
- if (!sleeping_prematurely(pgdat, order, remaining)) {
- remaining = schedule_timeout(HZ/10);
- finish_wait(&pgdat->kswapd_wait, &wait);
- prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
- }
-
- /*
- * After a short sleep, check if it was a
- * premature sleep. If not, then go fully
- * to sleep until explicitly woken up
- */
- if (!sleeping_prematurely(pgdat, order, remaining)) {
- trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
- schedule();
- } else {
- if (remaining)
- count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
- else
- count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
- }
- }
-
+ kswapd_try_to_sleep(pgdat, order, classzone_idx);
order = pgdat->kswapd_max_order;
+ classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = MAX_NR_ZONES - 1;
}
- finish_wait(&pgdat->kswapd_wait, &wait);
ret = try_to_freeze();
if (kthread_should_stop())
@@ -2464,7 +2698,7 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balance_pgdat(pgdat, order);
+ order = balance_pgdat(pgdat, order, &classzone_idx);
}
}
return 0;
@@ -2473,23 +2707,26 @@ static int kswapd(void *p)
/*
* A zone is low on free memory, so wake its kswapd task to service it.
*/
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
if (!populated_zone(zone))
return;
- pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
- return;
- if (pgdat->kswapd_max_order < order)
- pgdat->kswapd_max_order = order;
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
+ pgdat = zone->zone_pgdat;
+ if (pgdat->kswapd_max_order < order) {
+ pgdat->kswapd_max_order = order;
+ pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+ }
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
+ if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+ return;
+
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8f62f17ee1c7..897ea9e88238 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
-static int calculate_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
+{
+ int threshold;
+ int watermark_distance;
+
+ /*
+ * As vmstats are not up to date, there is drift between the estimated
+ * and real values. For high thresholds and a high number of CPUs, it
+ * is possible for the min watermark to be breached while the estimated
+ * value looks fine. The pressure threshold is a reduced value such
+ * that even the maximum amount of drift will not accidentally breach
+ * the min watermark
+ */
+ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+ threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
+
+int calculate_normal_threshold(struct zone *zone)
{
int threshold;
int mem; /* memory in 128 MB units */
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
for_each_populated_zone(zone) {
unsigned long max_drift, tolerate_drift;
- threshold = calculate_threshold(zone);
+ threshold = calculate_normal_threshold(zone);
for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -161,42 +185,50 @@ static void refresh_zone_stat_thresholds(void)
}
}
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+ int (*calculate_pressure)(struct zone *))
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = (*calculate_pressure)(zone);
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+}
+
/*
* For use when we know that interrupts are disabled.
*/
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
int delta)
{
- struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
-
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
long x;
+ long t;
- x = delta + *p;
+ x = delta + __this_cpu_read(*p);
- if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
+ t = __this_cpu_read(pcp->stat_threshold);
+
+ if (unlikely(x > t || x < -t)) {
zone_page_state_add(x, zone, item);
x = 0;
}
- *p = x;
+ __this_cpu_write(*p, x);
}
EXPORT_SYMBOL(__mod_zone_page_state);
/*
- * For an unknown interrupt state
- */
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __mod_zone_page_state(zone, item, delta);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-
-/*
* Optimized increment and decrement functions.
*
* These are only for a single page and therefore can take a struct page *
@@ -221,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
*/
void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
- s8 *p = pcp->vm_stat_diff + item;
-
- (*p)++;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- if (unlikely(*p > pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
+ v = __this_cpu_inc_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v > t)) {
+ s8 overstep = t >> 1;
- zone_page_state_add(*p + overstep, zone, item);
- *p = -overstep;
+ zone_page_state_add(v + overstep, zone, item);
+ __this_cpu_write(*p, -overstep);
}
}
@@ -242,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
- struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
- s8 *p = pcp->vm_stat_diff + item;
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ s8 v, t;
- (*p)--;
+ v = __this_cpu_dec_return(*p);
+ t = __this_cpu_read(pcp->stat_threshold);
+ if (unlikely(v < - t)) {
+ s8 overstep = t >> 1;
- if (unlikely(*p < - pcp->stat_threshold)) {
- int overstep = pcp->stat_threshold / 2;
-
- zone_page_state_add(*p - overstep, zone, item);
- *p = overstep;
+ zone_page_state_add(v - overstep, zone, item);
+ __this_cpu_write(*p, overstep);
}
}
@@ -261,6 +295,95 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
}
EXPORT_SYMBOL(__dec_zone_page_state);
+#ifdef CONFIG_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ * 0 No overstepping
+ * 1 Overstepping half of threshold
+ * -1 Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+ enum zone_stat_item item, int delta, int overstep_mode)
+{
+ struct per_cpu_pageset __percpu *pcp = zone->pageset;
+ s8 __percpu *p = pcp->vm_stat_diff + item;
+ long o, n, t, z;
+
+ do {
+ z = 0; /* overflow to zone counters */
+
+ /*
+ * The fetching of the stat_threshold is racy. We may apply
+ * a counter threshold to the wrong the cpu if we get
+ * rescheduled while executing here. However, the next
+ * counter update will apply the threshold again and
+ * therefore bring the counter under the threshold again.
+ *
+ * Most of the time the thresholds are the same anyways
+ * for all cpus in a zone.
+ */
+ t = this_cpu_read(pcp->stat_threshold);
+
+ o = this_cpu_read(*p);
+ n = delta + o;
+
+ if (n > t || n < -t) {
+ int os = overstep_mode * (t >> 1) ;
+
+ /* Overflow must be added to zone counters */
+ z = n + os;
+ n = -os;
+ }
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+ if (z)
+ zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ mod_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+ mod_state(zone, item, 1, 1);
+}
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+ mod_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+ int delta)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __mod_zone_page_state(zone, item, delta);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
unsigned long flags;
@@ -291,6 +414,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
local_irq_restore(flags);
}
EXPORT_SYMBOL(dec_zone_page_state);
+#endif
/*
* Update the zone counters for one cpu.
@@ -379,8 +503,12 @@ void refresh_cpu_vm_stats(int cpu)
* z = the zone from which the allocation occurred.
*
* Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
*/
-void zone_statistics(struct zone *preferred_zone, struct zone *z)
+void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
{
if (z->zone_pgdat == preferred_zone->zone_pgdat) {
__inc_zone_state(z, NUMA_HIT);
@@ -388,7 +516,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
__inc_zone_state(z, NUMA_MISS);
__inc_zone_state(preferred_zone, NUMA_FOREIGN);
}
- if (z->node == numa_node_id())
+ if (z->node == ((flags & __GFP_OTHER_NODE) ?
+ preferred_zone->node : numa_node_id()))
__inc_zone_state(z, NUMA_LOCAL);
else
__inc_zone_state(z, NUMA_OTHER);
@@ -759,6 +888,7 @@ static const char * const vmstat_text[] = {
"numa_local",
"numa_other",
#endif
+ "nr_anon_transparent_hugepages",
"nr_dirty_threshold",
"nr_dirty_background_threshold",
@@ -818,7 +948,16 @@ static const char * const vmstat_text[] = {
"unevictable_pgs_cleared",
"unevictable_pgs_stranded",
"unevictable_pgs_mlockfreed",
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ "thp_fault_alloc",
+ "thp_fault_fallback",
+ "thp_collapse_alloc",
+ "thp_collapse_alloc_failed",
+ "thp_split",
#endif
+
+#endif /* CONFIG_VM_EVENTS_COUNTERS */
};
static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
@@ -834,7 +973,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
- zone_nr_free_pages(zone),
+ zone_page_state(zone, NR_FREE_PAGES),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
@@ -1033,7 +1172,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
- cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+ cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
per_cpu(vmstat_work, cpu).work.func = NULL;
break;
case CPU_DOWN_FAILED: