diff options
author | Stefan Richter <stefanr@s5r6.in-berlin.de> | 2011-05-10 20:52:07 +0200 |
---|---|---|
committer | Stefan Richter <stefanr@s5r6.in-berlin.de> | 2011-05-10 22:50:41 +0200 |
commit | 020abf03cd659388f94cb328e1e1df0656e0d7ff (patch) | |
tree | 40d05011708ad1b4a05928d167eb120420581aa6 /mm | |
parent | 0ff8fbc61727c926883eec381fbd3d32d1fab504 (diff) | |
parent | 693d92a1bbc9e42681c42ed190bd42b636ca876f (diff) | |
download | linux-020abf03cd659388f94cb328e1e1df0656e0d7ff.tar.bz2 |
Merge tag 'v2.6.39-rc7'
in order to pull in changes in drivers/media/dvb/firewire/ and
sound/firewire/.
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 40 | ||||
-rw-r--r-- | mm/Kconfig.debug | 25 | ||||
-rw-r--r-- | mm/Makefile | 11 | ||||
-rw-r--r-- | mm/backing-dev.c | 18 | ||||
-rw-r--r-- | mm/bootmem.c | 188 | ||||
-rw-r--r-- | mm/compaction.c | 209 | ||||
-rw-r--r-- | mm/dmapool.c | 16 | ||||
-rw-r--r-- | mm/filemap.c | 236 | ||||
-rw-r--r-- | mm/huge_memory.c | 2393 | ||||
-rw-r--r-- | mm/hugetlb.c | 127 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 10 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 6 | ||||
-rw-r--r-- | mm/kmemleak.c | 19 | ||||
-rw-r--r-- | mm/ksm.c | 106 | ||||
-rw-r--r-- | mm/madvise.c | 10 | ||||
-rw-r--r-- | mm/memblock.c | 251 | ||||
-rw-r--r-- | mm/memcontrol.c | 928 | ||||
-rw-r--r-- | mm/memory-failure.c | 156 | ||||
-rw-r--r-- | mm/memory.c | 530 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 25 | ||||
-rw-r--r-- | mm/mempolicy.c | 42 | ||||
-rw-r--r-- | mm/migrate.c | 192 | ||||
-rw-r--r-- | mm/mincore.c | 7 | ||||
-rw-r--r-- | mm/mlock.c | 180 | ||||
-rw-r--r-- | mm/mmap.c | 57 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 20 | ||||
-rw-r--r-- | mm/mmzone.c | 21 | ||||
-rw-r--r-- | mm/mprotect.c | 20 | ||||
-rw-r--r-- | mm/mremap.c | 24 | ||||
-rw-r--r-- | mm/nobootmem.c | 427 | ||||
-rw-r--r-- | mm/nommu.c | 92 | ||||
-rw-r--r-- | mm/oom_kill.c | 98 | ||||
-rw-r--r-- | mm/page-writeback.c | 36 | ||||
-rw-r--r-- | mm/page_alloc.c | 356 | ||||
-rw-r--r-- | mm/page_cgroup.c | 140 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/pagewalk.c | 23 | ||||
-rw-r--r-- | mm/percpu-vm.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 25 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 121 | ||||
-rw-r--r-- | mm/readahead.c | 18 | ||||
-rw-r--r-- | mm/rmap.c | 200 | ||||
-rw-r--r-- | mm/shmem.c | 35 | ||||
-rw-r--r-- | mm/slab.c | 143 | ||||
-rw-r--r-- | mm/slob.c | 11 | ||||
-rw-r--r-- | mm/slub.c | 461 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 2 | ||||
-rw-r--r-- | mm/sparse.c | 6 | ||||
-rw-r--r-- | mm/swap.c | 320 | ||||
-rw-r--r-- | mm/swap_state.c | 11 | ||||
-rw-r--r-- | mm/swapfile.c | 416 | ||||
-rw-r--r-- | mm/truncate.c | 39 | ||||
-rw-r--r-- | mm/util.c | 23 | ||||
-rw-r--r-- | mm/vmalloc.c | 248 | ||||
-rw-r--r-- | mm/vmscan.c | 477 | ||||
-rw-r--r-- | mm/vmstat.c | 223 |
57 files changed, 7274 insertions, 2550 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c2c8a4a11898..e9c0c61f2ddd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS config COMPACTION bool "Allow for memory compaction" select MIGRATION - depends on EXPERIMENTAL && HUGETLB_PAGE && MMU + depends on MMU help Allows the compaction of memory for the allocation of huge pages. @@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS See Documentation/nommu-mmap.txt for more information. +config TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" + depends on X86 && MMU + select COMPACTION + help + Transparent Hugepages allows the kernel to use huge pages and + huge tlb transparently to the applications whenever possible. + This feature can improve computing performance to certain + applications by speeding up page faults during memory + allocation, by reducing the number of tlb misses and by speeding + up the pagetable walking. + + If memory constrained on embedded, you may want to say N. + +choice + prompt "Transparent Hugepage Support sysfs defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_ALWAYS + help + Selects the sysfs defaults for Transparent Hugepage Support. + + config TRANSPARENT_HUGEPAGE_ALWAYS + bool "always" + help + Enabling Transparent Hugepage always, can increase the + memory footprint of applications without a guaranteed + benefit but it will work automatically for all applications. + + config TRANSPARENT_HUGEPAGE_MADVISE + bool "madvise" + help + Enabling Transparent Hugepage madvise, will only provide a + performance improvement benefit to the applications using + madvise(MADV_HUGEPAGE) but it won't risk to increase the + memory footprint of applications without a guaranteed + benefit. +endchoice + # # UP and nommu archs use km based percpu allocator # diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index af7cfb43d2f0..8b1a477162dc 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -1,27 +1,24 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" - depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC - depends on !HIBERNATION || !PPC && !SPARC + depends on DEBUG_KERNEL + depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC depends on !KMEMCHECK + select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types of memory corruption. + For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, + fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages(). Additionally, + this option cannot be enabled in combination with hibernation as + that would result in incorrect warnings of memory corruption after + a resume because free pages are not saved to the suspend image. + config WANT_PAGE_DEBUG_FLAGS bool config PAGE_POISONING - bool "Debug page memory allocations" - depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC - depends on !HIBERNATION - select DEBUG_PAGEALLOC + bool select WANT_PAGE_DEBUG_FLAGS - ---help--- - Fill the pages with poison patterns after free_pages() and verify - the patterns before alloc_pages(). This results in a large slowdown, - but helps to find certain types of memory corruption. - - This option cannot be enabled in combination with hibernation as - that would result in incorrect warnings of memory corruption after - a resume because free pages are not saved to the suspend image. diff --git a/mm/Makefile b/mm/Makefile index f73f75a29f82..42a8326c3e3d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,9 +5,9 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o + vmalloc.o pagewalk.o pgtable-generic.o -obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ +obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ @@ -15,6 +15,12 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ $(mmu-y) obj-y += init-mm.o +ifdef CONFIG_NO_BOOTMEM + obj-y += nobootmem.o +else + obj-y += bootmem.o +endif + obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o @@ -37,6 +43,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 027100d30227..befc87531e4f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -14,17 +14,11 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} -EXPORT_SYMBOL(default_unplug_io_fn); - struct backing_dev_info default_backing_dev_info = { .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, .state = 0, .capabilities = BDI_CAP_MAP_COPY, - .unplug_io_fn = default_unplug_io_fn, }; EXPORT_SYMBOL_GPL(default_backing_dev_info); @@ -73,14 +67,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct inode *inode; nr_wb = nr_dirty = nr_io = nr_more_io = 0; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_for_each_entry(inode, &wb->b_dirty, i_wb_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_wb_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_wb_list) nr_more_io++; - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); @@ -604,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_bdi == bdi) - sb->s_bdi = NULL; + sb->s_bdi = &default_backing_dev_info; } spin_unlock(&sb_lock); } @@ -682,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi) if (bdi_has_dirty_io(bdi)) { struct bdi_writeback *dst = &default_backing_dev_info.wb; - spin_lock(&inode_lock); + spin_lock(&inode_wb_list_lock); list_splice(&bdi->wb.b_dirty, &dst->b_dirty); list_splice(&bdi->wb.b_io, &dst->b_io); list_splice(&bdi->wb.b_more_io, &dst->b_more_io); - spin_unlock(&inode_lock); + spin_unlock(&inode_wb_list_lock); } bdi_unregister(bdi); @@ -793,7 +787,7 @@ EXPORT_SYMBOL(congestion_wait); * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absense of zone congestion, cond_resched() is called to yield + * In the absence of zone congestion, cond_resched() is called to yield * the processor if necessary but otherwise does not sleep. * * The return value is 0 if the sleep is for the full timeout. Otherwise, diff --git a/mm/bootmem.c b/mm/bootmem.c index 13b0caa9793c..01d5a4b3dd0c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -23,19 +23,17 @@ #include "internal.h" +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data = { + .bdata = &bootmem_node_data[0] +}; +EXPORT_SYMBOL(contig_page_data); +#endif + unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -#ifdef CONFIG_CRASH_DUMP -/* - * If we have booted due to a crash, max_pfn will be a very low value. We need - * to know the amount of memory that the previous kernel used. - */ -unsigned long saved_max_pfn; -#endif - -#ifndef CONFIG_NO_BOOTMEM bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); @@ -146,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) min_low_pfn = start; return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } -#endif + /* * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range @@ -171,53 +169,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) } } -#ifdef CONFIG_NO_BOOTMEM -static void __init __free_pages_memory(unsigned long start, unsigned long end) -{ - int i; - unsigned long start_aligned, end_aligned; - int order = ilog2(BITS_PER_LONG); - - start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); - end_aligned = end & ~(BITS_PER_LONG - 1); - - if (end_aligned <= start_aligned) { - for (i = start; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); - - return; - } - - for (i = start; i < start_aligned; i++) - __free_pages_bootmem(pfn_to_page(i), 0); - - for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) - __free_pages_bootmem(pfn_to_page(i), order); - - for (i = end_aligned; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); -} - -unsigned long __init free_all_memory_core_early(int nodeid) -{ - int i; - u64 start, end; - unsigned long count = 0; - struct range *range = NULL; - int nr_range; - - nr_range = get_free_all_memory_range(&range, nodeid); - - for (i = 0; i < nr_range; i++) { - start = range[i].start; - end = range[i].end; - count += end - start; - __free_pages_memory(start, end); - } - - return count; -} -#else static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -278,7 +229,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } -#endif /** * free_all_bootmem_node - release a node's free pages to the buddy allocator @@ -289,12 +239,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); -#ifdef CONFIG_NO_BOOTMEM - /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ - return 0; -#else return free_all_bootmem_core(pgdat->bdata); -#endif } /** @@ -304,16 +249,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { -#ifdef CONFIG_NO_BOOTMEM - /* - * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id - * because in some case like Node0 doesnt have RAM installed - * low ram will be on Node1 - * Use MAX_NUMNODES will make sure all ranges in early_node_map[] - * will be used instead of only Node0 related - */ - return free_all_memory_core_early(MAX_NUMNODES); -#else unsigned long total_pages = 0; bootmem_data_t *bdata; @@ -321,10 +256,8 @@ unsigned long __init free_all_bootmem(void) total_pages += free_all_bootmem_core(bdata); return total_pages; -#endif } -#ifndef CONFIG_NO_BOOTMEM static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { @@ -419,7 +352,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, } BUG(); } -#endif /** * free_bootmem_node - mark a page range as usable @@ -434,10 +366,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { -#ifdef CONFIG_NO_BOOTMEM - kmemleak_free_part(__va(physaddr), size); - memblock_x86_free_range(physaddr, physaddr + size); -#else unsigned long start, end; kmemleak_free_part(__va(physaddr), size); @@ -446,7 +374,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, end = PFN_DOWN(physaddr + size); mark_bootmem_node(pgdat->bdata, start, end, 0, 0); -#endif } /** @@ -460,10 +387,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { -#ifdef CONFIG_NO_BOOTMEM - kmemleak_free_part(__va(addr), size); - memblock_x86_free_range(addr, addr + size); -#else unsigned long start, end; kmemleak_free_part(__va(addr), size); @@ -472,7 +395,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); mark_bootmem(start, end, 0, 0); -#endif } /** @@ -489,17 +411,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size) int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { -#ifdef CONFIG_NO_BOOTMEM - panic("no bootmem"); - return 0; -#else unsigned long start, end; start = PFN_DOWN(physaddr); end = PFN_UP(physaddr + size); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); -#endif } /** @@ -515,20 +432,14 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { -#ifdef CONFIG_NO_BOOTMEM - panic("no bootmem"); - return 0; -#else unsigned long start, end; start = PFN_DOWN(addr); end = PFN_UP(addr + size); return mark_bootmem(start, end, 1, flags); -#endif } -#ifndef CONFIG_NO_BOOTMEM int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, int flags) { @@ -685,33 +596,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, #endif return NULL; } -#endif static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { -#ifdef CONFIG_NO_BOOTMEM - void *ptr; - - if (WARN_ON_ONCE(slab_is_available())) - return kzalloc(size, GFP_NOWAIT); - -restart: - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); - - if (ptr) - return ptr; - - if (goal != 0) { - goal = 0; - goto restart; - } - - return NULL; -#else bootmem_data_t *bdata; void *region; @@ -737,7 +627,6 @@ restart: } return NULL; -#endif } /** @@ -758,10 +647,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, { unsigned long limit = 0; -#ifdef CONFIG_NO_BOOTMEM - limit = -1UL; -#endif - return ___alloc_bootmem_nopanic(size, align, goal, limit); } @@ -798,14 +683,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, { unsigned long limit = 0; -#ifdef CONFIG_NO_BOOTMEM - limit = -1UL; -#endif - return ___alloc_bootmem(size, align, goal, limit); } -#ifndef CONFIG_NO_BOOTMEM static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -822,7 +702,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, return ___alloc_bootmem(size, align, goal, limit); } -#endif /** * __alloc_bootmem_node - allocate boot memory from a specific node @@ -842,24 +721,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); - if (ptr) - return ptr; - - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, -1ULL); -#else - ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); -#endif - - return ptr; + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, @@ -880,13 +745,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, unsigned long new_goal; new_goal = MAX_DMA32_PFN << PAGE_SHIFT; -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - new_goal, -1ULL); -#else ptr = alloc_bootmem_core(pgdat->bdata, size, align, new_goal, 0); -#endif if (ptr) return ptr; } @@ -907,16 +767,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { -#ifdef CONFIG_NO_BOOTMEM - unsigned long pfn, goal, limit; - - pfn = section_nr_to_pfn(section_nr); - goal = pfn << PAGE_SHIFT; - limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; - - return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, - SMP_CACHE_BYTES, goal, limit); -#else bootmem_data_t *bdata; unsigned long pfn, goal, limit; @@ -926,7 +776,6 @@ void * __init alloc_bootmem_section(unsigned long size, bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); -#endif } #endif @@ -938,16 +787,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, - goal, -1ULL); -#else ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); -#endif if (ptr) return ptr; @@ -995,21 +839,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - void *ptr; - if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); -#ifdef CONFIG_NO_BOOTMEM - ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); - if (ptr) - return ptr; - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -#else - ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, - goal, ARCH_LOW_ADDRESS_LIMIT); -#endif - return ptr; } diff --git a/mm/compaction.c b/mm/compaction.c index 4d709ee59013..021a2960ef9e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -16,6 +16,9 @@ #include <linux/sysfs.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/compaction.h> + /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts @@ -30,6 +33,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ /* Account for isolated anon and file pages */ unsigned long nr_anon; @@ -60,7 +64,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, struct list_head *freelist) { unsigned long zone_end_pfn, end_pfn; - int total_isolated = 0; + int nr_scanned = 0, total_isolated = 0; struct page *cursor; /* Get the last PFN we should scan for free pages at */ @@ -81,6 +85,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, if (!pfn_valid_within(blockpfn)) continue; + nr_scanned++; if (!PageBuddy(page)) continue; @@ -100,6 +105,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, } } + trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); return total_isolated; } @@ -147,7 +153,6 @@ static void isolate_freepages(struct zone *zone, * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - spin_lock_irqsave(&zone->lock, flags); for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; @@ -170,9 +175,19 @@ static void isolate_freepages(struct zone *zone, if (!suitable_migration_target(page)) continue; - /* Found a block suitable for isolating free pages from */ - isolated = isolate_freepages_block(zone, pfn, freelist); - nr_freepages += isolated; + /* + * Found a block suitable for isolating free pages from. Now + * we disabled interrupts, double check things are ok and + * isolate the pages. This is to minimise the time IRQs + * are disabled + */ + isolated = 0; + spin_lock_irqsave(&zone->lock, flags); + if (suitable_migration_target(page)) { + isolated = isolate_freepages_block(zone, pfn, freelist); + nr_freepages += isolated; + } + spin_unlock_irqrestore(&zone->lock, flags); /* * Record the highest PFN we isolated pages from. When next @@ -182,7 +197,6 @@ static void isolate_freepages(struct zone *zone, if (isolated) high_pfn = max(high_pfn, pfn); } - spin_unlock_irqrestore(&zone->lock, flags); /* split_free_page does not map the pages */ list_for_each_entry(page, freelist, lru) { @@ -234,6 +248,8 @@ static unsigned long isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + unsigned long last_pageblock_nr = 0, pageblock_nr; + unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; /* Do not scan outside zone boundaries */ @@ -261,26 +277,74 @@ static unsigned long isolate_migratepages(struct zone *zone, } /* Time to isolate some pages for migration */ + cond_resched(); spin_lock_irq(&zone->lru_lock); for (; low_pfn < end_pfn; low_pfn++) { struct page *page; + bool locked = true; + + /* give a chance to irqs before checking need_resched() */ + if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { + spin_unlock_irq(&zone->lru_lock); + locked = false; + } + if (need_resched() || spin_is_contended(&zone->lru_lock)) { + if (locked) + spin_unlock_irq(&zone->lru_lock); + cond_resched(); + spin_lock_irq(&zone->lru_lock); + if (fatal_signal_pending(current)) + break; + } else if (!locked) + spin_lock_irq(&zone->lru_lock); + if (!pfn_valid_within(low_pfn)) continue; + nr_scanned++; /* Get the page and skip if free */ page = pfn_to_page(low_pfn); if (PageBuddy(page)) continue; + /* + * For async migration, also only scan in MOVABLE blocks. Async + * migration is optimistic to see if the minimum amount of work + * satisfies the allocation + */ + pageblock_nr = low_pfn >> pageblock_order; + if (!cc->sync && last_pageblock_nr != pageblock_nr && + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { + low_pfn += pageblock_nr_pages; + low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; + last_pageblock_nr = pageblock_nr; + continue; + } + + if (!PageLRU(page)) + continue; + + /* + * PageLRU is set, and lru_lock excludes isolation, + * splitting and collapsing (collapsing has already + * happened if PageLRU is set). + */ + if (PageTransHuge(page)) { + low_pfn += (1 << compound_order(page)) - 1; + continue; + } + /* Try isolate the page */ if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) continue; + VM_BUG_ON(PageTransCompound(page)); + /* Successfully isolated */ del_page_from_lru_list(zone, page, page_lru(page)); list_add(&page->lru, migratelist); - mem_cgroup_del_lru(page); cc->nr_migratepages++; + nr_isolated++; /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) @@ -292,6 +356,8 @@ static unsigned long isolate_migratepages(struct zone *zone, spin_unlock_irq(&zone->lru_lock); cc->migrate_pfn = low_pfn; + trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + return cc->nr_migratepages; } @@ -342,10 +408,10 @@ static void update_nr_listpages(struct compact_control *cc) } static int compact_finished(struct zone *zone, - struct compact_control *cc) + struct compact_control *cc) { unsigned int order; - unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); + unsigned long watermark; if (fatal_signal_pending(current)) return COMPACT_PARTIAL; @@ -355,9 +421,16 @@ static int compact_finished(struct zone *zone, return COMPACT_COMPLETE; /* Compaction run is not finished if the watermark is not met */ + watermark = low_wmark_pages(zone); + watermark += (1 << cc->order); + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) return COMPACT_CONTINUE; + /* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ if (cc->order == -1) return COMPACT_CONTINUE; @@ -375,10 +448,69 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; } +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ +unsigned long compaction_suitable(struct zone *zone, int order) +{ + int fragindex; + unsigned long watermark; + + /* + * Watermarks for order-0 must be met for compaction. Note the 2UL. + * This is because during migration, copies of pages need to be + * allocated and for a short time, the footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return COMPACT_SKIPPED; + + /* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ + if (order == -1) + return COMPACT_CONTINUE; + + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1 implies allocations might succeed dependingon watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + return COMPACT_SKIPPED; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) + return COMPACT_PARTIAL; + + return COMPACT_CONTINUE; +} + static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + ret = compaction_suitable(zone, cc->order); + switch (ret) { + case COMPACT_PARTIAL: + case COMPACT_SKIPPED: + /* Compaction is likely to fail */ + return ret; + case COMPACT_CONTINUE: + /* Fall through to compaction */ + ; + } + /* Setup to move all movable pages to the end of the zone */ cc->migrate_pfn = zone->zone_start_pfn; cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; @@ -388,13 +520,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { unsigned long nr_migrate, nr_remaining; + int err; if (!isolate_migratepages(zone, cc)) continue; nr_migrate = cc->nr_migratepages; - migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, 0); + err = migrate_pages(&cc->migratepages, compaction_alloc, + (unsigned long)cc, false, + cc->sync); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -402,9 +536,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); if (nr_remaining) count_vm_events(COMPACTPAGEFAILED, nr_remaining); + trace_mm_compaction_migratepages(nr_migrate - nr_remaining, + nr_remaining); /* Release LRU pages not migrated */ - if (!list_empty(&cc->migratepages)) { + if (err) { putback_lru_pages(&cc->migratepages); cc->nr_migratepages = 0; } @@ -418,8 +554,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) return ret; } -static unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask) +unsigned long compact_zone_order(struct zone *zone, + int order, gfp_t gfp_mask, + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -427,6 +564,7 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, + .sync = sync, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -442,16 +580,17 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from + * @sync: Whether migration is synchronous or not * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; - unsigned long watermark; struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; @@ -461,7 +600,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, * made because an assumption is made that the page allocator can satisfy * the "cheaper" orders without taking special steps */ - if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) + if (!order || !may_enter_fs || !may_perform_io) return rc; count_vm_event(COMPACTSTALL); @@ -469,43 +608,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - int fragindex; int status; - /* - * Watermarks for order-0 must be met for compaction. Note - * the 2UL. This is because during migration, copies of - * pages need to be allocated and for a short time, the - * footprint is higher - */ - watermark = low_wmark_pages(zone) + (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) - continue; - - /* - * fragmentation index determines if allocation failures are - * due to low memory or external fragmentation - * - * index of -1 implies allocations might succeed depending - * on watermarks - * index towards 0 implies failure is due to lack of memory - * index towards 1000 implies failure is due to fragmentation - * - * Only compact if a failure would be due to fragmentation. - */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - continue; - - if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { - rc = COMPACT_PARTIAL; - break; - } - - status = compact_zone_order(zone, order, gfp_mask); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); - if (zone_watermark_ok(zone, order, watermark, 0, 0)) + /* If a normal allocation would succeed, stop compacting */ + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; } diff --git a/mm/dmapool.c b/mm/dmapool.c index 4df2de77e069..03bf3bb4519a 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, if (mem_flags & __GFP_WAIT) { DECLARE_WAITQUEUE(wait, current); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); __add_wait_queue(&pool->waitq, &wait); spin_unlock_irqrestore(&pool->lock, flags); @@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc); static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) { - unsigned long flags; struct dma_page *page; - spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { if (dma < page->dma) continue; if (dma < (page->dma + pool->allocation)) - goto done; + return page; } - page = NULL; - done: - spin_unlock_irqrestore(&pool->lock, flags); - return page; + return NULL; } /** @@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) unsigned long flags; unsigned int offset; + spin_lock_irqsave(&pool->lock, flags); page = pool_find_page(pool, dma); if (!page) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p/%lx (bad dma)\n", @@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) offset = vaddr - page->vaddr; #ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p (bad vaddr)/%Lx\n", @@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) chain = *(int *)(page->vaddr + chain); continue; } + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, dma %Lx " "already free\n", pool->name, @@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) memset(vaddr, POOL_POISON_FREED, pool->size); #endif - spin_lock_irqsave(&pool->lock, flags); page->in_use--; *(int *)vaddr = page->offset; page->offset = offset; diff --git a/mm/filemap.c b/mm/filemap.c index ea89840fc65f..c641edf553a9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -80,8 +80,8 @@ * ->i_mutex * ->i_alloc_sem (various) * - * ->inode_lock - * ->sb_lock (fs/fs-writeback.c) + * inode_wb_list_lock + * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) * * ->i_mmap_lock @@ -98,24 +98,23 @@ * ->zone.lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->tree_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (page_remove_rmap->set_page_dirty) - * ->inode_lock (zap_pte_range->set_page_dirty) + * inode_wb_list_lock (page_remove_rmap->set_page_dirty) + * ->inode->i_lock (page_remove_rmap->set_page_dirty) + * inode_wb_list_lock (zap_pte_range->set_page_dirty) + * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * - * ->task->proc_lock - * ->dcache_lock (proc_pid_lookup) - * * (code doesn't rely on that order, so you could switch it around) * ->tasklist_lock (memory_failure, collect_procs_ao) * ->i_mmap_lock */ /* - * Remove a page from the page cache and free it. Caller has to make + * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock. */ -void __remove_from_page_cache(struct page *page) +void __delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; @@ -140,58 +139,42 @@ void __remove_from_page_cache(struct page *page) } } -void remove_from_page_cache(struct page *page) +/** + * delete_from_page_cache - delete page from page cache + * @page: the page which the kernel is trying to remove from page cache + * + * This must be called only on pages that have been verified to be in the page + * cache and locked. It will never put the page into the free list, the caller + * has a reference on the page. + */ +void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); + freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); - __remove_from_page_cache(page); + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (freepage) + freepage(page); + page_cache_release(page); } -EXPORT_SYMBOL(remove_from_page_cache); +EXPORT_SYMBOL(delete_from_page_cache); -static int sync_page(void *word) +static int sleep_on_page(void *word) { - struct address_space *mapping; - struct page *page; - - page = container_of((unsigned long *)word, struct page, flags); - - /* - * page_mapping() is being called without PG_locked held. - * Some knowledge of the state and use of the page is used to - * reduce the requirements down to a memory barrier. - * The danger here is of a stale page_mapping() return value - * indicating a struct address_space different from the one it's - * associated with when it is associated with one. - * After smp_mb(), it's either the correct page_mapping() for - * the page, or an old page_mapping() and the page's own - * page_mapping() has gone NULL. - * The ->sync_page() address_space operation must tolerate - * page_mapping() going NULL. By an amazing coincidence, - * this comes about because none of the users of the page - * in the ->sync_page() methods make essential use of the - * page_mapping(), merely passing the page down to the backing - * device's unplug functions when it's non-NULL, which in turn - * ignore it for all cases but swap, where only page_private(page) is - * of interest. When page_mapping() does go NULL, the entire - * call stack gracefully ignores the page and returns. - * -- wli - */ - smp_mb(); - mapping = page_mapping(page); - if (mapping && mapping->a_ops && mapping->a_ops->sync_page) - mapping->a_ops->sync_page(page); io_schedule(); return 0; } -static int sync_page_killable(void *word) +static int sleep_on_page_killable(void *word) { - sync_page(word); + sleep_on_page(word); return fatal_signal_pending(current) ? -EINTR : 0; } @@ -296,7 +279,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, continue; wait_on_page_writeback(page); - if (PageError(page)) + if (TestClearPageError(page)) ret = -EIO; } pagevec_release(&pvec); @@ -385,6 +368,76 @@ int filemap_write_and_wait_range(struct address_space *mapping, EXPORT_SYMBOL(filemap_write_and_wait_range); /** + * replace_page_cache_page - replace a pagecache page with a new one + * @old: page to be replaced + * @new: page to replace with + * @gfp_mask: allocation mode + * + * This function replaces a page in the pagecache with a new one. On + * success it acquires the pagecache reference for the new page and + * drops it for the old page. Both the old and new pages must be + * locked. This function does not add the new page to the LRU, the + * caller must do that. + * + * The remove + add is atomic. The only way this function can fail is + * memory allocation failure. + */ +int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) +{ + int error; + struct mem_cgroup *memcg = NULL; + + VM_BUG_ON(!PageLocked(old)); + VM_BUG_ON(!PageLocked(new)); + VM_BUG_ON(new->mapping); + + /* + * This is not page migration, but prepare_migration and + * end_migration does enough work for charge replacement. + * + * In the longer term we probably want a specialized function + * for moving the charge from old to new in a more efficient + * manner. + */ + error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask); + if (error) + return error; + + error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + if (!error) { + struct address_space *mapping = old->mapping; + void (*freepage)(struct page *); + + pgoff_t offset = old->index; + freepage = mapping->a_ops->freepage; + + page_cache_get(new); + new->mapping = mapping; + new->index = offset; + + spin_lock_irq(&mapping->tree_lock); + __delete_from_page_cache(old); + error = radix_tree_insert(&mapping->page_tree, offset, new); + BUG_ON(error); + mapping->nrpages++; + __inc_zone_page_state(new, NR_FILE_PAGES); + if (PageSwapBacked(new)) + __inc_zone_page_state(new, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + radix_tree_preload_end(); + if (freepage) + freepage(old); + page_cache_release(old); + mem_cgroup_end_migration(memcg, old, new, true); + } else { + mem_cgroup_end_migration(memcg, old, new, false); + } + + return error; +} +EXPORT_SYMBOL_GPL(replace_page_cache_page); + +/** * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add * @mapping: the page's address_space @@ -477,12 +530,6 @@ struct page *__page_cache_alloc(gfp_t gfp) EXPORT_SYMBOL(__page_cache_alloc); #endif -static int __sleep_on_page_lock(void *word) -{ - io_schedule(); - return 0; -} - /* * In order to wait for pages to become available there must be * waitqueues associated with pages. By using a hash table of @@ -510,7 +557,7 @@ void wait_on_page_bit(struct page *page, int bit_nr) DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); if (test_bit(bit_nr, &page->flags)) - __wait_on_bit(page_waitqueue(page), &wait, sync_page, + __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_on_page_bit); @@ -574,17 +621,12 @@ EXPORT_SYMBOL(end_page_writeback); /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @page: the page to lock - * - * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some - * random driver's requestfn sets TASK_RUNNING, we could busywait. However - * chances are that on the second loop, the block layer's plug list is empty, - * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ void __lock_page(struct page *page) { DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, + __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); @@ -594,24 +636,10 @@ int __lock_page_killable(struct page *page) DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); return __wait_on_bit_lock(page_waitqueue(page), &wait, - sync_page_killable, TASK_KILLABLE); + sleep_on_page_killable, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); -/** - * __lock_page_nosync - get a lock on the page, without calling sync_page() - * @page: the page to lock - * - * Variant of lock_page that does not require the caller to hold a reference - * on the page's mapping. - */ -void __lock_page_nosync(struct page *page) -{ - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, - TASK_UNINTERRUPTIBLE); -} - int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { @@ -619,8 +647,10 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, __lock_page(page); return 1; } else { - up_read(&mm->mmap_sem); - wait_on_page_locked(page); + if (!(flags & FAULT_FLAG_RETRY_NOWAIT)) { + up_read(&mm->mmap_sem); + wait_on_page_locked(page); + } return 0; } } @@ -780,9 +810,13 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; + + /* + * This can only trigger when the entry at index 0 moves out + * of or back to the root: none yet gotten, safe to restart. + */ if (radix_tree_deref_retry(page)) { - if (ret) - start = pages[ret-1]->index; + WARN_ON(start | i); goto restart; } @@ -798,6 +832,13 @@ repeat: pages[ret] = page; ret++; } + + /* + * If all entries were removed before we could secure them, + * try again, because callers stop trying once 0 is returned. + */ + if (unlikely(!ret && nr_found)) + goto restart; rcu_read_unlock(); return ret; } @@ -832,12 +873,14 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; + + /* + * This can only trigger when the entry at index 0 moves out + * of or back to the root: none yet gotten, safe to restart. + */ if (radix_tree_deref_retry(page)) goto restart; - if (page->mapping == NULL || page->index != index) - break; - if (!page_cache_get_speculative(page)) goto repeat; @@ -847,6 +890,16 @@ repeat: goto repeat; } + /* + * must check mapping and index after taking the ref. + * otherwise we can get both false positives and false + * negatives, which is just confusing to the caller. + */ + if (page->mapping == NULL || page->index != index) { + page_cache_release(page); + break; + } + pages[ret] = page; ret++; index++; @@ -885,6 +938,11 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; + + /* + * This can only trigger when the entry at index 0 moves out + * of or back to the root: none yet gotten, safe to restart. + */ if (radix_tree_deref_retry(page)) goto restart; @@ -900,6 +958,13 @@ repeat: pages[ret] = page; ret++; } + + /* + * If all entries were removed before we could secure them, + * try again, because callers stop trying once 0 is returned. + */ + if (unlikely(!ret && nr_found)) + goto restart; rcu_read_unlock(); if (ret) @@ -1289,12 +1354,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; + struct blk_plug plug; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; + blk_start_plug(&plug); + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { loff_t size; @@ -1367,6 +1435,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, break; } out: + blk_finish_plug(&plug); return retval; } EXPORT_SYMBOL(generic_file_aio_read); @@ -2218,7 +2287,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, gfp_notmask = __GFP_FS; repeat: page = find_lock_page(mapping, index); - if (likely(page)) + if (page) return page; page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); @@ -2478,11 +2547,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct blk_plug plug; ssize_t ret; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); + blk_start_plug(&plug); ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); @@ -2493,6 +2564,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err < 0 && ret > 0) ret = err; } + blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL(generic_file_aio_write); diff --git a/mm/huge_memory.c b/mm/huge_memory.c new file mode 100644 index 000000000000..83326ad66d9b --- /dev/null +++ b/mm/huge_memory.c @@ -0,0 +1,2393 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/mmu_notifier.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/mm_inline.h> +#include <linux/kthread.h> +#include <linux/khugepaged.h> +#include <linux/freezer.h> +#include <linux/mman.h> +#include <asm/tlb.h> +#include <asm/pgalloc.h> +#include "internal.h" + +/* + * By default transparent hugepage support is enabled for all mappings + * and khugepaged scans all mappings. Defrag is only invoked by + * khugepaged hugepage allocations and by page faults inside + * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived + * allocations. + */ +unsigned long transparent_hugepage_flags __read_mostly = +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS + (1<<TRANSPARENT_HUGEPAGE_FLAG)| +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| +#endif + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); + +/* default scan 8*512 pte (or vmas) every 30 second */ +static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; +static unsigned int khugepaged_pages_collapsed; +static unsigned int khugepaged_full_scans; +static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; +/* during fragmentation poll the hugepage allocator once every minute */ +static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; +static struct task_struct *khugepaged_thread __read_mostly; +static DEFINE_MUTEX(khugepaged_mutex); +static DEFINE_SPINLOCK(khugepaged_mm_lock); +static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); +/* + * default collapse hugepages if there is at least one pte mapped like + * it would have happened if the vma was large enough during page + * fault. + */ +static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; + +static int khugepaged(void *none); +static int mm_slots_hash_init(void); +static int khugepaged_slab_init(void); +static void khugepaged_slab_free(void); + +#define MM_SLOTS_HASH_HEADS 1024 +static struct hlist_head *mm_slots_hash __read_mostly; +static struct kmem_cache *mm_slot_cache __read_mostly; + +/** + * struct mm_slot - hash lookup from mm to mm_slot + * @hash: hash collision list + * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node hash; + struct list_head mm_node; + struct mm_struct *mm; +}; + +/** + * struct khugepaged_scan - cursor for scanning + * @mm_head: the head of the mm list to scan + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * + * There is only the one khugepaged_scan instance of this cursor structure. + */ +struct khugepaged_scan { + struct list_head mm_head; + struct mm_slot *mm_slot; + unsigned long address; +} khugepaged_scan = { + .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), +}; + + +static int set_recommended_min_free_kbytes(void) +{ + struct zone *zone; + int nr_zones = 0; + unsigned long recommended_min; + extern int min_free_kbytes; + + if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags) && + !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + return 0; + + for_each_populated_zone(zone) + nr_zones++; + + /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ + recommended_min = pageblock_nr_pages * nr_zones * 2; + + /* + * Make sure that on average at least two pageblocks are almost free + * of another type, one for a migratetype to fall back to and a + * second to avoid subsequent fallbacks of other types There are 3 + * MIGRATE_TYPES we care about. + */ + recommended_min += pageblock_nr_pages * nr_zones * + MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; + + /* don't ever allow to reserve more than 5% of the lowmem */ + recommended_min = min(recommended_min, + (unsigned long) nr_free_buffer_pages() / 20); + recommended_min <<= (PAGE_SHIFT-10); + + if (recommended_min > min_free_kbytes) + min_free_kbytes = recommended_min; + setup_per_zone_wmarks(); + return 0; +} +late_initcall(set_recommended_min_free_kbytes); + +static int start_khugepaged(void) +{ + int err = 0; + if (khugepaged_enabled()) { + int wakeup; + if (unlikely(!mm_slot_cache || !mm_slots_hash)) { + err = -ENOMEM; + goto out; + } + mutex_lock(&khugepaged_mutex); + if (!khugepaged_thread) + khugepaged_thread = kthread_run(khugepaged, NULL, + "khugepaged"); + if (unlikely(IS_ERR(khugepaged_thread))) { + printk(KERN_ERR + "khugepaged: kthread_run(khugepaged) failed\n"); + err = PTR_ERR(khugepaged_thread); + khugepaged_thread = NULL; + } + wakeup = !list_empty(&khugepaged_scan.mm_head); + mutex_unlock(&khugepaged_mutex); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + set_recommended_min_free_kbytes(); + } else + /* wakeup to exit */ + wake_up_interruptible(&khugepaged_wait); +out: + return err; +} + +#ifdef CONFIG_SYSFS + +static ssize_t double_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag req_madv) +{ + if (test_bit(enabled, &transparent_hugepage_flags)) { + VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); + return sprintf(buf, "[always] madvise never\n"); + } else if (test_bit(req_madv, &transparent_hugepage_flags)) + return sprintf(buf, "always [madvise] never\n"); + else + return sprintf(buf, "always madvise [never]\n"); +} +static ssize_t double_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag req_madv) +{ + if (!memcmp("always", buf, + min(sizeof("always")-1, count))) { + set_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + } else if (!memcmp("madvise", buf, + min(sizeof("madvise")-1, count))) { + clear_bit(enabled, &transparent_hugepage_flags); + set_bit(req_madv, &transparent_hugepage_flags); + } else if (!memcmp("never", buf, + min(sizeof("never")-1, count))) { + clear_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + } else + return -EINVAL; + + return count; +} + +static ssize_t enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return double_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); +} +static ssize_t enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + ssize_t ret; + + ret = double_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); + + if (ret > 0) { + int err = start_khugepaged(); + if (err) + ret = err; + } + + if (ret > 0 && + (test_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags) || + test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags))) + set_recommended_min_free_kbytes(); + + return ret; +} +static struct kobj_attribute enabled_attr = + __ATTR(enabled, 0644, enabled_show, enabled_store); + +static ssize_t single_flag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag flag) +{ + return sprintf(buf, "%d\n", + !!test_bit(flag, &transparent_hugepage_flags)); +} + +static ssize_t single_flag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count, + enum transparent_hugepage_flag flag) +{ + unsigned long value; + int ret; + + ret = kstrtoul(buf, 10, &value); + if (ret < 0) + return ret; + if (value > 1) + return -EINVAL; + + if (value) + set_bit(flag, &transparent_hugepage_flags); + else + clear_bit(flag, &transparent_hugepage_flags); + + return count; +} + +/* + * Currently defrag only disables __GFP_NOWAIT for allocation. A blind + * __GFP_REPEAT is too aggressive, it's never worth swapping tons of + * memory just to allocate one more hugepage. + */ +static ssize_t defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return double_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); +} +static ssize_t defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return double_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); +} +static struct kobj_attribute defrag_attr = + __ATTR(defrag, 0644, defrag_show, defrag_store); + +#ifdef CONFIG_DEBUG_VM +static ssize_t debug_cow_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); +} +static ssize_t debug_cow_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); +} +static struct kobj_attribute debug_cow_attr = + __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store); +#endif /* CONFIG_DEBUG_VM */ + +static struct attribute *hugepage_attr[] = { + &enabled_attr.attr, + &defrag_attr.attr, +#ifdef CONFIG_DEBUG_VM + &debug_cow_attr.attr, +#endif + NULL, +}; + +static struct attribute_group hugepage_attr_group = { + .attrs = hugepage_attr, +}; + +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, + scan_sleep_millisecs_store); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, + alloc_sleep_millisecs_store); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long pages; + + err = strict_strtoul(buf, 10, &pages); + if (err || !pages || pages > UINT_MAX) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR(pages_to_scan, 0644, pages_to_scan_show, + pages_to_scan_store); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t khugepaged_defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t khugepaged_defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR(defrag, 0644, khugepaged_defrag_show, + khugepaged_defrag_store); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = strict_strtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, + khugepaged_max_ptes_none_store); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + NULL, +}; + +static struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", +}; +#endif /* CONFIG_SYSFS */ + +static int __init hugepage_init(void) +{ + int err; +#ifdef CONFIG_SYSFS + static struct kobject *hugepage_kobj; +#endif + + err = -EINVAL; + if (!has_transparent_hugepage()) { + transparent_hugepage_flags = 0; + goto out; + } + +#ifdef CONFIG_SYSFS + err = -ENOMEM; + hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!hugepage_kobj)) { + printk(KERN_ERR "hugepage: failed kobject create\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } +#endif + + err = khugepaged_slab_init(); + if (err) + goto out; + + err = mm_slots_hash_init(); + if (err) { + khugepaged_slab_free(); + goto out; + } + + /* + * By default disable transparent hugepages on smaller systems, + * where the extra memory used could hurt more than TLB overhead + * is likely to save. The admin can still enable it through /sys. + */ + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + transparent_hugepage_flags = 0; + + start_khugepaged(); + + set_recommended_min_free_kbytes(); + +out: + return err; +} +module_init(hugepage_init) + +static int __init setup_transparent_hugepage(char *str) +{ + int ret = 0; + if (!str) + goto out; + if (!strcmp(str, "always")) { + set_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "madvise")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } else if (!strcmp(str, "never")) { + clear_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags); + clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags); + ret = 1; + } +out: + if (!ret) + printk(KERN_WARNING + "transparent_hugepage= cannot parse, ignored\n"); + return ret; +} +__setup("transparent_hugepage=", setup_transparent_hugepage); + +static void prepare_pmd_huge_pte(pgtable_t pgtable, + struct mm_struct *mm) +{ + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + if (!mm->pmd_huge_pte) + INIT_LIST_HEAD(&pgtable->lru); + else + list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); + mm->pmd_huge_pte = pgtable; +} + +static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pmd = pmd_mkwrite(pmd); + return pmd; +} + +static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + struct page *page) +{ + int ret = 0; + pgtable_t pgtable; + + VM_BUG_ON(!PageCompound(page)); + pgtable = pte_alloc_one(mm, haddr); + if (unlikely(!pgtable)) { + mem_cgroup_uncharge_page(page); + put_page(page); + return VM_FAULT_OOM; + } + + clear_huge_page(page, haddr, HPAGE_PMD_NR); + __SetPageUptodate(page); + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_none(*pmd))) { + spin_unlock(&mm->page_table_lock); + mem_cgroup_uncharge_page(page); + put_page(page); + pte_free(mm, pgtable); + } else { + pmd_t entry; + entry = mk_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + /* + * The spinlocking to take the lru_lock inside + * page_add_new_anon_rmap() acts as a full memory + * barrier to be sure clear_huge_page writes become + * visible after the set_pmd_at() write. + */ + page_add_new_anon_rmap(page, vma, haddr); + set_pmd_at(mm, haddr, pmd, entry); + prepare_pmd_huge_pte(pgtable, mm); + add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); + spin_unlock(&mm->page_table_lock); + } + + return ret; +} + +static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) +{ + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; +} + +static inline struct page *alloc_hugepage_vma(int defrag, + struct vm_area_struct *vma, + unsigned long haddr, int nd, + gfp_t extra_gfp) +{ + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), + HPAGE_PMD_ORDER, vma, haddr, nd); +} + +#ifndef CONFIG_NUMA +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), + HPAGE_PMD_ORDER); +} +#endif + +int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags) +{ + struct page *page; + unsigned long haddr = address & HPAGE_PMD_MASK; + pte_t *pte; + + if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr, numa_node_id(), 0); + if (unlikely(!page)) { + count_vm_event(THP_FAULT_FALLBACK); + goto out; + } + count_vm_event(THP_FAULT_ALLOC); + if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + put_page(page); + goto out; + } + + return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); + } +out: + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(__pte_alloc(mm, vma, pmd, address))) + return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); + return handle_pte_fault(mm, vma, address, pte, pmd, flags); +} + +int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma) +{ + struct page *src_page; + pmd_t pmd; + pgtable_t pgtable; + int ret; + + ret = -ENOMEM; + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; + + spin_lock(&dst_mm->page_table_lock); + spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); + + ret = -EAGAIN; + pmd = *src_pmd; + if (unlikely(!pmd_trans_huge(pmd))) { + pte_free(dst_mm, pgtable); + goto out_unlock; + } + if (unlikely(pmd_trans_splitting(pmd))) { + /* split huge page running from under us */ + spin_unlock(&src_mm->page_table_lock); + spin_unlock(&dst_mm->page_table_lock); + pte_free(dst_mm, pgtable); + + wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ + goto out; + } + src_page = pmd_page(pmd); + VM_BUG_ON(!PageHead(src_page)); + get_page(src_page); + page_dup_rmap(src_page); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + + pmdp_set_wrprotect(src_mm, addr, src_pmd); + pmd = pmd_mkold(pmd_wrprotect(pmd)); + set_pmd_at(dst_mm, addr, dst_pmd, pmd); + prepare_pmd_huge_pte(pgtable, dst_mm); + + ret = 0; +out_unlock: + spin_unlock(&src_mm->page_table_lock); + spin_unlock(&dst_mm->page_table_lock); +out: + return ret; +} + +/* no "address" argument so destroys page coloring of some arch */ +pgtable_t get_pmd_huge_pte(struct mm_struct *mm) +{ + pgtable_t pgtable; + + assert_spin_locked(&mm->page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + if (list_empty(&pgtable->lru)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} + +static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + struct page *page, + unsigned long haddr) +{ + pgtable_t pgtable; + pmd_t _pmd; + int ret = 0, i; + struct page **pages; + + pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, + GFP_KERNEL); + if (unlikely(!pages)) { + ret |= VM_FAULT_OOM; + goto out; + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | + __GFP_OTHER_NODE, + vma, address, page_to_nid(page)); + if (unlikely(!pages[i] || + mem_cgroup_newpage_charge(pages[i], mm, + GFP_KERNEL))) { + if (pages[i]) + put_page(pages[i]); + mem_cgroup_uncharge_start(); + while (--i >= 0) { + mem_cgroup_uncharge_page(pages[i]); + put_page(pages[i]); + } + mem_cgroup_uncharge_end(); + kfree(pages); + ret |= VM_FAULT_OOM; + goto out; + } + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + copy_user_highpage(pages[i], page + i, + haddr + PAGE_SHIFT*i, vma); + __SetPageUptodate(pages[i]); + cond_resched(); + } + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_free_pages; + VM_BUG_ON(!PageHead(page)); + + pmdp_clear_flush_notify(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = mk_pte(pages[i], vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_add_new_anon_rmap(pages[i], vma, haddr); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + kfree(pages); + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + page_remove_rmap(page); + spin_unlock(&mm->page_table_lock); + + ret |= VM_FAULT_WRITE; + put_page(page); + +out: + return ret; + +out_free_pages: + spin_unlock(&mm->page_table_lock); + mem_cgroup_uncharge_start(); + for (i = 0; i < HPAGE_PMD_NR; i++) { + mem_cgroup_uncharge_page(pages[i]); + put_page(pages[i]); + } + mem_cgroup_uncharge_end(); + kfree(pages); + goto out; +} + +int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd) +{ + int ret = 0; + struct page *page, *new_page; + unsigned long haddr; + + VM_BUG_ON(!vma->anon_vma); + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_unlock; + + page = pmd_page(orig_pmd); + VM_BUG_ON(!PageCompound(page) || !PageHead(page)); + haddr = address & HPAGE_PMD_MASK; + if (page_mapcount(page) == 1) { + pmd_t entry; + entry = pmd_mkyoung(orig_pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) + update_mmu_cache(vma, address, entry); + ret |= VM_FAULT_WRITE; + goto out_unlock; + } + get_page(page); + spin_unlock(&mm->page_table_lock); + + if (transparent_hugepage_enabled(vma) && + !transparent_hugepage_debug_cow()) + new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr, numa_node_id(), 0); + else + new_page = NULL; + + if (unlikely(!new_page)) { + count_vm_event(THP_FAULT_FALLBACK); + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + put_page(page); + goto out; + } + count_vm_event(THP_FAULT_ALLOC); + + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + put_page(new_page); + put_page(page); + ret |= VM_FAULT_OOM; + goto out; + } + + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + __SetPageUptodate(new_page); + + spin_lock(&mm->page_table_lock); + put_page(page); + if (unlikely(!pmd_same(*pmd, orig_pmd))) { + mem_cgroup_uncharge_page(new_page); + put_page(new_page); + } else { + pmd_t entry; + VM_BUG_ON(!PageHead(page)); + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + pmdp_clear_flush_notify(vma, haddr, pmd); + page_add_new_anon_rmap(new_page, vma, haddr); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache(vma, address, entry); + page_remove_rmap(page); + put_page(page); + ret |= VM_FAULT_WRITE; + } +out_unlock: + spin_unlock(&mm->page_table_lock); +out: + return ret; +} + +struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct page *page = NULL; + + assert_spin_locked(&mm->page_table_lock); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + goto out; + + page = pmd_page(*pmd); + VM_BUG_ON(!PageHead(page)); + if (flags & FOLL_TOUCH) { + pmd_t _pmd; + /* + * We should set the dirty bit only for FOLL_WRITE but + * for now the dirty bit in the pmd is meaningless. + * And if the dirty bit will become meaningful and + * we'll only set it with FOLL_WRITE, an atomic + * set_bit will be required on the pmd to set the + * young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + } + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + VM_BUG_ON(!PageCompound(page)); + if (flags & FOLL_GET) + get_page(page); + +out: + return page; +} + +int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd) +{ + int ret = 0; + + spin_lock(&tlb->mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&tlb->mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, + pmd); + } else { + struct page *page; + pgtable_t pgtable; + pgtable = get_pmd_huge_pte(tlb->mm); + page = pmd_page(*pmd); + pmd_clear(pmd); + page_remove_rmap(page); + VM_BUG_ON(page_mapcount(page) < 0); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON(!PageHead(page)); + spin_unlock(&tlb->mm->page_table_lock); + tlb_remove_page(tlb, page); + pte_free(tlb->mm, pgtable); + ret = 1; + } + } else + spin_unlock(&tlb->mm->page_table_lock); + + return ret; +} + +int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + int ret = 0; + + spin_lock(&vma->vm_mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + ret = !pmd_trans_splitting(*pmd); + spin_unlock(&vma->vm_mm->page_table_lock); + if (unlikely(!ret)) + wait_split_huge_page(vma->anon_vma, pmd); + else { + /* + * All logical pages in the range are present + * if backed by a huge page. + */ + memset(vec, 1, (end - addr) >> PAGE_SHIFT); + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + +int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + pmd_t entry; + + entry = pmdp_get_and_clear(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + set_pmd_at(mm, addr, pmd, entry); + spin_unlock(&vma->vm_mm->page_table_lock); + flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + ret = 1; + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + +pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, *ret = NULL; + + if (address & ~HPAGE_PMD_MASK) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto out; + if (pmd_page(*pmd) != page) + goto out; + /* + * split_vma() may create temporary aliased mappings. There is + * no risk as long as all huge pmd are found and have their + * splitting bit set before __split_huge_page_refcount + * runs. Finding the same huge pmd more than once during the + * same rmap walk is not a problem. + */ + if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && + pmd_trans_splitting(*pmd)) + goto out; + if (pmd_trans_huge(*pmd)) { + VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && + !pmd_trans_splitting(*pmd)); + ret = pmd; + } +out: + return ret; +} + +static int __split_huge_page_splitting(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd; + int ret = 0; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + if (pmd) { + /* + * We can't temporarily set the pmd to null in order + * to split it, the pmd must remain marked huge at all + * times or the VM won't take the pmd_trans_huge paths + * and it won't wait on the anon_vma->root->lock to + * serialize against split_huge_page*. + */ + pmdp_splitting_flush_notify(vma, address, pmd); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +static void __split_huge_page_refcount(struct page *page) +{ + int i; + unsigned long head_index = page->index; + struct zone *zone = page_zone(page); + int zonestat; + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); + compound_lock(page); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + + /* tail_page->_count cannot change */ + atomic_sub(atomic_read(&page_tail->_count), &page->_count); + BUG_ON(page_count(page) <= 0); + atomic_add(page_mapcount(page) + 1, &page_tail->_count); + BUG_ON(atomic_read(&page_tail->_count) <= 0); + + /* after clearing PageTail the gup refcount can be released */ + smp_mb(); + + /* + * retain hwpoison flag of the poisoned tail page: + * fix for the unsuitable process killed on Guest Machine(KVM) + * by the memory-failure. + */ + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; + page_tail->flags |= (page->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_mlocked) | + (1L << PG_uptodate))); + page_tail->flags |= (1L << PG_dirty); + + /* + * 1) clear PageTail before overwriting first_page + * 2) clear PageTail before clearing PageHead for VM_BUG_ON + */ + smp_wmb(); + + /* + * __split_huge_page_splitting() already set the + * splitting bit in all pmd that could map this + * hugepage, that will ensure no CPU can alter the + * mapcount on the head page. The mapcount is only + * accounted in the head page and it has to be + * transferred to all tail pages in the below code. So + * for this code to be safe, the split the mapcount + * can't change. But that doesn't mean userland can't + * keep changing and reading the page contents while + * we transfer the mapcount, so the pmd splitting + * status is achieved setting a reserved bit in the + * pmd, not by clearing the present bit. + */ + BUG_ON(page_mapcount(page_tail)); + page_tail->_mapcount = page->_mapcount; + + BUG_ON(page_tail->mapping); + page_tail->mapping = page->mapping; + + page_tail->index = ++head_index; + + BUG_ON(!PageAnon(page_tail)); + BUG_ON(!PageUptodate(page_tail)); + BUG_ON(!PageDirty(page_tail)); + BUG_ON(!PageSwapBacked(page_tail)); + + mem_cgroup_split_huge_fixup(page, page_tail); + + lru_add_page_tail(zone, page, page_tail); + } + + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + + /* + * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, + * so adjust those appropriately if this page is on the LRU. + */ + if (PageLRU(page)) { + zonestat = NR_LRU_BASE + page_lru(page); + __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); + } + + ClearPageCompound(page); + compound_unlock(page); + spin_unlock_irq(&zone->lru_lock); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + BUG_ON(page_count(page_tail) <= 0); + /* + * Tail pages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(page_tail); + } + + /* + * Only the head page (now become a regular page) is required + * to be pinned by the caller. + */ + BUG_ON(page_count(page) <= 0); +} + +static int __split_huge_page_map(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd, _pmd; + int ret = 0, i; + pgtable_t pgtable; + unsigned long haddr; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + if (pmd) { + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0, haddr = address; i < HPAGE_PMD_NR; + i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + BUG_ON(PageCompound(page+i)); + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (!pmd_write(*pmd)) + entry = pte_wrprotect(entry); + else + BUG_ON(page_mapcount(page) != 1); + if (!pmd_young(*pmd)) + entry = pte_mkold(entry); + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + /* + * Up to this point the pmd is present and huge and + * userland has the whole access to the hugepage + * during the split (which happens in place). If we + * overwrite the pmd with the not-huge version + * pointing to the pte here (which of course we could + * if all CPUs were bug free), userland could trigger + * a small page size TLB miss on the small sized TLB + * while the hugepage TLB entry is still established + * in the huge TLB. Some CPU doesn't like that. See + * http://support.amd.com/us/Processor_TechDocs/41322.pdf, + * Erratum 383 on page 93. Intel should be safe but is + * also warns that it's only safe if the permission + * and cache attributes of the two entries loaded in + * the two TLB is identical (which should be the case + * here). But it is generally safer to never allow + * small and huge TLB entries for the same virtual + * address to be loaded simultaneously. So instead of + * doing "pmd_populate(); flush_tlb_range();" we first + * mark the current pmd notpresent (atomically because + * here the pmd_trans_huge and pmd_trans_splitting + * must remain set at all times on the pmd until the + * split is complete for this pmd), then we flush the + * SMP TLB and finally we write the non-huge version + * of the pmd entry with pmd_populate. + */ + set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmd_populate(mm, pmd, pgtable); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +/* must be called with anon_vma->root->lock hold */ +static void __split_huge_page(struct page *page, + struct anon_vma *anon_vma) +{ + int mapcount, mapcount2; + struct anon_vma_chain *avc; + + BUG_ON(!PageHead(page)); + BUG_ON(PageTail(page)); + + mapcount = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount += __split_huge_page_splitting(page, vma, addr); + } + /* + * It is critical that new vmas are added to the tail of the + * anon_vma list. This guarantes that if copy_huge_pmd() runs + * and establishes a child pmd before + * __split_huge_page_splitting() freezes the parent pmd (so if + * we fail to prevent copy_huge_pmd() from running until the + * whole __split_huge_page() is complete), we will still see + * the newly established pmd of the child later during the + * walk, to be able to set it as pmd_trans_splitting too. + */ + if (mapcount != page_mapcount(page)) + printk(KERN_ERR "mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); + BUG_ON(mapcount != page_mapcount(page)); + + __split_huge_page_refcount(page); + + mapcount2 = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount2 += __split_huge_page_map(page, vma, addr); + } + if (mapcount != mapcount2) + printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); + BUG_ON(mapcount != mapcount2); +} + +int split_huge_page(struct page *page) +{ + struct anon_vma *anon_vma; + int ret = 1; + + BUG_ON(!PageAnon(page)); + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + goto out; + ret = 0; + if (!PageCompound(page)) + goto out_unlock; + + BUG_ON(!PageSwapBacked(page)); + __split_huge_page(page, anon_vma); + count_vm_event(THP_SPLIT); + + BUG_ON(PageCompound(page)); +out_unlock: + page_unlock_anon_vma(anon_vma); +out: + return ret; +} + +#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ + VM_HUGETLB|VM_SHARED|VM_MAYSHARE) + +int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) +{ + switch (advice) { + case MADV_HUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) + return -EINVAL; + *vm_flags &= ~VM_NOHUGEPAGE; + *vm_flags |= VM_HUGEPAGE; + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (unlikely(khugepaged_enter_vma_merge(vma))) + return -ENOMEM; + break; + case MADV_NOHUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) + return -EINVAL; + *vm_flags &= ~VM_HUGEPAGE; + *vm_flags |= VM_NOHUGEPAGE; + /* + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning + * this vma even if we leave the mm registered in khugepaged if + * it got registered before VM_NOHUGEPAGE was set. + */ + break; + } + + return 0; +} + +static int __init khugepaged_slab_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + return 0; +} + +static void __init khugepaged_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + mm_slot_cache = NULL; +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static int __init mm_slots_hash_init(void) +{ + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!mm_slots_hash) + return -ENOMEM; + return 0; +} + +#if 0 +static void __init mm_slots_hash_free(void) +{ + kfree(mm_slots_hash); + mm_slots_hash = NULL; +} +#endif + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + hlist_for_each_entry(mm_slot, node, bucket, hash) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + mm_slot->mm = mm; + hlist_add_head(&mm_slot->hash, bucket); +} + +static inline int khugepaged_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +int __khugepaged_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON(khugepaged_test_exit(mm)); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + free_mm_slot(mm_slot); + return 0; + } + + spin_lock(&khugepaged_mm_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + atomic_inc(&mm->mm_count); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + return 0; +} + +int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + unsigned long hstart, hend; + if (!vma->anon_vma) + /* + * Not yet faulted in so we will register later in the + * page fault if needed. + */ + return 0; + if (vma->vm_ops) + /* khugepaged not yet working on file or special mappings */ + return 0; + /* + * If is_pfn_mapping() is true is_learn_pfn_mapping() must be + * true too, verify it here. + */ + VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart < hend) + return khugepaged_enter(vma); + return 0; +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + free = 1; + } + + if (free) { + spin_unlock(&khugepaged_mm_lock); + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + free_mm_slot(mm_slot); + mmdrop(mm); + } else if (mm_slot) { + spin_unlock(&khugepaged_mm_lock); + /* + * This is required to serialize against + * khugepaged_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we + * return all pagetables will be destroyed) until + * khugepaged has finished working on the pagetables + * under the mmap_sem. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } else + spin_unlock(&khugepaged_mm_lock); +} + +static void release_pte_page(struct page *page) +{ + /* 0 stands for page_is_file_cache(page) == false */ + dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte) +{ + while (--_pte >= pte) { + pte_t pteval = *_pte; + if (!pte_none(pteval)) + release_pte_page(pte_page(pteval)); + } +} + +static void release_all_pte_pages(pte_t *pte) +{ + release_pte_pages(pte, pte + HPAGE_PMD_NR); +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + struct page *page; + pte_t *_pte; + int referenced = 0, isolated = 0, none = 0; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else { + release_pte_pages(pte, _pte); + goto out; + } + } + if (!pte_present(pteval) || !pte_write(pteval)) { + release_pte_pages(pte, _pte); + goto out; + } + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page)) { + release_pte_pages(pte, _pte); + goto out; + } + VM_BUG_ON(PageCompound(page)); + BUG_ON(!PageAnon(page)); + VM_BUG_ON(!PageSwapBacked(page)); + + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + release_pte_pages(pte, _pte); + goto out; + } + /* 0 stands for page_is_file_cache(page) == false */ + inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageLRU(page)); + + /* If there is no mapped pte young don't collapse the page */ + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = 1; + } + if (unlikely(!referenced)) + release_all_pte_pages(pte); + else + isolated = 1; +out: + return isolated; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl) +{ + pte_t *_pte; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (pte_none(pteval)) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + VM_BUG_ON(page_mapcount(src_page) != 1); + VM_BUG_ON(page_count(src_page) != 2); + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + + address += PAGE_SIZE; + page++; + } +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage, + struct vm_area_struct *vma, + int node) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *ptl; + int isolated; + unsigned long hstart, hend; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#ifndef CONFIG_NUMA + VM_BUG_ON(!*hpage); + new_page = *hpage; + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + return; + } +#else + VM_BUG_ON(*hpage); + /* + * Allocate the page while the vma is still valid and under + * the mmap_sem read mode so there is no memory allocation + * later when we take the mmap_sem in write mode. This is more + * friendly behavior (OTOH it may actually hide bugs) to + * filesystems in userland with daemons allocating memory in + * the userland I/O paths. Allocating memory with the + * mmap_sem in read mode is good idea also to allow greater + * scalability. + */ + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, + node, __GFP_OTHER_NODE); + if (unlikely(!new_page)) { + up_read(&mm->mmap_sem); + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + *hpage = ERR_PTR(-ENOMEM); + return; + } + count_vm_event(THP_COLLAPSE_ALLOC); + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + put_page(new_page); + return; + } +#endif + + /* after allocating the hugepage upgrade to mmap_sem write mode */ + up_read(&mm->mmap_sem); + + /* + * Prevent all access to pagetables with the exception of + * gup_fast later hanlded by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + vma = find_vma(mm, address); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (address < hstart || address + HPAGE_PMD_SIZE > hend) + goto out; + + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + goto out; + + if (!vma->anon_vma || vma->vm_ops) + goto out; + if (is_vma_temporary_stack(vma)) + goto out; + /* + * If is_pfn_mapping() is true is_learn_pfn_mapping() must be + * true too, verify it here. + */ + VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + /* pmd can't go away or become huge under us */ + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + anon_vma_lock(vma->anon_vma); + + pte = pte_offset_map(pmd, address); + ptl = pte_lockptr(mm, pmd); + + spin_lock(&mm->page_table_lock); /* probably unnecessary */ + /* + * After this gup_fast can't run anymore. This also removes + * any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address + * to avoid the risk of CPU bugs in that area. + */ + _pmd = pmdp_clear_flush_notify(vma, address, pmd); + spin_unlock(&mm->page_table_lock); + + spin_lock(ptl); + isolated = __collapse_huge_page_isolate(vma, address, pte); + spin_unlock(ptl); + + if (unlikely(!isolated)) { + pte_unmap(pte); + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + set_pmd_at(mm, address, pmd, _pmd); + spin_unlock(&mm->page_table_lock); + anon_vma_unlock(vma->anon_vma); + goto out; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock(vma->anon_vma); + + __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + pte_unmap(pte); + __SetPageUptodate(new_page); + pgtable = pmd_pgtable(_pmd); + VM_BUG_ON(page_count(pgtable) != 1); + VM_BUG_ON(page_mapcount(pgtable) != 0); + + _pmd = mk_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + _pmd = pmd_mkhuge(_pmd); + + /* + * spin_lock() below is not the equivalent of smp_wmb(), so + * this is needed to avoid the copy_huge_page writes to become + * visible after the set_pmd_at() write. + */ + smp_wmb(); + + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(new_page, vma, address); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache(vma, address, entry); + prepare_pmd_huge_pte(pgtable, mm); + mm->nr_ptes--; + spin_unlock(&mm->page_table_lock); + +#ifndef CONFIG_NUMA + *hpage = NULL; +#endif + khugepaged_pages_collapsed++; +out_up_write: + up_write(&mm->mmap_sem); + return; + +out: + mem_cgroup_uncharge_page(new_page); +#ifdef CONFIG_NUMA + put_page(new_page); +#endif + goto out_up_write; +} + +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + int ret = 0, referenced = 0, none = 0; + struct page *page; + unsigned long _address; + spinlock_t *ptl; + int node = -1; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else + goto out_unmap; + } + if (!pte_present(pteval) || !pte_write(pteval)) + goto out_unmap; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + goto out_unmap; + /* + * Chose the node of the first page. This could + * be more sophisticated and look at more pages, + * but isn't for now. + */ + if (node == -1) + node = page_to_nid(page); + VM_BUG_ON(PageCompound(page)); + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + goto out_unmap; + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) + goto out_unmap; + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) + referenced = 1; + } + if (referenced) + ret = 1; +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret) + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma, node); +out: + return ret; +} + +static void collect_mm_slot(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_test_exit(mm)) { + /* free mm_slot */ + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + free_mm_slot(mm_slot); + mmdrop(mm); + } +} + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, + struct page **hpage) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else { + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + vma = NULL; + else + vma = find_vma(mm, khugepaged_scan.address); + + progress++; + for (; vma; vma = vma->vm_next) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) { + progress++; + break; + } + + if ((!(vma->vm_flags & VM_HUGEPAGE) && + !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) { + skip: + progress++; + continue; + } + if (!vma->anon_vma || vma->vm_ops) + goto skip; + if (is_vma_temporary_stack(vma)) + goto skip; + /* + * If is_pfn_mapping() is true is_learn_pfn_mapping() + * must be true too, verify it here. + */ + VM_BUG_ON(is_linear_pfn_mapping(vma) || + vma->vm_flags & VM_NO_THP); + + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) + goto skip; + if (khugepaged_scan.address > hend) + goto skip; + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + int ret; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (ret) + /* we released mmap_sem so break loop */ + goto breakouterloop_mmap_sem; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_sem: + + spin_lock(&khugepaged_mm_lock); + VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (khugepaged_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + khugepaged_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + !khugepaged_enabled(); +} + +static void khugepaged_do_scan(struct page **hpage) +{ + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = khugepaged_pages_to_scan; + + barrier(); /* write khugepaged_pages_to_scan to local stack */ + + while (progress < pages) { + cond_resched(); + +#ifndef CONFIG_NUMA + if (!*hpage) { + *hpage = alloc_hugepage(khugepaged_defrag()); + if (unlikely(!*hpage)) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + break; + } + count_vm_event(THP_COLLAPSE_ALLOC); + } +#else + if (IS_ERR(*hpage)) + break; +#endif + + if (unlikely(kthread_should_stop() || freezing(current))) + break; + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + hpage); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + } +} + +static void khugepaged_alloc_sleep(void) +{ + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); +} + +#ifndef CONFIG_NUMA +static struct page *khugepaged_alloc_hugepage(void) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + khugepaged_alloc_sleep(); + } else + count_vm_event(THP_COLLAPSE_ALLOC); + } while (unlikely(!hpage) && + likely(khugepaged_enabled())); + return hpage; +} +#endif + +static void khugepaged_loop(void) +{ + struct page *hpage; + +#ifdef CONFIG_NUMA + hpage = NULL; +#endif + while (likely(khugepaged_enabled())) { +#ifndef CONFIG_NUMA + hpage = khugepaged_alloc_hugepage(); + if (unlikely(!hpage)) { + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + break; + } + count_vm_event(THP_COLLAPSE_ALLOC); +#else + if (IS_ERR(hpage)) { + khugepaged_alloc_sleep(); + hpage = NULL; + } +#endif + + khugepaged_do_scan(&hpage); +#ifndef CONFIG_NUMA + if (hpage) + put_page(hpage); +#endif + try_to_freeze(); + if (unlikely(kthread_should_stop())) + break; + if (khugepaged_has_work()) { + DEFINE_WAIT(wait); + if (!khugepaged_scan_sleep_millisecs) + continue; + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_scan_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } else if (khugepaged_enabled()) + wait_event_freezable(khugepaged_wait, + khugepaged_wait_event()); + } +} + +static int khugepaged(void *none) +{ + struct mm_slot *mm_slot; + + set_freezable(); + set_user_nice(current, 19); + + /* serialize with start_khugepaged() */ + mutex_lock(&khugepaged_mutex); + + for (;;) { + mutex_unlock(&khugepaged_mutex); + VM_BUG_ON(khugepaged_thread != current); + khugepaged_loop(); + VM_BUG_ON(khugepaged_thread != current); + + mutex_lock(&khugepaged_mutex); + if (!khugepaged_enabled()) + break; + if (unlikely(kthread_should_stop())) + break; + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + + khugepaged_thread = NULL; + mutex_unlock(&khugepaged_mutex); + + return 0; +} + +void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + struct page *page; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(&mm->page_table_lock); + return; + } + page = pmd_page(*pmd); + VM_BUG_ON(!page_count(page)); + get_page(page); + spin_unlock(&mm->page_table_lock); + + split_huge_page(page); + + put_page(page); + BUG_ON(pmd_trans_huge(*pmd)); +} + +static void split_huge_page_address(struct mm_struct *mm, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return; + /* + * Caller holds the mmap_sem write mode, so a huge pmd cannot + * materialize from under us. + */ + split_huge_page_pmd(mm, pmd); +} + +void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + /* + * If the new start address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (start & ~HPAGE_PMD_MASK && + (start & HPAGE_PMD_MASK) >= vma->vm_start && + (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, start); + + /* + * If the new end address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (end & ~HPAGE_PMD_MASK && + (end & HPAGE_PMD_MASK) >= vma->vm_start && + (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, end); + + /* + * If we're also updating the vma->vm_next->vm_start, if the new + * vm_next->vm_start isn't page aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (adjust_next > 0) { + struct vm_area_struct *next = vma->vm_next; + unsigned long nstart = next->vm_start; + nstart += adjust_next << PAGE_SHIFT; + if (nstart & ~HPAGE_PMD_MASK && + (nstart & HPAGE_PMD_MASK) >= next->vm_start && + (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) + split_huge_page_address(next->vm_mm, nstart); + } +} diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85855240933d..8ee3bd8ec5b5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t) if (rg->from > t) return chg; - /* We overlap with this area, if it extends futher than + /* We overlap with this area, if it extends further than * us then we must extend ourselves. Account for its * existing reservation. */ if (rg->to > t) { @@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma) return 0; } -static void clear_gigantic_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - struct page *p = page; - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { - cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); - } -} -static void clear_huge_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - - if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, sz); - return; - } - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++) { - cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); - } -} - -static void copy_user_gigantic_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < pages_per_huge_page(h); ) { - cond_resched(); - copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -static void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_user_gigantic_page(dst, src, addr, vma); - return; - } - - might_sleep(); - for (i = 0; i < pages_per_huge_page(h); i++) { - cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); - } -} - static void copy_gigantic_page(struct page *dst, struct page *src) { int i; @@ -907,7 +842,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) } /* - * Increase the hugetlb pool such that it can accomodate a reservation + * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. */ static int gather_surplus_pages(struct hstate *h, int delta) @@ -955,7 +890,7 @@ retry: /* * The surplus_list now contains _at_least_ the number of extra pages - * needed to accomodate the reservation. Add the appropriate number + * needed to accommodate the reservation. Add the appropriate number * of pages to the hugetlb pool and free the extras back to the buddy * allocator. Commit the entire reservation here to prevent another * process from stealing the pages as they are added to the pool but @@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } + static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) @@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, err = strict_strtoul(buf, 10, &count); if (err) - return 0; + goto out; h = kobj_to_hstate(kobj, &nid); + if (h->order >= MAX_ORDER) { + err = -EINVAL; + goto out; + } + if (nid == NUMA_NO_NODE) { /* * global hstate attribute @@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, NODEMASK_FREE(nodes_allowed); return len; +out: + NODEMASK_FREE(nodes_allowed); + return err; } static ssize_t nr_hugepages_show(struct kobject *kobj, @@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); } + static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); + if (h->order >= MAX_ORDER) + return -EINVAL; + err = strict_strtoul(buf, 10, &input); if (err) - return 0; + return err; spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = input; @@ -1922,13 +1870,18 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; + + tmp = h->max_huge_pages; - if (!write) - tmp = h->max_huge_pages; + if (write && h->order >= MAX_ORDER) + return -EINVAL; table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { NODEMASK_ALLOC(nodemask_t, nodes_allowed, @@ -1943,8 +1896,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (nodes_allowed != &node_states[N_HIGH_MEMORY]) NODEMASK_FREE(nodes_allowed); } - - return 0; +out: + return ret; } int hugetlb_sysctl_handler(struct ctl_table *table, int write, @@ -1982,21 +1935,26 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; - if (!write) - tmp = h->nr_overcommit_huge_pages; + tmp = h->nr_overcommit_huge_pages; + + if (write && h->order >= MAX_ORDER) + return -EINVAL; table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; spin_unlock(&hugetlb_lock); } - - return 0; +out: + return ret; } #endif /* CONFIG_SYSCTL */ @@ -2085,7 +2043,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA - * has a reference to the reservation map it cannot dissappear until + * has a reference to the reservation map it cannot disappear until * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ @@ -2454,7 +2412,8 @@ retry_avoidcopy: return VM_FAULT_OOM; } - copy_user_huge_page(new_page, old_page, address, vma); + copy_user_huge_page(new_page, old_page, address, vma, + pages_per_huge_page(h)); __SetPageUptodate(new_page); /* @@ -2531,7 +2490,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Currently, we are forced to kill the process in the event the * original mapper has unmapped pages from the child due to a failed - * COW. Warn that such a situation has occured as it may not be obvious + * COW. Warn that such a situation has occurred as it may not be obvious */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { printk(KERN_WARNING @@ -2558,7 +2517,7 @@ retry: ret = -PTR_ERR(page); goto out; } - clear_huge_page(page, address, huge_page_size(h)); + clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); if (vma->vm_flags & VM_MAYSHARE) { diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 0948f1072d6b..c7fc7fd00e32 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -1,4 +1,4 @@ -/* Inject a hwpoison memory failure on a arbitary pfn */ +/* Inject a hwpoison memory failure on a arbitrary pfn */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/kernel.h> diff --git a/mm/internal.h b/mm/internal.h index dedb0aff673f..9d0ced8e505e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -134,6 +134,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern unsigned long vma_address(struct page *page, + struct vm_area_struct *vma); +#endif #else /* !CONFIG_MMU */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { @@ -158,7 +162,7 @@ static inline struct page *mem_map_offset(struct page *base, int offset) } /* - * Iterator over all subpages withing the maximally aligned gigantic + * Iterator over all subpages within the maximally aligned gigantic * page 'base'. Handle any discontiguity in the mem_map. */ static inline struct page *mem_map_next(struct page *iter, @@ -241,10 +245,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas); - #define ZONE_RECLAIM_NOSCAN -2 #define ZONE_RECLAIM_FULL -1 #define ZONE_RECLAIM_SOME 0 diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index 177a5169bbde..ff0d9779cec8 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void) * after the module is removed. */ for (i = 0; i < 10; i++) { - elem = kmalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); + elem = kzalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); if (!elem) return -ENOMEM; - memset(elem, 0, sizeof(*elem)); INIT_LIST_HEAD(&elem->list); - list_add_tail(&elem->list, &test_list); } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index bd9bc214091b..c1d5867543e4 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -113,7 +113,9 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ + __GFP_NORETRY | __GFP_NOMEMALLOC | \ + __GFP_NOWARN) /* scanning area inside a memory block */ struct kmemleak_scan_area { @@ -263,7 +265,7 @@ static void kmemleak_disable(void); } while (0) /* - * Macro invoked when a serious kmemleak condition occured and cannot be + * Macro invoked when a serious kmemleak condition occurred and cannot be * recovered from. Kmemleak will be disabled and further allocation/freeing * tracing no longer available. */ @@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct kmemleak_object *object; struct prio_tree_node *node; - object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { - kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); + pr_warning("Cannot allocate a kmemleak_object structure\n"); + kmemleak_disable(); return NULL; } @@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) return; } - area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); if (!area) { - kmemleak_warn("Cannot allocate a scan area\n"); + pr_warning("Cannot allocate a scan area\n"); goto out; } @@ -1003,7 +1006,7 @@ static bool update_checksum(struct kmemleak_object *object) /* * Memory scanning is a long process and it needs to be interruptable. This - * function checks whether such interrupt condition occured. + * function checks whether such interrupt condition occurred. */ static int scan_should_stop(void) { @@ -1730,7 +1733,7 @@ static int __init kmemleak_late_init(void) if (atomic_read(&kmemleak_error)) { /* - * Some error occured and kmemleak was disabled. There is a + * Some error occurred and kmemleak was disabled. There is a * small chance that kmemleak_disable() was called immediately * after setting kmemleak_initialized and we may end up with * two clean-up threads but serialized by scan_mutex. @@ -34,6 +34,7 @@ #include <linux/swap.h> #include <linux/ksm.h> #include <linux/hash.h> +#include <linux/freezer.h> #include <asm/tlbflush.h> #include "internal.h" @@ -300,20 +301,6 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) return rmap_item->address & STABLE_FLAG; } -static void hold_anon_vma(struct rmap_item *rmap_item, - struct anon_vma *anon_vma) -{ - rmap_item->anon_vma = anon_vma; - get_anon_vma(anon_vma); -} - -static void ksm_drop_anon_vma(struct rmap_item *rmap_item) -{ - struct anon_vma *anon_vma = rmap_item->anon_vma; - - drop_anon_vma(anon_vma); -} - /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -396,7 +383,7 @@ static void break_cow(struct rmap_item *rmap_item) * It is not an accident that whenever we want to break COW * to undo, we also need to drop a reference to the anon_vma. */ - ksm_drop_anon_vma(rmap_item); + put_anon_vma(rmap_item->anon_vma); down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) @@ -411,6 +398,20 @@ out: up_read(&mm->mmap_sem); } +static struct page *page_trans_compound_anon(struct page *page) +{ + if (PageTransCompound(page)) { + struct page *head = compound_trans_head(page); + /* + * head may actually be splitted and freed from under + * us but it's ok here. + */ + if (PageAnon(head)) + return head; + } + return NULL; +} + static struct page *get_mergeable_page(struct rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; @@ -430,7 +431,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) page = follow_page(vma, addr, FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - if (PageAnon(page)) { + if (PageAnon(page) || page_trans_compound_anon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { @@ -451,7 +452,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) ksm_pages_sharing--; else ksm_pages_shared--; - ksm_drop_anon_vma(rmap_item); + put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } @@ -539,7 +540,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) else ksm_pages_shared--; - ksm_drop_anon_vma(rmap_item); + put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; } else if (rmap_item->address & UNSTABLE_FLAG) { @@ -708,6 +709,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (addr == -EFAULT) goto out; + BUG_ON(PageTransCompound(page)); ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) goto out; @@ -718,7 +720,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, swapped = PageSwapCache(page); flush_cache_page(vma, addr, page_to_pfn(page)); /* - * Ok this is tricky, when get_user_pages_fast() run it doesnt + * Ok this is tricky, when get_user_pages_fast() run it doesn't * take any lock, therefore the check that we are going to make * with the pagecount against the mapcount is racey and * O_DIRECT can happen right after the check. @@ -783,6 +785,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, goto out; pmd = pmd_offset(pud, addr); + BUG_ON(pmd_trans_huge(*pmd)); if (!pmd_present(*pmd)) goto out; @@ -800,6 +803,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); put_page(page); pte_unmap_unlock(ptep, ptl); @@ -808,6 +813,33 @@ out: return err; } +static int page_trans_compound_anon_split(struct page *page) +{ + int ret = 0; + struct page *transhuge_head = page_trans_compound_anon(page); + if (transhuge_head) { + /* Get the reference on the head to split it. */ + if (get_page_unless_zero(transhuge_head)) { + /* + * Recheck we got the reference while the head + * was still anonymous. + */ + if (PageAnon(transhuge_head)) + ret = split_huge_page(transhuge_head); + else + /* + * Retry later if split_huge_page run + * from under us. + */ + ret = 1; + put_page(transhuge_head); + } else + /* Retry later if split_huge_page run from under us. */ + ret = 1; + } + return ret; +} + /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page @@ -828,6 +860,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (!(vma->vm_flags & VM_MERGEABLE)) goto out; + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); if (!PageAnon(page)) goto out; @@ -900,7 +935,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, goto out; /* Must get reference to anon_vma while still holding mmap_sem */ - hold_anon_vma(rmap_item, vma->anon_vma); + rmap_item->anon_vma = vma->anon_vma; + get_anon_vma(vma->anon_vma); out: up_read(&mm->mmap_sem); return err; @@ -1247,6 +1283,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) slot = ksm_scan.mm_slot; if (slot == &ksm_mm_head) { + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to ksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + root_unstable_tree = RB_ROOT; spin_lock(&ksm_mmlist_lock); @@ -1277,7 +1325,13 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { + if (IS_ERR_OR_NULL(*page)) { + ksm_scan.address += PAGE_SIZE; + cond_resched(); + continue; + } + if (PageAnon(*page) || + page_trans_compound_anon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, @@ -1291,8 +1345,7 @@ next_mm: up_read(&mm->mmap_sem); return rmap_item; } - if (!IS_ERR_OR_NULL(*page)) - put_page(*page); + put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); } @@ -1352,7 +1405,7 @@ static void ksm_do_scan(unsigned int scan_npages) struct rmap_item *rmap_item; struct page *uninitialized_var(page); - while (scan_npages--) { + while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) @@ -1370,6 +1423,7 @@ static int ksmd_should_run(void) static int ksm_scan_thread(void *nothing) { + set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { @@ -1378,11 +1432,13 @@ static int ksm_scan_thread(void *nothing) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); + try_to_freeze(); + if (ksmd_should_run()) { schedule_timeout_interruptible( msecs_to_jiffies(ksm_thread_sleep_millisecs)); } else { - wait_event_interruptible(ksm_thread_wait, + wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } diff --git a/mm/madvise.c b/mm/madvise.c index 319528b8db74..2221491ed503 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma, if (error) goto out; break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior) case MADV_MERGEABLE: case MADV_UNMERGEABLE: #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: +#endif return 1; default: diff --git a/mm/memblock.c b/mm/memblock.c index 400dc62697d7..a0562d1a6ad4 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -58,28 +58,6 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } -static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1, - phys_addr_t base2, phys_addr_t size2) -{ - if (base2 == base1 + size1) - return 1; - else if (base1 == base2 + size2) - return -1; - - return 0; -} - -static long __init_memblock memblock_regions_adjacent(struct memblock_type *type, - unsigned long r1, unsigned long r2) -{ - phys_addr_t base1 = type->regions[r1].base; - phys_addr_t size1 = type->regions[r1].size; - phys_addr_t base2 = type->regions[r2].base; - phys_addr_t size2 = type->regions[r2].size; - - return memblock_addrs_adjacent(base1, size1, base2, size2); -} - long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { unsigned long i; @@ -137,8 +115,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, BUG_ON(0 == size); - size = memblock_align_up(size, align); - /* Pump up max_addr */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; @@ -208,14 +184,13 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u type->regions[i].size = type->regions[i + 1].size; } type->cnt--; -} -/* Assumption: base addr of region 1 < base addr of region 2 */ -static void __init_memblock memblock_coalesce_regions(struct memblock_type *type, - unsigned long r1, unsigned long r2) -{ - type->regions[r1].size += type->regions[r2].size; - memblock_remove_region(type, r2); + /* Special case for empty arrays */ + if (type->cnt == 0) { + type->cnt = 1; + type->regions[0].base = 0; + type->regions[0].size = 0; + } } /* Defined below but needed now */ @@ -278,7 +253,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) return 0; /* Add the new reserved region now. Should not fail ! */ - BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0); + BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size)); /* If the array wasn't our static init one, then free it. We only do * that before SLAB is available as later on, we don't know whether @@ -298,58 +273,99 @@ extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1 return 1; } -static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +static long __init_memblock memblock_add_region(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) { - unsigned long coalesced = 0; - long adjacent, i; - - if ((type->cnt == 1) && (type->regions[0].size == 0)) { - type->regions[0].base = base; - type->regions[0].size = size; - return 0; - } + phys_addr_t end = base + size; + int i, slot = -1; - /* First try and coalesce this MEMBLOCK with another. */ + /* First try and coalesce this MEMBLOCK with others */ for (i = 0; i < type->cnt; i++) { - phys_addr_t rgnbase = type->regions[i].base; - phys_addr_t rgnsize = type->regions[i].size; + struct memblock_region *rgn = &type->regions[i]; + phys_addr_t rend = rgn->base + rgn->size; - if ((rgnbase == base) && (rgnsize == size)) - /* Already have this region, so we're done */ + /* Exit if there's no possible hits */ + if (rgn->base > end || rgn->size == 0) + break; + + /* Check if we are fully enclosed within an existing + * block + */ + if (rgn->base <= base && rend >= end) return 0; - adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); - /* Check if arch allows coalescing */ - if (adjacent != 0 && type == &memblock.memory && - !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize)) - break; - if (adjacent > 0) { - type->regions[i].base -= size; - type->regions[i].size += size; - coalesced++; - break; - } else if (adjacent < 0) { - type->regions[i].size += size; - coalesced++; - break; + /* Check if we overlap or are adjacent with the bottom + * of a block. + */ + if (base < rgn->base && end >= rgn->base) { + /* If we can't coalesce, create a new block */ + if (!memblock_memory_can_coalesce(base, size, + rgn->base, + rgn->size)) { + /* Overlap & can't coalesce are mutually + * exclusive, if you do that, be prepared + * for trouble + */ + WARN_ON(end != rgn->base); + goto new_block; + } + /* We extend the bottom of the block down to our + * base + */ + rgn->base = base; + rgn->size = rend - base; + + /* Return if we have nothing else to allocate + * (fully coalesced) + */ + if (rend >= end) + return 0; + + /* We continue processing from the end of the + * coalesced block. + */ + base = rend; + size = end - base; + } + + /* Now check if we overlap or are adjacent with the + * top of a block + */ + if (base <= rend && end >= rend) { + /* If we can't coalesce, create a new block */ + if (!memblock_memory_can_coalesce(rgn->base, + rgn->size, + base, size)) { + /* Overlap & can't coalesce are mutually + * exclusive, if you do that, be prepared + * for trouble + */ + WARN_ON(rend != base); + goto new_block; + } + /* We adjust our base down to enclose the + * original block and destroy it. It will be + * part of our new allocation. Since we've + * freed an entry, we know we won't fail + * to allocate one later, so we won't risk + * losing the original block allocation. + */ + size += (base - rgn->base); + base = rgn->base; + memblock_remove_region(type, i--); } } - /* If we plugged a hole, we may want to also coalesce with the - * next region + /* If the array is empty, special case, replace the fake + * filler region and return */ - if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) && - ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base, - type->regions[i].size, - type->regions[i+1].base, - type->regions[i+1].size)))) { - memblock_coalesce_regions(type, i, i+1); - coalesced++; + if ((type->cnt == 1) && (type->regions[0].size == 0)) { + type->regions[0].base = base; + type->regions[0].size = size; + return 0; } - if (coalesced) - return coalesced; - + new_block: /* If we are out of space, we fail. It's too late to resize the array * but then this shouldn't have happened in the first place. */ @@ -364,13 +380,14 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys } else { type->regions[i+1].base = base; type->regions[i+1].size = size; + slot = i + 1; break; } } - if (base < type->regions[0].base) { type->regions[0].base = base; type->regions[0].size = size; + slot = 0; } type->cnt++; @@ -378,7 +395,8 @@ static long __init_memblock memblock_add_region(struct memblock_type *type, phys * our allocation and return an error */ if (type->cnt == type->max && memblock_double_array(type)) { - type->cnt--; + BUG_ON(slot < 0); + memblock_remove_region(type, slot); return -1; } @@ -391,52 +409,55 @@ long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) } -static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +static long __init_memblock __memblock_remove(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) { - phys_addr_t rgnbegin, rgnend; phys_addr_t end = base + size; int i; - rgnbegin = rgnend = 0; /* supress gcc warnings */ - - /* Find the region where (base, size) belongs to */ - for (i=0; i < type->cnt; i++) { - rgnbegin = type->regions[i].base; - rgnend = rgnbegin + type->regions[i].size; + /* Walk through the array for collisions */ + for (i = 0; i < type->cnt; i++) { + struct memblock_region *rgn = &type->regions[i]; + phys_addr_t rend = rgn->base + rgn->size; - if ((rgnbegin <= base) && (end <= rgnend)) + /* Nothing more to do, exit */ + if (rgn->base > end || rgn->size == 0) break; - } - /* Didn't find the region */ - if (i == type->cnt) - return -1; + /* If we fully enclose the block, drop it */ + if (base <= rgn->base && end >= rend) { + memblock_remove_region(type, i--); + continue; + } - /* Check to see if we are removing entire region */ - if ((rgnbegin == base) && (rgnend == end)) { - memblock_remove_region(type, i); - return 0; - } + /* If we are fully enclosed within a block + * then we need to split it and we are done + */ + if (base > rgn->base && end < rend) { + rgn->size = base - rgn->base; + if (!memblock_add_region(type, end, rend - end)) + return 0; + /* Failure to split is bad, we at least + * restore the block before erroring + */ + rgn->size = rend - rgn->base; + WARN_ON(1); + return -1; + } - /* Check to see if region is matching at the front */ - if (rgnbegin == base) { - type->regions[i].base = end; - type->regions[i].size -= size; - return 0; - } + /* Check if we need to trim the bottom of a block */ + if (rgn->base < end && rend > end) { + rgn->size -= end - rgn->base; + rgn->base = end; + break; + } - /* Check to see if the region is matching at the end */ - if (rgnend == end) { - type->regions[i].size -= size; - return 0; - } + /* And check if we need to trim the top of a block */ + if (base < rend) + rgn->size -= rend - base; - /* - * We need to split the entry - adjust the current one to the - * beginging of the hole and add the region after hole. - */ - type->regions[i].size = base - type->regions[i].base; - return memblock_add_region(type, end, rgnend - end); + } + return 0; } long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) @@ -469,7 +490,7 @@ phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, ph found = memblock_find_base(size, align, 0, max_addr); if (found != MEMBLOCK_ERROR && - memblock_add_region(&memblock.reserved, found, size) >= 0) + !memblock_add_region(&memblock.reserved, found, size)) return found; return 0; @@ -550,7 +571,7 @@ static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, if (this_nid == nid) { phys_addr_t ret = memblock_find_region(start, this_end, size, align); if (ret != MEMBLOCK_ERROR && - memblock_add_region(&memblock.reserved, ret, size) >= 0) + !memblock_add_region(&memblock.reserved, ret, size)) return ret; } start = this_end; @@ -683,13 +704,13 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { - int idx = memblock_search(&memblock.reserved, base); + int idx = memblock_search(&memblock.memory, base); if (idx == -1) return 0; - return memblock.reserved.regions[idx].base <= base && - (memblock.reserved.regions[idx].base + - memblock.reserved.regions[idx].size) >= (base + size); + return memblock.memory.regions[idx].base <= base && + (memblock.memory.regions[idx].base + + memblock.memory.regions[idx].size) >= (base + size); } int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7a22b4129211..010f9166fa6e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -73,15 +73,6 @@ static int really_do_swap_account __initdata = 0; #define do_swap_account (0) #endif -/* - * Per memcg event counter is incremented at every pagein/pageout. This counter - * is used for trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - * - * These values will be used as !((event) & ((1 <<(thresh)) - 1)) - */ -#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ -#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ /* * Statistics for memory cgroup. @@ -93,19 +84,36 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ - MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ - /* incremented at every pagein/pageout */ - MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ - MEM_CGROUP_STAT_NSTATS, }; +enum mem_cgroup_events_index { + MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ + MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ + MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ + MEM_CGROUP_EVENTS_NSTATS, +}; +/* + * Per memcg event counter is incremented at every pagein/pageout. With THP, + * it will be incremated by the number of pages. This counter is used for + * for trigger some periodic events. This is straightforward and better + * than using jiffies etc. to handle periodic memcg event. + */ +enum mem_cgroup_events_target { + MEM_CGROUP_TARGET_THRESH, + MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_NTARGETS, +}; +#define THRESHOLDS_EVENTS_TARGET (128) +#define SOFTLIMIT_EVENTS_TARGET (1024) + struct mem_cgroup_stat_cpu { - s64 count[MEM_CGROUP_STAT_NSTATS]; + long count[MEM_CGROUP_STAT_NSTATS]; + unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; + unsigned long targets[MEM_CGROUP_NTARGETS]; }; /* @@ -218,12 +226,6 @@ struct mem_cgroup { * per zone LRU lists. */ struct mem_cgroup_lru_info info; - - /* - protect against reclaim related member. - */ - spinlock_t reclaim_param_lock; - /* * While reclaiming in a hierarchy, we cache the last child we * reclaimed from. @@ -292,7 +294,6 @@ static struct move_charge_struct { unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ - struct mm_struct *mm; wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), @@ -328,13 +329,6 @@ enum charge_type { NR_CHARGE_TYPE, }; -/* only for here (for easy reading.) */ -#define PCGF_CACHE (1UL << PCG_CACHE) -#define PCGF_USED (1UL << PCG_USED) -#define PCGF_LOCK (1UL << PCG_LOCK) -/* Not used, but added here for completeness */ -#define PCGF_ACCT (1UL << PCG_ACCT) - /* for encoding cft->private value on file */ #define _MEM (0) #define _MEMSWAP (1) @@ -372,14 +366,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct page_cgroup *pc) +page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) { - struct mem_cgroup *mem = pc->mem_cgroup; - int nid = page_cgroup_nid(pc); - int zid = page_cgroup_zid(pc); - - if (!mem) - return NULL; + int nid = page_to_nid(page); + int zid = page_zonenum(page); return mem_cgroup_zoneinfo(mem, nid, zid); } @@ -505,11 +495,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) } } -static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) -{ - return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; -} - static struct mem_cgroup_per_zone * __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) { @@ -566,11 +551,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) * common workload, threashold and synchonization as vmstat[] should be * implemented. */ -static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, - enum mem_cgroup_stat_index idx) +static long mem_cgroup_read_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) { + long val = 0; int cpu; - s64 val = 0; get_online_cpus(); for_each_online_cpu(cpu) @@ -584,9 +569,9 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, return val; } -static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) +static long mem_cgroup_local_usage(struct mem_cgroup *mem) { - s64 ret; + long ret; ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); @@ -600,24 +585,41 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } -static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, - struct page_cgroup *pc, - bool charge) +static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, + enum mem_cgroup_events_index idx) { - int val = (charge) ? 1 : -1; + unsigned long val = 0; + int cpu; + for_each_online_cpu(cpu) + val += per_cpu(mem->stat->events[idx], cpu); +#ifdef CONFIG_HOTPLUG_CPU + spin_lock(&mem->pcp_counter_lock); + val += mem->nocpu_base.events[idx]; + spin_unlock(&mem->pcp_counter_lock); +#endif + return val; +} + +static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, + bool file, int nr_pages) +{ preempt_disable(); - if (PageCgroupCache(pc)) - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); + if (file) + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); else - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); + + /* pagein of a big page is an event. So, ignore page size */ + if (nr_pages > 0) + __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); + else { + __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); + nr_pages = -nr_pages; /* for event */ + } - if (charge) - __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); - else - __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); - __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); + __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); preempt_enable(); } @@ -637,13 +639,34 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, return total; } -static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) +static bool __memcg_event_check(struct mem_cgroup *mem, int target) { - s64 val; + unsigned long val, next; + + val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); + next = this_cpu_read(mem->stat->targets[target]); + /* from time_after() in jiffies.h */ + return ((long)next - (long)val < 0); +} - val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); +static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) +{ + unsigned long val, next; + + val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); - return !(val & ((1 << event_mask_shift) - 1)); + switch (target) { + case MEM_CGROUP_TARGET_THRESH: + next = val + THRESHOLDS_EVENTS_TARGET; + break; + case MEM_CGROUP_TARGET_SOFTLIMIT: + next = val + SOFTLIMIT_EVENTS_TARGET; + break; + default: + return; + } + + this_cpu_write(mem->stat->targets[target], next); } /* @@ -653,10 +676,15 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) static void memcg_check_events(struct mem_cgroup *mem, struct page *page) { /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { + if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { mem_cgroup_threshold(mem); - if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) + __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); + if (unlikely(__memcg_event_check(mem, + MEM_CGROUP_TARGET_SOFTLIMIT))){ mem_cgroup_update_tree(mem, page); + __mem_cgroup_target_update(mem, + MEM_CGROUP_TARGET_SOFTLIMIT); + } } } @@ -815,13 +843,13 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) * We don't check PCG_USED bit. It's cleared when the "page" is finally * removed from global LRU. */ - mz = page_cgroup_zoneinfo(pc); - MEM_CGROUP_ZSTAT(mz, lru) -= 1; + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); if (mem_cgroup_is_root(pc->mem_cgroup)) return; VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); - return; } void mem_cgroup_del_lru(struct page *page) @@ -829,24 +857,49 @@ void mem_cgroup_del_lru(struct page *page) mem_cgroup_del_lru_list(page, page_lru(page)); } -void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) +/* + * Writeback is about to end against a page which has been marked for immediate + * reclaim. If it still appears to be reclaimable, move it to the tail of the + * inactive list. + */ +void mem_cgroup_rotate_reclaimable_page(struct page *page) { struct mem_cgroup_per_zone *mz; struct page_cgroup *pc; + enum lru_list lru = page_lru(page); if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ + /* unused or root page is not rotated. */ + if (!PageCgroupUsed(pc)) + return; + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ smp_rmb(); + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); + list_move_tail(&pc->lru, &mz->lists[lru]); +} + +void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) +{ + struct mem_cgroup_per_zone *mz; + struct page_cgroup *pc; + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(page); /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) + if (!PageCgroupUsed(pc)) return; - mz = page_cgroup_zoneinfo(pc); + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); list_move(&pc->lru, &mz->lists[lru]); } @@ -859,16 +912,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); VM_BUG_ON(PageCgroupAcctLRU(pc)); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); if (!PageCgroupUsed(pc)) return; - - mz = page_cgroup_zoneinfo(pc); - MEM_CGROUP_ZSTAT(mz, lru) += 1; + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); SetPageCgroupAcctLRU(pc); if (mem_cgroup_is_root(pc->mem_cgroup)) return; @@ -876,18 +926,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) } /* - * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to - * lru because the page may.be reused after it's fully uncharged (because of - * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge - * it again. This function is only used to charge SwapCache. It's done under - * lock_page and expected that zone->lru_lock is never held. + * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed + * while it's linked to lru because the page may be reused after it's fully + * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. + * It's done under lock_page and expected that zone->lru_lock isnever held. */ -static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) +static void mem_cgroup_lru_del_before_commit(struct page *page) { unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * Doing this check without taking ->lru_lock seems wrong but this + * is safe. Because if page_cgroup's USED bit is unset, the page + * will not be added to any memcg's LRU. If page_cgroup's USED bit is + * set, the commit after this will fail, anyway. + * This all charge/uncharge is done under some mutual execustion. + * So, we don't need to taking care of changes in USED bit. + */ + if (likely(!PageLRU(page))) + return; + spin_lock_irqsave(&zone->lru_lock, flags); /* * Forget old LRU when this page_cgroup is *not* used. This Used bit @@ -898,12 +958,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) spin_unlock_irqrestore(&zone->lru_lock, flags); } -static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) +static void mem_cgroup_lru_add_after_commit(struct page *page) { unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); + /* taking care of that the page is added to LRU while we commit it */ + if (likely(!PageLRU(page))) + return; spin_lock_irqsave(&zone->lru_lock, flags); /* link when the page is linked to LRU but page_cgroup isn't */ if (PageLRU(page) && !PageCgroupAcctLRU(pc)) @@ -1032,18 +1095,11 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; pc = lookup_page_cgroup(page); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); if (!PageCgroupUsed(pc)) return NULL; - - mz = page_cgroup_zoneinfo(pc); - if (!mz) - return NULL; - + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); return &mz->reclaim_stat; } @@ -1075,9 +1131,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, if (scan >= nr_to_scan) break; - page = pc->page; if (unlikely(!PageCgroupUsed(pc))) continue; + + page = lookup_cgroup_page(pc); + if (unlikely(!PageLRU(page))) continue; @@ -1087,7 +1145,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* we don't affect global LRU but rotate in our LRU */ @@ -1109,32 +1167,32 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) -static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) +/** + * mem_cgroup_margin - calculate chargeable space of a memory cgroup + * @mem: the memory cgroup + * + * Returns the maximum amount of memory @mem can be charged with, in + * pages. + */ +static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) { - if (do_swap_account) { - if (res_counter_check_under_limit(&mem->res) && - res_counter_check_under_limit(&mem->memsw)) - return true; - } else - if (res_counter_check_under_limit(&mem->res)) - return true; - return false; + unsigned long long margin; + + margin = res_counter_margin(&mem->res); + if (do_swap_account) + margin = min(margin, res_counter_margin(&mem->memsw)); + return margin >> PAGE_SHIFT; } static unsigned int get_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; - unsigned int swappiness; /* root ? */ if (cgrp->parent == NULL) return vm_swappiness; - spin_lock(&memcg->reclaim_param_lock); - swappiness = memcg->swappiness; - spin_unlock(&memcg->reclaim_param_lock); - - return swappiness; + return memcg->swappiness; } static void mem_cgroup_start_move(struct mem_cgroup *mem) @@ -1312,8 +1370,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) u64 limit; u64 memsw; - limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + - total_swap_pages; + limit = res_counter_read_u64(&memcg->res, RES_LIMIT); + limit += total_swap_pages << PAGE_SHIFT; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); /* * If memsw is finite and limits the amount of swap space available @@ -1349,13 +1408,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) rcu_read_unlock(); /* Updates scanning parameter */ - spin_lock(&root_mem->reclaim_param_lock); if (!css) { /* this means start scan from ID:1 */ root_mem->last_scanned_child = 0; } else root_mem->last_scanned_child = found; - spin_unlock(&root_mem->reclaim_param_lock); } return ret; @@ -1384,7 +1441,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; - unsigned long excess = mem_cgroup_get_excess(root_mem); + unsigned long excess; + + excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ if (root_mem->memsw_is_minimum) @@ -1407,7 +1466,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, break; } /* - * We want to do more targetted reclaim. + * We want to do more targeted reclaim. * excess >> 2 is not to excessive so as to * reclaim too much, nor too less that we keep * coming back to reclaim from this cgroup @@ -1441,9 +1500,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return ret; total += ret; if (check_soft) { - if (res_counter_check_under_soft_limit(&root_mem->res)) + if (!res_counter_soft_limit_excess(&root_mem->res)) return total; - } else if (mem_cgroup_check_under_limit(root_mem)) + } else if (mem_cgroup_margin(root_mem)) return 1 + total; } return total; @@ -1600,11 +1659,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) * possibility of race condition. If there is, we take a lock. */ -static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); bool need_unlock = false; + unsigned long uninitialized_var(flags); if (unlikely(!pc)) return; @@ -1614,55 +1675,52 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) if (unlikely(!mem || !PageCgroupUsed(pc))) goto out; /* pc->mem_cgroup is unstable ? */ - if (unlikely(mem_cgroup_stealed(mem))) { + if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { /* take a lock against to access pc->mem_cgroup */ - lock_page_cgroup(pc); + move_lock_page_cgroup(pc, &flags); need_unlock = true; mem = pc->mem_cgroup; if (!mem || !PageCgroupUsed(pc)) goto out; } - this_cpu_add(mem->stat->count[idx], val); - switch (idx) { - case MEM_CGROUP_STAT_FILE_MAPPED: + case MEMCG_NR_FILE_MAPPED: if (val > 0) SetPageCgroupFileMapped(pc); else if (!page_mapped(page)) ClearPageCgroupFileMapped(pc); + idx = MEM_CGROUP_STAT_FILE_MAPPED; break; default: BUG(); } + this_cpu_add(mem->stat->count[idx], val); + out: if (unlikely(need_unlock)) - unlock_page_cgroup(pc); + move_unlock_page_cgroup(pc, &flags); rcu_read_unlock(); return; } - -void mem_cgroup_update_file_mapped(struct page *page, int val) -{ - mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); -} +EXPORT_SYMBOL(mem_cgroup_update_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. * TODO: maybe necessary to use big numbers in big irons. */ -#define CHARGE_SIZE (32 * PAGE_SIZE) +#define CHARGE_BATCH 32U struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ - int charge; + unsigned int nr_pages; struct work_struct work; }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); static atomic_t memcg_drain_count; /* - * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed + * Try to consume stocked charge on this cpu. If success, one page is consumed * from local stock and true is returned. If the stock is 0 or charges from a * cgroup which is not current target, returns false. This stock will be * refilled. @@ -1673,8 +1731,8 @@ static bool consume_stock(struct mem_cgroup *mem) bool ret = true; stock = &get_cpu_var(memcg_stock); - if (mem == stock->cached && stock->charge) - stock->charge -= PAGE_SIZE; + if (mem == stock->cached && stock->nr_pages) + stock->nr_pages--; else /* need to call res_counter_charge */ ret = false; put_cpu_var(memcg_stock); @@ -1688,13 +1746,15 @@ static void drain_stock(struct memcg_stock_pcp *stock) { struct mem_cgroup *old = stock->cached; - if (stock->charge) { - res_counter_uncharge(&old->res, stock->charge); + if (stock->nr_pages) { + unsigned long bytes = stock->nr_pages * PAGE_SIZE; + + res_counter_uncharge(&old->res, bytes); if (do_swap_account) - res_counter_uncharge(&old->memsw, stock->charge); + res_counter_uncharge(&old->memsw, bytes); + stock->nr_pages = 0; } stock->cached = NULL; - stock->charge = 0; } /* @@ -1711,7 +1771,7 @@ static void drain_local_stock(struct work_struct *dummy) * Cache charges(val) which is from res_counter, to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void refill_stock(struct mem_cgroup *mem, int val) +static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) { struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); @@ -1719,7 +1779,7 @@ static void refill_stock(struct mem_cgroup *mem, int val) drain_stock(stock); stock->cached = mem; } - stock->charge += val; + stock->nr_pages += nr_pages; put_cpu_var(memcg_stock); } @@ -1771,11 +1831,17 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) spin_lock(&mem->pcp_counter_lock); for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { - s64 x = per_cpu(mem->stat->count[i], cpu); + long x = per_cpu(mem->stat->count[i], cpu); per_cpu(mem->stat->count[i], cpu) = 0; mem->nocpu_base.count[i] += x; } + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { + unsigned long x = per_cpu(mem->stat->events[i], cpu); + + per_cpu(mem->stat->events[i], cpu) = 0; + mem->nocpu_base.events[i] += x; + } /* need to clear ON_MOVE value, works as a kind of lock. */ per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; spin_unlock(&mem->pcp_counter_lock); @@ -1825,9 +1891,10 @@ enum { CHARGE_OOM_DIE, /* the current is killed because of OOM */ }; -static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, - int csize, bool oom_check) +static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, + unsigned int nr_pages, bool oom_check) { + unsigned long csize = nr_pages * PAGE_SIZE; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; unsigned long flags = 0; @@ -1842,27 +1909,38 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, if (likely(!ret)) return CHARGE_OK; + res_counter_uncharge(&mem->res, csize); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - - if (csize > PAGE_SIZE) /* change csize and retry */ + /* + * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch + * of regular pages (CHARGE_BATCH), or a single regular page (1). + * + * Never reclaim on behalf of optional batching, retry with a + * single page instead. + */ + if (nr_pages == CHARGE_BATCH) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + gfp_mask, flags); + if (mem_cgroup_margin(mem_over_limit) >= nr_pages) + return CHARGE_RETRY; /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. */ - if (ret || mem_cgroup_check_under_limit(mem_over_limit)) + if (nr_pages == 1 && ret) return CHARGE_RETRY; /* @@ -1887,12 +1965,15 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) + gfp_t gfp_mask, + unsigned int nr_pages, + struct mem_cgroup **memcg, + bool oom) { + unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL; int ret; - int csize = CHARGE_SIZE; /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1917,7 +1998,7 @@ again: VM_BUG_ON(css_is_removed(&mem->css)); if (mem_cgroup_is_root(mem)) goto done; - if (consume_stock(mem)) + if (nr_pages == 1 && consume_stock(mem)) goto done; css_get(&mem->css); } else { @@ -1925,23 +2006,22 @@ again: rcu_read_lock(); p = rcu_dereference(mm->owner); - VM_BUG_ON(!p); /* - * because we don't have task_lock(), "p" can exit while - * we're here. In that case, "mem" can point to root - * cgroup but never be NULL. (and task_struct itself is freed - * by RCU, cgroup itself is RCU safe.) Then, we have small - * risk here to get wrong cgroup. But such kind of mis-account - * by race always happens because we don't have cgroup_mutex(). - * It's overkill and we allow that small race, here. + * Because we don't have task_lock(), "p" can exit. + * In that case, "mem" can point to root or p can be NULL with + * race with swapoff. Then, we have small risk of mis-accouning. + * But such kind of mis-account by race always happens because + * we don't have cgroup_mutex(). It's overkill and we allo that + * small race, here. + * (*) swapoff at el will charge against mm-struct not against + * task-struct. So, mm->owner can be NULL. */ mem = mem_cgroup_from_task(p); - VM_BUG_ON(!mem); - if (mem_cgroup_is_root(mem)) { + if (!mem || mem_cgroup_is_root(mem)) { rcu_read_unlock(); goto done; } - if (consume_stock(mem)) { + if (nr_pages == 1 && consume_stock(mem)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -1976,13 +2056,12 @@ again: nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); - + ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); switch (ret) { case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ - csize = PAGE_SIZE; + batch = nr_pages; css_put(&mem->css); mem = NULL; goto again; @@ -2003,8 +2082,8 @@ again: } } while (ret != CHARGE_OK); - if (csize > PAGE_SIZE) - refill_stock(mem, csize - PAGE_SIZE); + if (batch > nr_pages) + refill_stock(mem, batch - nr_pages); css_put(&mem->css); done: *memcg = mem; @@ -2023,20 +2102,17 @@ bypass: * gotten by try_charge(). */ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, - unsigned long count) + unsigned int nr_pages) { if (!mem_cgroup_is_root(mem)) { - res_counter_uncharge(&mem->res, PAGE_SIZE * count); + unsigned long bytes = nr_pages * PAGE_SIZE; + + res_counter_uncharge(&mem->res, bytes); if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); + res_counter_uncharge(&mem->memsw, bytes); } } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) -{ - __mem_cgroup_cancel_charge(mem, 1); -} - /* * A helper function to get mem_cgroup from ID. must be called under * rcu_read_lock(). The caller must check css_is_removed() or some if @@ -2084,26 +2160,22 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) return mem; } -/* - * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be - * USED state. If already USED, uncharge and return. - */ - static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) + struct page *page, + unsigned int nr_pages, + struct page_cgroup *pc, + enum charge_type ctype) { - /* try_charge() can return NULL to *memcg, taking care of it. */ - if (!mem) - return; - lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem); + __mem_cgroup_cancel_charge(mem, nr_pages); return; } - + /* + * we don't need page_cgroup_lock about tail pages, becase they are not + * accessed by any other context at this point. + */ pc->mem_cgroup = mem; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2127,19 +2199,62 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, break; } - mem_cgroup_charge_statistics(mem, pc, true); - + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); unlock_page_cgroup(pc); /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. * if they exceeds softlimit. */ - memcg_check_events(mem, pc->page); + memcg_check_events(mem, page); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ + (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) +/* + * Because tail pages are not marked as "used", set it. We're under + * zone->lru_lock, 'splitting on pmd' and compund_lock. + */ +void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) +{ + struct page_cgroup *head_pc = lookup_page_cgroup(head); + struct page_cgroup *tail_pc = lookup_page_cgroup(tail); + unsigned long flags; + + if (mem_cgroup_disabled()) + return; + /* + * We have no races with charge/uncharge but will have races with + * page state accounting. + */ + move_lock_page_cgroup(head_pc, &flags); + + tail_pc->mem_cgroup = head_pc->mem_cgroup; + smp_wmb(); /* see __commit_charge() */ + if (PageCgroupAcctLRU(head_pc)) { + enum lru_list lru; + struct mem_cgroup_per_zone *mz; + + /* + * LRU flags cannot be copied because we need to add tail + *.page to LRU by generic call and our hook will be called. + * We hold lru_lock, then, reduce counter directly. + */ + lru = page_lru(head); + mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); + MEM_CGROUP_ZSTAT(mz, lru) -= 1; + } + tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + move_unlock_page_cgroup(head_pc, &flags); } +#endif /** - * __mem_cgroup_move_account - move account of the page + * mem_cgroup_move_account - move account of the page + * @page: the page + * @nr_pages: number of regular pages (>1 for huge pages) * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. @@ -2147,22 +2262,42 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, * * The caller must confirm following. * - page is not on LRU (isolate_page() is useful.) - * - the pc is locked, used, and ->mem_cgroup points to @from. + * - compound_lock is held when nr_pages > 1 * * This function doesn't do "charge" nor css_get to new cgroup. It should be - * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is + * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is * true, this function does "uncharge" from old cgroup, but it doesn't if * @uncharge is false, so a caller should do "uncharge". */ - -static void __mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) +static int mem_cgroup_move_account(struct page *page, + unsigned int nr_pages, + struct page_cgroup *pc, + struct mem_cgroup *from, + struct mem_cgroup *to, + bool uncharge) { + unsigned long flags; + int ret; + VM_BUG_ON(from == to); - VM_BUG_ON(PageLRU(pc->page)); - VM_BUG_ON(!page_is_cgroup_locked(pc)); - VM_BUG_ON(!PageCgroupUsed(pc)); - VM_BUG_ON(pc->mem_cgroup != from); + VM_BUG_ON(PageLRU(page)); + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + ret = -EBUSY; + if (nr_pages > 1 && !PageTransHuge(page)) + goto out; + + lock_page_cgroup(pc); + + ret = -EINVAL; + if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) + goto unlock; + + move_lock_page_cgroup(pc, &flags); if (PageCgroupFileMapped(pc)) { /* Update mapped_file data for mem_cgroup */ @@ -2171,42 +2306,31 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } - mem_cgroup_charge_statistics(from, pc, false); + mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from); + __mem_cgroup_cancel_charge(from, nr_pages); /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, pc, true); + mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); /* * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of * this function is just force_empty() and move charge, so it's - * garanteed that "to" is never removed. So, we don't check rmdir + * guaranteed that "to" is never removed. So, we don't check rmdir * status here. */ -} - -/* - * check whether the @pc is valid for moving account and call - * __mem_cgroup_move_account() - */ -static int mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) -{ - int ret = -EINVAL; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { - __mem_cgroup_move_account(pc, from, to, uncharge); - ret = 0; - } + move_unlock_page_cgroup(pc, &flags); + ret = 0; +unlock: unlock_page_cgroup(pc); /* * check events */ - memcg_check_events(to, pc->page); - memcg_check_events(from, pc->page); + memcg_check_events(to, page); + memcg_check_events(from, page); +out: return ret; } @@ -2214,14 +2338,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, * move charges to its parent. */ -static int mem_cgroup_move_parent(struct page_cgroup *pc, +static int mem_cgroup_move_parent(struct page *page, + struct page_cgroup *pc, struct mem_cgroup *child, gfp_t gfp_mask) { - struct page *page = pc->page; struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; + unsigned int nr_pages; + unsigned long uninitialized_var(flags); int ret; /* Is ROOT ? */ @@ -2234,14 +2360,22 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (isolate_lru_page(page)) goto put; + nr_pages = hpage_nr_pages(page); + parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); if (ret || !parent) goto put_back; - ret = mem_cgroup_move_account(pc, child, parent, true); + if (nr_pages > 1) + flags = compound_lock_irqsave(page); + + ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); if (ret) - mem_cgroup_cancel_charge(parent); + __mem_cgroup_cancel_charge(parent, nr_pages); + + if (nr_pages > 1) + compound_unlock_irqrestore(page, flags); put_back: putback_lru_page(page); put: @@ -2260,20 +2394,29 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { struct mem_cgroup *mem = NULL; + unsigned int nr_pages = 1; struct page_cgroup *pc; + bool oom = true; int ret; + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + /* + * Never OOM-kill a process for a huge page. The + * fault handler will fall back to regular pages. + */ + oom = false; + } + pc = lookup_page_cgroup(page); - /* can happen at boot */ - if (unlikely(!pc)) - return 0; - prefetchw(pc); + BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); return 0; } @@ -2282,8 +2425,6 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - if (PageCompound(page)) - return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -2303,9 +2444,26 @@ static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype); +static void +__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, + enum charge_type ctype) +{ + struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * In some case, SwapCache, FUSE(splice_buf->radixtree), the page + * is already on LRU. It means the page may on some other page_cgroup's + * LRU. Take care of it. + */ + mem_cgroup_lru_del_before_commit(page); + __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); + mem_cgroup_lru_add_after_commit(page); + return; +} + int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + struct mem_cgroup *mem = NULL; int ret; if (mem_cgroup_disabled()) @@ -2340,14 +2498,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (unlikely(!mm)) mm = &init_mm; - if (page_is_file_cache(page)) - return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE); + if (page_is_file_cache(page)) { + ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); + if (ret || !mem) + return ret; + /* + * FUSE reuses pages without going through the final + * put that would remove them from the LRU list, make + * sure that they get relinked properly. + */ + __mem_cgroup_commit_charge_lrucare(page, mem, + MEM_CGROUP_CHARGE_TYPE_CACHE); + return ret; + } /* shmem */ if (PageSwapCache(page)) { - struct mem_cgroup *mem = NULL; - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); if (!ret) __mem_cgroup_commit_charge_swapin(page, mem, @@ -2372,6 +2538,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct mem_cgroup *mem; int ret; + *ptr = NULL; + if (mem_cgroup_disabled()) return 0; @@ -2389,30 +2557,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); } static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype) { - struct page_cgroup *pc; - if (mem_cgroup_disabled()) return; if (!ptr) return; cgroup_exclude_rmdir(&ptr->css); - pc = lookup_page_cgroup(page); - mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype); - mem_cgroup_lru_add_after_commit_swapcache(page); + + __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); /* * Now swap is on-memory. This means this page may be * counted both as mem and swap....double count. @@ -2460,14 +2624,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - mem_cgroup_cancel_charge(mem); + __mem_cgroup_cancel_charge(mem, 1); } -static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, + unsigned int nr_pages, + const enum charge_type ctype) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; + /* If swapout, usage of swap doesn't decrease */ if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) uncharge_memsw = false; @@ -2482,7 +2648,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) batch->memcg = mem; /* * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continously can be expected to be in + * In those cases, all pages freed continuously can be expected to be in * the same cgroup and we have chance to coalesce uncharges. * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) * because we want to do uncharge as soon as possible. @@ -2491,6 +2657,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) goto direct_uncharge; + if (nr_pages > 1) + goto direct_uncharge; + /* * In typical case, batch->memcg == mem. This means we can * merge a series of uncharges to an uncharge of res_counter. @@ -2499,14 +2668,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) if (batch->memcg != mem) goto direct_uncharge; /* remember freed charge and uncharge it later */ - batch->bytes += PAGE_SIZE; + batch->nr_pages++; if (uncharge_memsw) - batch->memsw_bytes += PAGE_SIZE; + batch->memsw_nr_pages++; return; direct_uncharge: - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); if (unlikely(batch->memcg != mem)) memcg_oom_recover(mem); return; @@ -2518,8 +2687,9 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { - struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + unsigned int nr_pages = 1; + struct page_cgroup *pc; if (mem_cgroup_disabled()) return NULL; @@ -2527,6 +2697,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } /* * Check if our page_cgroup is valid */ @@ -2559,7 +2733,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, pc, false); + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); ClearPageCgroupUsed(pc); /* @@ -2580,7 +2754,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); + mem_cgroup_do_uncharge(mem, nr_pages, ctype); return mem; @@ -2620,8 +2794,8 @@ void mem_cgroup_uncharge_start(void) /* We can do nest. */ if (current->memcg_batch.do_batch == 1) { current->memcg_batch.memcg = NULL; - current->memcg_batch.bytes = 0; - current->memcg_batch.memsw_bytes = 0; + current->memcg_batch.nr_pages = 0; + current->memcg_batch.memsw_nr_pages = 0; } } @@ -2642,10 +2816,12 @@ void mem_cgroup_uncharge_end(void) * This "batch->memcg" is valid without any css_get/put etc... * bacause we hide charges behind us. */ - if (batch->bytes) - res_counter_uncharge(&batch->memcg->res, batch->bytes); - if (batch->memsw_bytes) - res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); + if (batch->nr_pages) + res_counter_uncharge(&batch->memcg->res, + batch->nr_pages * PAGE_SIZE); + if (batch->memsw_nr_pages) + res_counter_uncharge(&batch->memcg->memsw, + batch->memsw_nr_pages * PAGE_SIZE); memcg_oom_recover(batch->memcg); /* forget this pointer (for sanity check) */ batch->memcg = NULL; @@ -2768,13 +2944,16 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * page belongs to. */ int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **ptr) + struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) { - struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + struct page_cgroup *pc; enum charge_type ctype; int ret = 0; + *ptr = NULL; + + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -2824,7 +3003,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { @@ -2851,13 +3030,13 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); return ret; } /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { struct page *used, *unused; struct page_cgroup *pc; @@ -2866,8 +3045,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, return; /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); - /* at migration success, oldpage->mapping is NULL. */ - if (oldpage->mapping) { + if (!migration_ok) { used = oldpage; unused = newpage; } else { @@ -2917,7 +3095,7 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *mem; int ret; if (mem_cgroup_disabled()) @@ -2930,6 +3108,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, return ret; } +#ifdef CONFIG_DEBUG_VM +static struct page_cgroup *lookup_page_cgroup_used(struct page *page) +{ + struct page_cgroup *pc; + + pc = lookup_page_cgroup(page); + if (likely(pc) && PageCgroupUsed(pc)) + return pc; + return NULL; +} + +bool mem_cgroup_bad_page_check(struct page *page) +{ + if (mem_cgroup_disabled()) + return false; + + return lookup_page_cgroup_used(page) != NULL; +} + +void mem_cgroup_print_bad_page(struct page *page) +{ + struct page_cgroup *pc; + + pc = lookup_page_cgroup_used(page); + if (pc) { + int ret = -1; + char *path; + + printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", + pc, pc->flags, pc->mem_cgroup); + + path = kmalloc(PATH_MAX, GFP_KERNEL); + if (path) { + rcu_read_lock(); + ret = cgroup_path(pc->mem_cgroup->css.cgroup, + path, PATH_MAX); + rcu_read_unlock(); + } + + printk(KERN_CONT "(%s)\n", + (ret < 0) ? "cannot get the path" : path); + kfree(path); + } +} +#endif + static DEFINE_MUTEX(set_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, @@ -3173,6 +3397,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, loop += 256; busy = NULL; while (loop--) { + struct page *page; + ret = 0; spin_lock_irqsave(&zone->lru_lock, flags); if (list_empty(list)) { @@ -3188,7 +3414,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, } spin_unlock_irqrestore(&zone->lru_lock, flags); - ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); + page = lookup_cgroup_page(pc); + + ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); if (ret == -ENOMEM) break; @@ -3336,13 +3564,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, } -static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, - enum mem_cgroup_stat_index idx) +static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) { struct mem_cgroup *iter; - s64 val = 0; + long val = 0; - /* each per cpu's value can be minus.Then, use s64 */ + /* Per-cpu values can be negative, use a signed accumulator */ for_each_mem_cgroup_tree(iter, mem) val += mem_cgroup_read_stat(iter, idx); @@ -3362,12 +3590,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) return res_counter_read_u64(&mem->memsw, RES_USAGE); } - val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); + val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); if (swap) - val += mem_cgroup_get_recursive_idx_stat(mem, - MEM_CGROUP_STAT_SWAPOUT); + val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); return val << PAGE_SHIFT; } @@ -3587,9 +3814,9 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) s->stat[MCS_RSS] += val * PAGE_SIZE; val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); s->stat[MCS_PGPGIN] += val; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); s->stat[MCS_PGPGOUT] += val; if (do_swap_account) { val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); @@ -3713,9 +3940,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, return -EINVAL; } - spin_lock(&memcg->reclaim_param_lock); memcg->swappiness = val; - spin_unlock(&memcg->reclaim_param_lock); cgroup_unlock(); @@ -4177,13 +4402,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) */ if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; - pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); if (!pn) return 1; mem->info.nodeinfo[node] = pn; - memset(pn, 0, sizeof(*pn)); - for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) @@ -4207,14 +4430,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void) /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) - mem = kmalloc(size, GFP_KERNEL); + mem = kzalloc(size, GFP_KERNEL); else - mem = vmalloc(size); + mem = vzalloc(size); if (!mem) return NULL; - memset(mem, 0, size); mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!mem->stat) goto out_free; @@ -4374,7 +4596,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->memsw, NULL); } mem->last_scanned_child = 0; - spin_lock_init(&mem->reclaim_param_lock); INIT_LIST_HEAD(&mem->oom_notify); if (parent) @@ -4462,7 +4683,7 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4624,6 +4845,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; + split_huge_page_pmd(walk->mm, pmd); + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (is_target_pte_for_mc(vma, addr, *pte, NULL)) @@ -4639,7 +4862,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; - /* We've already held the mmap_sem */ + down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4651,6 +4874,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } + up_read(&mm->mmap_sem); precharge = mc.precharge; mc.precharge = 0; @@ -4660,10 +4884,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) static int mem_cgroup_precharge_mc(struct mm_struct *mm) { - return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); + unsigned long precharge = mem_cgroup_count_precharge(mm); + + VM_BUG_ON(mc.moving_task); + mc.moving_task = current; + return mem_cgroup_do_precharge(precharge); } -static void mem_cgroup_clear_mc(void) +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ +static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; @@ -4698,23 +4927,28 @@ static void mem_cgroup_clear_mc(void) PAGE_SIZE * mc.moved_swap); } /* we've already done mem_cgroup_get(mc.to) */ - mc.moved_swap = 0; } - if (mc.mm) { - up_read(&mc.mm->mmap_sem); - mmput(mc.mm); - } + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static void mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + + /* + * we must clear moving_task before waking up waiters at the end of + * task migration. + */ + mc.moving_task = NULL; + __mem_cgroup_clear_mc(); spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; spin_unlock(&mc.lock); - mc.moving_task = NULL; - mc.mm = NULL; mem_cgroup_end_move(from); - memcg_oom_recover(from); - memcg_oom_recover(to); - wake_up_all(&mc.waitq); } static int mem_cgroup_can_attach(struct cgroup_subsys *ss, @@ -4736,38 +4970,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return 0; /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { - /* - * We do all the move charge works under one mmap_sem to - * avoid deadlock with down_write(&mmap_sem) - * -> try_charge() -> if (mc.moving_task) -> sleep. - */ - down_read(&mm->mmap_sem); - VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); - VM_BUG_ON(mc.moving_task); - VM_BUG_ON(mc.mm); - mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; mc.to = mem; - mc.precharge = 0; - mc.moved_charge = 0; - mc.moved_swap = 0; spin_unlock(&mc.lock); - mc.moving_task = current; - mc.mm = mm; + /* We set mc.moving_task later */ ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - /* We call up_read() and mmput() in clear_mc(). */ - } else - mmput(mm); + } + mmput(mm); } return ret; } @@ -4789,6 +5008,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; + split_huge_page_pmd(walk->mm, pmd); retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { @@ -4809,8 +5029,8 @@ retry: if (isolate_lru_page(page)) goto put; pc = lookup_page_cgroup(page); - if (!mem_cgroup_move_account(pc, - mc.from, mc.to, false)) { + if (!mem_cgroup_move_account(page, 1, pc, + mc.from, mc.to, false)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -4855,7 +5075,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - /* We've already held the mmap_sem */ +retry: + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + /* + * Someone who are holding the mmap_sem might be waiting in + * waitq. So we cancel all extra charges, wake up all waiters, + * and retry. Because we cancel precharges, we might not be able + * to move enough charges, but moving charge is a best-effort + * feature anyway, so it wouldn't be a big problem. + */ + __mem_cgroup_clear_mc(); + cond_resched(); + goto retry; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4874,6 +5106,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) */ break; } + up_read(&mm->mmap_sem); } static void mem_cgroup_move_task(struct cgroup_subsys *ss, @@ -4882,11 +5115,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct task_struct *p, bool threadgroup) { - if (!mc.mm) + struct mm_struct *mm; + + if (!mc.to) /* no need to move charge */ return; - mem_cgroup_move_charge(mc.mm); + mm = get_task_mm(p); + if (mm) { + mem_cgroup_move_charge(mm); + mmput(mm); + } mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ @@ -4930,9 +5169,9 @@ struct cgroup_subsys mem_cgroup_subsys = { static int __init enable_swap_account(char *s) { /* consider enabled if no parameter or 1 is given */ - if (!s || !strcmp(s, "1")) + if (!(*s) || !strcmp(s, "=1")) really_do_swap_account = 1; - else if (!strcmp(s, "0")) + else if (!strcmp(s, "=0")) really_do_swap_account = 0; return 1; } @@ -4940,7 +5179,8 @@ __setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - enable_swap_account("0"); + printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); + enable_swap_account("=0"); return 1; } __setup("noswapaccount", disable_swap_account); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 46ab2c044b0e..2b9a5eef39e0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -203,12 +203,12 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. * This could cause a loop when the user sets SIGBUS - * to SIG_IGN, but hopefully noone will do that? + * to SIG_IGN, but hopefully no one will do that? */ ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ if (ret < 0) @@ -233,8 +233,8 @@ void shake_page(struct page *p, int access) } /* - * Only all shrink_slab here (which would also - * shrink other caches) if access is not potentially fatal. + * Only call shrink_slab here (which would also shrink other caches) if + * access is not potentially fatal. */ if (access) { int nr; @@ -634,7 +634,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) * when the page is reread or dropped. If an * application assumes it will always get error on * fsync, but does other operations on the fd before - * and the page is dropped inbetween then the error + * and the page is dropped between then the error * will not be properly reported. * * This can already happen even without hwpoisoned @@ -728,7 +728,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) * The table matches them in order and calls the right handler. * * This is quite tricky because we can access page at any time - * in its live cycle, so all accesses have to be extremly careful. + * in its live cycle, so all accesses have to be extremely careful. * * This is not complete. More states could be added. * For any missing state don't attempt recovery. @@ -854,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int ret; int kill = 1; struct page *hpage = compound_head(p); + struct page *ppage; if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; @@ -895,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } /* + * ppage: poisoned page + * if p is regular page(4k page) + * ppage == real poisoned page; + * else p is hugetlb or THP, ppage == head page. + */ + ppage = hpage; + + if (PageTransHuge(hpage)) { + /* + * Verify that this isn't a hugetlbfs head page, the check for + * PageAnon is just for avoid tripping a split_huge_page + * internal debug check, as split_huge_page refuses to deal with + * anything that isn't an anon page. PageAnon can't go away fro + * under us because we hold a refcount on the hpage, without a + * refcount on the hpage. split_huge_page can't be safely called + * in the first place, having a refcount on the tail isn't + * enough * to be safe. + */ + if (!PageHuge(hpage) && PageAnon(hpage)) { + if (unlikely(split_huge_page(hpage))) { + /* + * FIXME: if splitting THP is failed, it is + * better to stop the following operation rather + * than causing panic by unmapping. System might + * survive if the page is freed later. + */ + printk(KERN_INFO + "MCE %#lx: failed to split THP\n", pfn); + + BUG_ON(!PageHWPoison(p)); + return SWAP_FAIL; + } + /* THP is split, so ppage should be the real poisoned page. */ + ppage = p; + } + } + + /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. @@ -903,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(hpage, &tokill); + collect_procs(ppage, &tokill); - ret = try_to_unmap(hpage, ttu); + if (hpage != ppage) + lock_page(ppage); + + ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pfn, page_mapcount(ppage)); + + if (hpage != ppage) + unlock_page(ppage); /* * Now that the dirty bit has been propagated to the @@ -919,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, + kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, ret != SWAP_SUCCESS, p, pfn); return ret; @@ -928,7 +973,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -936,7 +981,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -966,7 +1011,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - nr_pages = 1 << compound_order(hpage); + nr_pages = 1 << compound_trans_order(hpage); atomic_long_add(nr_pages, &mce_bad_pages); /* @@ -993,7 +1038,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * Check "just unpoisoned", "filter hit", and * "race with other subpage." */ - lock_page_nosync(hpage); + lock_page(hpage); if (!PageHWPoison(hpage) || (hwpoison_filter(p) && TestClearPageHWPoison(p)) || (p != hpage && TestSetPageHWPoison(hpage))) { @@ -1020,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageLRU(p) && !PageHuge(p)) - shake_page(p, 0); - if (!PageLRU(p) && !PageHuge(p)) { - /* - * shake_page could have turned it free. - */ - if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy, 2nd try", DELAYED); - return 0; + if (!PageHuge(p) && !PageTransCompound(p)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { + /* + * shake_page could have turned it free. + */ + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy, 2nd try", + DELAYED); + return 0; + } + action_result(pfn, "non LRU", IGNORED); + put_page(p); + return -EBUSY; } - action_result(pfn, "non LRU", IGNORED); - put_page(p); - return -EBUSY; } /* @@ -1040,7 +1088,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ - lock_page_nosync(hpage); + lock_page(hpage); /* * unpoison always clear PG_hwpoison inside page lock @@ -1062,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * For error on the tail page, we should set PG_hwpoison * on the head page to show that the hugepage is hwpoisoned */ - if (PageTail(p) && TestSetPageHWPoison(hpage)) { + if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { action_result(pfn, "hugepage already hardware poisoned", IGNORED); unlock_page(hpage); @@ -1082,7 +1130,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) /* * Now take care of user space mappings. - * Abort on fail: __remove_from_page_cache() assumes unmapped page. + * Abort on fail: __delete_from_page_cache() assumes unmapped page. */ if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); @@ -1164,7 +1212,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_order(page); + nr_pages = 1 << compound_trans_order(page); if (!get_page_unless_zero(page)) { /* @@ -1183,7 +1231,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - lock_page_nosync(page); + lock_page(page); /* * This test is racy because PG_hwpoison is set outside of page lock. * That's acceptable because that won't trigger kernel panic. Instead, @@ -1290,9 +1338,13 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_add(&hpage->lru, &pagelist); - ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, + true); if (ret) { - putback_lru_pages(&pagelist); + struct page *page1, *page2; + list_for_each_entry_safe(page1, page2, &pagelist, lru) + put_page(page1); + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) @@ -1301,7 +1353,7 @@ static int soft_offline_huge_page(struct page *page, int flags) } done: if (!PageHWPoison(hpage)) - atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); + atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); /* keep elevated page count for bad page */ @@ -1413,8 +1465,10 @@ int soft_offline_page(struct page *page, int flags) LIST_HEAD(pagelist); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + 0, true); if (ret) { + putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) @@ -1433,35 +1487,3 @@ done: /* keep elevated page count for bad page */ return ret; } - -/* - * The caller must hold current->mm->mmap_sem in read mode. - */ -int is_hwpoison_address(unsigned long addr) -{ - pgd_t *pgdp; - pud_t pud, *pudp; - pmd_t pmd, *pmdp; - pte_t pte, *ptep; - swp_entry_t entry; - - pgdp = pgd_offset(current->mm, addr); - if (!pgd_present(*pgdp)) - return 0; - pudp = pud_offset(pgdp, addr); - pud = *pudp; - if (!pud_present(pud) || pud_large(pud)) - return 0; - pmdp = pmd_offset(pudp, addr); - pmd = *pmdp; - if (!pmd_present(pmd) || pmd_large(pmd)) - return 0; - ptep = pte_offset_map(pmdp, addr); - pte = *ptep; - pte_unmap(ptep); - if (!is_swap_pte(pte)) - return 0; - entry = pte_to_swp_entry(pte); - return is_hwpoison_entry(entry); -} -EXPORT_SYMBOL_GPL(is_hwpoison_address); diff --git a/mm/memory.c b/mm/memory.c index 02e48aa0ed13..61e66f026563 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address) { pgtable_t new = pte_alloc_one(mm, address); + int wait_split_huge_page; if (!new) return -ENOMEM; @@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ spin_lock(&mm->page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + wait_split_huge_page = 0; + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm->nr_ptes++; pmd_populate(mm, pmd, new); new = NULL; - } + } else if (unlikely(pmd_trans_splitting(*pmd))) + wait_split_huge_page = 1; spin_unlock(&mm->page_table_lock); if (new) pte_free(mm, new); + if (wait_split_huge_page) + wait_split_huge_page(vma->anon_vma, pmd); return 0; } @@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&init_mm.page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; - } + } else + VM_BUG_ON(pmd_trans_splitting(*pmd)); spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); @@ -719,9 +726,9 @@ out_set_pte: return 0; } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; @@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*src_pmd)) { + int err; + VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); + err = copy_huge_pmd(dst_mm, src_mm, + dst_pmd, src_pmd, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, @@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next-addr != HPAGE_PMD_SIZE) { + VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); + split_huge_page_pmd(vma->vm_mm, pmd); + } else if (zap_huge_pmd(tlb, vma, pmd)) { + (*zap_work)--; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) { (*zap_work)--; continue; @@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pud = pud_offset(pgd, address); if (pud_none(*pud)) goto no_page_table; - if (pud_huge(*pud)) { + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; @@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; - if (pmd_huge(*pmd)) { + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(mm, pmd); + goto split_fallthrough; + } + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(mm, address, + pmd, flags); + spin_unlock(&mm->page_table_lock); + goto out; + } + } else + spin_unlock(&mm->page_table_lock); + /* fall through */ + } +split_fallthrough: if (unlikely(pmd_bad(*pmd))) goto no_page_table; @@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, */ mark_page_accessed(page); } + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here and migration is + * blocked by the pte's page reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + } unlock: pte_unmap_unlock(ptep, ptl); out: @@ -1339,9 +1410,65 @@ no_page_table: return page; } +static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) +{ + return stack_guard_page_start(vma, addr) || + stack_guard_page_end(vma, addr+PAGE_SIZE); +} + +/** + * __get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @nonblocking: whether waiting for disk IO or mmap_sem contention + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @nonblocking != NULL, __get_user_pages will not wait for disk IO + * or mmap_sem contention, and if waiting is needed to pin all pages, + * *@nonblocking will be set to 0. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking) { int i; unsigned long vm_flags; @@ -1365,9 +1492,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct vm_area_struct *vma; vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(tsk, start)) { + if (!vma && in_gate_area(mm, start)) { unsigned long pg = start & PAGE_MASK; - struct vm_area_struct *gate_vma = get_gate_vma(tsk); pgd_t *pgd; pud_t *pud; pmd_t *pmd; @@ -1386,15 +1512,17 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pmd = pmd_offset(pud, pg); if (pmd_none(*pmd)) return i ? : -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, pg); if (pte_none(*pte)) { pte_unmap(pte); return i ? : -EFAULT; } + vma = get_gate_vma(mm); if (pages) { struct page *page; - page = vm_normal_page(gate_vma, start, *pte); + page = vm_normal_page(vma, start, *pte); if (!page) { if (!(gup_flags & FOLL_DUMP) && is_zero_pfn(pte_pfn(*pte))) @@ -1408,12 +1536,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, get_page(page); } pte_unmap(pte); - if (vmas) - vmas[i] = gate_vma; - i++; - start += PAGE_SIZE; - nr_pages--; - continue; + goto next_page; } if (!vma || @@ -1441,24 +1564,52 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; + unsigned int fault_flags = 0; + + /* For mlock, just skip the stack guard page. */ + if (foll_flags & FOLL_MLOCK) { + if (stack_guard_page(vma, start)) + goto next_page; + } + if (foll_flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (foll_flags & FOLL_NOWAIT) + fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); ret = handle_mm_fault(mm, vma, start, - (foll_flags & FOLL_WRITE) ? - FAULT_FLAG_WRITE : 0); + fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; - if (ret & - (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| - VM_FAULT_SIGBUS)) + if (ret & (VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { + if (i) + return i; + else if (gup_flags & FOLL_HWPOISON) + return -EHWPOISON; + else + return -EFAULT; + } + if (ret & VM_FAULT_SIGBUS) return i ? i : -EFAULT; BUG(); } - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + return i; + } /* * The VM_FAULT_WRITE bit tells us that @@ -1486,6 +1637,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, flush_anon_page(vma, page, start); flush_dcache_page(page); } +next_page: if (vmas) vmas[i] = vma; i++; @@ -1495,10 +1647,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } while (nr_pages); return i; } +EXPORT_SYMBOL(__get_user_pages); /** * get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. * @mm: mm_struct of target mm * @start: starting user address * @nr_pages: number of pages from start to pin @@ -1559,7 +1713,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); @@ -1584,7 +1739,8 @@ struct page *get_dump_page(unsigned long addr) struct page *page; if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) return NULL; flush_cache_page(vma, addr, page_to_pfn(page)); return page; @@ -1598,8 +1754,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, pud_t * pud = pud_alloc(mm, pgd, addr); if (pud) { pmd_t * pmd = pmd_alloc(mm, pud, addr); - if (pmd) + if (pmd) { + VM_BUG_ON(pmd_trans_huge(*pmd)); return pte_alloc_map_lock(mm, pmd, addr, ptl); + } } return NULL; } @@ -1818,6 +1976,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; + VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); if (remap_pte_range(mm, pmd, addr, next, @@ -2027,10 +2186,10 @@ EXPORT_SYMBOL_GPL(apply_to_page_range); * handle_pte_fault chooses page fault handler according to an entry * which was read non-atomically. Before making any commitment, on * those architectures or configurations (e.g. i386 with PAE) which - * might give a mix of unmatched parts, do_swap_page and do_file_page + * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault * must check under lock before unmapping the pte and proceeding * (but do_wp_page is only called after already making such a check; - * and do_anonymous_page and do_no_page can safely check later on). + * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, pte_t *page_table, pte_t orig_pte) @@ -2048,19 +2207,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, return same; } -/* - * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when - * servicing faults for write access. In the normal case, do always want - * pte_mkwrite. But get_user_pages can cause write faults for mappings - * that do not have writing enabled, when used by access_process_vm. - */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) -{ - if (likely(vma->vm_flags & VM_WRITE)) - pte = pte_mkwrite(pte); - return pte; -} - static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { /* @@ -2112,7 +2258,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int reuse = 0, ret = 0; + int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; @@ -2144,19 +2290,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } page_cache_release(old_page); } - reuse = reuse_swap_page(old_page); - if (reuse) + if (reuse_swap_page(old_page)) { /* * The page is all ours. Move it to our anon_vma so * the rmap code will not search our parent or siblings. * Protected against the rmap code by the page lock. */ page_move_anon_rmap(old_page, vma, address); + unlock_page(old_page); + goto reuse; + } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2212,7 +2359,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } @@ -2220,18 +2366,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } dirty_page = old_page; get_page(dirty_page); - reuse = 1; - } - if (reuse) { reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, page_table); + pte_unmap_unlock(page_table, ptl); ret |= VM_FAULT_WRITE; - goto unlock; + + if (!dirty_page) + return ret; + + /* + * Yes, Virginia, this is actually required to prevent a race + * with clear_page_dirty_for_io() from clearing the page dirty + * bit after it clear all dirty ptes, but before a racing + * do_wp_page installs a dirty pte. + * + * __do_fault is protected similarly. + */ + if (!page_mkwrite) { + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + } + put_page(dirty_page); + if (page_mkwrite) { + struct address_space *mapping = dirty_page->mapping; + + set_page_dirty(dirty_page); + unlock_page(dirty_page); + page_cache_release(dirty_page); + if (mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); + + return ret; } /* @@ -2256,16 +2436,6 @@ gotten: } __SetPageUptodate(new_page); - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if ((vma->vm_flags & VM_LOCKED) && old_page) { - lock_page(old_page); /* for LRU manipulation */ - clear_page_mlock(old_page); - unlock_page(old_page); - } - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; @@ -2333,42 +2503,19 @@ gotten: if (new_page) page_cache_release(new_page); - if (old_page) - page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); - if (dirty_page) { + if (old_page) { /* - * Yes, Virginia, this is actually required to prevent a race - * with clear_page_dirty_for_io() from clearing the page dirty - * bit after it clear all dirty ptes, but before a racing - * do_wp_page installs a dirty pte. - * - * do_no_page is protected similarly. + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. */ - if (!page_mkwrite) { - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); - } - put_page(dirty_page); - if (page_mkwrite) { - struct address_space *mapping = dirty_page->mapping; - - set_page_dirty(dirty_page); - unlock_page(dirty_page); - page_cache_release(dirty_page); - if (mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { + lock_page(old_page); /* LRU manipulation */ + munlock_vma_page(old_page); + unlock_page(old_page); } - - /* file_update_time outside page_lock */ - if (vma->vm_file) - file_update_time(vma->vm_file); + page_cache_release(old_page); } return ret; oom_free_new: @@ -2572,6 +2719,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; details.i_mmap_lock = &mapping->i_mmap_lock; + mutex_lock(&mapping->unmap_mutex); spin_lock(&mapping->i_mmap_lock); /* Protect against endless unmapping loops */ @@ -2588,6 +2736,7 @@ void unmap_mapping_range(struct address_space *mapping, if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->unmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); @@ -2629,7 +2778,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, swp_entry_t entry; pte_t pte; int locked; - struct mem_cgroup *ptr = NULL; + struct mem_cgroup *ptr; int exclusive = 0; int ret = 0; @@ -2975,12 +3124,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } charged = 1; - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (vma->vm_flags & VM_LOCKED) - clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { @@ -3147,9 +3290,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, unsigned int flags) +int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; @@ -3228,9 +3371,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - pte = pte_alloc_map(mm, pmd, address); - if (!pte) + if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + if (!vma->vm_ops) + return do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + } else { + pmd_t orig_pmd = *pmd; + barrier(); + if (pmd_trans_huge(orig_pmd)) { + if (flags & FAULT_FLAG_WRITE && + !pmd_write(orig_pmd) && + !pmd_trans_splitting(orig_pmd)) + return do_huge_pmd_wp_page(mm, vma, address, + pmd, orig_pmd); + return 0; + } + } + + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); return handle_pte_fault(mm, vma, address, pte, pmd, flags); } @@ -3296,7 +3470,12 @@ int make_pages_present(unsigned long addr, unsigned long end) vma = find_vma(current->mm, addr); if (!vma) return -ENOMEM; - write = (vma->vm_flags & VM_WRITE) != 0; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; @@ -3331,7 +3510,7 @@ static int __init gate_vma_init(void) __initcall(gate_vma_init); #endif -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef AT_SYSINFO_EHDR return &gate_vma; @@ -3340,7 +3519,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) #endif } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { #ifdef AT_SYSINFO_EHDR if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) @@ -3368,6 +3547,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, goto out; pmd = pmd_offset(pud, address); + VM_BUG_ON(pmd_trans_huge(*pmd)); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; @@ -3480,20 +3660,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, #endif /* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages + * Access another process' address space as given in mm. If non-NULL, use the + * given task for page fault accounting. */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, void *buf, int len, int write) { - struct mm_struct *mm; struct vm_area_struct *vma; void *old_buf = buf; - mm = get_task_mm(tsk); - if (!mm) - return 0; - down_read(&mm->mmap_sem); /* ignore errors, just check how much was successfully transferred */ while (len) { @@ -3510,7 +3685,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in */ #ifdef CONFIG_HAVE_IOREMAP_PROT vma = find_vma(mm, addr); - if (!vma) + if (!vma || vma->vm_start > addr) break; if (vma->vm_ops && vma->vm_ops->access) ret = vma->vm_ops->access(vma, addr, buf, @@ -3542,11 +3717,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in addr += bytes; } up_read(&mm->mmap_sem); - mmput(mm); return buf - old_buf; } +/** + * access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @write: whether the access is a write + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write) +{ + return __access_remote_vm(NULL, mm, addr, buf, len, write); +} + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, int write) +{ + struct mm_struct *mm; + int ret; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = __access_remote_vm(tsk, mm, addr, buf, len, write); + mmput(mm); + + return ret; +} + /* * Print the name of a VMA. */ @@ -3608,3 +3819,74 @@ void might_fault(void) } EXPORT_SYMBOL(might_fault); #endif + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +static void clear_gigantic_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < pages_per_huge_page; + i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} +void clear_huge_page(struct page *page, + unsigned long addr, unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } +} + +static void copy_user_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, + struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page; ) { + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + copy_user_gigantic_page(dst, src, addr, vma, + pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2c6523af5473..9ca1d604f7cd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, int type) +static void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) { - atomic_set(&page->_mapcount, type); + page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); atomic_inc(&page->_count); @@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) * so use __ref to tell modpost not to generate a warning */ void __ref put_page_bootmem(struct page *page) { - int type; + unsigned long type; - type = atomic_read(&page->_mapcount); - BUG_ON(type >= -1); + type = (unsigned long) page->lru.next; + BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || + type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (atomic_dec_return(&page->_count) == 1) { ClearPagePrivate(page); set_page_private(page, 0); - reset_page_mapcount(page); + INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); } @@ -373,7 +375,7 @@ void online_page(struct page *page) #endif #ifdef CONFIG_FLATMEM - max_mapnr = max(page_to_pfn(page), max_mapnr); + max_mapnr = max(pfn, max_mapnr); #endif ClearPageReserved(page); @@ -407,6 +409,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) int ret; struct memory_notify arg; + lock_memory_hotplug(); arg.start_pfn = pfn; arg.nr_pages = nr_pages; arg.status_change_nid = -1; @@ -419,6 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) ret = notifier_to_errno(ret); if (ret) { memory_notify(MEM_CANCEL_ONLINE, &arg); + unlock_memory_hotplug(); return ret; } /* @@ -443,6 +447,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) printk(KERN_DEBUG "online_pages %lx at %lx failed\n", nr_pages, pfn); memory_notify(MEM_CANCEL_ONLINE, &arg); + unlock_memory_hotplug(); return ret; } @@ -467,6 +472,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (onlined_pages) memory_notify(MEM_ONLINE, &arg); + unlock_memory_hotplug(); return 0; } @@ -718,7 +724,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) pfn); dump_page(page); #endif - /* Becasue we don't have big zone->lock. we should + /* Because we don't have big zone->lock. we should check this again here. */ if (page_count(page)) { not_managed++; @@ -733,7 +739,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) goto out; } /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, + true, true); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 11ff260fb282..959a8b8c7350 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(vma->vm_mm, pmd); if (pmd_none_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, @@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, return PTR_ERR(vma); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, 0); + err = migrate_pages(&pagelist, new_node_page, dest, + false, true); if (err) putback_lru_pages(&pagelist); } @@ -991,7 +993,7 @@ int do_migrate_pages(struct mm_struct *mm, * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. - * Otherwise when we finish scannng from_tmp, we at least have the + * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. @@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, 0); + (unsigned long)vma, + false, true); if (nr_failed) putback_lru_pages(&pagelist); } @@ -1308,16 +1311,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, /* Find the mm_struct */ rcu_read_lock(); - read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); rcu_read_unlock(); err = -ESRCH; goto out; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); rcu_read_unlock(); err = -EINVAL; @@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) } /* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, + int nd) { - int nd = numa_node_id(); - switch (policy->mode) { case MPOL_PREFERRED: if (!(policy->flags & MPOL_F_LOCAL)) @@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, zl = node_zonelist(interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))), gfp_flags); } else { - zl = policy_zonelist(gfp_flags, *mpol); + zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } @@ -1796,7 +1795,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, } /** - * alloc_page_vma - Allocate a page for a VMA. + * alloc_pages_vma - Allocate a page for a VMA. * * @gfp: * %GFP_USER user allocation. @@ -1805,6 +1804,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * + * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @@ -1818,7 +1818,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * Should be called with the mm_sem of the vma hold. */ struct page * -alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr, int node) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1828,18 +1829,18 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); - page = alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); return page; } - zl = policy_zonelist(gfp, pol); + zl = policy_zonelist(gfp, pol, node); if (unlikely(mpol_needs_cond_ref(pol))) { /* * slow path: ref counted shared policy */ - struct page *page = __alloc_pages_nodemask(gfp, 0, + struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); put_mems_allowed(); @@ -1848,7 +1849,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * fast path: default or task policy */ - page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, order, zl, + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } @@ -1889,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + policy_zonelist(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } @@ -1976,8 +1979,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: return nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: - return a->v.preferred_node == b->v.preferred_node && - a->flags == b->flags; + return a->v.preferred_node == b->v.preferred_node; default: BUG(); return 0; diff --git a/mm/migrate.c b/mm/migrate.c index fe5a3c6a5426..34132f8e9109 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -35,6 +35,8 @@ #include <linux/hugetlb.h> #include <linux/gfp.h> +#include <asm/tlbflush.h> + #include "internal.h" #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -111,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, goto out; pmd = pmd_offset(pud, addr); + if (pmd_trans_huge(*pmd)) + goto out; if (!pmd_present(*pmd)) goto out; @@ -244,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -316,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -371,7 +375,7 @@ void migrate_page_copy(struct page *newpage, struct page *page) * redo the accounting that clear_page_dirty_for_io undid, * but we can't use set_page_dirty because that function * is actually a signal that all of the page has become dirty. - * Wheras only part of our page may be dirty. + * Whereas only part of our page may be dirty. */ __set_page_dirty_nobuffers(newpage); } @@ -560,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping, * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int remap_swapcache) + int remap_swapcache, bool sync) { struct address_space *mapping; int rc; @@ -582,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page, mapping = page_mapping(page); if (!mapping) rc = migrate_page(mapping, newpage, page); - else if (mapping->a_ops->migratepage) + else { /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. + * Do not writeback pages if !sync and migratepage is + * not pointing to migrate_page() which is nonblocking + * (swapcache/tmpfs uses migratepage = migrate_page). */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page); - else - rc = fallback_migrate_page(mapping, newpage, page); + if (PageDirty(page) && !sync && + mapping->a_ops->migratepage != migrate_page) + rc = -EBUSY; + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, + newpage, page); + else + rc = fallback_migrate_page(mapping, newpage, page); + } if (rc) { newpage->mapping = NULL; @@ -612,15 +626,14 @@ static int move_to_new_page(struct page *newpage, struct page *page, * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, int offlining) + struct page *page, int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); int remap_swapcache = 1; - int rcu_locked = 0; int charge = 0; - struct mem_cgroup *mem = NULL; + struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL; if (!newpage) @@ -630,13 +643,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* page was freed from under us. So we are done. */ goto move_newpage; } + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) + goto move_newpage; /* prepare cgroup just returns 0 or -ENOMEM */ rc = -EAGAIN; if (!trylock_page(page)) { - if (!force) + if (!force || !sync) goto move_newpage; + + /* + * It's not safe for direct compaction to call lock_page. + * For example, during page readahead pages are added locked + * to the LRU. Later, when the IO completes the pages are + * marked uptodate and unlocked. However, the queueing + * could be merging multiple pages for one bio (e.g. + * mpage_readpages). If an allocation happens for the + * second or third page, the process can end up locking + * the same page twice and deadlocking. Rather than + * trying to be clever about what pages can be locked, + * avoid the use of lock_page for direct compaction + * altogether. + */ + if (current->flags & PF_MEMALLOC) + goto move_newpage; + lock_page(page); } @@ -655,7 +688,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, } /* charge against new page */ - charge = mem_cgroup_prepare_migration(page, newpage, &mem); + charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); if (charge == -ENOMEM) { rc = -ENOMEM; goto unlock; @@ -663,6 +696,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, BUG_ON(charge); if (PageWriteback(page)) { + /* + * For !sync, there is no point retrying as the retry loop + * is expected to be too short for PageWriteback to be cleared + */ + if (!sync) { + rc = -EBUSY; + goto uncharge; + } if (!force) goto uncharge; wait_on_page_writeback(page); @@ -670,20 +711,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. - * This rcu_read_lock() delays freeing anon_vma pointer until the end + * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ if (PageAnon(page)) { - rcu_read_lock(); - rcu_locked = 1; - - /* Determine how to safely use anon_vma */ - if (!page_mapped(page)) { - if (!PageSwapCache(page)) - goto rcu_unlock; - + /* + * Only page_lock_anon_vma() understands the subtleties of + * getting a hold on an anon_vma from outside one of its mms. + */ + anon_vma = page_lock_anon_vma(page); + if (anon_vma) { + /* + * Take a reference count on the anon_vma if the + * page is mapped so that it is guaranteed to + * exist when the page is remapped later + */ + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); + } else if (PageSwapCache(page)) { /* * We cannot be sure that the anon_vma of an unmapped * swapcache page is safe to use because we don't @@ -698,13 +745,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, */ remap_swapcache = 0; } else { - /* - * Take a reference count on the anon_vma if the - * page is mapped so that it is guaranteed to - * exist when the page is remapped later - */ - anon_vma = page_anon_vma(page); - get_anon_vma(anon_vma); + goto uncharge; } } @@ -721,16 +762,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * free the metadata, so the page can be freed. */ if (!page->mapping) { - if (!PageAnon(page) && page_has_private(page)) { - /* - * Go direct to try_to_free_buffers() here because - * a) that's what try_to_release_page() would do anyway - * b) we may be under rcu_read_lock() here, so we can't - * use GFP_KERNEL which is what try_to_release_page() - * needs to be effective. - */ + VM_BUG_ON(PageAnon(page)); + if (page_has_private(page)) { try_to_free_buffers(page); - goto rcu_unlock; + goto uncharge; } goto skip_unmap; } @@ -740,24 +775,22 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, remap_swapcache); + rc = move_to_new_page(newpage, page, remap_swapcache, sync); if (rc && remap_swapcache) remove_migration_ptes(page, page); -rcu_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) - drop_anon_vma(anon_vma); + put_anon_vma(anon_vma); - if (rcu_locked) - rcu_read_unlock(); uncharge: if (!charge) - mem_cgroup_end_migration(mem, page, newpage); + mem_cgroup_end_migration(mem, page, newpage, rc == 0); unlock: unlock_page(page); +move_newpage: if (rc != -EAGAIN) { /* * A page that has been migrated has all references @@ -771,8 +804,6 @@ unlock: putback_lru_page(page); } -move_newpage: - /* * Move the new page to the LRU. If migration was not successful * then this will free the page. @@ -808,12 +839,11 @@ move_newpage: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, int offlining) + int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; struct page *new_hpage = get_new_page(hpage, private, &result); - int rcu_locked = 0; struct anon_vma *anon_vma = NULL; if (!new_hpage) @@ -822,39 +852,29 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = -EAGAIN; if (!trylock_page(hpage)) { - if (!force) + if (!force || !sync) goto out; lock_page(hpage); } if (PageAnon(hpage)) { - rcu_read_lock(); - rcu_locked = 1; - - if (page_mapped(hpage)) { - anon_vma = page_anon_vma(hpage); - atomic_inc(&anon_vma->external_refcount); + anon_vma = page_lock_anon_vma(hpage); + if (anon_vma) { + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); } } try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, 1); + rc = move_to_new_page(new_hpage, hpage, 1, sync); if (rc) remove_migration_ptes(hpage, hpage); - if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, - &anon_vma->lock)) { - int empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); - if (empty) - anon_vma_free(anon_vma); - } - - if (rcu_locked) - rcu_read_unlock(); + if (anon_vma) + put_anon_vma(anon_vma); out: unlock_page(hpage); @@ -885,12 +905,13 @@ out: * are movable anymore because to has become empty * or no retryable pages exist anymore. * Caller should call putback_lru_pages to return pages to the LRU - * or free list. + * or free list only if ret != 0. * * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, bool offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -910,7 +931,8 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2, offlining); + page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -939,7 +961,8 @@ out: } int migrate_huge_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, bool offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -955,7 +978,8 @@ int migrate_huge_pages(struct list_head *from, cond_resched(); rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, offlining); + private, page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -974,10 +998,6 @@ int migrate_huge_pages(struct list_head *from, } rc = 0; out: - - list_for_each_entry_safe(page, page2, from, lru) - put_page(page); - if (rc) return rc; @@ -1040,7 +1060,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; - page = follow_page(vma, pp->addr, FOLL_GET); + page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1088,7 +1108,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0); + (unsigned long)pm, 0, true); if (err) putback_lru_pages(&pagelist); } @@ -1285,14 +1305,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, return -EPERM; /* Find the mm_struct */ - read_lock(&tasklist_lock); + rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); return -ESRCH; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!mm) return -EINVAL; diff --git a/mm/mincore.c b/mm/mincore.c index 9ac42dc6d7b6..a4e6b9d75c76 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { + vec += (next - addr) >> PAGE_SHIFT; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else diff --git a/mm/mlock.c b/mm/mlock.c index b70919ce4f72..516b2c2ddd5a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page) } } -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return (vma->vm_flags & VM_GROWSDOWN) && - (vma->vm_start == addr) && - !vma_stack_continue(vma->vm_prev, addr); -} - /** * __mlock_vma_pages_range() - mlock a range of pages in the vma. * @vma: target vma @@ -155,13 +148,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add * vma->vm_mm->mmap_sem must be held for at least read. */ static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int *nonblocking) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; - struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; - int ret = 0; int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); @@ -170,73 +162,24 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH | FOLL_GET; - if (vma->vm_flags & VM_WRITE) + gup_flags = FOLL_TOUCH | FOLL_MLOCK; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; - /* We don't try to access the guard page of a stack vma */ - if (stack_guard_page(vma, start)) { - addr += PAGE_SIZE; - nr_pages--; - } - - while (nr_pages > 0) { - int i; - - cond_resched(); - - /* - * get_user_pages makes pages present if we are - * setting mlock. and this extra reference count will - * disable migration of this page. However, page may - * still be truncated out from under us. - */ - ret = __get_user_pages(current, mm, addr, - min_t(int, nr_pages, ARRAY_SIZE(pages)), - gup_flags, pages, NULL); - /* - * This can happen for, e.g., VM_NONLINEAR regions before - * a page has been allocated and mapped at a given offset, - * or for addresses that map beyond end of a file. - * We'll mlock the pages if/when they get faulted in. - */ - if (ret < 0) - break; - - lru_add_drain(); /* push cached pages to LRU */ - - for (i = 0; i < ret; i++) { - struct page *page = pages[i]; - - if (page->mapping) { - /* - * That preliminary check is mainly to avoid - * the pointless overhead of lock_page on the - * ZERO_PAGE: which might bounce very badly if - * there is contention. However, we're still - * dirtying its cacheline with get/put_page: - * we'll add another __get_user_pages flag to - * avoid it if that case turns out to matter. - */ - lock_page(page); - /* - * Because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - mlock_vma_page(page); - unlock_page(page); - } - put_page(page); /* ref from get_user_pages() */ - } - - addr += ret * PAGE_SIZE; - nr_pages -= ret; - ret = 0; - } + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; - return ret; /* 0 or negative error code */ + return __get_user_pages(current, mm, addr, nr_pages, gup_flags, + NULL, NULL, nonblocking); } /* @@ -278,9 +221,9 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current))) { + vma == get_gate_vma(current->mm))) { - __mlock_vma_pages_range(vma, start, end); + __mlock_vma_pages_range(vma, start, end, NULL); /* Hide errors from mmap() and other callers */ return 0; @@ -372,18 +315,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int ret = 0; int lock = newflags & VM_LOCKED; - if (newflags == vma->vm_flags || - (vma->vm_flags & (VM_IO | VM_PFNMAP))) + if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) goto out; /* don't set VM_LOCKED, don't count */ - if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current)) { - if (lock) - make_pages_present(start, end); - goto out; /* don't set VM_LOCKED, don't count */ - } - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); @@ -419,14 +354,10 @@ success: * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - if (lock) { + if (lock) vma->vm_flags = newflags; - ret = __mlock_vma_pages_range(vma, start, end); - if (ret < 0) - ret = __mlock_posix_error_return(ret); - } else { + else munlock_vma_pages_range(vma, start, end); - } out: *prev = vma; @@ -439,7 +370,8 @@ static int do_mlock(unsigned long start, size_t len, int on) struct vm_area_struct * vma, * prev; int error; - len = PAGE_ALIGN(len); + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; @@ -482,6 +414,62 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } +static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + int ret = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. __mlock_vma_pages_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + ret = __mlock_posix_error_return(ret); + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; @@ -507,6 +495,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); + if (!error) + error = do_mlock_pages(start, len, 0); return error; } @@ -571,6 +561,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); up_write(¤t->mm->mmap_sem); + if (!ret && (flags & MCL_CURRENT)) { + /* Ignore errors */ + do_mlock_pages(0, TASK_SIZE, 1); + } out: return ret; } diff --git a/mm/mmap.c b/mm/mmap.c index b179abb1474a..772140c53ab1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -29,6 +29,7 @@ #include <linux/mmu_notifier.h> #include <linux/perf_event.h> #include <linux/audit.h> +#include <linux/khugepaged.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) down_write(&mm->mmap_sem); #ifdef CONFIG_COMPAT_BRK - min_brk = mm->end_code; + /* + * CONFIG_COMPAT_BRK can still be overridden by setting + * randomize_va_space to 2, which will still cause mm->start_brk + * to be arbitrarily shifted + */ + if (current->brk_randomized) + min_brk = mm->start_brk; + else + min_brk = mm->end_data; #else min_brk = mm->start_brk; #endif @@ -588,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end); } } + vma_adjust_trans_huge(vma, start, end, adjust_next); + /* * When changing only vma->vm_end, we don't really need anon_vma * lock. This is a fairly rare case by itself, but the anon_vma @@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(prev); return prev; } @@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(area); return area; } @@ -1754,13 +1767,17 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) size = address - vma->vm_start; grow = (address - vma->vm_end) >> PAGE_SHIFT; - error = acct_stack_growth(vma, size, grow); - if (!error) { - vma->vm_end = address; - perf_event_mmap(vma); + error = -ENOMEM; + if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + vma->vm_end = address; + perf_event_mmap(vma); + } } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1800,14 +1817,18 @@ static int expand_downwards(struct vm_area_struct *vma, size = vma->vm_end - address; grow = (vma->vm_start - address) >> PAGE_SHIFT; - error = acct_stack_growth(vma, size, grow); - if (!error) { - vma->vm_start = address; - vma->vm_pgoff -= grow; - perf_event_mmap(vma); + error = -ENOMEM; + if (grow <= vma->vm_pgoff) { + error = acct_stack_growth(vma, size, grow); + if (!error) { + vma->vm_start = address; + vma->vm_pgoff -= grow; + perf_event_mmap(vma); + } } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } @@ -2462,6 +2483,7 @@ int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { + int ret; struct vm_area_struct *vma; vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); @@ -2479,16 +2501,23 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_ops = &special_mapping_vmops; vma->vm_private_data = pages; - if (unlikely(insert_vm_struct(mm, vma))) { - kmem_cache_free(vm_area_cachep, vma); - return -ENOMEM; - } + ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); + if (ret) + goto out; + + ret = insert_vm_struct(mm, vma); + if (ret) + goto out; mm->total_vm += len >> PAGE_SHIFT; perf_event_mmap(vma); return 0; + +out: + kmem_cache_free(vm_area_cachep, vma); + return ret; } static DEFINE_MUTEX(mm_all_locks_mutex); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 438951d366f2..8d032de4088e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->test_young) { + young = mn->ops->test_young(mn, mm, address); + if (young) + break; + } + } + rcu_read_unlock(); + + return young; +} + void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb82c855..f5b7d1760213 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - -#ifdef CONFIG_SMP -/* Called when a more accurate view of NR_FREE_PAGES is needed */ -unsigned long zone_nr_free_pages(struct zone *zone) -{ - unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); - - /* - * While kswapd is awake, it is considered the zone is under some - * memory pressure. Under pressure, there is a risk that - * per-cpu-counter-drift will allow the min watermark to be breached - * potentially causing a live-lock. While kswapd is awake and - * free pages are low, get a better estimate for free pages - */ - if (nr_free_pages < zone->percpu_drift_mark && - !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) - return zone_page_state_snapshot(zone, NR_FREE_PAGES); - - return nr_free_pages; -} -#endif /* CONFIG_SMP */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c5133873097..5a688a2756be 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, pte_unmap_unlock(pte - 1, ptl); } -static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, +static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma->vm_mm, pmd); + else if (change_huge_pmd(vma, pmd, addr, newprot)) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) continue; - change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); + change_pte_range(vma->vm_mm, pmd, addr, next, newprot, + dirty_accountable); } while (pmd++, addr = next, addr != end); } -static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, +static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); + change_pmd_range(vma, pud, addr, next, newprot, + dirty_accountable); } while (pud++, addr = next, addr != end); } @@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); + change_pud_range(vma, pgd, addr, next, newprot, + dirty_accountable); } while (pgd++, addr = next, addr != end); flush_tlb_range(vma, start, end); } diff --git a/mm/mremap.c b/mm/mremap.c index 563fbdd6293a..a7c1f9f9b941 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return NULL; pmd = pmd_offset(pud, addr); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) return NULL; return pmd; } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) if (!pmd) return NULL; - if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) + VM_BUG_ON(pmd_trans_huge(*pmd)); + if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) return NULL; return pmd; @@ -91,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, */ mapping = vma->vm_file->f_mapping; spin_lock(&mapping->i_mmap_lock); - if (new_vma->vm_truncate_count && - new_vma->vm_truncate_count != vma->vm_truncate_count) - new_vma->vm_truncate_count = 0; + new_vma->vm_truncate_count = 0; } /* @@ -147,7 +148,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; - new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); + new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; next = (new_addr + PMD_SIZE) & PMD_MASK; @@ -276,9 +277,16 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (old_len > vma->vm_end - addr) goto Efault; - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { - if (new_len > old_len) + /* Need to be careful about a growing mapping */ + if (new_len > old_len) { + unsigned long pgoff; + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) goto Efault; + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) + goto Einval; } if (vma->vm_flags & VM_LOCKED) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c new file mode 100644 index 000000000000..9109049f0bbc --- /dev/null +++ b/mm/nobootmem.c @@ -0,0 +1,427 @@ +/* + * bootmem - A boot-time physical memory allocator and configurator + * + * Copyright (C) 1999 Ingo Molnar + * 1999 Kanoj Sarcar, SGI + * 2008 Johannes Weiner + * + * Access to this subsystem has to be serialized externally (which is true + * for the boot process anyway). + */ +#include <linux/init.h> +#include <linux/pfn.h> +#include <linux/slab.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/kmemleak.h> +#include <linux/range.h> +#include <linux/memblock.h> + +#include <asm/bug.h> +#include <asm/io.h> +#include <asm/processor.h> + +#include "internal.h" + +#ifndef CONFIG_NEED_MULTIPLE_NODES +struct pglist_data __refdata contig_page_data; +EXPORT_SYMBOL(contig_page_data); +#endif + +unsigned long max_low_pfn; +unsigned long min_low_pfn; +unsigned long max_pfn; + +static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + void *ptr; + u64 addr; + + if (limit > memblock.current_limit) + limit = memblock.current_limit; + + addr = find_memory_core_early(nid, size, align, goal, limit); + + if (addr == MEMBLOCK_ERROR) + return NULL; + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); + return ptr; +} + +/* + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long addr, unsigned long size) +{ + unsigned long cursor, end; + + kmemleak_free_part(__va(addr), size); + + cursor = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int i; + unsigned long start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); + + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); + + if (end_aligned <= start_aligned) { + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + return; + } + + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); +} + +unsigned long __init free_all_memory_core_early(int nodeid) +{ + int i; + u64 start, end; + unsigned long count = 0; + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); + } + + return count; +} + +/** + * free_all_bootmem_node - release a node's free pages to the buddy allocator + * @pgdat: node to be released + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +{ + register_page_bootmem_info_node(pgdat); + + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + return 0; +} + +/** + * free_all_bootmem - release free pages to the buddy allocator + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem(void) +{ + /* + * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesn't have RAM installed + * low ram will be on Node1 + * Use MAX_NUMNODES will make sure all ranges in early_node_map[] + * will be used instead of only Node0 related + */ + return free_all_memory_core_early(MAX_NUMNODES); +} + +/** + * free_bootmem_node - mark a page range as usable + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must reside completely on the specified node. + */ +void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) +{ + kmemleak_free_part(__va(physaddr), size); + memblock_x86_free_range(physaddr, physaddr + size); +} + +/** + * free_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must be contiguous but may span node boundaries. + */ +void __init free_bootmem(unsigned long addr, unsigned long size) +{ + kmemleak_free_part(__va(addr), size); + memblock_x86_free_range(addr, addr + size); +} + +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +} + +/** + * __alloc_bootmem_nopanic - allocate boot memory without panicking + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * Returns NULL on failure. + */ +void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem_nopanic(size, align, goal, limit); +} + +static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); + + if (mem) + return mem; + /* + * Whoops, we cannot satisfy the allocation request. + */ + printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + panic("Out of memory"); + return NULL; +} + +/** + * __alloc_bootmem - allocate boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal) +{ + unsigned long limit = -1UL; + + return ___alloc_bootmem(size, align, goal, limit); +} + +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + return __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, -1ULL); +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ +#ifdef MAX_DMA32_PFN + unsigned long end_pfn; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + /* update goal according ...MAX_DMA32_PFN */ + end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && + (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { + void *ptr; + unsigned long new_goal; + + new_goal = MAX_DMA32_PFN << PAGE_SHIFT; + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + new_goal, -1ULL); + if (ptr) + return ptr; + } +#endif + + return __alloc_bootmem_node(pgdat, size, align, goal); + +} + +#ifdef CONFIG_SPARSEMEM +/** + * alloc_bootmem_section - allocate boot memory from a specific section + * @size: size of the request in bytes + * @section_nr: sparse map section to allocate from + * + * Return NULL on failure. + */ +void * __init alloc_bootmem_section(unsigned long size, + unsigned long section_nr) +{ + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + + return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, + SMP_CACHE_BYTES, goal, limit); +} +#endif + +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + return __alloc_bootmem_nopanic(size, align, goal); +} + +#ifndef ARCH_LOW_ADDRESS_LIMIT +#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL +#endif + +/** + * __alloc_bootmem_low - allocate low boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +} + +/** + * __alloc_bootmem_low_node - allocate low boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); + if (ptr) + return ptr; + + return __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +} diff --git a/mm/nommu.c b/mm/nommu.c index 27a9ac588516..c4c542c736a9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -10,7 +10,7 @@ * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> - * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> + * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> */ #include <linux/module.h> @@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp) int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *retry) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); @@ -328,6 +330,7 @@ void *vmalloc_node(unsigned long size, int node) { return vmalloc(size); } +EXPORT_SYMBOL(vmalloc_node); /** * vzalloc_node - allocate memory on a specific node with zero fill @@ -440,6 +443,31 @@ void __attribute__((weak)) vmalloc_sync_all(void) { } +/** + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * + * Returns: NULL on failure, vm_struct on success + * + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. If the kernel address space is not shared + * between processes, it syncs the pagetable across all + * processes. + */ +struct vm_struct *alloc_vm_area(size_t size) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL_GPL(alloc_vm_area); + +void free_vm_area(struct vm_struct *area) +{ + BUG(); +} +EXPORT_SYMBOL_GPL(free_vm_area); + int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { @@ -1814,10 +1842,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range); -void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) -{ -} - unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { @@ -1935,7 +1959,7 @@ error: return -ENOMEM; } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } @@ -1947,21 +1971,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -/* - * Access another process' address space. - * - source/target buffer must be kernel space - */ -int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, + unsigned long addr, void *buf, int len, int write) { struct vm_area_struct *vma; - struct mm_struct *mm; - - if (addr + len < addr) - return 0; - - mm = get_task_mm(tsk); - if (!mm) - return 0; down_read(&mm->mmap_sem); @@ -1986,6 +1999,43 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in } up_read(&mm->mmap_sem); + + return len; +} + +/** + * @access_remote_vm - access another process' address space + * @mm: the mm_struct of the target address space + * @addr: start address to access + * @buf: source or destination buffer + * @len: number of bytes to transfer + * @write: whether the access is a write + * + * The caller must hold a reference on @mm. + */ +int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write) +{ + return __access_remote_vm(NULL, mm, addr, buf, len, write); +} + +/* + * Access another process' address space. + * - source/target buffer must be kernel space + */ +int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) +{ + struct mm_struct *mm; + + if (addr + len < addr) + return 0; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + len = __access_remote_vm(tsk, mm, addr, buf, len, write); + mmput(mm); return len; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7dcca55ede7c..f52e85c80e8d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -31,6 +31,7 @@ #include <linux/memcontrol.h> #include <linux/mempolicy.h> #include <linux/security.h> +#include <linux/ptrace.h> int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -83,24 +84,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk, #endif /* CONFIG_NUMA */ /* - * If this is a system OOM (not a memcg OOM) and the task selected to be - * killed is not already running at high (RT) priorities, speed up the - * recovery by boosting the dying task to the lowest FIFO priority. - * That helps with the recovery and avoids interfering with RT tasks. - */ -static void boost_dying_task_prio(struct task_struct *p, - struct mem_cgroup *mem) -{ - struct sched_param param = { .sched_priority = 1 }; - - if (mem) - return; - - if (!rt_task(p)) - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); -} - -/* * The process p may have detached its own ->mm while exiting or through * use_mm(), but one or more of its subthreads may still have a valid * pointer. Return p, or any of its subthreads with a valid ->mm, with @@ -189,10 +172,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, /* * The baseline for the badness score is the proportion of RAM that each - * task's rss and swap space use. + * task's rss, pagetable and swap space use. */ - points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / - totalpages; + points = get_mm_rss(p->mm) + p->mm->nr_ptes; + points += get_mm_counter(p->mm, MM_SWAPENTS); + + points *= 1000; + points /= totalpages; task_unlock(p); /* @@ -292,13 +278,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, unsigned long totalpages, struct mem_cgroup *mem, const nodemask_t *nodemask) { - struct task_struct *p; + struct task_struct *g, *p; struct task_struct *chosen = NULL; *ppoints = 0; - for_each_process(p) { + do_each_thread(g, p) { unsigned int points; + if (!p->mm) + continue; if (oom_unkillable_task(p, mem, nodemask)) continue; @@ -314,22 +302,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, if (test_tsk_thread_flag(p, TIF_MEMDIE)) return ERR_PTR(-1UL); - /* - * This is in the process of releasing memory so wait for it - * to finish before killing some other task by mistake. - * - * However, if p is the current task, we allow the 'kill' to - * go ahead if it is exiting: this will simply set TIF_MEMDIE, - * which will allow it to gain access to memory reserves in - * the process of exiting and releasing its resources. - * Otherwise we could get an easy OOM deadlock. - */ - if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { - if (p != current) - return ERR_PTR(-1UL); - - chosen = p; - *ppoints = 1000; + if (p->flags & PF_EXITING) { + /* + * If p is the current task and is in the process of + * releasing memory, we allow the "kill" to set + * TIF_MEMDIE, which will allow it to gain access to + * memory reserves. Otherwise, it may stall forever. + * + * The loop isn't broken here, however, in case other + * threads are found to have already been oom killed. + */ + if (p == current) { + chosen = p; + *ppoints = 1000; + } else { + /* + * If this task is not being ptraced on exit, + * then wait for it to finish before killing + * some other task unnecessarily. + */ + if (!(task_ptrace(p->group_leader) & + PT_TRACE_EXIT)) + return ERR_PTR(-1UL); + } } points = oom_badness(p, mem, nodemask, totalpages); @@ -337,7 +332,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, chosen = p; *ppoints = points; } - } + } while_each_thread(g, p); return chosen; } @@ -396,7 +391,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, task_unlock(current); dump_stack(); mem_cgroup_print_oom_info(mem, p); - show_mem(); + show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) dump_tasks(mem, nodemask); } @@ -442,13 +437,6 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); - /* - * We give our sacrificial lamb high priority and access to - * all the memory it needs. That way it should be able to - * exit() and clear out its resources quickly... - */ - boost_dying_task_prio(p, mem); - return 0; } #undef K @@ -472,7 +460,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, */ if (p->flags & PF_EXITING) { set_tsk_thread_flag(p, TIF_MEMDIE); - boost_dying_task_prio(p, mem); return 0; } @@ -491,6 +478,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; + if (child->mm == p->mm) + continue; /* * oom_badness() returns 0 if the thread is unkillable */ @@ -537,6 +526,16 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) unsigned int points = 0; struct task_struct *p; + /* + * If current has a pending SIGKILL, then automatically select it. The + * goal is to allow it to allocate so that it may quickly exit and free + * its memory. + */ + if (fatal_signal_pending(current)) { + set_thread_flag(TIF_MEMDIE); + return; + } + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; read_lock(&tasklist_lock); @@ -689,7 +688,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ if (fatal_signal_pending(current)) { set_thread_flag(TIF_MEMDIE); - boost_dying_task_prio(current, NULL); return; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b840afa89761..31f698862420 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -404,15 +404,18 @@ unsigned long determine_dirtyable_memory(void) * - vm.dirty_background_ratio or vm.dirty_background_bytes * - vm.dirty_ratio or vm.dirty_bytes * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * runtime tasks. + * real-time tasks. */ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { unsigned long background; unsigned long dirty; - unsigned long available_memory = determine_dirtyable_memory(); + unsigned long uninitialized_var(available_memory); struct task_struct *tsk; + if (!vm_dirty_bytes || !dirty_background_bytes) + available_memory = determine_dirtyable_memory(); + if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); else @@ -563,7 +566,7 @@ static void balance_dirty_pages(struct address_space *mapping, break; /* We've done our duty */ } trace_wbc_balance_dirty_wait(&wbc, bdi); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); io_schedule_timeout(pause); /* @@ -924,7 +927,7 @@ retry: break; } - done_index = page->index + 1; + done_index = page->index; lock_page(page); @@ -974,6 +977,7 @@ continue_unlock: * not be suitable for data integrity * writeout). */ + done_index = page->index + 1; done = 1; break; } @@ -1036,11 +1040,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc, int generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct blk_plug plug; + int ret; + /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) return 0; - return write_cache_pages(mapping, wbc, __writepage, mapping); + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __writepage, mapping); + blk_finish_plug(&plug); + return ret; } EXPORT_SYMBOL(generic_writepages); @@ -1103,7 +1113,7 @@ EXPORT_SYMBOL(write_one_page); int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) - SetPageDirty(page); + return !TestSetPageDirty(page); return 0; } @@ -1208,6 +1218,17 @@ int set_page_dirty(struct page *page) if (likely(mapping)) { int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; + /* + * readahead/lru_deactivate_page could remain + * PG_readahead/PG_reclaim due to race with end_page_writeback + * About readahead, if the page is written, the flags would be + * reset. So no problem. + * About lru_deactivate_page, if the page is redirty, the flag + * will be reset. So no problem. but if the page is used by readahead + * it will confuse readahead and make it restart the size rampup + * process. But it's a trivial problem. + */ + ClearPageReclaim(page); #ifdef CONFIG_BLOCK if (!spd) spd = __set_page_dirty_buffers; @@ -1236,7 +1257,7 @@ int set_page_dirty_lock(struct page *page) { int ret; - lock_page_nosync(page); + lock_page(page); ret = set_page_dirty(page); unlock_page(page); return ret; @@ -1263,7 +1284,6 @@ int clear_page_dirty_for_io(struct page *page) BUG_ON(!PageLocked(page)); - ClearPageReclaim(page); if (mapping && mapping_cap_account_dirty(mapping)) { /* * Yes, Virginia, this is indeed insane. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ff7e15872398..9f8a97b9a350 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,7 @@ #include <linux/compaction.h> #include <trace/events/kmem.h> #include <linux/ftrace_event.h> +#include <linux/memcontrol.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -286,7 +287,7 @@ static void bad_page(struct page *page) /* Don't complain about poisoned pages */ if (PageHWPoison(page)) { - __ClearPageBuddy(page); + reset_page_mapcount(page); /* remove PageBuddy */ return; } @@ -317,7 +318,7 @@ static void bad_page(struct page *page) dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ - __ClearPageBuddy(page); + reset_page_mapcount(page); /* remove PageBuddy */ add_taint(TAINT_BAD_PAGE); } @@ -357,6 +358,7 @@ void prep_compound_page(struct page *page, unsigned long order) } } +/* update __split_huge_page_refcount if you change this function */ static int destroy_compound_page(struct page *page, unsigned long order) { int i; @@ -426,18 +428,10 @@ static inline void rmv_page_order(struct page *page) * * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ -static inline struct page * -__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) -{ - unsigned long buddy_idx = page_idx ^ (1 << order); - - return page + (buddy_idx - page_idx); -} - static inline unsigned long -__find_combined_index(unsigned long page_idx, unsigned int order) +__find_buddy_index(unsigned long page_idx, unsigned int order) { - return (page_idx & ~(1 << order)); + return page_idx ^ (1 << order); } /* @@ -448,8 +442,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we use PG_buddy. - * Setting, clearing, and testing PG_buddy is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount -2. + * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -482,7 +476,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_buddy. Page's + * free pages of length of (1 << order) and marked with _mapcount -2. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were @@ -499,6 +493,7 @@ static inline void __free_one_page(struct page *page, { unsigned long page_idx; unsigned long combined_idx; + unsigned long uninitialized_var(buddy_idx); struct page *buddy; if (unlikely(PageCompound(page))) @@ -513,7 +508,8 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(bad_range(zone, page)); while (order < MAX_ORDER-1) { - buddy = __page_find_buddy(page, page_idx, order); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; @@ -521,7 +517,7 @@ static inline void __free_one_page(struct page *page, list_del(&buddy->lru); zone->free_area[order].nr_free--; rmv_page_order(buddy); - combined_idx = __find_combined_index(page_idx, order); + combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; @@ -538,9 +534,10 @@ static inline void __free_one_page(struct page *page, */ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { struct page *higher_page, *higher_buddy; - combined_idx = __find_combined_index(page_idx, order); - higher_page = page + combined_idx - page_idx; - higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); + combined_idx = buddy_idx & page_idx; + higher_page = page + (combined_idx - page_idx); + buddy_idx = __find_buddy_index(combined_idx, order + 1); + higher_buddy = page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); @@ -569,7 +566,8 @@ static inline int free_pages_check(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { + (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | + (mem_cgroup_bad_page_check(page)))) { bad_page(page); return 1; } @@ -618,6 +616,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, list = &pcp->lists[migratetype]; } while (list_empty(list)); + /* This is the only non-empty list. Free them all. */ + if (batch_free == MIGRATE_PCPTYPES) + batch_free = to_free; + do { page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ @@ -651,13 +653,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order) trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); - for (i = 0; i < (1 << order); i++) { - struct page *pg = page + i; - - if (PageAnon(pg)) - pg->mapping = NULL; - bad += free_pages_check(pg); - } + if (PageAnon(page)) + page->mapping = NULL; + for (i = 0; i < (1 << order); i++) + bad += free_pages_check(page + i); if (bad) return false; @@ -757,7 +756,8 @@ static inline int check_new_page(struct page *page) if (unlikely(page_mapcount(page) | (page->mapping != NULL) | (atomic_read(&page->_count) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { + (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | + (mem_cgroup_bad_page_check(page)))) { bad_page(page); return 1; } @@ -870,9 +870,8 @@ static int move_freepages(struct zone *zone, } order = page_order(page); - list_del(&page->lru); - list_add(&page->lru, - &zone->free_area[order].free_list[migratetype]); + list_move(&page->lru, + &zone->free_area[order].free_list[migratetype]); page += 1 << order; pages_moved += 1 << order; } @@ -943,7 +942,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * If breaking a large block of pages, move all free * pages to the preferred allocation list. If falling * back for a reclaimable kernel allocation, be more - * agressive about taking ownership of free pages + * aggressive about taking ownership of free pages */ if (unlikely(current_order >= (pageblock_order >> 1)) || start_migratetype == MIGRATE_RECLAIMABLE || @@ -1095,8 +1094,10 @@ static void drain_pages(unsigned int cpu) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; + if (pcp->count) { + free_pcppages_bulk(zone, pcp->count, pcp); + pcp->count = 0; + } local_irq_restore(flags); } } @@ -1338,7 +1339,7 @@ again: } __count_zone_vm_events(PGALLOC, zone, 1 << order); - zone_statistics(preferred_zone, zone); + zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); VM_BUG_ON(bad_range(zone, page)); @@ -1460,24 +1461,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return 1 if free pages are above 'mark'. This takes into account the order + * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags, long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; + free_pages -= (1 << order) + 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) - return 0; + return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ free_pages -= z->free_area[o].nr_free << o; @@ -1486,9 +1487,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, min >>= 1; if (free_pages <= min) - return 0; + return false; } - return 1; + return true; +} + +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + zone_page_state(z, NR_FREE_PAGES)); +} + +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + + if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) + free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); } #ifdef CONFIG_NUMA @@ -1700,6 +1720,20 @@ try_next_zone: return page; } +/* + * Large machines with many possible nodes should not always dump per-node + * meminfo in irq context. + */ +static inline bool should_suppress_show_mem(void) +{ + bool ret = false; + +#if NODES_SHIFT > 8 + ret = in_interrupt(); +#endif + return ret; +} + static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed) @@ -1793,15 +1827,18 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { struct page *page; if (!order || compaction_deferred(preferred_zone)) return NULL; + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask); + nodemask, sync_migration); + current->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ @@ -1837,7 +1874,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { return NULL; } @@ -1852,23 +1890,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, { struct page *page = NULL; struct reclaim_state reclaim_state; - struct task_struct *p = current; bool drained = false; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - p->flags |= PF_MEMALLOC; + current->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + current->reclaim_state = &reclaim_state; *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); - p->reclaim_state = NULL; + current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + current->flags &= ~PF_MEMALLOC; cond_resched(); @@ -1920,19 +1957,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, static inline void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx) + enum zone_type high_zoneidx, + enum zone_type classzone_idx) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wakeup_kswapd(zone, order, classzone_idx); } static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { - struct task_struct *p = current; int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; const gfp_t wait = gfp_mask & __GFP_WAIT; @@ -1948,18 +1985,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); if (!wait) { - alloc_flags |= ALLOC_HARDER; + /* + * Not worth trying to allocate harder for + * __GFP_NOMEMALLOC even if it can't schedule. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + alloc_flags |= ALLOC_HARDER; /* * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(p)) && !in_interrupt()) + } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { if (!in_interrupt() && - ((p->flags & PF_MEMALLOC) || + ((current->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } @@ -1978,7 +2020,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - struct task_struct *p = current; + bool sync_migration = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2003,7 +2045,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2012,6 +2056,14 @@ restart: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* + * Find the true preferred zone if the allocation is unconstrained by + * cpusets. + */ + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) + first_zones_zonelist(zonelist, high_zoneidx, NULL, + &preferred_zone); + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2034,21 +2086,26 @@ rebalance: goto nopage; /* Avoid recursion of direct reclaim */ - if (p->flags & PF_MEMALLOC) + if (current->flags & PF_MEMALLOC) goto nopage; /* Avoid allocations with no watermarks from looping endlessly */ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* Try direct compaction */ + /* + * Try direct compaction. The first pass is asynchronous. Subsequent + * attempts after direct reclaim are synchronous + */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + migratetype, &did_some_progress, + sync_migration); if (page) goto got_pg; + sync_migration = !(gfp_mask & __GFP_NO_KSWAPD); /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, @@ -2102,15 +2159,43 @@ rebalance: /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; + } else { + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress, + sync_migration); + if (page) + goto got_pg; } nopage: if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { - printk(KERN_WARNING "%s: page allocation failure." - " order:%d, mode:0x%x\n", - p->comm, order, gfp_mask); + unsigned int filter = SHOW_MEM_FILTER_NODES; + + /* + * This documents exceptions given to allocations in certain + * contexts that are allowed to allocate outside current's set + * of allowed nodes. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + if (test_thread_flag(TIF_MEMDIE) || + (current->flags & (PF_MEMALLOC | PF_EXITING))) + filter &= ~SHOW_MEM_FILTER_NODES; + if (in_interrupt() || !wait) + filter &= ~SHOW_MEM_FILTER_NODES; + + pr_warning("%s: page allocation failure. order:%d, mode:0x%x\n", + current->comm, order, gfp_mask); dump_stack(); - show_mem(); + if (!should_suppress_show_mem()) + show_mem(filter); } return page; got_pg: @@ -2151,7 +2236,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, get_mems_allowed(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + first_zones_zonelist(zonelist, high_zoneidx, + nodemask ? : &cpuset_current_mems_allowed, + &preferred_zone); if (!preferred_zone) { put_mems_allowed(); return NULL; @@ -2358,19 +2445,42 @@ void si_meminfo_node(struct sysinfo *val, int nid) } #endif +/* + * Determine whether the zone's node should be displayed or not, depending on + * whether SHOW_MEM_FILTER_NODES was passed to __show_free_areas(). + */ +static bool skip_free_areas_zone(unsigned int flags, const struct zone *zone) +{ + bool ret = false; + + if (!(flags & SHOW_MEM_FILTER_NODES)) + goto out; + + get_mems_allowed(); + ret = !node_isset(zone->zone_pgdat->node_id, + cpuset_current_mems_allowed); + put_mems_allowed(); +out: + return ret; +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. + * Suppresses nodes that are not allowed by current's cpuset if + * SHOW_MEM_FILTER_NODES is passed. */ -void show_free_areas(void) +void __show_free_areas(unsigned int filter) { int cpu; struct zone *zone; for_each_populated_zone(zone) { + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s per-cpu:\n", zone->name); @@ -2412,6 +2522,8 @@ void show_free_areas(void) for_each_populated_zone(zone) { int i; + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s" " free:%lukB" @@ -2442,7 +2554,7 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone_nr_free_pages(zone)), + K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), @@ -2479,6 +2591,8 @@ void show_free_areas(void) for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; + if (skip_free_areas_zone(filter, zone)) + continue; show_node(zone); printk("%s: ", zone->name); @@ -2498,6 +2612,11 @@ void show_free_areas(void) show_swap_cache_info(); } +void show_free_areas(void) +{ + __show_free_areas(0); +} + static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; @@ -2585,9 +2704,16 @@ static int __parse_numa_zonelist_order(char *s) static __init int setup_numa_zonelist_order(char *s) { - if (s) - return __parse_numa_zonelist_order(s); - return 0; + int ret; + + if (!s) + return 0; + + ret = __parse_numa_zonelist_order(s); + if (ret == 0) + strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); + + return ret; } early_param("numa_zonelist_order", setup_numa_zonelist_order); @@ -3050,7 +3176,7 @@ static __init_refok int __build_all_zonelists(void *data) * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. */ -void build_all_zonelists(void *data) +void __ref build_all_zonelists(void *data) { set_zonelist_order(); @@ -3639,13 +3765,45 @@ void __init free_bootmem_with_active_regions(int nid, } #ifdef CONFIG_HAVE_MEMBLOCK +/* + * Basic iterator support. Return the last range of PFNs for a node + * Note: nid == MAX_NUMNODES returns last region regardless of node + */ +static int __meminit last_active_region_index_in_nid(int nid) +{ + int i; + + for (i = nr_nodemap_entries - 1; i >= 0; i--) + if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) + return i; + + return -1; +} + +/* + * Basic iterator support. Return the previous active range of PFNs for a node + * Note: nid == MAX_NUMNODES returns next region regardless of node + */ +static int __meminit previous_active_region_index_in_nid(int index, int nid) +{ + for (index = index - 1; index >= 0; index--) + if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) + return index; + + return -1; +} + +#define for_each_active_range_index_in_nid_reverse(i, nid) \ + for (i = last_active_region_index_in_nid(nid); i != -1; \ + i = previous_active_region_index_in_nid(i, nid)) + u64 __init find_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) { int i; /* Need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { + for_each_active_range_index_in_nid_reverse(i, nid) { u64 addr; u64 ei_start, ei_last; u64 final_start, final_end; @@ -3688,34 +3846,6 @@ int __init add_from_early_node_map(struct range *range, int az, return nr_range; } -#ifdef CONFIG_NO_BOOTMEM -void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - void *ptr; - u64 addr; - - if (limit > memblock.current_limit) - limit = memblock.current_limit; - - addr = find_memory_core_early(nid, size, align, goal, limit); - - if (addr == MEMBLOCK_ERROR) - return NULL; - - ptr = phys_to_virt(addr); - memset(ptr, 0, size); - memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); - /* - * The min_count is set to 0 so that bootmem allocated blocks - * are never reported as leaks. - */ - kmemleak_alloc(ptr, size, 0, 0); - return ptr; -} -#endif - - void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; @@ -3796,7 +3926,7 @@ static void __init find_usable_zone_for_movable(void) /* * The zone ranges provided by the architecture do not include ZONE_MOVABLE - * because it is sized independant of architecture. Unlike the other zones, + * because it is sized independent of architecture. Unlike the other zones, * the starting point for ZONE_MOVABLE is not fixed. It may be different * in each node depending on the size of each node and how evenly kernelcore * is distributed. This helper function adjusts the zone ranges @@ -4014,7 +4144,7 @@ static void __init setup_usemap(struct pglist_data *pgdat, zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); } #else -static void inline setup_usemap(struct pglist_data *pgdat, +static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, unsigned long zonesize) {} #endif /* CONFIG_SPARSEMEM */ @@ -4749,15 +4879,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) dma_reserve = new_dma_reserve; } -#ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { -#ifndef CONFIG_NO_BOOTMEM - .bdata = &bootmem_node_data[0] -#endif - }; -EXPORT_SYMBOL(contig_page_data); -#endif - void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, zones_size, @@ -5316,10 +5437,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { unsigned long check = pfn + iter; - if (!pfn_valid_within(check)) { - iter++; + if (!pfn_valid_within(check)) continue; - } + page = pfn_to_page(check); if (!page_count(page)) { if (PageBuddy(page)) @@ -5517,7 +5637,6 @@ static struct trace_print_flags pageflag_names[] = { {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_buddy, "buddy" }, {1UL << PG_swapbacked, "swapbacked" }, {1UL << PG_unevictable, "unevictable" }, #ifdef CONFIG_MMU @@ -5565,7 +5684,8 @@ void dump_page(struct page *page) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", - page, page_count(page), page_mapcount(page), + page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); + mem_cgroup_print_bad_page(page); } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 5bffada7cde1..99055010cece 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -11,12 +11,11 @@ #include <linux/swapops.h> #include <linux/kmemleak.h> -static void __meminit -__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) +static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) { pc->flags = 0; + set_page_cgroup_array_id(pc, id); pc->mem_cgroup = NULL; - pc->page = pfn_to_page(pfn); INIT_LIST_HEAD(&pc->lru); } static unsigned long total_usage; @@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return base + offset; } +struct page *lookup_cgroup_page(struct page_cgroup *pc) +{ + unsigned long pfn; + struct page *page; + pg_data_t *pgdat; + + pgdat = NODE_DATA(page_cgroup_array_id(pc)); + pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; + page = pfn_to_page(pfn); + VM_BUG_ON(pc != lookup_page_cgroup(page)); + return page; +} + static int __init alloc_node_page_cgroup(int nid) { struct page_cgroup *base, *pc; @@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid) return -ENOMEM; for (index = 0; index < nr_pages; index++) { pc = base + index; - __init_page_cgroup(pc, start_pfn + index); + init_page_cgroup(pc, nid); } NODE_DATA(nid)->node_page_cgroup = base; total_usage += table_size; @@ -105,46 +117,75 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) return section->page_cgroup + pfn; } -/* __alloc_bootmem...() is protected by !slab_available() */ +struct page *lookup_cgroup_page(struct page_cgroup *pc) +{ + struct mem_section *section; + struct page *page; + unsigned long nr; + + nr = page_cgroup_array_id(pc); + section = __nr_to_section(nr); + page = pfn_to_page(pc - section->page_cgroup); + VM_BUG_ON(pc != lookup_page_cgroup(page)); + return page; +} + +static void *__init_refok alloc_page_cgroup(size_t size, int nid) +{ + void *addr = NULL; + + addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN); + if (addr) + return addr; + + if (node_state(nid, N_HIGH_MEMORY)) + addr = vmalloc_node(size, nid); + else + addr = vmalloc(size); + + return addr; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static void free_page_cgroup(void *addr) +{ + if (is_vmalloc_addr(addr)) { + vfree(addr); + } else { + struct page *page = virt_to_page(addr); + size_t table_size = + sizeof(struct page_cgroup) * PAGES_PER_SECTION; + + BUG_ON(PageReserved(page)); + free_pages_exact(addr, table_size); + } +} +#endif + static int __init_refok init_section_page_cgroup(unsigned long pfn) { - struct mem_section *section = __pfn_to_section(pfn); struct page_cgroup *base, *pc; + struct mem_section *section; unsigned long table_size; + unsigned long nr; int nid, index; - if (!section->page_cgroup) { - nid = page_to_nid(pfn_to_page(pfn)); - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - VM_BUG_ON(!slab_is_available()); - if (node_state(nid, N_HIGH_MEMORY)) { - base = kmalloc_node(table_size, - GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); - } else { - base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); - if (!base) - base = vmalloc(table_size); - } - /* - * The value stored in section->page_cgroup is (base - pfn) - * and it does not point to the memory block allocated above, - * causing kmemleak false positives. - */ - kmemleak_not_leak(base); - } else { - /* - * We don't have to allocate page_cgroup again, but - * address of memmap may be changed. So, we have to initialize - * again. - */ - base = section->page_cgroup + pfn; - table_size = 0; - /* check address of memmap is changed or not. */ - if (base->page == pfn_to_page(pfn)) - return 0; - } + nr = pfn_to_section_nr(pfn); + section = __nr_to_section(nr); + + if (section->page_cgroup) + return 0; + + nid = page_to_nid(pfn_to_page(pfn)); + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; + base = alloc_page_cgroup(table_size, nid); + + /* + * The value stored in section->page_cgroup is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); if (!base) { printk(KERN_ERR "page cgroup allocation failure\n"); @@ -153,7 +194,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) for (index = 0; index < PAGES_PER_SECTION; index++) { pc = base + index; - __init_page_cgroup(pc, pfn + index); + init_page_cgroup(pc, nr); } section->page_cgroup = base - pfn; @@ -170,16 +211,8 @@ void __free_page_cgroup(unsigned long pfn) if (!ms || !ms->page_cgroup) return; base = ms->page_cgroup + pfn; - if (is_vmalloc_addr(base)) { - vfree(base); - ms->page_cgroup = NULL; - } else { - struct page *page = virt_to_page(base); - if (!PageReserved(page)) { /* Is bootmem ? */ - kfree(base); - ms->page_cgroup = NULL; - } - } + free_page_cgroup(base); + ms->page_cgroup = NULL; } int __meminit online_page_cgroup(unsigned long start_pfn, @@ -243,12 +276,7 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, break; } - if (ret) - ret = notifier_from_errno(ret); - else - ret = NOTIFY_OK; - - return ret; + return notifier_from_errno(ret); } #endif @@ -349,7 +377,7 @@ not_enough_page: * @new: new id * * Returns old id at success, 0 at failure. - * (There is no mem_cgroup useing 0 as its id) + * (There is no mem_cgroup using 0 as its id) */ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new) diff --git a/mm/page_io.c b/mm/page_io.c index 2dee975bf469..dc76b4d0611e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= REQ_SYNC | REQ_UNPLUG; + rw |= REQ_SYNC; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 38cc58b8b2b0..c3450d533611 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -33,18 +33,35 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pmd = pmd_offset(pud, addr); do { +again: next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) { + if (pmd_none(*pmd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; continue; } + /* + * This implies that each ->pmd_entry() handler + * needs to know about pmd_trans_huge() pmds + */ if (walk->pmd_entry) err = walk->pmd_entry(pmd, addr, next, walk); - if (!err && walk->pte_entry) - err = walk_pte_range(pmd, addr, next, walk); + if (err) + break; + + /* + * Check this here so we only break down trans_huge + * pages when we _need_ to + */ + if (!walk->pte_entry) + continue; + + split_huge_page_pmd(walk->mm, pmd); + if (pmd_none_or_clear_bad(pmd)) + goto again; + err = walk_pte_range(pmd, addr, next, walk); if (err) break; } while (pmd++, addr = next, addr != end); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 7d9c1d0ebd3f..ea534960a04b 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) return NULL; vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, - pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); + pcpu_nr_groups, pcpu_atom_size); if (!vms) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index efe816856a9d..a160db39b810 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -258,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, /* * (Un)populated page region iterators. Iterate over (un)populated - * page regions betwen @start and @end in @chunk. @rs and @re should + * page regions between @start and @end in @chunk. @rs and @re should * be integer variables and will be set to start and end page index of * the current region. */ @@ -293,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size) if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); - else { - void *ptr = vmalloc(size); - if (ptr) - memset(ptr, 0, size); - return ptr; - } + else + return vzalloc(size); } /** @@ -346,7 +342,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) * @chunk: chunk of interest * * Determine whether area map of @chunk needs to be extended to - * accomodate a new allocation. + * accommodate a new allocation. * * CONTEXT: * pcpu_lock. @@ -435,7 +431,7 @@ out_unlock: * depending on @head, is reduced by @tail bytes and @tail byte block * is inserted after the target block. * - * @chunk->map must have enough free slots to accomodate the split. + * @chunk->map must have enough free slots to accommodate the split. * * CONTEXT: * pcpu_lock. @@ -1012,8 +1008,7 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr) } if (in_first_chunk) { - if ((unsigned long)addr < VMALLOC_START || - (unsigned long)addr >= VMALLOC_END) + if (!is_vmalloc_addr(addr)) return __pa(addr); else return page_to_phys(vmalloc_to_page(addr)); @@ -1268,7 +1263,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* we're done parsing the input, undefine BUG macro and dump config */ #undef PCPU_SETUP_BUG_ON - pcpu_dump_alloc_info(KERN_INFO, ai); + pcpu_dump_alloc_info(KERN_DEBUG, ai); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; @@ -1440,7 +1435,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( /* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest - * which can accomodate 4k aligned segments which are equal to + * which can accommodate 4k aligned segments which are equal to * or larger than min_unit_size. */ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); @@ -1555,7 +1550,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @alloc_fn: function to allocate percpu page - * @free_fn: funtion to free percpu page + * @free_fn: function to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. @@ -1683,7 +1678,7 @@ out_free: * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE - * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @free_fn: function to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte * * This is a helper to ease setting up page-remapped first percpu diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c new file mode 100644 index 000000000000..eb663fb533e0 --- /dev/null +++ b/mm/pgtable-generic.c @@ -0,0 +1,121 @@ +/* + * mm/pgtable-generic.c + * + * Generic pgtable methods declared in asm-generic/pgtable.h + * + * Copyright (C) 2010 Linus Torvalds + */ + +#include <linux/pagemap.h> +#include <asm/tlb.h> +#include <asm-generic/pgtable.h> + +#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +/* + * Only sets the access flags (dirty, accessed, and + * writable). Furthermore, we know it always gets set to a "more + * permissive" setting, which allows most architectures to optimize + * this. We return whether the PTE actually changed, which in turn + * instructs the caller to do things like update__mmu_cache. This + * used to be done in the caller, but sparc needs minor faults to + * force that call on sun4c so we changed this macro slightly + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + if (changed) { + set_pte_at(vma->vm_mm, address, ptep, entry); + flush_tlb_page(vma, address); + } + return changed; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int changed = !pmd_same(*pmdp, entry); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (changed) { + set_pmd_at(vma->vm_mm, address, pmdp, entry); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + return changed; +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ + BUG(); + return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; +#ifndef CONFIG_TRANSPARENT_HUGEPAGE + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + pte_t pte; + pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); + flush_tlb_page(vma, address); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = pmd_mksplitting(*pmdp); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + /* tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif diff --git a/mm/readahead.c b/mm/readahead.c index 77506a291a2d..2c0cc489e288 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages); static int read_pages(struct address_space *mapping, struct file *filp, struct list_head *pages, unsigned nr_pages) { + struct blk_plug plug; unsigned page_idx; int ret; + blk_start_plug(&plug); + if (mapping->a_ops->readpages) { ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); /* Clean up the remaining pages */ @@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp, page_cache_release(page); } ret = 0; + out: + blk_finish_plug(&plug); + return ret; } @@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping, /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); - -#ifdef CONFIG_BLOCK - /* - * Normally the current page is !uptodate and lock_page() will be - * immediately called to implicitly unplug the device. However this - * is not always true for RAID conifgurations, where data arrives - * not strictly in their submission order. In this case we need to - * explicitly kick off the IO. - */ - if (PageUptodate(page)) - blk_run_backing_dev(mapping->backing_dev_info, NULL); -#endif } EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/rmap.c b/mm/rmap.c index 1a8bf76bfd03..8da044a1db0f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -31,11 +31,12 @@ * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) - * inode_lock (in set_page_dirty's __mark_inode_dirty) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, - * within inode_lock in __sync_single_inode) + * within inode_wb_list_lock in __sync_single_inode) * * (code doesn't rely on that order so it could be switched around) * ->tasklist_lock @@ -67,11 +68,24 @@ static struct kmem_cache *anon_vma_chain_cachep; static inline struct anon_vma *anon_vma_alloc(void) { - return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); + struct anon_vma *anon_vma; + + anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); + if (anon_vma) { + atomic_set(&anon_vma->refcount, 1); + /* + * Initialise the anon_vma root to point to itself. If called + * from fork, the root will be reset to the parents anon_vma. + */ + anon_vma->root = anon_vma; + } + + return anon_vma; } -void anon_vma_free(struct anon_vma *anon_vma) +static inline void anon_vma_free(struct anon_vma *anon_vma) { + VM_BUG_ON(atomic_read(&anon_vma->refcount)); kmem_cache_free(anon_vma_cachep, anon_vma); } @@ -94,7 +108,7 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) * anonymous pages mapped into it with that anon_vma. * * The common case will be that we already have one, but if - * if not we either need to find an adjacent mapping that we + * not we either need to find an adjacent mapping that we * can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we * allocate a new one. @@ -133,11 +147,6 @@ int anon_vma_prepare(struct vm_area_struct *vma) if (unlikely(!anon_vma)) goto out_enomem_free_avc; allocated = anon_vma; - /* - * This VMA had no anon_vma yet. This anon_vma is - * the root of any anon_vma tree that might form. - */ - anon_vma->root = anon_vma; } anon_vma_lock(anon_vma); @@ -156,7 +165,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) anon_vma_unlock(anon_vma); if (unlikely(allocated)) - anon_vma_free(allocated); + put_anon_vma(allocated); if (unlikely(avc)) anon_vma_chain_free(avc); } @@ -177,6 +186,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, list_add(&avc->same_vma, &vma->anon_vma_chain); anon_vma_lock(anon_vma); + /* + * It's critical to add new vmas to the tail of the anon_vma, + * see comment in huge_memory.c:__split_huge_page(). + */ list_add_tail(&avc->same_anon_vma, &anon_vma->head); anon_vma_unlock(anon_vma); } @@ -237,9 +250,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) */ anon_vma->root = pvma->anon_vma->root; /* - * With KSM refcounts, an anon_vma can stay around longer than the - * process it belongs to. The root anon_vma needs to be pinned - * until this anon_vma is freed, because the lock lives in the root. + * With refcounts, an anon_vma can stay around longer than the + * process it belongs to. The root anon_vma needs to be pinned until + * this anon_vma is freed, because the lock lives in the root. */ get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ @@ -249,7 +262,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) return 0; out_error_free_anon_vma: - anon_vma_free(anon_vma); + put_anon_vma(anon_vma); out_error: unlink_anon_vmas(vma); return -ENOMEM; @@ -268,15 +281,11 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); + empty = list_empty(&anon_vma->head); anon_vma_unlock(anon_vma); - if (empty) { - /* We no longer need the root anon_vma */ - if (anon_vma->root != anon_vma) - drop_anon_vma(anon_vma->root); - anon_vma_free(anon_vma); - } + if (empty) + put_anon_vma(anon_vma); } void unlink_anon_vmas(struct vm_area_struct *vma) @@ -299,7 +308,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; spin_lock_init(&anon_vma->lock); - anonvma_external_refcount_init(anon_vma); + atomic_set(&anon_vma->refcount, 0); INIT_LIST_HEAD(&anon_vma->head); } @@ -360,7 +369,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) * Returns virtual address or -EFAULT if page's index/offset is not * within the range mapped the @vma. */ -static inline unsigned long +inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -435,6 +444,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return NULL; + if (pmd_trans_huge(*pmd)) + return NULL; pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ @@ -489,35 +500,65 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; - pte_t *pte; - spinlock_t *ptl; int referenced = 0; - pte = page_check_address(page, mm, address, &ptl, 0); - if (!pte) - goto out; - - /* - * Don't want to elevate referenced for mlocked page that gets this far, - * in order that it progresses to try_to_unmap and is moved to the - * unevictable list. - */ - if (vma->vm_flags & VM_LOCKED) { - *mapcount = 1; /* break early from loop */ - *vm_flags |= VM_LOCKED; - goto out_unmap; - } + if (unlikely(PageTransHuge(page))) { + pmd_t *pmd; - if (ptep_clear_flush_young_notify(vma, address, pte)) { + spin_lock(&mm->page_table_lock); /* - * Don't treat a reference through a sequentially read - * mapping as such. If the page has been used in - * another mapping, we will catch it; if this other - * mapping is already gone, the unmap path will have - * set PG_referenced or activated the page. + * rmap might return false positives; we must filter + * these out using page_check_address_pmd(). */ - if (likely(!VM_SequentialReadHint(vma))) + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_FLAG); + if (!pmd) { + spin_unlock(&mm->page_table_lock); + goto out; + } + + if (vma->vm_flags & VM_LOCKED) { + spin_unlock(&mm->page_table_lock); + *mapcount = 0; /* break early from loop */ + *vm_flags |= VM_LOCKED; + goto out; + } + + /* go ahead even if the pmd is pmd_trans_splitting() */ + if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; + spin_unlock(&mm->page_table_lock); + } else { + pte_t *pte; + spinlock_t *ptl; + + /* + * rmap might return false positives; we must filter + * these out using page_check_address(). + */ + pte = page_check_address(page, mm, address, &ptl, 0); + if (!pte) + goto out; + + if (vma->vm_flags & VM_LOCKED) { + pte_unmap_unlock(pte, ptl); + *mapcount = 0; /* break early from loop */ + *vm_flags |= VM_LOCKED; + goto out; + } + + if (ptep_clear_flush_young_notify(vma, address, pte)) { + /* + * Don't treat a reference through a sequentially read + * mapping as such. If the page has been used in + * another mapping, we will catch it; if this other + * mapping is already gone, the unmap path will have + * set PG_referenced or activated the page. + */ + if (likely(!VM_SequentialReadHint(vma))) + referenced++; + } + pte_unmap_unlock(pte, ptl); } /* Pretend the page is referenced if the task has the @@ -526,9 +567,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, rwsem_is_locked(&mm->mmap_sem)) referenced++; -out_unmap: (*mapcount)--; - pte_unmap_unlock(pte, ptl); if (referenced) *vm_flags |= vma->vm_flags; @@ -864,8 +903,13 @@ void do_page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { int first = atomic_inc_and_test(&page->_mapcount); - if (first) - __inc_zone_page_state(page, NR_ANON_PAGES); + if (first) { + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); + } if (unlikely(PageKsm(page))) return; @@ -893,7 +937,10 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - __inc_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); @@ -911,7 +958,7 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, 1); + mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); } } @@ -946,10 +993,14 @@ void page_remove_rmap(struct page *page) return; if (PageAnon(page)) { mem_cgroup_uncharge_page(page); - __dec_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __dec_zone_page_state(page, NR_ANON_PAGES); + else + __dec_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, -1); + mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); } /* * It would be tidy to reset the PageAnon mapping here, @@ -1202,7 +1253,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } -static bool is_vma_temporary_stack(struct vm_area_struct *vma) +bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -1400,6 +1451,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int ret; BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); if (unlikely(PageKsm(page))) ret = try_to_unmap_ksm(page, flags); @@ -1439,41 +1491,15 @@ int try_to_munlock(struct page *page) return try_to_unmap_file(page, TTU_MUNLOCK); } -#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) -/* - * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root - * if necessary. Be careful to do all the tests under the lock. Once - * we know we are the last user, nobody else can get a reference and we - * can do the freeing without the lock. - */ -void drop_anon_vma(struct anon_vma *anon_vma) +void __put_anon_vma(struct anon_vma *anon_vma) { - BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); - if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { - struct anon_vma *root = anon_vma->root; - int empty = list_empty(&anon_vma->head); - int last_root_user = 0; - int root_empty = 0; + struct anon_vma *root = anon_vma->root; - /* - * The refcount on a non-root anon_vma got dropped. Drop - * the refcount on the root and check if we need to free it. - */ - if (empty && anon_vma != root) { - BUG_ON(atomic_read(&root->external_refcount) <= 0); - last_root_user = atomic_dec_and_test(&root->external_refcount); - root_empty = list_empty(&root->head); - } - anon_vma_unlock(anon_vma); + if (root != anon_vma && atomic_dec_and_test(&root->refcount)) + anon_vma_free(root); - if (empty) { - anon_vma_free(anon_vma); - if (root_empty && last_root_user) - anon_vma_free(root); - } - } + anon_vma_free(anon_vma); } -#endif #ifdef CONFIG_MIGRATION /* diff --git a/mm/shmem.c b/mm/shmem.c index 47fdeeb9d636..8fa27e4e582a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -224,7 +224,6 @@ static const struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, - .unplug_io_fn = default_unplug_io_fn, }; static LIST_HEAD(shmem_swaplist); @@ -422,7 +421,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long * a waste to allocate index if we cannot allocate data. */ if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0) + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks - 1) >= 0) return ERR_PTR(-ENOSPC); percpu_counter_inc(&sbinfo->used_blocks); spin_lock(&inode->i_lock); @@ -779,7 +779,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) * If truncating down to a partial page, then * if that page is already allocated, hold it * in memory until the truncation is over, so - * truncate_partial_page cannnot miss it were + * truncate_partial_page cannot miss it were * it assigned to swap. */ if (newsize & (PAGE_CACHE_SIZE-1)) { @@ -1081,7 +1081,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) shmem_recalc_inode(inode); if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { - remove_from_page_cache(page); + delete_from_page_cache(page); shmem_swp_set(info, entry, swap.val); shmem_swp_unmap(entry); if (list_empty(&info->swaplist)) @@ -1091,7 +1091,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_unlock(&info->lock); swap_shmem_alloc(swap); BUG_ON(page_mapped(page)); - page_cache_release(page); /* pagecache ref */ swap_writepage(page, wbc); if (inode) { mutex_lock(&shmem_swaplist_mutex); @@ -1399,7 +1398,8 @@ repeat: shmem_swp_unmap(entry); sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { - if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) || + if (percpu_counter_compare(&sbinfo->used_blocks, + sbinfo->max_blocks) >= 0 || shmem_acct_block(info->flags)) { spin_unlock(&info->lock); error = -ENOSPC; @@ -1843,8 +1843,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { - error = security_inode_init_security(inode, dir, NULL, NULL, - NULL); + error = security_inode_init_security(inode, dir, + &dentry->d_name, NULL, + NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { iput(inode); @@ -1983,8 +1984,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (!inode) return -ENOSPC; - error = security_inode_init_security(inode, dir, NULL, NULL, - NULL); + error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, + NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { iput(inode); @@ -2144,8 +2145,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, { struct inode *inode = dentry->d_inode; - if (*len < 3) + if (*len < 3) { + *len = 3; return 255; + } if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, @@ -2415,13 +2418,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) return &p->vfs_inode; } +static void shmem_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); +} + static void shmem_destroy_inode(struct inode *inode) { if ((inode->i_mode & S_IFMT) == S_IFREG) { /* only struct inode is valid if it's an inline symlink */ mpol_free_shared_policy(&SHMEM_I(inode)->policy); } - kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); + call_rcu(&inode->i_rcu, shmem_i_callback); } static void init_once(void *foo) @@ -2784,5 +2794,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) fput(vma->vm_file); vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } diff --git a/mm/slab.c b/mm/slab.c index b1e40dafbab3..46a9c163a92f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -191,22 +191,6 @@ typedef unsigned int kmem_bufctl_t; #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) /* - * struct slab - * - * Manages the objs in a slab. Placed either at the beginning of mem allocated - * for a slab, or allocated from an general cache. - * Slabs are chained into three list: fully used, partial, fully free slabs. - */ -struct slab { - struct list_head list; - unsigned long colouroff; - void *s_mem; /* including colour offset */ - unsigned int inuse; /* num of objs active in slab */ - kmem_bufctl_t free; - unsigned short nodeid; -}; - -/* * struct slab_rcu * * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to @@ -219,8 +203,6 @@ struct slab { * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. - * - * We assume struct slab_rcu can overlay struct slab when destroying. */ struct slab_rcu { struct rcu_head head; @@ -229,6 +211,27 @@ struct slab_rcu { }; /* + * struct slab + * + * Manages the objs in a slab. Placed either at the beginning of mem allocated + * for a slab, or allocated from an general cache. + * Slabs are chained into three list: fully used, partial, fully free slabs. + */ +struct slab { + union { + struct { + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; + unsigned short nodeid; + }; + struct slab_rcu __slab_cover_slab_rcu; + }; +}; + +/* * struct array_cache * * Purpose: @@ -284,7 +287,7 @@ struct kmem_list3 { * Need this for bootstrapping a per node allocator. */ #define NUM_INIT_LISTS (3 * MAX_NUMNODES) -struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; +static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; #define CACHE_CACHE 0 #define SIZE_AC MAX_NUMNODES #define SIZE_L3 (2 * MAX_NUMNODES) @@ -829,12 +832,12 @@ static void init_reap_node(int cpu) static void next_reap_node(void) { - int node = __get_cpu_var(slab_reap_node); + int node = __this_cpu_read(slab_reap_node); node = next_node(node, node_online_map); if (unlikely(node >= MAX_NUMNODES)) node = first_node(node_online_map); - __get_cpu_var(slab_reap_node) = node; + __this_cpu_write(slab_reap_node, node); } #else @@ -875,7 +878,7 @@ static struct array_cache *alloc_arraycache(int node, int entries, nc = kmalloc_node(memsize, gfp, node); /* * The array_cache structures contain pointers to free object. - * However, when such objects are allocated or transfered to another + * However, when such objects are allocated or transferred to another * cache the pointers are not cleared and they could be counted as * valid references during a kmemleak scan. Therefore, kmemleak must * not scan such objects. @@ -1012,7 +1015,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, */ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) { - int node = __get_cpu_var(slab_reap_node); + int node = __this_cpu_read(slab_reap_node); if (l3->alien) { struct array_cache *ac = l3->alien[node]; @@ -1293,7 +1296,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, * anything expensive but will only modify reap_work * and reschedule the timer. */ - cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); + cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); /* Now the cache_reaper is guaranteed to be not running. */ per_cpu(slab_reap_work, cpu).work.func = NULL; break; @@ -1387,7 +1390,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self, break; } out: - return ret ? notifier_from_errno(ret) : NOTIFY_OK; + return notifier_from_errno(ret); } #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ @@ -2147,8 +2150,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) * * @name must be valid until the cache is destroyed. This implies that * the module calling this has to destroy the cache before getting unloaded. - * Note that kmem_cache_name() is not guaranteed to return the same pointer, - * therefore applications must manage it themselves. * * The flags are * @@ -2288,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (ralign < align) { ralign = align; } - /* disable debug if not aligning with REDZONE_ALIGN */ - if (ralign & (__alignof__(unsigned long long) - 1)) + /* disable debug if necessary */ + if (ralign > __alignof__(unsigned long long)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. @@ -2315,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ - cachep->obj_offset += align; - size += align + sizeof(unsigned long long); + cachep->obj_offset += sizeof(unsigned long long); + size += 2 * sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of @@ -2605,7 +2606,7 @@ EXPORT_SYMBOL(kmem_cache_shrink); * * The cache must be empty before calling this function. * - * The caller must guarantee that noone will allocate memory from the cache + * The caller must guarantee that no one will allocate memory from the cache * during the kmem_cache_destroy(). */ void kmem_cache_destroy(struct kmem_cache *cachep) @@ -2781,7 +2782,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, /* * Map pages beginning at addr to the given cache and slab. This is required * for the slab allocator to be able to lookup the cache and slab of a - * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. + * virtual address for kfree, ksize, and slab debugging. */ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, void *addr) @@ -3653,42 +3654,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) +void * +kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) { - return __cache_alloc(cachep, flags, __builtin_return_address(0)); -} -EXPORT_SYMBOL(kmem_cache_alloc_notrace); -#endif + void *ret; -/** - * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. - * @cachep: the cache we're checking against - * @ptr: pointer to validate - * - * This verifies that the untrusted pointer looks sane; - * it is _not_ a guarantee that the pointer is actually - * part of the slab cache in question, but it at least - * validates that the pointer can be dereferenced and - * looks half-way sane. - * - * Currently only used for dentry validation. - */ -int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) -{ - unsigned long size = cachep->buffer_size; - struct page *page; + ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); - if (unlikely(!kern_ptr_validate(ptr, size))) - goto out; - page = virt_to_page(ptr); - if (unlikely(!PageSlab(page))) - goto out; - if (unlikely(page_get_cache(page) != cachep)) - goto out; - return 1; -out: - return 0; + trace_kmalloc(_RET_IP_, ret, + size, slab_buffer_size(cachep), flags); + return ret; } +EXPORT_SYMBOL(kmem_cache_alloc_trace); +#endif #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) @@ -3705,31 +3683,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) EXPORT_SYMBOL(kmem_cache_alloc_node); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, - gfp_t flags, - int nodeid) +void *kmem_cache_alloc_node_trace(size_t size, + struct kmem_cache *cachep, + gfp_t flags, + int nodeid) { - return __cache_alloc_node(cachep, flags, nodeid, + void *ret; + + ret = __cache_alloc_node(cachep, flags, nodeid, __builtin_return_address(0)); + trace_kmalloc_node(_RET_IP_, ret, + size, slab_buffer_size(cachep), + flags, nodeid); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) { struct kmem_cache *cachep; - void *ret; cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - ret = kmem_cache_alloc_node_notrace(cachep, flags, node); - - trace_kmalloc_node((unsigned long) caller, ret, - size, cachep->buffer_size, flags, node); - - return ret; + return kmem_cache_alloc_node_trace(size, cachep, flags, node); } #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) @@ -3862,12 +3841,6 @@ unsigned int kmem_cache_size(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *cachep) -{ - return cachep->name; -} -EXPORT_SYMBOL_GPL(kmem_cache_name); - /* * This initializes kmem_list3 or resizes various caches for all nodes. */ @@ -4075,7 +4048,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) * necessary. Note that the l3 listlock also protects the array_cache * if drain_array() is used on the shared array. */ -void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, struct array_cache *ac, int force, int node) { int tofree; @@ -4339,7 +4312,7 @@ static const struct seq_operations slabinfo_op = { * @count: data length * @ppos: unused */ -ssize_t slabinfo_write(struct file *file, const char __user * buffer, +static ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; diff --git a/mm/slob.c b/mm/slob.c index 617b6d6c42c7..46e0aee33a23 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -666,23 +666,12 @@ unsigned int kmem_cache_size(struct kmem_cache *c) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *c) -{ - return c->name; -} -EXPORT_SYMBOL(kmem_cache_name); - int kmem_cache_shrink(struct kmem_cache *d) { return 0; } EXPORT_SYMBOL(kmem_cache_shrink); -int kmem_ptr_validate(struct kmem_cache *a, const void *b) -{ - return 0; -} - static unsigned int slob_ready __read_mostly; int slab_is_available(void) diff --git a/mm/slub.c b/mm/slub.c index bec0e355fbad..9d2e5e46bf09 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -28,6 +28,8 @@ #include <linux/math64.h> #include <linux/fault-inject.h> +#include <trace/events/kmem.h> + /* * Lock order: * 1. slab_lock(page) @@ -62,7 +64,7 @@ * we must stay away from it for a while since we may cause a bouncing * cacheline if we try to acquire the lock. So go onto the next slab. * If all pages are busy then we may allocate a new slab instead of reusing - * a partial slab. A new slab has noone operating on it and thus there is + * a partial slab. A new slab has no one operating on it and thus there is * no danger of cacheline contention. * * Interrupts are disabled during allocation and deallocation in order to @@ -215,7 +217,7 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) #endif -static inline void stat(struct kmem_cache *s, enum stat_item si) +static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS __this_cpu_inc(s->cpu_slab->stat[si]); @@ -279,11 +281,40 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->objsize; + +#endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + +static inline int order_objects(int order, unsigned long size, int reserved) +{ + return ((PAGE_SIZE << order) - reserved) / size; +} + static inline struct kmem_cache_order_objects oo_make(int order, - unsigned long size) + unsigned long size, int reserved) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + (PAGE_SIZE << order) / size + (order << OO_SHIFT) + order_objects(order, size, reserved) }; return x; @@ -615,7 +646,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)); + length = (PAGE_SIZE << compound_order(page)) - s->reserved; end = start + length; remainder = length % s->size; if (!remainder) @@ -696,7 +727,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + maxobj = order_objects(compound_order(page), s->size, s->reserved); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", s->name, page->objects, maxobj); @@ -746,7 +777,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + max_objects = order_objects(compound_order(page), s->size, s->reserved); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -798,21 +829,31 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) { flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, s->objsize); + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); } static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) -{ - kmemcheck_slab_free(s, object, s->objsize); - debug_check_no_locks_freed(object, s->objsize); + /* + * Trouble is that we may no longer disable interupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->objsize); + debug_check_no_locks_freed(x, s->objsize); + local_irq_restore(flags); + } +#endif if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, s->objsize); + debug_check_no_obj_freed(x, s->objsize); } /* @@ -1099,9 +1140,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, static inline void slab_free_hook(struct kmem_cache *s, void *x) {} -static inline void slab_free_hook_irq(struct kmem_cache *s, - void *object) {} - #endif /* CONFIG_SLUB_DEBUG */ /* @@ -1247,21 +1285,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __free_pages(page, order); } +#define need_reserve_slab_rcu \ + (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) + static void rcu_free_slab(struct rcu_head *h) { struct page *page; - page = container_of((struct list_head *)h, struct page, lru); + if (need_reserve_slab_rcu) + page = virt_to_head_page(h); + else + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); } static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { - /* - * RCU free overloads the RCU head over the LRU - */ - struct rcu_head *head = (void *)&page->lru; + struct rcu_head *head; + + if (need_reserve_slab_rcu) { + int order = compound_order(page); + int offset = (PAGE_SIZE << order) - s->reserved; + + VM_BUG_ON(s->reserved != sizeof(*head)); + head = page_address(page) + offset; + } else { + /* + * RCU free overloads the RCU head over the LRU + */ + head = (void *)&page->lru; + } call_rcu(head, rcu_free_slab); } else @@ -1485,6 +1540,78 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) } } +#ifdef CONFIG_CMPXCHG_LOCAL +#ifdef CONFIG_PREEMPT +/* + * Calculate the next globally unique transaction for disambiguiation + * during cmpxchg. The transactions start with the cpu number and are then + * incremented by CONFIG_NR_CPUS. + */ +#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) +#else +/* + * No preemption supported therefore also no need to check for + * different cpus. + */ +#define TID_STEP 1 +#endif + +static inline unsigned long next_tid(unsigned long tid) +{ + return tid + TID_STEP; +} + +static inline unsigned int tid_to_cpu(unsigned long tid) +{ + return tid % TID_STEP; +} + +static inline unsigned long tid_to_event(unsigned long tid) +{ + return tid / TID_STEP; +} + +static inline unsigned int init_tid(int cpu) +{ + return cpu; +} + +static inline void note_cmpxchg_failure(const char *n, + const struct kmem_cache *s, unsigned long tid) +{ +#ifdef SLUB_DEBUG_CMPXCHG + unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); + + printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); + +#ifdef CONFIG_PREEMPT + if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + printk("due to cpu change %d -> %d\n", + tid_to_cpu(tid), tid_to_cpu(actual_tid)); + else +#endif + if (tid_to_event(tid) != tid_to_event(actual_tid)) + printk("due to cpu running other code. Event %ld->%ld\n", + tid_to_event(tid), tid_to_event(actual_tid)); + else + printk("for unknown reason: actual=%lx was=%lx target=%lx\n", + actual_tid, tid, next_tid(tid)); +#endif + stat(s, CMPXCHG_DOUBLE_CPU_FAIL); +} + +#endif + +void init_kmem_cache_cpus(struct kmem_cache *s) +{ +#ifdef CONFIG_CMPXCHG_LOCAL + int cpu; + + for_each_possible_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); +#endif + +} /* * Remove the cpu slab */ @@ -1516,6 +1643,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) page->inuse--; } c->page = NULL; +#ifdef CONFIG_CMPXCHG_LOCAL + c->tid = next_tid(c->tid); +#endif unfreeze_slab(s, page, tail); } @@ -1650,6 +1780,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, { void **object; struct page *new; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long flags; + + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif +#endif /* We handle __GFP_ZERO in the caller */ gfpflags &= ~__GFP_ZERO; @@ -1676,6 +1819,10 @@ load_freelist: c->node = page_to_nid(c->page); unlock_out: slab_unlock(c->page); +#ifdef CONFIG_CMPXCHG_LOCAL + c->tid = next_tid(c->tid); + local_irq_restore(flags); +#endif stat(s, ALLOC_SLOWPATH); return object; @@ -1711,6 +1858,9 @@ new_slab: } if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) slab_out_of_memory(s, gfpflags, node); +#ifdef CONFIG_CMPXCHG_LOCAL + local_irq_restore(flags); +#endif return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) @@ -1737,23 +1887,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, { void **object; struct kmem_cache_cpu *c; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long tid; +#else unsigned long flags; +#endif if (slab_pre_alloc_hook(s, gfpflags)) return NULL; +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_save(flags); +#else +redo: +#endif + + /* + * Must read kmem_cache cpu data via this cpu ptr. Preemption is + * enabled. We may switch back and forth between cpus while + * reading from one cpu area. That does not matter as long + * as we end up on the original cpu again when doing the cmpxchg. + */ c = __this_cpu_ptr(s->cpu_slab); + +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * The transaction ids are globally unique per cpu and per operation on + * a per cpu queue. Thus they can be guarantee that the cmpxchg_double + * occurs on the right processor and that there was no operation on the + * linked list in between. + */ + tid = c->tid; + barrier(); +#endif + object = c->freelist; if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * The cmpxchg will only match if there was no additional + * operation and if we are on the right processor. + * + * The cmpxchg does the following atomically (without lock semantics!) + * 1. Relocate first pointer to the current per cpu area. + * 2. Verify that tid and freelist have not been changed + * 3. If they were not changed replace tid and freelist + * + * Since this is without lock semantics the protection is only against + * code executing on this cpu *not* from access by other cpus. + */ + if (unlikely(!irqsafe_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + object, tid, + get_freepointer(s, object), next_tid(tid)))) { + + note_cmpxchg_failure("slab_alloc", s, tid); + goto redo; + } +#else c->freelist = get_freepointer(s, object); +#endif stat(s, ALLOC_FASTPATH); } + +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_restore(flags); +#endif if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->objsize); @@ -1774,11 +1977,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) EXPORT_SYMBOL(kmem_cache_alloc); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_trace); + +void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) +{ + void *ret = kmalloc_order(size, flags, order); + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order_trace); #endif #ifdef CONFIG_NUMA @@ -1794,13 +2007,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) EXPORT_SYMBOL(kmem_cache_alloc_node); #ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, +void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node) + int node, size_t size) { - return slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, + size, s->size, gfpflags, node); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif #endif @@ -1817,9 +2034,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long flags; - stat(s, FREE_SLOWPATH); + local_irq_save(flags); +#endif slab_lock(page); + stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s)) goto debug; @@ -1849,6 +2070,9 @@ checks_ok: out_unlock: slab_unlock(page); +#ifdef CONFIG_CMPXCHG_LOCAL + local_irq_restore(flags); +#endif return; slab_empty: @@ -1860,6 +2084,9 @@ slab_empty: stat(s, FREE_REMOVE_PARTIAL); } slab_unlock(page); +#ifdef CONFIG_CMPXCHG_LOCAL + local_irq_restore(flags); +#endif stat(s, FREE_SLAB); discard_slab(s, page); return; @@ -1886,23 +2113,56 @@ static __always_inline void slab_free(struct kmem_cache *s, { void **object = (void *)x; struct kmem_cache_cpu *c; +#ifdef CONFIG_CMPXCHG_LOCAL + unsigned long tid; +#else unsigned long flags; +#endif slab_free_hook(s, x); +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_save(flags); + +#else +redo: +#endif + + /* + * Determine the currently cpus per cpu slab. + * The cpu may change afterward. However that does not matter since + * data is retrieved via this pointer. If we are on the same cpu + * during the cmpxchg then the free will succedd. + */ c = __this_cpu_ptr(s->cpu_slab); - slab_free_hook_irq(s, x); +#ifdef CONFIG_CMPXCHG_LOCAL + tid = c->tid; + barrier(); +#endif if (likely(page == c->page && c->node != NUMA_NO_NODE)) { set_freepointer(s, object, c->freelist); + +#ifdef CONFIG_CMPXCHG_LOCAL + if (unlikely(!irqsafe_cpu_cmpxchg_double( + s->cpu_slab->freelist, s->cpu_slab->tid, + c->freelist, tid, + object, next_tid(tid)))) { + + note_cmpxchg_failure("slab_free", s, tid); + goto redo; + } +#else c->freelist = object; +#endif stat(s, FREE_FASTPATH); } else __slab_free(s, page, x, addr); +#ifndef CONFIG_CMPXCHG_LOCAL local_irq_restore(flags); +#endif } void kmem_cache_free(struct kmem_cache *s, void *x) @@ -1917,17 +2177,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); -/* Figure out on which slab page the object resides */ -static struct page *get_object_page(const void *x) -{ - struct page *page = virt_to_head_page(x); - - if (!PageSlab(page)) - return NULL; - - return page; -} - /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -1983,13 +2232,13 @@ static int slub_nomerge; * the smallest order which will fit the object. */ static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover) + int max_order, int fract_leftover, int reserved) { int order; int rem; int min_order = slub_min_order; - if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; for (order = max(min_order, @@ -1998,10 +2247,10 @@ static inline int slab_order(int size, int min_objects, unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size) + if (slab_size < min_objects * size + reserved) continue; - rem = slab_size % size; + rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; @@ -2011,7 +2260,7 @@ static inline int slab_order(int size, int min_objects, return order; } -static inline int calculate_order(int size) +static inline int calculate_order(int size, int reserved) { int order; int min_objects; @@ -2029,14 +2278,14 @@ static inline int calculate_order(int size) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = (PAGE_SIZE << slub_max_order)/size; + max_objects = order_objects(slub_max_order, size, reserved); min_objects = min(min_objects, max_objects); while (min_objects > 1) { fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction); + slub_max_order, fraction, reserved); if (order <= slub_max_order) return order; fraction /= 2; @@ -2048,14 +2297,14 @@ static inline int calculate_order(int size) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1); + order = slab_order(size, 1, slub_max_order, 1, reserved); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1); + order = slab_order(size, 1, MAX_ORDER, 1, reserved); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -2105,9 +2354,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); +#ifdef CONFIG_CMPXCHG_LOCAL + /* + * Must align to double word boundary for the double cmpxchg instructions + * to work. + */ + s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); +#else + /* Regular alignment is sufficient */ s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); +#endif + + if (!s->cpu_slab) + return 0; + + init_kmem_cache_cpus(s); - return s->cpu_slab != NULL; + return 1; } static struct kmem_cache *kmem_cache_node; @@ -2306,7 +2569,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (forced_order >= 0) order = forced_order; else - order = calculate_order(size); + order = calculate_order(size, s->reserved); if (order < 0) return 0; @@ -2324,8 +2587,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size); - s->min = oo_make(get_order(size), size); + s->oo = oo_make(order, size, s->reserved); + s->min = oo_make(get_order(size), size, s->reserved); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -2344,6 +2607,10 @@ static int kmem_cache_open(struct kmem_cache *s, s->objsize = size; s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); + s->reserved = 0; + + if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + s->reserved = sizeof(struct rcu_head); if (!calculate_sizes(s, -1)) goto error; @@ -2386,35 +2653,6 @@ error: } /* - * Check if a given pointer is valid - */ -int kmem_ptr_validate(struct kmem_cache *s, const void *object) -{ - struct page *page; - - if (!kern_ptr_validate(object, s->size)) - return 0; - - page = get_object_page(object); - - if (!page || s != page->slab) - /* No slab or wrong slab */ - return 0; - - if (!check_valid_pointer(s, page, object)) - return 0; - - /* - * We could also check if the object is on the slabs freelist. - * But this would be too expensive and it seems that the main - * purpose of kmem_ptr_valid() is to check if the object belongs - * to a certain slab. - */ - return 1; -} -EXPORT_SYMBOL(kmem_ptr_validate); - -/* * Determine the size of a slab object */ unsigned int kmem_cache_size(struct kmem_cache *s) @@ -2423,12 +2661,6 @@ unsigned int kmem_cache_size(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_size); -const char *kmem_cache_name(struct kmem_cache *s) -{ - return s->name; -} -EXPORT_SYMBOL(kmem_cache_name); - static void list_slab_objects(struct kmem_cache *s, struct page *page, const char *text) { @@ -2720,7 +2952,6 @@ EXPORT_SYMBOL(__kmalloc_node); size_t ksize(const void *object) { struct page *page; - struct kmem_cache *s; if (unlikely(object == ZERO_SIZE_PTR)) return 0; @@ -2731,28 +2962,8 @@ size_t ksize(const void *object) WARN_ON(!PageCompound(page)); return PAGE_SIZE << compound_order(page); } - s = page->slab; - -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; + return slab_ksize(page->slab); } EXPORT_SYMBOL(ksize); @@ -3336,7 +3547,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); - /* Honor the call site pointer we recieved. */ + /* Honor the call site pointer we received. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); return ret; @@ -3366,7 +3577,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, ret = slab_alloc(s, gfpflags, node, caller); - /* Honor the call site pointer we recieved. */ + /* Honor the call site pointer we received. */ trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); return ret; @@ -3660,7 +3871,7 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, "%7ld ", l->count); if (l->addr) - len += sprint_symbol(buf + len, (unsigned long)l->addr); + len += sprintf(buf + len, "%pS", (void *)l->addr); else len += sprintf(buf + len, "<not-available>"); @@ -3821,7 +4032,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } - down_read(&slub_lock); + lock_memory_hotplug(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { for_each_node_state(node, N_NORMAL_MEMORY) { @@ -3862,7 +4073,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif - up_read(&slub_lock); + unlock_memory_hotplug(); kfree(nodes); return x + sprintf(buf + x, "\n"); } @@ -3970,12 +4181,9 @@ SLAB_ATTR(min_partial); static ssize_t ctor_show(struct kmem_cache *s, char *buf) { - if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); - - return n + sprintf(buf + n, "\n"); - } - return 0; + if (!s->ctor) + return 0; + return sprintf(buf, "%pS\n", s->ctor); } SLAB_ATTR_RO(ctor); @@ -4044,6 +4252,12 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); +static ssize_t reserved_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->reserved); +} +SLAB_ATTR_RO(reserved); + #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -4330,6 +4544,7 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, + &reserved_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 29d6cbffb283..64b984091edb 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -9,7 +9,7 @@ * * However, virtual mappings need a page table and TLBs. Many Linux * architectures already map their physical space using 1-1 mappings - * via TLBs. For those arches the virtual memmory map is essentially + * via TLBs. For those arches the virtual memory map is essentially * for free if we use the same page size as the 1-1 mappings. In that * case the overhead consists of a few additional pages that are * allocated to create a view of memory for vmemmap. diff --git a/mm/sparse.c b/mm/sparse.c index 95ac219af379..aa64b12831a2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -500,7 +500,7 @@ void __init sparse_init(void) * so alloc 2M (with 2M align) and 24 bytes in turn will * make next 2M slip to one more 2M later. * then in big system, the memory will have a lot of holes... - * here try to allocate 2M pages continously. + * here try to allocate 2M pages continuously. * * powerpc need to call sparse_init_one_section right after each * sparse_early_mem_map_alloc, so allocate usemap_map at first. @@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) static void free_map_bootmem(struct page *page, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; - int magic; + unsigned long magic; for (i = 0; i < nr_pages; i++, page++) { - magic = atomic_read(&page->_mapcount); + magic = (unsigned long) page->lru.next; BUG_ON(magic == NODE_INFO); diff --git a/mm/swap.c b/mm/swap.c index 3f4854205b16..a448db377cb0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -39,6 +39,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -56,17 +57,97 @@ static void __page_cache_release(struct page *page) del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } +} + +static void __put_single_page(struct page *page) +{ + __page_cache_release(page); free_hot_cold_page(page, 0); } -static void put_compound_page(struct page *page) +static void __put_compound_page(struct page *page) { - page = compound_head(page); - if (put_page_testzero(page)) { - compound_page_dtor *dtor; + compound_page_dtor *dtor; - dtor = get_compound_page_dtor(page); - (*dtor)(page); + __page_cache_release(page); + dtor = get_compound_page_dtor(page); + (*dtor)(page); +} + +static void put_compound_page(struct page *page) +{ + if (unlikely(PageTail(page))) { + /* __split_huge_page_refcount can run under us */ + struct page *page_head = page->first_page; + smp_rmb(); + /* + * If PageTail is still set after smp_rmb() we can be sure + * that the page->first_page we read wasn't a dangling pointer. + * See __split_huge_page_refcount() smp_wmb(). + */ + if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + unsigned long flags; + /* + * Verify that our page_head wasn't converted + * to a a regular page before we got a + * reference on it. + */ + if (unlikely(!PageHead(page_head))) { + /* PageHead is cleared after PageTail */ + smp_rmb(); + VM_BUG_ON(PageTail(page)); + goto out_put_head; + } + /* + * Only run compound_lock on a valid PageHead, + * after having it pinned with + * get_page_unless_zero() above. + */ + smp_mb(); + /* page_head wasn't a dangling pointer */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); + VM_BUG_ON(PageHead(page_head)); + out_put_head: + if (put_page_testzero(page_head)) + __put_single_page(page_head); + out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; + } + VM_BUG_ON(page_head != page->first_page); + /* + * We can release the refcount taken by + * get_page_unless_zero now that + * split_huge_page_refcount is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_dec(&page->_count); + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + compound_unlock_irqrestore(page_head, flags); + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } + } else { + /* page_head is a dangling pointer */ + VM_BUG_ON(PageTail(page)); + goto out_put_single; + } + } else if (put_page_testzero(page)) { + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); } } @@ -75,7 +156,7 @@ void put_page(struct page *page) if (unlikely(PageCompound(page))) put_compound_page(page); else if (put_page_testzero(page)) - __page_cache_release(page); + __put_single_page(page); } EXPORT_SYMBOL(put_page); @@ -98,15 +179,13 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); -/* - * pagevec_move_tail() must be called with IRQ disabled. - * Otherwise this may cause nasty races. - */ -static void pagevec_move_tail(struct pagevec *pvec) +static void pagevec_lru_move_fn(struct pagevec *pvec, + void (*move_fn)(struct page *page, void *arg), + void *arg) { int i; - int pgmoved = 0; struct zone *zone = NULL; + unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -114,29 +193,50 @@ static void pagevec_move_tail(struct pagevec *pvec) if (pagezone != zone) { if (zone) - spin_unlock(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); zone = pagezone; - spin_lock(&zone->lru_lock); - } - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - pgmoved++; + spin_lock_irqsave(&zone->lru_lock, flags); } + + (*move_fn)(page, arg); } if (zone) - spin_unlock(&zone->lru_lock); - __count_vm_events(PGROTATED, pgmoved); + spin_unlock_irqrestore(&zone->lru_lock, flags); release_pages(pvec->pages, pvec->nr, pvec->cold); pagevec_reinit(pvec); } +static void pagevec_move_tail_fn(struct page *page, void *arg) +{ + int *pgmoved = arg; + struct zone *zone = page_zone(page); + + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + enum lru_list lru = page_lru_base_type(page); + list_move_tail(&page->lru, &zone->lru[lru].list); + mem_cgroup_rotate_reclaimable_page(page); + (*pgmoved)++; + } +} + +/* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) +{ + int pgmoved = 0; + + pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); + __count_vm_events(PGROTATED, pgmoved); +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. */ -void rotate_reclaimable_page(struct page *page) +void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && !PageUnevictable(page) && PageLRU(page)) { @@ -267,6 +367,71 @@ void add_page_to_unevictable_list(struct page *page) } /* + * If the page can not be invalidated, it is moved to the + * inactive list to speed up its reclaim. It is moved to the + * head of the list, rather than the tail, to give the flusher + * threads some time to write it out, as this is much more + * effective than the single-page writeout from reclaim. + * + * If the page isn't page_mapped and dirty/writeback, the page + * could reclaim asap using PG_reclaim. + * + * 1. active, mapped page -> none + * 2. active, dirty/writeback page -> inactive, head, PG_reclaim + * 3. inactive, mapped page -> none + * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim + * 5. inactive, clean -> inactive, tail + * 6. Others -> none + * + * In 4, why it moves inactive's head, the VM expects the page would + * be write it out by flusher threads as this is much more effective + * than the single-page writeout from reclaim. + */ +static void lru_deactivate_fn(struct page *page, void *arg) +{ + int lru, file; + bool active; + struct zone *zone = page_zone(page); + + if (!PageLRU(page)) + return; + + /* Some processes are using the page */ + if (page_mapped(page)) + return; + + active = PageActive(page); + + file = page_is_file_cache(page); + lru = page_lru_base_type(page); + del_page_from_lru_list(zone, page, lru + active); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(zone, page, lru); + + if (PageWriteback(page) || PageDirty(page)) { + /* + * PG_reclaim could be raced with end_page_writeback + * It can make readahead confusing. But race window + * is _really_ small and it's non-critical problem. + */ + SetPageReclaim(page); + } else { + /* + * The page's writeback ends up during pagevec + * We moves tha page into tail of inactive. + */ + list_move_tail(&page->lru, &zone->lru[lru].list); + mem_cgroup_rotate_reclaimable_page(page); + __count_vm_event(PGROTATED); + } + + if (active) + __count_vm_event(PGDEACTIVATE); + update_page_reclaim_stat(zone, page, file, 0); +} + +/* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. @@ -292,6 +457,29 @@ static void drain_cpu_pagevecs(int cpu) pagevec_move_tail(pvec); local_irq_restore(flags); } + + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); +} + +/** + * deactivate_page - forcefully deactivate a page + * @page: page to deactivate + * + * This function hints the VM that @page is a good reclaim candidate, + * for example if its invalidation fails due to the page being dirty + * or under writeback. + */ +void deactivate_page(struct page *page) +{ + if (likely(get_page_unless_zero(page))) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } } void lru_add_drain(void) @@ -399,44 +587,70 @@ void __pagevec_release(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_release); +/* used by __split_huge_page_refcount() */ +void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail) +{ + int active; + enum lru_list lru; + const int file = 0; + struct list_head *head; + + VM_BUG_ON(!PageHead(page)); + VM_BUG_ON(PageCompound(page_tail)); + VM_BUG_ON(PageLRU(page_tail)); + VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); + + SetPageLRU(page_tail); + + if (page_evictable(page_tail, NULL)) { + if (PageActive(page)) { + SetPageActive(page_tail); + active = 1; + lru = LRU_ACTIVE_ANON; + } else { + active = 0; + lru = LRU_INACTIVE_ANON; + } + update_page_reclaim_stat(zone, page_tail, file, active); + if (likely(PageLRU(page))) + head = page->lru.prev; + else + head = &zone->lru[lru].list; + __add_page_to_lru_list(zone, page_tail, lru, head); + } else { + SetPageUnevictable(page_tail); + add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); + } +} + +static void ____pagevec_lru_add_fn(struct page *page, void *arg) +{ + enum lru_list lru = (enum lru_list)arg; + struct zone *zone = page_zone(page); + int file = is_file_lru(lru); + int active = is_active_lru(lru); + + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON(PageLRU(page)); + + SetPageLRU(page); + if (active) + SetPageActive(page); + update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(zone, page, lru); +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { - int i; - struct zone *zone = NULL; - VM_BUG_ON(is_unevictable_lru(lru)); - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - int file; - int active; - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(PageUnevictable(page)); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - active = is_active_lru(lru); - file = is_file_lru(lru); - if (active) - SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); - add_page_to_lru_list(zone, page, lru); - } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); + pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); } EXPORT_SYMBOL(____pagevec_lru_add); diff --git a/mm/swap_state.c b/mm/swap_state.c index e10f5833167f..46680461785b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -24,12 +24,10 @@ /* * swapper_space is a fiction, retained to simplify the path through - * vmscan's shrink_page_list, to make sync_page look nicer, and to allow - * future use of radix_tree tags in the swap cache. + * vmscan's shrink_page_list. */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .sync_page = block_sync_page, .set_page_dirty = __set_page_dirty_nobuffers, .migratepage = migrate_page, }; @@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = { static struct backing_dev_info swap_backing_dev_info = { .name = "swap", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, - .unplug_io_fn = swap_unplug_io_fn, }; struct address_space swapper_space = { @@ -157,6 +154,12 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) { + swapcache_free(entry, NULL); + return 0; + } + /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC diff --git a/mm/swapfile.c b/mm/swapfile.c index 67ddaaf98c74..8c6b3ce38f09 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -95,39 +95,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) } /* - * We need this because the bdev->unplug_fn can sleep and we cannot - * hold swap_lock while calling the unplug_fn. And swap_lock - * cannot be turned into a mutex. - */ -static DECLARE_RWSEM(swap_unplug_sem); - -void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) -{ - swp_entry_t entry; - - down_read(&swap_unplug_sem); - entry.val = page_private(page); - if (PageSwapCache(page)) { - struct block_device *bdev = swap_info[swp_type(entry)]->bdev; - struct backing_dev_info *bdi; - - /* - * If the page is removed from swapcache from under us (with a - * racy try_to_unuse/swapoff) we need an additional reference - * count to avoid reading garbage from page_private(page) above. - * If the WARN_ON triggers during a swapoff it maybe the race - * condition and it's harmless. However if it triggers without - * swapoff it signals a problem. - */ - WARN_ON(page_count(page) <= 1); - - bdi = bdev->bd_inode->i_mapping->backing_dev_info; - blk_run_backing_dev(bdi, page); - } - up_read(&swap_unplug_sem); -} - -/* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */ @@ -212,8 +179,8 @@ static int wait_for_discard(void *word) #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 -static inline unsigned long scan_swap_map(struct swap_info_struct *si, - unsigned char usage) +static unsigned long scan_swap_map(struct swap_info_struct *si, + unsigned char usage) { unsigned long offset; unsigned long scan_base; @@ -880,7 +847,7 @@ unsigned int count_swap_pages(int type, int free) static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page) { - struct mem_cgroup *ptr = NULL; + struct mem_cgroup *ptr; spinlock_t *ptl; pte_t *pte; int ret = 1; @@ -964,6 +931,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (unlikely(pmd_trans_huge(*pmd))) + continue; if (pmd_none_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); @@ -1548,6 +1517,36 @@ bad_bmap: goto out; } +static void enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map) +{ + int i, prev; + + spin_lock(&swap_lock); + if (prio >= 0) + p->prio = prio; + else + p->prio = --least_priority; + p->swap_map = swap_map; + p->flags |= SWP_WRITEOK; + nr_swap_pages += p->pages; + total_swap_pages += p->pages; + + /* insert swap space into swap_list: */ + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { + if (p->prio >= swap_info[i]->prio) + break; + prev = i; + } + p->next = i; + if (prev < 0) + swap_list.head = swap_list.next = p->type; + else + swap_info[prev]->next = p->type; + spin_unlock(&swap_lock); +} + SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; @@ -1619,32 +1618,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) current->flags &= ~PF_OOM_ORIGIN; if (err) { + /* + * reading p->prio and p->swap_map outside the lock is + * safe here because only sys_swapon and sys_swapoff + * change them, and there can be no other sys_swapon or + * sys_swapoff for this swap_info_struct at this point. + */ /* re-insert swap space back into swap_list */ - spin_lock(&swap_lock); - if (p->prio < 0) - p->prio = --least_priority; - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; - } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = type; - else - swap_info[prev]->next = type; - nr_swap_pages += p->pages; - total_swap_pages += p->pages; - p->flags |= SWP_WRITEOK; - spin_unlock(&swap_lock); + enable_swap_info(p, p->prio, p->swap_map); goto out_dput; } - /* wait for any unplug function to finish */ - down_write(&swap_unplug_sem); - up_write(&swap_unplug_sem); - destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); @@ -1677,7 +1661,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { mutex_lock(&inode->i_mutex); inode->i_flags &= ~S_SWAPFILE; @@ -1842,49 +1826,24 @@ static int __init max_swapfiles_check(void) late_initcall(max_swapfiles_check); #endif -/* - * Written 01/25/92 by Simmule Turner, heavily changed by Linus. - * - * The swapon system call - */ -SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; - char *name = NULL; - struct block_device *bdev = NULL; - struct file *swap_file = NULL; - struct address_space *mapping; unsigned int type; - int i, prev; - int error; - union swap_header *swap_header; - unsigned int nr_good_pages; - int nr_extents = 0; - sector_t span; - unsigned long maxpages; - unsigned long swapfilepages; - unsigned char *swap_map = NULL; - struct page *page = NULL; - struct inode *inode = NULL; - int did_down = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) - return -ENOMEM; + return ERR_PTR(-ENOMEM); spin_lock(&swap_lock); for (type = 0; type < nr_swapfiles; type++) { if (!(swap_info[type]->flags & SWP_USED)) break; } - error = -EPERM; if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); kfree(p); - goto out; + return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { p->type = type; @@ -1909,80 +1868,49 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->next = -1; spin_unlock(&swap_lock); - name = getname(specialfile); - error = PTR_ERR(name); - if (IS_ERR(name)) { - name = NULL; - goto bad_swap_2; - } - swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); - error = PTR_ERR(swap_file); - if (IS_ERR(swap_file)) { - swap_file = NULL; - goto bad_swap_2; - } - - p->swap_file = swap_file; - mapping = swap_file->f_mapping; - inode = mapping->host; - - error = -EBUSY; - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = swap_info[i]; + return p; +} - if (i == type || !q->swap_file) - continue; - if (mapping == q->swap_file->f_mapping) - goto bad_swap; - } +static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) +{ + int error; - error = -EINVAL; if (S_ISBLK(inode->i_mode)) { - bdev = I_BDEV(inode); - error = bd_claim(bdev, sys_swapon); + p->bdev = bdgrab(I_BDEV(inode)); + error = blkdev_get(p->bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, + sys_swapon); if (error < 0) { - bdev = NULL; - error = -EINVAL; - goto bad_swap; + p->bdev = NULL; + return -EINVAL; } - p->old_block_size = block_size(bdev); - error = set_blocksize(bdev, PAGE_SIZE); + p->old_block_size = block_size(p->bdev); + error = set_blocksize(p->bdev, PAGE_SIZE); if (error < 0) - goto bad_swap; - p->bdev = bdev; + return error; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; mutex_lock(&inode->i_mutex); - did_down = 1; - if (IS_SWAPFILE(inode)) { - error = -EBUSY; - goto bad_swap; - } - } else { - goto bad_swap; - } + if (IS_SWAPFILE(inode)) + return -EBUSY; + } else + return -EINVAL; - swapfilepages = i_size_read(inode) >> PAGE_SHIFT; + return 0; +} - /* - * Read the swap header. - */ - if (!mapping->a_ops->readpage) { - error = -EINVAL; - goto bad_swap; - } - page = read_mapping_page(mapping, 0, swap_file); - if (IS_ERR(page)) { - error = PTR_ERR(page); - goto bad_swap; - } - swap_header = kmap(page); +static unsigned long read_swap_header(struct swap_info_struct *p, + union swap_header *swap_header, + struct inode *inode) +{ + int i; + unsigned long maxpages; + unsigned long swapfilepages; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { printk(KERN_ERR "Unable to find swap-space signature\n"); - error = -EINVAL; - goto bad_swap; + return 0; } /* swap partition endianess hack... */ @@ -1998,8 +1926,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) printk(KERN_WARNING "Unable to handle swap header version %d\n", swap_header->info.version); - error = -EINVAL; - goto bad_swap; + return 0; } p->lowest_bit = 1; @@ -2030,61 +1957,155 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } p->highest_bit = maxpages - 1; - error = -EINVAL; if (!maxpages) - goto bad_swap; + return 0; + swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { printk(KERN_WARNING "Swap area shorter than signature indicates\n"); - goto bad_swap; + return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) - goto bad_swap; + return 0; if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) - goto bad_swap; + return 0; - /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages); - if (!swap_map) { - error = -ENOMEM; - goto bad_swap; - } + return maxpages; +} + +static int setup_swap_map_and_extents(struct swap_info_struct *p, + union swap_header *swap_header, + unsigned char *swap_map, + unsigned long maxpages, + sector_t *span) +{ + int i; + unsigned int nr_good_pages; + int nr_extents; - memset(swap_map, 0, maxpages); nr_good_pages = maxpages - 1; /* omit header page */ for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; - if (page_nr == 0 || page_nr > swap_header->info.last_page) { - error = -EINVAL; - goto bad_swap; - } + if (page_nr == 0 || page_nr > swap_header->info.last_page) + return -EINVAL; if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; } } - error = swap_cgroup_swapon(type, maxpages); - if (error) - goto bad_swap; - if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; p->max = maxpages; p->pages = nr_good_pages; - nr_extents = setup_swap_extents(p, &span); - if (nr_extents < 0) { - error = nr_extents; - goto bad_swap; - } + nr_extents = setup_swap_extents(p, span); + if (nr_extents < 0) + return nr_extents; nr_good_pages = p->pages; } if (!nr_good_pages) { printk(KERN_WARNING "Empty swap-file\n"); + return -EINVAL; + } + + return nr_extents; +} + +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) +{ + struct swap_info_struct *p; + char *name; + struct file *swap_file = NULL; + struct address_space *mapping; + int i; + int prio; + int error; + union swap_header *swap_header; + int nr_extents; + sector_t span; + unsigned long maxpages; + unsigned char *swap_map = NULL; + struct page *page = NULL; + struct inode *inode = NULL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + p = alloc_swap_info(); + if (IS_ERR(p)) + return PTR_ERR(p); + + name = getname(specialfile); + if (IS_ERR(name)) { + error = PTR_ERR(name); + name = NULL; + goto bad_swap; + } + swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); + if (IS_ERR(swap_file)) { + error = PTR_ERR(swap_file); + swap_file = NULL; + goto bad_swap; + } + + p->swap_file = swap_file; + mapping = swap_file->f_mapping; + + for (i = 0; i < nr_swapfiles; i++) { + struct swap_info_struct *q = swap_info[i]; + + if (q == p || !q->swap_file) + continue; + if (mapping == q->swap_file->f_mapping) { + error = -EBUSY; + goto bad_swap; + } + } + + inode = mapping->host; + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ + error = claim_swapfile(p, inode); + if (unlikely(error)) + goto bad_swap; + + /* + * Read the swap header. + */ + if (!mapping->a_ops->readpage) { error = -EINVAL; goto bad_swap; } + page = read_mapping_page(mapping, 0, swap_file); + if (IS_ERR(page)) { + error = PTR_ERR(page); + goto bad_swap; + } + swap_header = kmap(page); + + maxpages = read_swap_header(p, swap_header, inode); + if (unlikely(!maxpages)) { + error = -EINVAL; + goto bad_swap; + } + + /* OK, set up the swap map and apply the bad block list */ + swap_map = vzalloc(maxpages); + if (!swap_map) { + error = -ENOMEM; + goto bad_swap; + } + + error = swap_cgroup_swapon(p->type, maxpages); + if (error) + goto bad_swap; + + nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, + maxpages, &span); + if (unlikely(nr_extents < 0)) { + error = nr_extents; + goto bad_swap; + } if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { @@ -2096,58 +2117,46 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } mutex_lock(&swapon_mutex); - spin_lock(&swap_lock); + prio = -1; if (swap_flags & SWAP_FLAG_PREFER) - p->prio = + prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - else - p->prio = --least_priority; - p->swap_map = swap_map; - p->flags |= SWP_WRITEOK; - nr_swap_pages += nr_good_pages; - total_swap_pages += nr_good_pages; + enable_swap_info(p, prio, swap_map); printk(KERN_INFO "Adding %uk swap on %s. " "Priority:%d extents:%d across:%lluk %s%s\n", - nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, + p->pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : ""); - /* insert swap space into swap_list: */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; - } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = type; - else - swap_info[prev]->next = type; - spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); + if (S_ISREG(inode->i_mode)) + inode->i_flags |= S_SWAPFILE; error = 0; goto out; bad_swap: - if (bdev) { - set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + if (inode && S_ISBLK(inode->i_mode) && p->bdev) { + set_blocksize(p->bdev, p->old_block_size); + blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } destroy_swap_extents(p); - swap_cgroup_swapoff(type); -bad_swap_2: + swap_cgroup_swapoff(p->type); spin_lock(&swap_lock); p->swap_file = NULL; p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); - if (swap_file) + if (swap_file) { + if (inode && S_ISREG(inode->i_mode)) { + mutex_unlock(&inode->i_mutex); + inode = NULL; + } filp_close(swap_file, NULL); + } out: if (page && !IS_ERR(page)) { kunmap(page); @@ -2155,11 +2164,8 @@ out: } if (name) putname(name); - if (did_down) { - if (!error) - inode->i_flags |= S_SWAPFILE; + if (inode && S_ISREG(inode->i_mode)) mutex_unlock(&inode->i_mutex); - } return error; } diff --git a/mm/truncate.c b/mm/truncate.c index ba887bff48c5..a95667529135 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -106,9 +106,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page) cancel_dirty_page(page, PAGE_CACHE_SIZE); clear_page_mlock(page); - remove_from_page_cache(page); ClearPageMappedToDisk(page); - page_cache_release(page); /* pagecache ref */ + delete_from_page_cache(page); return 0; } @@ -225,6 +224,7 @@ void truncate_inode_pages_range(struct address_space *mapping, next = start; while (next <= end && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; @@ -247,6 +247,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } @@ -320,11 +321,12 @@ EXPORT_SYMBOL(truncate_inode_pages); * pagetables. */ unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) + pgoff_t start, pgoff_t end) { struct pagevec pvec; pgoff_t next = start; - unsigned long ret = 0; + unsigned long ret; + unsigned long count = 0; int i; pagevec_init(&pvec, 0); @@ -351,9 +353,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (lock_failed) continue; - ret += invalidate_inode_page(page); - + ret = invalidate_inode_page(page); unlock_page(page); + /* + * Invalidation is a hint that the page is no longer + * of interest and try to speed up its reclaim. + */ + if (!ret) + deactivate_page(page); + count += ret; if (next > end) break; } @@ -361,7 +369,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, mem_cgroup_uncharge_end(); cond_resched(); } - return ret; + return count; } EXPORT_SYMBOL(invalidate_mapping_pages); @@ -387,9 +395,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) clear_page_mlock(page); BUG_ON(page_has_private(page)); - __remove_from_page_cache(page); + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(page); + page_cache_release(page); /* pagecache ref */ return 1; failed: @@ -545,13 +557,12 @@ EXPORT_SYMBOL(truncate_pagecache); * @inode: inode * @newsize: new file size * - * truncate_setsize updastes i_size update and performs pagecache - * truncation (if necessary) for a file size updates. It will be - * typically be called from the filesystem's setattr function when - * ATTR_SIZE is passed in. + * truncate_setsize updates i_size and performs pagecache truncation (if + * necessary) to @newsize. It will be typically be called from the filesystem's + * setattr function when ATTR_SIZE is passed in. * - * Must be called with inode_mutex held and after all filesystem - * specific block truncation has been performed. + * Must be called with inode_mutex held and before all filesystem specific + * block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { diff --git a/mm/util.c b/mm/util.c index 73dac81e9f78..e7b103a6fd21 100644 --- a/mm/util.c +++ b/mm/util.c @@ -186,27 +186,6 @@ void kzfree(const void *p) } EXPORT_SYMBOL(kzfree); -int kern_ptr_validate(const void *ptr, unsigned long size) -{ - unsigned long addr = (unsigned long)ptr; - unsigned long min_addr = PAGE_OFFSET; - unsigned long align_mask = sizeof(void *) - 1; - - if (unlikely(addr < min_addr)) - goto out; - if (unlikely(addr > (unsigned long)high_memory - size)) - goto out; - if (unlikely(addr & align_mask)) - goto out; - if (unlikely(!kern_addr_valid(addr))) - goto out; - if (unlikely(!kern_addr_valid(addr + size - 1))) - goto out; - return 1; -out: - return 0; -} - /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate @@ -248,7 +227,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm) /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. - * If the architecture not support this fucntion, simply return with no + * If the architecture not support this function, simply return with no * page pinned */ int __attribute__((weak)) __get_user_pages_fast(unsigned long start, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index eb5cc7d00c5a..5d6030235d7a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -261,8 +261,15 @@ struct vmap_area { }; static DEFINE_SPINLOCK(vmap_area_lock); -static struct rb_root vmap_area_root = RB_ROOT; static LIST_HEAD(vmap_area_list); +static struct rb_root vmap_area_root = RB_ROOT; + +/* The vmap cache globals are protected by vmap_area_lock */ +static struct rb_node *free_vmap_cache; +static unsigned long cached_hole_size; +static unsigned long cached_vstart; +static unsigned long cached_align; + static unsigned long vmap_area_pcpu_hole; static struct vmap_area *__find_vmap_area(unsigned long addr) @@ -331,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, struct rb_node *n; unsigned long addr; int purged = 0; + struct vmap_area *first; BUG_ON(!size); BUG_ON(size & ~PAGE_MASK); + BUG_ON(!is_power_of_2(align)); va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); @@ -341,79 +350,106 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, return ERR_PTR(-ENOMEM); retry: - addr = ALIGN(vstart, align); - spin_lock(&vmap_area_lock); - if (addr + size - 1 < addr) - goto overflow; + /* + * Invalidate cache if we have more permissive parameters. + * cached_hole_size notes the largest hole noticed _below_ + * the vmap_area cached in free_vmap_cache: if size fits + * into that hole, we want to scan from vstart to reuse + * the hole instead of allocating above free_vmap_cache. + * Note that __free_vmap_area may update free_vmap_cache + * without updating cached_hole_size or cached_align. + */ + if (!free_vmap_cache || + size < cached_hole_size || + vstart < cached_vstart || + align < cached_align) { +nocache: + cached_hole_size = 0; + free_vmap_cache = NULL; + } + /* record if we encounter less permissive parameters */ + cached_vstart = vstart; + cached_align = align; + + /* find starting point for our search */ + if (free_vmap_cache) { + first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); + addr = ALIGN(first->va_end + PAGE_SIZE, align); + if (addr < vstart) + goto nocache; + if (addr + size - 1 < addr) + goto overflow; + + } else { + addr = ALIGN(vstart, align); + if (addr + size - 1 < addr) + goto overflow; - /* XXX: could have a last_hole cache */ - n = vmap_area_root.rb_node; - if (n) { - struct vmap_area *first = NULL; + n = vmap_area_root.rb_node; + first = NULL; - do { + while (n) { struct vmap_area *tmp; tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end >= addr) { - if (!first && tmp->va_start < addr + size) - first = tmp; - n = n->rb_left; - } else { first = tmp; + if (tmp->va_start <= addr) + break; + n = n->rb_left; + } else n = n->rb_right; - } - } while (n); + } if (!first) goto found; - - if (first->va_end < addr) { - n = rb_next(&first->rb_node); - if (n) - first = rb_entry(n, struct vmap_area, rb_node); - else - goto found; - } - - while (addr + size > first->va_start && addr + size <= vend) { - addr = ALIGN(first->va_end + PAGE_SIZE, align); - if (addr + size - 1 < addr) - goto overflow; - - n = rb_next(&first->rb_node); - if (n) - first = rb_entry(n, struct vmap_area, rb_node); - else - goto found; - } } -found: - if (addr + size > vend) { -overflow: - spin_unlock(&vmap_area_lock); - if (!purged) { - purge_vmap_area_lazy(); - purged = 1; - goto retry; - } - if (printk_ratelimit()) - printk(KERN_WARNING - "vmap allocation for size %lu failed: " - "use vmalloc=<size> to increase size.\n", size); - kfree(va); - return ERR_PTR(-EBUSY); + + /* from the starting point, walk areas until a suitable hole is found */ + while (addr + size >= first->va_start && addr + size <= vend) { + if (addr + cached_hole_size < first->va_start) + cached_hole_size = first->va_start - addr; + addr = ALIGN(first->va_end + PAGE_SIZE, align); + if (addr + size - 1 < addr) + goto overflow; + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; } - BUG_ON(addr & (align-1)); +found: + if (addr + size > vend) + goto overflow; va->va_start = addr; va->va_end = addr + size; va->flags = 0; __insert_vmap_area(va); + free_vmap_cache = &va->rb_node; spin_unlock(&vmap_area_lock); + BUG_ON(va->va_start & (align-1)); + BUG_ON(va->va_start < vstart); + BUG_ON(va->va_end > vend); + return va; + +overflow: + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = 1; + goto retry; + } + if (printk_ratelimit()) + printk(KERN_WARNING + "vmap allocation for size %lu failed: " + "use vmalloc=<size> to increase size.\n", size); + kfree(va); + return ERR_PTR(-EBUSY); } static void rcu_free_va(struct rcu_head *head) @@ -426,6 +462,22 @@ static void rcu_free_va(struct rcu_head *head) static void __free_vmap_area(struct vmap_area *va) { BUG_ON(RB_EMPTY_NODE(&va->rb_node)); + + if (free_vmap_cache) { + if (va->va_end < cached_vstart) { + free_vmap_cache = NULL; + } else { + struct vmap_area *cache; + cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node); + if (va->va_start <= cache->va_start) { + free_vmap_cache = rb_prev(&va->rb_node); + /* + * We don't try to update cached_hole_size or + * cached_align, but it won't go very wrong. + */ + } + } + } rb_erase(&va->rb_node, &vmap_area_root); RB_CLEAR_NODE(&va->rb_node); list_del_rcu(&va->list); @@ -748,7 +800,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask); - if (unlikely(IS_ERR(va))) { + if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); } @@ -1175,6 +1227,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { vunmap_page_range(addr, addr + size); } +EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); /** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB @@ -1315,13 +1368,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, -1, GFP_KERNEL, caller); } -struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, - int node, gfp_t gfp_mask) -{ - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, - node, gfp_mask, __builtin_return_address(0)); -} - static struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; @@ -1537,25 +1583,12 @@ fail: return NULL; } -void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) -{ - void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, - __builtin_return_address(0)); - - /* - * A ref_count = 3 is needed because the vm_struct and vmap_area - * structures allocated in the __get_vm_area_node() function contain - * references to the virtual address of the vmalloc'ed block. - */ - kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); - - return addr; -} - /** - * __vmalloc_node - allocate virtually contiguous memory + * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment + * @start: vm area range start + * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 @@ -1565,9 +1598,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller) { struct vm_struct *area; void *addr; @@ -1577,8 +1610,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, - VMALLOC_END, node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, + gfp_mask, caller); if (!area) return NULL; @@ -1595,6 +1628,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, return addr; } +/** + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or -1 + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) +{ + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp_mask, prot, node, caller); +} + void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { return __vmalloc_node(size, 1, gfp_mask, prot, -1, @@ -1949,8 +2003,6 @@ finished: * should know vmalloc() area is valid and can use memcpy(). * This is for routines which have to access vmalloc area without * any informaion, as /dev/kmem. - * - * The caller should guarantee KM_USER1 is not used. */ long vwrite(char *buf, char *addr, unsigned long count) @@ -2203,17 +2255,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this - * @gfp_mask: allocation mask * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates - * congruent vmalloc areas for it. These areas tend to be scattered - * pretty far, distance between two areas easily going up to - * gigabytes. To avoid interacting with regular vmallocs, these areas - * are allocated from top. + * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to + * be scattered pretty far, distance between two areas easily going up + * to gigabytes. To avoid interacting with regular vmallocs, these + * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans areas from the end looking for @@ -2224,7 +2275,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask) + size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); @@ -2234,8 +2285,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, unsigned long base, start, end, last_end; bool purged = false; - gfp_mask &= GFP_RECLAIM_MASK; - /* verify parameters and allocate data structures */ BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { @@ -2268,14 +2317,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } - vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); - vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); + vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); if (!vas || !vms) goto err_free; for (area = 0; area < nr_vms; area++) { - vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); - vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } @@ -2456,13 +2505,8 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, "0x%p-0x%p %7ld", v->addr, v->addr + v->size, v->size); - if (v->caller) { - char buff[KSYM_SYMBOL_LEN]; - - seq_putc(m, ' '); - sprint_symbol(buff, (unsigned long)v->caller); - seq_puts(m, buff); - } + if (v->caller) + seq_printf(m, " %pS", v->caller); if (v->nr_pages) seq_printf(m, " pages=%d", v->nr_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index d31d7ce52c0e..f6b435c80079 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -32,6 +32,7 @@ #include <linux/topology.h> #include <linux/cpu.h> #include <linux/cpuset.h> +#include <linux/compaction.h> #include <linux/notifier.h> #include <linux/rwsem.h> #include <linux/delay.h> @@ -40,6 +41,7 @@ #include <linux/memcontrol.h> #include <linux/delayacct.h> #include <linux/sysctl.h> +#include <linux/oom.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -51,11 +53,23 @@ #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> -enum lumpy_mode { - LUMPY_MODE_NONE, - LUMPY_MODE_ASYNC, - LUMPY_MODE_SYNC, -}; +/* + * reclaim_mode determines how the inactive list is shrunk + * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages + * RECLAIM_MODE_ASYNC: Do not block + * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback + * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference + * page from the LRU and reclaim all pages within a + * naturally aligned range + * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of + * order-0 pages and then compact the zone + */ +typedef unsigned __bitwise__ reclaim_mode_t; +#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) +#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) +#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) +#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) +#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) struct scan_control { /* Incremented by the number of inactive pages that were scanned */ @@ -88,7 +102,7 @@ struct scan_control { * Intend to reclaim enough continuous memory rather than reclaim * enough amount of memory. i.e, mode for high order allocation. */ - enum lumpy_mode lumpy_reclaim_mode; + reclaim_mode_t reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } -static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, +static void set_reclaim_mode(int priority, struct scan_control *sc, bool sync) { - enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; + reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; /* - * Some reclaim have alredy been failed. No worth to try synchronous - * lumpy reclaim. + * Initially assume we are entering either lumpy reclaim or + * reclaim/compaction.Depending on the order, we will either set the + * sync mode or just reclaim order-0 pages later. */ - if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) - return; + if (COMPACTION_BUILD) + sc->reclaim_mode = RECLAIM_MODE_COMPACTION; + else + sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. + * Avoid using lumpy reclaim or reclaim/compaction if possible by + * restricting when its set to either costly allocations or when + * under memory pressure */ if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode = mode; + sc->reclaim_mode |= syncmode; else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode = mode; + sc->reclaim_mode |= syncmode; else - sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; } -static void disable_lumpy_reclaim_mode(struct scan_control *sc) +static void reset_reclaim_mode(struct scan_control *sc) { - sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; } static inline int is_page_cache_freeable(struct page *page) @@ -342,7 +359,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi, static void handle_write_error(struct address_space *mapping, struct page *page, int error) { - lock_page_nosync(page); + lock_page(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); @@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * first attempt to free a range of pages fails. */ if (PageWriteback(page) && - sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) + (sc->reclaim_mode & RECLAIM_MODE_SYNC)) wait_on_page_writeback(page); if (!PageWriteback(page)) { @@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); + trace_reclaim_flags(page, sc->reclaim_mode)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -494,9 +511,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) spin_unlock_irq(&mapping->tree_lock); swapcache_free(swap, page); } else { - __remove_from_page_cache(page); + void (*freepage)(struct page *); + + freepage = mapping->a_ops->freepage; + + __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (freepage != NULL) + freepage(page); } return 1; @@ -615,7 +639,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) + if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) return PAGEREF_RECLAIM; /* @@ -732,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && + if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && may_enter_fs) wait_on_page_writeback(page); else { @@ -888,7 +912,7 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); - disable_lumpy_reclaim_mode(sc); + reset_reclaim_mode(sc); continue; activate_locked: @@ -901,7 +925,7 @@ activate_locked: keep_locked: unlock_page(page); keep: - disable_lumpy_reclaim_mode(sc); + reset_reclaim_mode(sc); keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); @@ -1021,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: @@ -1042,7 +1066,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * surrounding the tag page. Only take those pages of * the same active state as that tag page. We may safely * round the target page pfn down to the requested order - * as the mem_map is guarenteed valid out to MAX_ORDER, + * as the mem_map is guaranteed valid out to MAX_ORDER, * where that page is in a different zone we will detect * it from its zone id and abort this block scan. */ @@ -1079,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); - nr_taken++; + nr_taken += hpage_nr_pages(page); nr_lumpy_taken++; if (PageDirty(cursor_page)) nr_lumpy_dirty++; @@ -1134,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); - nr_active++; + nr_active += numpages; } if (count) - count[lru]++; + count[lru] += numpages; } return nr_active; @@ -1251,7 +1276,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, add_page_to_lru_list(zone, page, lru); if (is_active_lru(lru)) { int file = is_file_lru(lru); - reclaim_stat->recent_rotated[file]++; + int numpages = hpage_nr_pages(page); + reclaim_stat->recent_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1317,7 +1343,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, return false; /* Only stall on lumpy reclaim */ - if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) return false; /* If we have relaimed everything on the isolated list, no stall */ @@ -1361,15 +1387,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return SWAP_CLUSTER_MAX; } - set_lumpy_reclaim_mode(priority, sc, false); + set_reclaim_mode(priority, sc, false); lru_add_drain(); spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1381,8 +1407,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); /* @@ -1404,7 +1430,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - set_lumpy_reclaim_mode(priority, sc, true); + set_reclaim_mode(priority, sc, true); nr_reclaimed += shrink_page_list(&page_list, zone, sc); } @@ -1419,7 +1445,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, zone_idx(zone), nr_scanned, nr_reclaimed, priority, - trace_shrink_flags(file, sc->lumpy_reclaim_mode)); + trace_shrink_flags(file, sc->reclaim_mode)); return nr_reclaimed; } @@ -1459,7 +1485,7 @@ static void move_active_pages_to_lru(struct zone *zone, list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); - pgmoved++; + pgmoved += hpage_nr_pages(page); if (!pagevec_add(&pvec, page) || list_empty(list)) { spin_unlock_irq(&zone->lru_lock); @@ -1527,7 +1553,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { - nr_rotated++; + nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -1798,6 +1824,69 @@ out: } /* + * Reclaim/compaction depends on a number of pages being freed. To avoid + * disruption to the system, a small number of order-0 pages continue to be + * rotated and reclaimed in the normal fashion. However, by the time we get + * back to the allocator and call try_to_compact_zone(), we ensure that + * there are enough free pages for it to be likely successful + */ +static inline bool should_continue_reclaim(struct zone *zone, + unsigned long nr_reclaimed, + unsigned long nr_scanned, + struct scan_control *sc) +{ + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + + /* If not in reclaim/compaction mode, stop */ + if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) + return false; + + /* Consider stopping depending on scan and reclaim activity */ + if (sc->gfp_mask & __GFP_REPEAT) { + /* + * For __GFP_REPEAT allocations, stop reclaiming if the + * full LRU list has been scanned and we are still failing + * to reclaim pages. This full LRU scan is potentially + * expensive but a __GFP_REPEAT caller really wants to succeed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + } else { + /* + * For non-__GFP_REPEAT allocations which can presumably + * fail without consequence, stop if we failed to reclaim + * any pages from the last SWAP_CLUSTER_MAX number of + * pages that were scanned. This will return to the + * caller faster at the risk reclaim/compaction and + * the resulting allocation attempt fails + */ + if (!nr_reclaimed) + return false; + } + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = (2UL << sc->order); + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (sc->nr_reclaimed < pages_for_compaction && + inactive_lru_pages > pages_for_compaction) + return true; + + /* If compaction would go ahead or the allocation would succeed, stop */ + switch (compaction_suitable(zone, sc->order)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + return true; + } +} + +/* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static void shrink_zone(int priority, struct zone *zone, @@ -1806,9 +1895,12 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed = sc->nr_reclaimed; + unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; +restart: + nr_reclaimed = 0; + nr_scanned = sc->nr_scanned; get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -1834,8 +1926,7 @@ static void shrink_zone(int priority, struct zone *zone, if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } - - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to @@ -1844,6 +1935,11 @@ static void shrink_zone(int priority, struct zone *zone, if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + /* reclaim/compaction might need reclaim to continue */ + if (should_continue_reclaim(zone, nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)) + goto restart; + throttle_vm_writeout(sc->gfp_mask); } @@ -1893,17 +1989,12 @@ static bool zone_reclaimable(struct zone *zone) return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; } -/* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. It can't handle OOM during hibernation. - * So let's check zone's unreclaimable in direct reclaim as well as kswapd. - */ +/* All zones in zonelist are unreclaimable? */ static bool all_unreclaimable(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; - bool all_unreclaimable = true; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -1911,13 +2002,11 @@ static bool all_unreclaimable(struct zonelist *zonelist, continue; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (zone_reclaimable(zone)) { - all_unreclaimable = false; - break; - } + if (!zone->all_unreclaimable) + return false; } - return all_unreclaimable; + return true; } /* @@ -2000,7 +2089,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zone *preferred_zone; first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - NULL, &preferred_zone); + &cpuset_current_mems_allowed, + &preferred_zone); wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); } } @@ -2012,6 +2102,14 @@ out: if (sc->nr_reclaimed) return sc->nr_reclaimed; + /* + * As hibernation is going on, kswapd is freezed so that it can't mark + * the zone into all_unreclaimable. Thus bypassing all_unreclaimable + * check. + */ + if (oom_killer_disabled) + return 0; + /* top priority shrink_zones still had more to do? don't OOM, then */ if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) return 1; @@ -2117,38 +2215,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +/* + * pgdat_balanced is used when checking if a node is balanced for high-order + * allocations. Only zones that meet watermarks and are in a zone allowed + * by the callers classzone_idx are added to balanced_pages. The total of + * balanced pages must be at least 25% of the zones allowed by classzone_idx + * for the node to be considered balanced. Forcing all zones to be balanced + * for high orders can cause excessive reclaim when there are imbalanced zones. + * The choice of 25% is due to + * o a 16M DMA zone that is balanced will not balance a zone on any + * reasonable sized machine + * o On all other machines, the top zone must be at least a reasonable + * percentage of the middle zones. For example, on 32-bit x86, highmem + * would need to be at least 256M for it to be balance a whole node. + * Similarly, on x86-64 the Normal zone would need to be at least 1G + * to balance a node on its own. These seemed like reasonable ratios. + */ +static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, + int classzone_idx) +{ + unsigned long present_pages = 0; + int i; + + for (i = 0; i <= classzone_idx; i++) + present_pages += pgdat->node_zones[i].present_pages; + + return balanced_pages > (present_pages >> 2); +} + /* is kswapd sleeping prematurely? */ -static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, + int classzone_idx) { int i; + unsigned long balanced = 0; + bool all_zones_ok = true; /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) - return 1; + return true; - /* If after HZ/10, a zone is below the high mark, it's premature */ + /* Check the watermark levels */ for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable) + /* + * balance_pgdat() skips over all_unreclaimable after + * DEF_PRIORITY. Effectively, it considers them balanced so + * they must be considered balanced here as well if kswapd + * is to sleep + */ + if (zone->all_unreclaimable) { + balanced += zone->present_pages; continue; + } - if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), - 0, 0)) - return 1; + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), + classzone_idx, 0)) + all_zones_ok = false; + else + balanced += zone->present_pages; } - return 0; + /* + * For high-order requests, the balanced zones must contain at least + * 25% of the nodes pages for kswapd to sleep. For order-0, all zones + * must be balanced + */ + if (order) + return pgdat_balanced(pgdat, balanced, classzone_idx); + else + return !all_zones_ok; } /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the number of pages which were actually freed. + * Returns the final order kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -2165,11 +2312,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, int order, + int *classzone_idx) { int all_zones_ok; + unsigned long balanced; int priority; int i; + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { @@ -2192,7 +2342,6 @@ loop_again: count_vm_event(PAGEOUTRUN); for (priority = DEF_PRIORITY; priority >= 0; priority--) { - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; @@ -2201,6 +2350,7 @@ loop_again: disable_swap_token(); all_zones_ok = 1; + balanced = 0; /* * Scan in the highmem->dma direction for the highest @@ -2223,9 +2373,10 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; + *classzone_idx = i; break; } } @@ -2250,6 +2401,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; + unsigned long balance_gap; if (!populated_zone(zone)) continue; @@ -2266,20 +2418,31 @@ loop_again: mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); /* - * We put equal pressure on every zone, unless one - * zone has way too many pages free already. + * We put equal pressure on every zone, unless + * one zone has way too many pages free + * already. The "too many pages" is defined + * as the high wmark plus a "gap" where the + * gap is either the low watermark or 1% + * of the zone, whichever is smaller. */ - if (!zone_watermark_ok(zone, order, - 8*high_wmark_pages(zone), end_zone, 0)) + balance_gap = min(low_wmark_pages(zone), + (zone->present_pages + + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + if (!zone_watermark_ok_safe(zone, order, + high_wmark_pages(zone) + balance_gap, + end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; + if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && !zone_reclaimable(zone)) + if (nr_slab == 0 && + !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -2290,7 +2453,7 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; /* @@ -2298,7 +2461,7 @@ loop_again: * means that we have a GFP_ATOMIC allocation * failure risk. Hurry up! */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, min_wmark_pages(zone), end_zone, 0)) has_under_min_watermark_zone = 1; } else { @@ -2310,10 +2473,12 @@ loop_again: * spectulatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); + if (i <= *classzone_idx) + balanced += zone->present_pages; } } - if (all_zones_ok) + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2336,7 +2501,13 @@ loop_again: break; } out: - if (!all_zones_ok) { + + /* + * order-0: All zones must meet high watermark for a balanced node + * high-order: Balanced zones must make up at least 25% of the node + * for the node to be balanced + */ + if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { cond_resched(); try_to_freeze(); @@ -2361,7 +2532,88 @@ out: goto loop_again; } - return sc.nr_reclaimed; + /* + * If kswapd was reclaiming at a higher order, it has the option of + * sleeping without all zones being balanced. Before it does, it must + * ensure that the watermarks for order-0 on *all* zones are met and + * that the congestion flags are cleared. The congestion flag must + * be cleared as kswapd is the only mechanism that clears the flag + * and it is potentially going to sleep here. + */ + if (order) { + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; + + /* Confirm the zone is balanced for order-0 */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone), 0, 0)) { + order = sc.order = 0; + goto loop_again; + } + + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); + } + } + + /* + * Return the order we were reclaiming at so sleeping_prematurely() + * makes a decision on the order we were last reclaiming at. However, + * if another caller entered the allocator slow path while kswapd + * was awake, order will remain at the higher level + */ + *classzone_idx = end_zone; + return order; +} + +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +{ + long remaining = 0; + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a premature sleep. If not, then + * go fully to sleep until explicitly woken up. + */ + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + + /* + * vmstat counters are not perfectly accurate and the estimated + * value for counters such as NR_FREE_PAGES can deviate from the + * true value by nr_online_cpus * threshold. To avoid the zone + * watermarks being breached while under pressure, we reduce the + * per-cpu vmstat threshold while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + finish_wait(&pgdat->kswapd_wait, &wait); } /* @@ -2380,9 +2632,10 @@ out: static int kswapd(void *p) { unsigned long order; + int classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; - DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; @@ -2410,49 +2663,30 @@ static int kswapd(void *p) set_freezable(); order = 0; + classzone_idx = MAX_NR_ZONES - 1; for ( ; ; ) { unsigned long new_order; + int new_classzone_idx; int ret; - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; - if (order < new_order) { + pgdat->classzone_idx = MAX_NR_ZONES - 1; + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' - * allocation + * allocation or has tigher zone constraints */ order = new_order; + classzone_idx = new_classzone_idx; } else { - if (!freezing(current) && !kthread_should_stop()) { - long remaining = 0; - - /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - remaining = schedule_timeout(HZ/10); - finish_wait(&pgdat->kswapd_wait, &wait); - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - } - - /* - * After a short sleep, check if it was a - * premature sleep. If not, then go fully - * to sleep until explicitly woken up - */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - trace_mm_vmscan_kswapd_sleep(pgdat->node_id); - schedule(); - } else { - if (remaining) - count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); - else - count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); - } - } - + kswapd_try_to_sleep(pgdat, order, classzone_idx); order = pgdat->kswapd_max_order; + classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = MAX_NR_ZONES - 1; } - finish_wait(&pgdat->kswapd_wait, &wait); ret = try_to_freeze(); if (kthread_should_stop()) @@ -2464,7 +2698,7 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balance_pgdat(pgdat, order); + order = balance_pgdat(pgdat, order, &classzone_idx); } } return 0; @@ -2473,23 +2707,26 @@ static int kswapd(void *p) /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; if (!populated_zone(zone)) return; - pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) - return; - if (pgdat->kswapd_max_order < order) - pgdat->kswapd_max_order = order; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; + pgdat = zone->zone_pgdat; + if (pgdat->kswapd_max_order < order) { + pgdat->kswapd_max_order = order; + pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); + } if (!waitqueue_active(&pgdat->kswapd_wait)) return; + if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + return; + + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 8f62f17ee1c7..897ea9e88238 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -static int calculate_threshold(struct zone *zone) +int calculate_pressure_threshold(struct zone *zone) +{ + int threshold; + int watermark_distance; + + /* + * As vmstats are not up to date, there is drift between the estimated + * and real values. For high thresholds and a high number of CPUs, it + * is possible for the min watermark to be breached while the estimated + * value looks fine. The pressure threshold is a reduced value such + * that even the maximum amount of drift will not accidentally breach + * the min watermark + */ + watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); + threshold = max(1, (int)(watermark_distance / num_online_cpus())); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + +int calculate_normal_threshold(struct zone *zone) { int threshold; int mem; /* memory in 128 MB units */ @@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) for_each_populated_zone(zone) { unsigned long max_drift, tolerate_drift; - threshold = calculate_threshold(zone); + threshold = calculate_normal_threshold(zone); for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold @@ -161,42 +185,50 @@ static void refresh_zone_stat_thresholds(void) } } +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = (*calculate_pressure)(zone); + for_each_possible_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } +} + /* * For use when we know that interrupts are disabled. */ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); - - s8 *p = pcp->vm_stat_diff + item; + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; long x; + long t; - x = delta + *p; + x = delta + __this_cpu_read(*p); - if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { + t = __this_cpu_read(pcp->stat_threshold); + + if (unlikely(x > t || x < -t)) { zone_page_state_add(x, zone, item); x = 0; } - *p = x; + __this_cpu_write(*p, x); } EXPORT_SYMBOL(__mod_zone_page_state); /* - * For an unknown interrupt state - */ -void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_zone_page_state(zone, item, delta); - local_irq_restore(flags); -} -EXPORT_SYMBOL(mod_zone_page_state); - -/* * Optimized increment and decrement functions. * * These are only for a single page and therefore can take a struct page * @@ -221,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); - s8 *p = pcp->vm_stat_diff + item; - - (*p)++; + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; - if (unlikely(*p > pcp->stat_threshold)) { - int overstep = pcp->stat_threshold / 2; + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { + s8 overstep = t >> 1; - zone_page_state_add(*p + overstep, zone, item); - *p = -overstep; + zone_page_state_add(v + overstep, zone, item); + __this_cpu_write(*p, -overstep); } } @@ -242,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); - s8 *p = pcp->vm_stat_diff + item; + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; - (*p)--; + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { + s8 overstep = t >> 1; - if (unlikely(*p < - pcp->stat_threshold)) { - int overstep = pcp->stat_threshold / 2; - - zone_page_state_add(*p - overstep, zone, item); - *p = overstep; + zone_page_state_add(v - overstep, zone, item); + __this_cpu_write(*p, overstep); } } @@ -261,6 +295,95 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(__dec_zone_page_state); +#ifdef CONFIG_CMPXCHG_LOCAL +/* + * If we have cmpxchg_local support then we do not need to incur the overhead + * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. + * + * mod_state() modifies the zone counter state through atomic per cpu + * operations. + * + * Overstep mode specifies how overstep should handled: + * 0 No overstepping + * 1 Overstepping half of threshold + * -1 Overstepping minus half of threshold +*/ +static inline void mod_state(struct zone *zone, + enum zone_stat_item item, int delta, int overstep_mode) +{ + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + long o, n, t, z; + + do { + z = 0; /* overflow to zone counters */ + + /* + * The fetching of the stat_threshold is racy. We may apply + * a counter threshold to the wrong the cpu if we get + * rescheduled while executing here. However, the next + * counter update will apply the threshold again and + * therefore bring the counter under the threshold again. + * + * Most of the time the thresholds are the same anyways + * for all cpus in a zone. + */ + t = this_cpu_read(pcp->stat_threshold); + + o = this_cpu_read(*p); + n = delta + o; + + if (n > t || n < -t) { + int os = overstep_mode * (t >> 1) ; + + /* Overflow must be added to zone counters */ + z = n + os; + n = -os; + } + } while (this_cpu_cmpxchg(*p, o, n) != o); + + if (z) + zone_page_state_add(z, zone, item); +} + +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + mod_state(zone, item, delta, 0); +} +EXPORT_SYMBOL(mod_zone_page_state); + +void inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + mod_state(zone, item, 1, 1); +} + +void inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + mod_state(page_zone(page), item, 1, 1); +} +EXPORT_SYMBOL(inc_zone_page_state); + +void dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + mod_state(page_zone(page), item, -1, -1); +} +EXPORT_SYMBOL(dec_zone_page_state); +#else +/* + * Use interrupt disable to serialize counter updates + */ +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_zone_page_state(zone, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_zone_page_state); + void inc_zone_state(struct zone *zone, enum zone_stat_item item) { unsigned long flags; @@ -291,6 +414,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) local_irq_restore(flags); } EXPORT_SYMBOL(dec_zone_page_state); +#endif /* * Update the zone counters for one cpu. @@ -379,8 +503,12 @@ void refresh_cpu_vm_stats(int cpu) * z = the zone from which the allocation occurred. * * Must be called with interrupts disabled. + * + * When __GFP_OTHER_NODE is set assume the node of the preferred + * zone is the local node. This is useful for daemons who allocate + * memory on behalf of other processes. */ -void zone_statistics(struct zone *preferred_zone, struct zone *z) +void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) { if (z->zone_pgdat == preferred_zone->zone_pgdat) { __inc_zone_state(z, NUMA_HIT); @@ -388,7 +516,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) __inc_zone_state(z, NUMA_MISS); __inc_zone_state(preferred_zone, NUMA_FOREIGN); } - if (z->node == numa_node_id()) + if (z->node == ((flags & __GFP_OTHER_NODE) ? + preferred_zone->node : numa_node_id())) __inc_zone_state(z, NUMA_LOCAL); else __inc_zone_state(z, NUMA_OTHER); @@ -759,6 +888,7 @@ static const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_anon_transparent_hugepages", "nr_dirty_threshold", "nr_dirty_background_threshold", @@ -818,7 +948,16 @@ static const char * const vmstat_text[] = { "unevictable_pgs_cleared", "unevictable_pgs_stranded", "unevictable_pgs_mlockfreed", + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "thp_fault_alloc", + "thp_fault_fallback", + "thp_collapse_alloc", + "thp_collapse_alloc_failed", + "thp_split", #endif + +#endif /* CONFIG_VM_EVENTS_COUNTERS */ }; static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, @@ -834,7 +973,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_nr_free_pages(zone), + zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), @@ -1033,7 +1172,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); per_cpu(vmstat_work, cpu).work.func = NULL; break; case CPU_DOWN_FAILED: |