diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-14 16:49:17 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-14 16:49:17 -0700 |
commit | 1dcf58d6e6e6eb7ec10e9abc56887b040205b06f (patch) | |
tree | c03e7a25ef13eea62f1547914a76e5c68f3f4c28 /mm | |
parent | 80dcc31fbe55932ac9204daee5f2ebc0c49b6da3 (diff) | |
parent | e4b0db72be2487bae0e3251c22f82c104f7c1cfd (diff) | |
download | linux-1dcf58d6e6e6eb7ec10e9abc56887b040205b06f.tar.bz2 |
Merge branch 'akpm' (patches from Andrew)
Merge first patchbomb from Andrew Morton:
- arch/sh updates
- ocfs2 updates
- kernel/watchdog feature
- about half of mm/
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (122 commits)
Documentation: update arch list in the 'memtest' entry
Kconfig: memtest: update number of test patterns up to 17
arm: add support for memtest
arm64: add support for memtest
memtest: use phys_addr_t for physical addresses
mm: move memtest under mm
mm, hugetlb: abort __get_user_pages if current has been oom killed
mm, mempool: do not allow atomic resizing
memcg: print cgroup information when system panics due to panic_on_oom
mm: numa: remove migrate_ratelimited
mm: fold arch_randomize_brk into ARCH_HAS_ELF_RANDOMIZE
mm: split ET_DYN ASLR from mmap ASLR
s390: redefine randomize_et_dyn for ELF_ET_DYN_BASE
mm: expose arch_mmap_rnd when available
s390: standardize mmap_rnd() usage
powerpc: standardize mmap_rnd() usage
mips: extract logic for mmap_rnd()
arm64: standardize mmap_rnd() usage
x86: standardize mmap_rnd() usage
arm: factor out mmap ASLR into mmap_rnd
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/cleancache.c | 276 | ||||
-rw-r--r-- | mm/cma.c | 49 | ||||
-rw-r--r-- | mm/cma.h | 24 | ||||
-rw-r--r-- | mm/cma_debug.c | 170 | ||||
-rw-r--r-- | mm/compaction.c | 15 | ||||
-rw-r--r-- | mm/filemap.c | 15 | ||||
-rw-r--r-- | mm/gup.c | 124 | ||||
-rw-r--r-- | mm/huge_memory.c | 39 | ||||
-rw-r--r-- | mm/hugetlb.c | 12 | ||||
-rw-r--r-- | mm/internal.h | 4 | ||||
-rw-r--r-- | mm/memblock.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 194 | ||||
-rw-r--r-- | mm/memory.c | 371 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 35 | ||||
-rw-r--r-- | mm/mempolicy.c | 6 | ||||
-rw-r--r-- | mm/mempool.c | 10 | ||||
-rw-r--r-- | mm/memtest.c | 118 | ||||
-rw-r--r-- | mm/migrate.c | 37 | ||||
-rw-r--r-- | mm/mlock.c | 131 | ||||
-rw-r--r-- | mm/mmap.c | 4 | ||||
-rw-r--r-- | mm/oom_kill.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 19 | ||||
-rw-r--r-- | mm/page_alloc.c | 247 | ||||
-rw-r--r-- | mm/slab.c | 22 | ||||
-rw-r--r-- | mm/slob.c | 3 | ||||
-rw-r--r-- | mm/slub.c | 28 | ||||
-rw-r--r-- | mm/truncate.c | 37 | ||||
-rw-r--r-- | mm/vmalloc.c | 8 |
30 files changed, 1199 insertions, 818 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a03131b6ba8e..390214da4546 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -517,6 +517,12 @@ config CMA_DEBUG processing calls such as dma_alloc_from_contiguous(). This option does not affect warning and error messages. +config CMA_DEBUGFS + bool "CMA debugfs interface" + depends on CMA && DEBUG_FS + help + Turns on the DebugFS interface for CMA. + config CMA_AREAS int "Maximum count of the CMA areas" depends on CMA diff --git a/mm/Makefile b/mm/Makefile index 15dbe9903c27..98c4eaeabdcb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o +obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o @@ -76,3 +77,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o +obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o diff --git a/mm/cleancache.c b/mm/cleancache.c index 053bcd8f12fb..8fc50811119b 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -19,7 +19,7 @@ #include <linux/cleancache.h> /* - * cleancache_ops is set by cleancache_ops_register to contain the pointers + * cleancache_ops is set by cleancache_register_ops to contain the pointers * to the cleancache "backend" implementation functions. */ static struct cleancache_ops *cleancache_ops __read_mostly; @@ -34,145 +34,107 @@ static u64 cleancache_failed_gets; static u64 cleancache_puts; static u64 cleancache_invalidates; -/* - * When no backend is registered all calls to init_fs and init_shared_fs - * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or - * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array - * [shared_|]fs_poolid_map) are given to the respective super block - * (sb->cleancache_poolid) and no tmem_pools are created. When a backend - * registers with cleancache the previous calls to init_fs and init_shared_fs - * are executed to create tmem_pools and set the respective poolids. While no - * backend is registered all "puts", "gets" and "flushes" are ignored or failed. - */ -#define MAX_INITIALIZABLE_FS 32 -#define FAKE_FS_POOLID_OFFSET 1000 -#define FAKE_SHARED_FS_POOLID_OFFSET 2000 - -#define FS_NO_BACKEND (-1) -#define FS_UNKNOWN (-2) -static int fs_poolid_map[MAX_INITIALIZABLE_FS]; -static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS]; -static char *uuids[MAX_INITIALIZABLE_FS]; -/* - * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads - * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple - * threads calling mount (and ending up in __cleancache_init_[shared|]fs). - */ -static DEFINE_MUTEX(poolid_mutex); -/* - * When set to false (default) all calls to the cleancache functions, except - * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded - * by the if (!cleancache_ops) return. This means multiple threads (from - * different filesystems) will be checking cleancache_ops. The usage of a - * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are - * OK if the time between the backend's have been initialized (and - * cleancache_ops has been set to not NULL) and when the filesystems start - * actually calling the backends. The inverse (when unloading) is obviously - * not good - but this shim does not do that (yet). - */ - -/* - * The backends and filesystems work all asynchronously. This is b/c the - * backends can be built as modules. - * The usual sequence of events is: - * a) mount / -> __cleancache_init_fs is called. We set the - * [shared_|]fs_poolid_map and uuids for. - * - * b). user does I/Os -> we call the rest of __cleancache_* functions - * which return immediately as cleancache_ops is false. - * - * c). modprobe zcache -> cleancache_register_ops. We init the backend - * and set cleancache_ops to true, and for any fs_poolid_map - * (which is set by __cleancache_init_fs) we initialize the poolid. - * - * d). user does I/Os -> now that cleancache_ops is true all the - * __cleancache_* functions can call the backend. They all check - * that fs_poolid_map is valid and if so invoke the backend. - * - * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is - * reset (which is the second check in the __cleancache_* ops - * to call the backend). - * - * The sequence of event could also be c), followed by a), and d). and e). The - * c) would not happen anymore. There is also the chance of c), and one thread - * doing a) + d), and another doing e). For that case we depend on the - * filesystem calling __cleancache_invalidate_fs in the proper sequence (so - * that it handles all I/Os before it invalidates the fs (which is last part - * of unmounting process). - * - * Note: The acute reader will notice that there is no "rmmod zcache" case. - * This is b/c the functionality for that is not yet implemented and when - * done, will require some extra locking not yet devised. - */ +static void cleancache_register_ops_sb(struct super_block *sb, void *unused) +{ + switch (sb->cleancache_poolid) { + case CLEANCACHE_NO_BACKEND: + __cleancache_init_fs(sb); + break; + case CLEANCACHE_NO_BACKEND_SHARED: + __cleancache_init_shared_fs(sb); + break; + } +} /* - * Register operations for cleancache, returning previous thus allowing - * detection of multiple backends and possible nesting. + * Register operations for cleancache. Returns 0 on success. */ -struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) +int cleancache_register_ops(struct cleancache_ops *ops) { - struct cleancache_ops *old = cleancache_ops; - int i; + if (cmpxchg(&cleancache_ops, NULL, ops)) + return -EBUSY; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (fs_poolid_map[i] == FS_NO_BACKEND) - fs_poolid_map[i] = ops->init_fs(PAGE_SIZE); - if (shared_fs_poolid_map[i] == FS_NO_BACKEND) - shared_fs_poolid_map[i] = ops->init_shared_fs - (uuids[i], PAGE_SIZE); - } /* - * We MUST set cleancache_ops _after_ we have called the backends - * init_fs or init_shared_fs functions. Otherwise the compiler might - * re-order where cleancache_ops is set in this function. + * A cleancache backend can be built as a module and hence loaded after + * a cleancache enabled filesystem has called cleancache_init_fs. To + * handle such a scenario, here we call ->init_fs or ->init_shared_fs + * for each active super block. To differentiate between local and + * shared filesystems, we temporarily initialize sb->cleancache_poolid + * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED + * respectively in case there is no backend registered at the time + * cleancache_init_fs or cleancache_init_shared_fs is called. + * + * Since filesystems can be mounted concurrently with cleancache + * backend registration, we have to be careful to guarantee that all + * cleancache enabled filesystems that has been mounted by the time + * cleancache_register_ops is called has got and all mounted later will + * get cleancache_poolid. This is assured by the following statements + * tied together: + * + * a) iterate_supers skips only those super blocks that has started + * ->kill_sb + * + * b) if iterate_supers encounters a super block that has not finished + * ->mount yet, it waits until it is finished + * + * c) cleancache_init_fs is called from ->mount and + * cleancache_invalidate_fs is called from ->kill_sb + * + * d) we call iterate_supers after cleancache_ops has been set + * + * From a) it follows that if iterate_supers skips a super block, then + * either the super block is already dead, in which case we do not need + * to bother initializing cleancache for it, or it was mounted after we + * initiated iterate_supers. In the latter case, it must have seen + * cleancache_ops set according to d) and initialized cleancache from + * ->mount by itself according to c). This proves that we call + * ->init_fs at least once for each active super block. + * + * From b) and c) it follows that if iterate_supers encounters a super + * block that has already started ->init_fs, it will wait until ->mount + * and hence ->init_fs has finished, then check cleancache_poolid, see + * that it has already been set and therefore do nothing. This proves + * that we call ->init_fs no more than once for each super block. + * + * Combined together, the last two paragraphs prove the function + * correctness. + * + * Note that various cleancache callbacks may proceed before this + * function is called or even concurrently with it, but since + * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop + * until the corresponding ->init_fs has been actually called and + * cleancache_ops has been set. */ - barrier(); - cleancache_ops = ops; - mutex_unlock(&poolid_mutex); - return old; + iterate_supers(cleancache_register_ops_sb, NULL); + return 0; } EXPORT_SYMBOL(cleancache_register_ops); /* Called by a cleancache-enabled filesystem at time of mount */ void __cleancache_init_fs(struct super_block *sb) { - int i; + int pool_id = CLEANCACHE_NO_BACKEND; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (fs_poolid_map[i] == FS_UNKNOWN) { - sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET; - if (cleancache_ops) - fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE); - else - fs_poolid_map[i] = FS_NO_BACKEND; - break; - } + if (cleancache_ops) { + pool_id = cleancache_ops->init_fs(PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; } - mutex_unlock(&poolid_mutex); + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_fs); /* Called by a cleancache-enabled clustered filesystem at time of mount */ -void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +void __cleancache_init_shared_fs(struct super_block *sb) { - int i; + int pool_id = CLEANCACHE_NO_BACKEND_SHARED; - mutex_lock(&poolid_mutex); - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - if (shared_fs_poolid_map[i] == FS_UNKNOWN) { - sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; - uuids[i] = uuid; - if (cleancache_ops) - shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs - (uuid, PAGE_SIZE); - else - shared_fs_poolid_map[i] = FS_NO_BACKEND; - break; - } + if (cleancache_ops) { + pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); + if (pool_id < 0) + pool_id = CLEANCACHE_NO_POOL; } - mutex_unlock(&poolid_mutex); + sb->cleancache_poolid = pool_id; } EXPORT_SYMBOL(__cleancache_init_shared_fs); @@ -202,19 +164,6 @@ static int cleancache_get_key(struct inode *inode, } /* - * Returns a pool_id that is associated with a given fake poolid. - */ -static int get_poolid_from_fake(int fake_pool_id) -{ - if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) - return shared_fs_poolid_map[fake_pool_id - - FAKE_SHARED_FS_POOLID_OFFSET]; - else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) - return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET]; - return FS_NO_BACKEND; -} - -/* * "Get" data from cleancache associated with the poolid/inode/index * that were specified when the data was put to cleanache and, if * successful, use it to fill the specified page with data and return 0. @@ -229,7 +178,6 @@ int __cleancache_get_page(struct page *page) { int ret = -1; int pool_id; - int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) { @@ -238,17 +186,14 @@ int __cleancache_get_page(struct page *page) } VM_BUG_ON_PAGE(!PageLocked(page), page); - fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (fake_pool_id < 0) + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id < 0) goto out; - pool_id = get_poolid_from_fake(fake_pool_id); if (cleancache_get_key(page->mapping->host, &key) < 0) goto out; - if (pool_id >= 0) - ret = cleancache_ops->get_page(pool_id, - key, page->index, page); + ret = cleancache_ops->get_page(pool_id, key, page->index, page); if (ret == 0) cleancache_succ_gets++; else @@ -271,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page); void __cleancache_put_page(struct page *page) { int pool_id; - int fake_pool_id; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) { @@ -280,12 +224,7 @@ void __cleancache_put_page(struct page *page) } VM_BUG_ON_PAGE(!PageLocked(page), page); - fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; - if (fake_pool_id < 0) - return; - - pool_id = get_poolid_from_fake(fake_pool_id); - + pool_id = page->mapping->host->i_sb->cleancache_poolid; if (pool_id >= 0 && cleancache_get_key(page->mapping->host, &key) >= 0) { cleancache_ops->put_page(pool_id, key, page->index, page); @@ -306,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping, struct page *page) { /* careful... page->mapping is NULL sometimes when this is called */ - int pool_id; - int fake_pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) return; - if (fake_pool_id >= 0) { - pool_id = get_poolid_from_fake(fake_pool_id); - if (pool_id < 0) - return; - + if (pool_id >= 0) { VM_BUG_ON_PAGE(!PageLocked(page), page); if (cleancache_get_key(mapping->host, &key) >= 0) { cleancache_ops->invalidate_page(pool_id, @@ -339,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page); */ void __cleancache_invalidate_inode(struct address_space *mapping) { - int pool_id; - int fake_pool_id = mapping->host->i_sb->cleancache_poolid; + int pool_id = mapping->host->i_sb->cleancache_poolid; struct cleancache_filekey key = { .u.key = { 0 } }; if (!cleancache_ops) return; - if (fake_pool_id < 0) - return; - - pool_id = get_poolid_from_fake(fake_pool_id); - if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) cleancache_ops->invalidate_inode(pool_id, key); } @@ -363,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode); */ void __cleancache_invalidate_fs(struct super_block *sb) { - int index; - int fake_pool_id = sb->cleancache_poolid; - int old_poolid = fake_pool_id; + int pool_id; - mutex_lock(&poolid_mutex); - if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) { - index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET; - old_poolid = shared_fs_poolid_map[index]; - shared_fs_poolid_map[index] = FS_UNKNOWN; - uuids[index] = NULL; - } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) { - index = fake_pool_id - FAKE_FS_POOLID_OFFSET; - old_poolid = fs_poolid_map[index]; - fs_poolid_map[index] = FS_UNKNOWN; - } - sb->cleancache_poolid = -1; - if (cleancache_ops) - cleancache_ops->invalidate_fs(old_poolid); - mutex_unlock(&poolid_mutex); + pool_id = sb->cleancache_poolid; + sb->cleancache_poolid = CLEANCACHE_NO_POOL; + + if (cleancache_ops && pool_id >= 0) + cleancache_ops->invalidate_fs(pool_id); } EXPORT_SYMBOL(__cleancache_invalidate_fs); static int __init init_cleancache(void) { - int i; - #ifdef CONFIG_DEBUG_FS struct dentry *root = debugfs_create_dir("cleancache", NULL); if (root == NULL) @@ -400,10 +314,6 @@ static int __init init_cleancache(void) debugfs_create_u64("invalidates", S_IRUGO, root, &cleancache_invalidates); #endif - for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { - fs_poolid_map[i] = FS_UNKNOWN; - shared_fs_poolid_map[i] = FS_UNKNOWN; - } return 0; } module_init(init_cleancache) @@ -35,29 +35,24 @@ #include <linux/highmem.h> #include <linux/io.h> -struct cma { - unsigned long base_pfn; - unsigned long count; - unsigned long *bitmap; - unsigned int order_per_bit; /* Order of pages represented by one bit */ - struct mutex lock; -}; - -static struct cma cma_areas[MAX_CMA_AREAS]; -static unsigned cma_area_count; +#include "cma.h" + +struct cma cma_areas[MAX_CMA_AREAS]; +unsigned cma_area_count; static DEFINE_MUTEX(cma_mutex); -phys_addr_t cma_get_base(struct cma *cma) +phys_addr_t cma_get_base(const struct cma *cma) { return PFN_PHYS(cma->base_pfn); } -unsigned long cma_get_size(struct cma *cma) +unsigned long cma_get_size(const struct cma *cma) { return cma->count << PAGE_SHIFT; } -static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) +static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, + int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -68,7 +63,8 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) * Find a PFN aligned to the specified order and return an offset represented in * order_per_bits. */ -static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) +static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, + int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -77,18 +73,14 @@ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) - cma->base_pfn) >> cma->order_per_bit; } -static unsigned long cma_bitmap_maxno(struct cma *cma) -{ - return cma->count >> cma->order_per_bit; -} - -static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, - unsigned long pages) +static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, + unsigned long pages) { return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; } -static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, + unsigned int count) { unsigned long bitmap_no, bitmap_count; @@ -134,6 +126,12 @@ static int __init cma_activate_area(struct cma *cma) } while (--i); mutex_init(&cma->lock); + +#ifdef CONFIG_CMA_DEBUGFS + INIT_HLIST_HEAD(&cma->mem_head); + spin_lock_init(&cma->mem_head_lock); +#endif + return 0; err: @@ -167,7 +165,8 @@ core_initcall(cma_init_reserved_areas); * This function creates custom contiguous area from already reserved memory. */ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, - int order_per_bit, struct cma **res_cma) + unsigned int order_per_bit, + struct cma **res_cma) { struct cma *cma; phys_addr_t alignment; @@ -358,7 +357,7 @@ err: * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, int count, unsigned int align) +struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) { unsigned long mask, offset, pfn, start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; @@ -429,7 +428,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ -bool cma_release(struct cma *cma, struct page *pages, int count) +bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) { unsigned long pfn; diff --git a/mm/cma.h b/mm/cma.h new file mode 100644 index 000000000000..1132d733556d --- /dev/null +++ b/mm/cma.h @@ -0,0 +1,24 @@ +#ifndef __MM_CMA_H__ +#define __MM_CMA_H__ + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ + struct mutex lock; +#ifdef CONFIG_CMA_DEBUGFS + struct hlist_head mem_head; + spinlock_t mem_head_lock; +#endif +}; + +extern struct cma cma_areas[MAX_CMA_AREAS]; +extern unsigned cma_area_count; + +static unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +#endif diff --git a/mm/cma_debug.c b/mm/cma_debug.c new file mode 100644 index 000000000000..0b377536ccde --- /dev/null +++ b/mm/cma_debug.c @@ -0,0 +1,170 @@ +/* + * CMA DebugFS Interface + * + * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com> + */ + + +#include <linux/debugfs.h> +#include <linux/cma.h> +#include <linux/list.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/mm_types.h> + +#include "cma.h" + +struct cma_mem { + struct hlist_node node; + struct page *p; + unsigned long n; +}; + +static struct dentry *cma_debugfs_root; + +static int cma_debugfs_get(void *data, u64 *val) +{ + unsigned long *p = data; + + *val = *p; + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); + +static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) +{ + spin_lock(&cma->mem_head_lock); + hlist_add_head(&mem->node, &cma->mem_head); + spin_unlock(&cma->mem_head_lock); +} + +static struct cma_mem *cma_get_entry_from_list(struct cma *cma) +{ + struct cma_mem *mem = NULL; + + spin_lock(&cma->mem_head_lock); + if (!hlist_empty(&cma->mem_head)) { + mem = hlist_entry(cma->mem_head.first, struct cma_mem, node); + hlist_del_init(&mem->node); + } + spin_unlock(&cma->mem_head_lock); + + return mem; +} + +static int cma_free_mem(struct cma *cma, int count) +{ + struct cma_mem *mem = NULL; + + while (count) { + mem = cma_get_entry_from_list(cma); + if (mem == NULL) + return 0; + + if (mem->n <= count) { + cma_release(cma, mem->p, mem->n); + count -= mem->n; + kfree(mem); + } else if (cma->order_per_bit == 0) { + cma_release(cma, mem->p, count); + mem->p += count; + mem->n -= count; + count = 0; + cma_add_to_cma_mem_list(cma, mem); + } else { + pr_debug("cma: cannot release partial block when order_per_bit != 0\n"); + cma_add_to_cma_mem_list(cma, mem); + break; + } + } + + return 0; + +} + +static int cma_free_write(void *data, u64 val) +{ + int pages = val; + struct cma *cma = data; + + return cma_free_mem(cma, pages); +} + +DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); + +static int cma_alloc_mem(struct cma *cma, int count) +{ + struct cma_mem *mem; + struct page *p; + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) + return -ENOMEM; + + p = cma_alloc(cma, count, 0); + if (!p) { + kfree(mem); + return -ENOMEM; + } + + mem->p = p; + mem->n = count; + + cma_add_to_cma_mem_list(cma, mem); + + return 0; +} + +static int cma_alloc_write(void *data, u64 val) +{ + int pages = val; + struct cma *cma = data; + + return cma_alloc_mem(cma, pages); +} + +DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); + +static void cma_debugfs_add_one(struct cma *cma, int idx) +{ + struct dentry *tmp; + char name[16]; + int u32s; + + sprintf(name, "cma-%d", idx); + + tmp = debugfs_create_dir(name, cma_debugfs_root); + + debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma, + &cma_alloc_fops); + + debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma, + &cma_free_fops); + + debugfs_create_file("base_pfn", S_IRUGO, tmp, + &cma->base_pfn, &cma_debugfs_fops); + debugfs_create_file("count", S_IRUGO, tmp, + &cma->count, &cma_debugfs_fops); + debugfs_create_file("order_per_bit", S_IRUGO, tmp, + &cma->order_per_bit, &cma_debugfs_fops); + + u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); + debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); +} + +static int __init cma_debugfs_init(void) +{ + int i; + + cma_debugfs_root = debugfs_create_dir("cma", NULL); + if (!cma_debugfs_root) + return -ENOMEM; + + for (i = 0; i < cma_area_count; i++) + cma_debugfs_add_one(&cma_areas[i], i); + + return 0; +} +late_initcall(cma_debugfs_init); diff --git a/mm/compaction.c b/mm/compaction.c index 8c0d9459b54a..a18201a8124e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1174,13 +1174,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { struct free_area *area = &zone->free_area[order]; + bool can_steal; /* Job done if page is free of the right migratetype */ if (!list_empty(&area->free_list[migratetype])) return COMPACT_PARTIAL; - /* Job done if allocation would set block type */ - if (order >= pageblock_order && area->nr_free) +#ifdef CONFIG_CMA + /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ + if (migratetype == MIGRATE_MOVABLE && + !list_empty(&area->free_list[MIGRATE_CMA])) + return COMPACT_PARTIAL; +#endif + /* + * Job done if allocation would steal freepages from + * other migratetype buddy lists. + */ + if (find_suitable_fallback(area, order, migratetype, + true, &can_steal) != -1) return COMPACT_PARTIAL; } diff --git a/mm/filemap.c b/mm/filemap.c index 876f4e6f3ed6..12548d03c11d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -202,16 +202,15 @@ void __delete_from_page_cache(struct page *page, void *shadow) BUG_ON(page_mapped(page)); /* - * Some filesystems seem to re-dirty the page even after - * the VM has canceled the dirty bit (eg ext3 journaling). + * At this point page must be either written or cleaned by truncate. + * Dirty page here signals a bug and loss of unwritten data. * - * Fix it up by doing a final dirty accounting check after - * having removed the page entirely. + * This fixes dirty accounting after removing the page entirely but + * leaves PageDirty set: it has no effect for truncated page and + * anyway will be cleared before returning page into buddy allocator. */ - if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); - } + if (WARN_ON_ONCE(PageDirty(page))) + account_page_cleaned(page, mapping); } /** @@ -92,7 +92,7 @@ retry: */ mark_page_accessed(page); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret; - /* For mlock, just skip the stack guard page. */ - if ((*flags & FOLL_MLOCK) && + /* For mm_populate(), just skip the stack guard page. */ + if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) || stack_guard_page_end(vma, address + PAGE_SIZE))) return -ENOENT; @@ -819,6 +819,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, EXPORT_SYMBOL(get_user_pages); /** + * populate_vma_page_range() - populate a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @nonblocking: + * + * This takes care of mlocking the pages too if VM_LOCKED is set. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. + */ +long populate_vma_page_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long nr_pages = (end - start) / PAGE_SIZE; + int gup_flags; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON_VMA(start < vma->vm_start, vma); + VM_BUG_ON_VMA(end > vma->vm_end, vma); + VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); + + gup_flags = FOLL_TOUCH | FOLL_POPULATE; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) + gup_flags |= FOLL_WRITE; + + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + + /* + * We made sure addr is within a VMA, so the following will + * not result in a stack expansion that recurses back here. + */ + return __get_user_pages(current, mm, start, nr_pages, gup_flags, + NULL, NULL, nonblocking); +} + +/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int locked = 0; + long ret = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); + vma = find_vma(mm, nstart); + } else if (nstart >= vma->vm_end) + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages. populate_vma_page_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. + */ + ret = populate_vma_page_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + break; + } + nend = nstart + ret * PAGE_SIZE; + ret = 0; + } + if (locked) + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + +/** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address * diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6817b0350c71..3afb5cbe1312 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1231,7 +1231,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, pmd, _pmd, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) @@ -2109,7 +2109,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) { while (--_pte >= pte) { pte_t pteval = *_pte; - if (!pte_none(pteval)) + if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) release_pte_page(pte_page(pteval)); } } @@ -2120,13 +2120,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, { struct page *page; pte_t *_pte; - int none = 0; + int none_or_zero = 0; bool referenced = false, writable = false; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval)) { - if (++none <= khugepaged_max_ptes_none) + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out; @@ -2207,9 +2207,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, pte_t pteval = *_pte; struct page *src_page; - if (pte_none(pteval)) { + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { clear_user_highpage(page, address); add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + if (is_zero_pfn(pte_pfn(pteval))) { + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + } } else { src_page = pte_page(pteval); copy_user_highpage(page, src_page, address, vma); @@ -2316,8 +2328,14 @@ static struct page struct vm_area_struct *vma, unsigned long address, int node) { + gfp_t flags; + VM_BUG_ON_PAGE(*hpage, *hpage); + /* Only allocate from the target node */ + flags = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | + __GFP_THISNODE; + /* * Before allocating the hugepage, release the mmap_sem read lock. * The allocation can take potentially a long time if it involves @@ -2326,8 +2344,7 @@ static struct page */ up_read(&mm->mmap_sem); - *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( - khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); + *hpage = alloc_pages_exact_node(node, flags, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); @@ -2543,7 +2560,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none = 0; + int ret = 0, none_or_zero = 0; struct page *page; unsigned long _address; spinlock_t *ptl; @@ -2561,8 +2578,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval)) { - if (++none <= khugepaged_max_ptes_none) + if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out_unmap; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c41b2a0ee273..8874c8ad55aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3278,6 +3278,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) { + remainder = 0; + break; + } + + /* * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the * first, for the page indexing below to work. @@ -3735,8 +3744,7 @@ retry: if (!pmd_huge(*pmd)) goto out; if (pmd_present(*pmd)) { - page = pte_page(*(pte_t *)pmd) + - ((address & ~PMD_MASK) >> PAGE_SHIFT); + page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); if (flags & FOLL_GET) get_page(page); } else { diff --git a/mm/internal.h b/mm/internal.h index a96da5b0029d..edaab69a9c35 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -200,6 +200,8 @@ isolate_freepages_range(struct compact_control *cc, unsigned long isolate_migratepages_range(struct compact_control *cc, unsigned long low_pfn, unsigned long end_pfn); +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal); #endif @@ -240,7 +242,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); #ifdef CONFIG_MMU -extern long __mlock_vma_pages_range(struct vm_area_struct *vma, +extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking); extern void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); diff --git a/mm/memblock.c b/mm/memblock.c index 252b77bdf65e..3f37a0bca5d5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -699,14 +699,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *_rgn = &memblock.reserved; + struct memblock_type *type = &memblock.reserved; memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(_rgn, base, size, nid, flags); + return memblock_add_range(type, base, size, nid, flags); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b34ef4a32a3b..c3f09b2dda5f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -14,6 +14,12 @@ * Copyright (C) 2012 Parallels Inc. and Google Inc. * Authors: Glauber Costa and Suleiman Souhlal * + * Native page reclaim + * Charge lifetime sanitation + * Lockless page tracking & accounting + * Unified hierarchy configuration model + * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -1436,15 +1442,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) struct mem_cgroup *iter; unsigned int i; - if (!p) - return; - mutex_lock(&oom_info_lock); rcu_read_lock(); - pr_info("Task in "); - pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); - pr_cont(" killed as a result of limit of "); + if (p) { + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_cont(" killed as a result of limit of "); + } else { + pr_info("Memory limit reached of cgroup "); + } + pr_cont_cgroup_path(memcg->css.cgroup); pr_cont("\n"); @@ -1531,7 +1539,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, return; } - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -2779,92 +2787,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/** - * mem_cgroup_move_account - move account of the page - * @page: the page - * @nr_pages: number of regular pages (>1 for huge pages) - * @from: mem_cgroup which the page is moved from. - * @to: mem_cgroup which the page is moved to. @from != @to. - * - * The caller must confirm following. - * - page is not on LRU (isolate_page() is useful.) - * - compound_lock is held when nr_pages > 1 - * - * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" - * from old cgroup. - */ -static int mem_cgroup_move_account(struct page *page, - unsigned int nr_pages, - struct mem_cgroup *from, - struct mem_cgroup *to) -{ - unsigned long flags; - int ret; - - VM_BUG_ON(from == to); - VM_BUG_ON_PAGE(PageLRU(page), page); - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - ret = -EBUSY; - if (nr_pages > 1 && !PageTransHuge(page)) - goto out; - - /* - * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup - * of its source page while we change it: page migration takes - * both pages off the LRU, but page cache replacement doesn't. - */ - if (!trylock_page(page)) - goto out; - - ret = -EINVAL; - if (page->mem_cgroup != from) - goto out_unlock; - - spin_lock_irqsave(&from->move_lock, flags); - - if (!PageAnon(page) && page_mapped(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - } - - if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - } - - /* - * It is safe to change page->mem_cgroup here because the page - * is referenced, charged, and isolated - we can't race with - * uncharging, charging, migration, or LRU putback. - */ - - /* caller should have done css_get */ - page->mem_cgroup = to; - spin_unlock_irqrestore(&from->move_lock, flags); - - ret = 0; - - local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); - memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); - memcg_check_events(from, page); - local_irq_enable(); -out_unlock: - unlock_page(page); -out: - return ret; -} - #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) @@ -4816,6 +4738,92 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, return page; } +/** + * mem_cgroup_move_account - move account of the page + * @page: the page + * @nr_pages: number of regular pages (>1 for huge pages) + * @from: mem_cgroup which the page is moved from. + * @to: mem_cgroup which the page is moved to. @from != @to. + * + * The caller must confirm following. + * - page is not on LRU (isolate_page() is useful.) + * - compound_lock is held when nr_pages > 1 + * + * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" + * from old cgroup. + */ +static int mem_cgroup_move_account(struct page *page, + unsigned int nr_pages, + struct mem_cgroup *from, + struct mem_cgroup *to) +{ + unsigned long flags; + int ret; + + VM_BUG_ON(from == to); + VM_BUG_ON_PAGE(PageLRU(page), page); + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ + ret = -EBUSY; + if (nr_pages > 1 && !PageTransHuge(page)) + goto out; + + /* + * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup + * of its source page while we change it: page migration takes + * both pages off the LRU, but page cache replacement doesn't. + */ + if (!trylock_page(page)) + goto out; + + ret = -EINVAL; + if (page->mem_cgroup != from) + goto out_unlock; + + spin_lock_irqsave(&from->move_lock, flags); + + if (!PageAnon(page) && page_mapped(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + } + + if (PageWriteback(page)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + } + + /* + * It is safe to change page->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */ + + /* caller should have done css_get */ + page->mem_cgroup = to; + spin_unlock_irqrestore(&from->move_lock, flags); + + ret = 0; + + local_irq_disable(); + mem_cgroup_charge_statistics(to, page, nr_pages); + memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); + memcg_check_events(from, page); + local_irq_enable(); +out_unlock: + unlock_page(page); +out: + return ret; +} + static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { diff --git a/mm/memory.c b/mm/memory.c index 97839f5c8c30..ac20b2a6a0c3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1983,167 +1983,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, } /* - * This routine handles present pages, when users try to write - * to a shared page. It is done by copying the page to a new address - * and decrementing the shared-page counter for the old page. - * - * Note that this routine assumes that the protection checks have been - * done by the caller (the low-level page fault routine in most cases). - * Thus we can safely just mark it writable once we've done any necessary - * COW. + * Handle write page faults for pages that can be reused in the current vma * - * We also mark the page dirty at this point even though the page will - * change only once the write actually happens. This avoids a few races, - * and potentially makes it more efficient. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), with pte both mapped and locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * This can happen either due to the mapping being with the VM_SHARED flag, + * or due to us being the last reference standing to the page. In either + * case, all we need to do here is to mark the page as writable and update + * any related book-keeping. */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - spinlock_t *ptl, pte_t orig_pte) +static inline int wp_page_reuse(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, + struct page *page, int page_mkwrite, + int dirty_shared) __releases(ptl) { - struct page *old_page, *new_page = NULL; pte_t entry; - int ret = 0; - int page_mkwrite = 0; - bool dirty_shared = false; - unsigned long mmun_start = 0; /* For mmu_notifiers */ - unsigned long mmun_end = 0; /* For mmu_notifiers */ - struct mem_cgroup *memcg; - - old_page = vm_normal_page(vma, address, orig_pte); - if (!old_page) { - /* - * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a - * VM_PFNMAP VMA. - * - * We should not cow pages in a shared writeable mapping. - * Just mark the pages writable as we can't do any dirty - * accounting on raw pfn maps. - */ - if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED)) - goto reuse; - goto gotten; - } - /* - * Take out anonymous pages first, anonymous shared vmas are - * not dirty accountable. + * Clear the pages cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. */ - if (PageAnon(old_page) && !PageKsm(old_page)) { - if (!trylock_page(old_page)) { - page_cache_get(old_page); - pte_unmap_unlock(page_table, ptl); - lock_page(old_page); - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - if (!pte_same(*page_table, orig_pte)) { - unlock_page(old_page); - goto unlock; - } - page_cache_release(old_page); - } - if (reuse_swap_page(old_page)) { - /* - * The page is all ours. Move it to our anon_vma so - * the rmap code will not search our parent or siblings. - * Protected against the rmap code by the page lock. - */ - page_move_anon_rmap(old_page, vma, address); - unlock_page(old_page); - goto reuse; - } - unlock_page(old_page); - } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED))) { - page_cache_get(old_page); - /* - * Only catch write-faults on shared writable pages, - * read-only shared pages can get COWed by - * get_user_pages(.write=1, .force=1). - */ - if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - int tmp; - - pte_unmap_unlock(page_table, ptl); - tmp = do_page_mkwrite(vma, old_page, address); - if (unlikely(!tmp || (tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - page_cache_release(old_page); - return tmp; - } - /* - * Since we dropped the lock we need to revalidate - * the PTE as someone else may have changed it. If - * they did, we just return, as we can count on the - * MMU to tell us if they didn't also make it writable. - */ - page_table = pte_offset_map_lock(mm, pmd, address, - &ptl); - if (!pte_same(*page_table, orig_pte)) { - unlock_page(old_page); - goto unlock; - } - page_mkwrite = 1; - } - - dirty_shared = true; - -reuse: - /* - * Clear the pages cpupid information as the existing - * information potentially belongs to a now completely - * unrelated process. - */ - if (old_page) - page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); - - flush_cache_page(vma, address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, address, page_table, entry,1)) - update_mmu_cache(vma, address, page_table); - pte_unmap_unlock(page_table, ptl); - ret |= VM_FAULT_WRITE; + if (page) + page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - if (dirty_shared) { - struct address_space *mapping; - int dirtied; + flush_cache_page(vma, address, pte_pfn(orig_pte)); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (ptep_set_access_flags(vma, address, page_table, entry, 1)) + update_mmu_cache(vma, address, page_table); + pte_unmap_unlock(page_table, ptl); - if (!page_mkwrite) - lock_page(old_page); + if (dirty_shared) { + struct address_space *mapping; + int dirtied; - dirtied = set_page_dirty(old_page); - VM_BUG_ON_PAGE(PageAnon(old_page), old_page); - mapping = old_page->mapping; - unlock_page(old_page); - page_cache_release(old_page); + if (!page_mkwrite) + lock_page(page); - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + dirtied = set_page_dirty(page); + VM_BUG_ON_PAGE(PageAnon(page), page); + mapping = page->mapping; + unlock_page(page); + page_cache_release(page); - if (!page_mkwrite) - file_update_time(vma->vm_file); + if ((dirtied || page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); } - return ret; + if (!page_mkwrite) + file_update_time(vma->vm_file); } - /* - * Ok, we need to copy. Oh, well.. - */ - page_cache_get(old_page); -gotten: - pte_unmap_unlock(page_table, ptl); + return VM_FAULT_WRITE; +} + +/* + * Handle the case of a page which we actually need to copy to a new page. + * + * Called with mmap_sem locked and the old page referenced, but + * without the ptl held. + * + * High level logic flow: + * + * - Allocate a page, copy the content of the old page to the new one. + * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. + * - Take the PTL. If the pte changed, bail out and release the allocated page + * - If the pte is still the way we remember it, update the page table and all + * relevant references. This includes dropping the reference the page-table + * held to the old page, as well as updating the rmap. + * - In any case, unlock the PTL and drop the reference we took to the old page. + */ +static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + pte_t orig_pte, struct page *old_page) +{ + struct page *new_page = NULL; + spinlock_t *ptl = NULL; + pte_t entry; + int page_copied = 0; + const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ + const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ + struct mem_cgroup *memcg; if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -2163,8 +2087,6 @@ gotten: if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) goto oom_free_new; - mmun_start = address & PAGE_MASK; - mmun_end = mmun_start + PAGE_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* @@ -2177,8 +2099,9 @@ gotten: dec_mm_counter_fast(mm, MM_FILEPAGES); inc_mm_counter_fast(mm, MM_ANONPAGES); } - } else + } else { inc_mm_counter_fast(mm, MM_ANONPAGES); + } flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2227,29 +2150,29 @@ gotten: /* Free the old page.. */ new_page = old_page; - ret |= VM_FAULT_WRITE; - } else + page_copied = 1; + } else { mem_cgroup_cancel_charge(new_page, memcg); + } if (new_page) page_cache_release(new_page); -unlock: + pte_unmap_unlock(page_table, ptl); - if (mmun_end > mmun_start) - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ - if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { + if (page_copied && (vma->vm_flags & VM_LOCKED)) { lock_page(old_page); /* LRU manipulation */ munlock_vma_page(old_page); unlock_page(old_page); } page_cache_release(old_page); } - return ret; + return page_copied ? VM_FAULT_WRITE : 0; oom_free_new: page_cache_release(new_page); oom: @@ -2258,6 +2181,144 @@ oom: return VM_FAULT_OOM; } +static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, + pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, + struct page *old_page) + __releases(ptl) +{ + int page_mkwrite = 0; + + page_cache_get(old_page); + + /* + * Only catch write-faults on shared writable pages, + * read-only shared pages can get COWed by + * get_user_pages(.write=1, .force=1). + */ + if (vma->vm_ops && vma->vm_ops->page_mkwrite) { + int tmp; + + pte_unmap_unlock(page_table, ptl); + tmp = do_page_mkwrite(vma, old_page, address); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(old_page); + return tmp; + } + /* + * Since we dropped the lock we need to revalidate + * the PTE as someone else may have changed it. If + * they did, we just return, as we can count on the + * MMU to tell us if they didn't also make it writable. + */ + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + pte_unmap_unlock(page_table, ptl); + page_cache_release(old_page); + return 0; + } + page_mkwrite = 1; + } + + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, page_mkwrite, 1); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), with pte both mapped and locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + spinlock_t *ptl, pte_t orig_pte) + __releases(ptl) +{ + struct page *old_page; + + old_page = vm_normal_page(vma, address, orig_pte); + if (!old_page) { + /* + * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a + * VM_PFNMAP VMA. + * + * We should not cow pages in a shared writeable mapping. + * Just mark the pages writable as we can't do any dirty + * accounting on raw pfn maps. + */ + if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED)) + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, 0, 0); + + pte_unmap_unlock(page_table, ptl); + return wp_page_copy(mm, vma, address, page_table, pmd, + orig_pte, old_page); + } + + /* + * Take out anonymous pages first, anonymous shared vmas are + * not dirty accountable. + */ + if (PageAnon(old_page) && !PageKsm(old_page)) { + if (!trylock_page(old_page)) { + page_cache_get(old_page); + pte_unmap_unlock(page_table, ptl); + lock_page(old_page); + page_table = pte_offset_map_lock(mm, pmd, address, + &ptl); + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + pte_unmap_unlock(page_table, ptl); + page_cache_release(old_page); + return 0; + } + page_cache_release(old_page); + } + if (reuse_swap_page(old_page)) { + /* + * The page is all ours. Move it to our anon_vma so + * the rmap code will not search our parent or siblings. + * Protected against the rmap code by the page lock. + */ + page_move_anon_rmap(old_page, vma, address); + unlock_page(old_page); + return wp_page_reuse(mm, vma, address, page_table, ptl, + orig_pte, old_page, 0, 0); + } + unlock_page(old_page); + } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == + (VM_WRITE|VM_SHARED))) { + return wp_page_shared(mm, vma, address, page_table, pmd, + ptl, orig_pte, old_page); + } + + /* + * Ok, we need to copy. Oh, well.. + */ + page_cache_get(old_page); + + pte_unmap_unlock(page_table, ptl); + return wp_page_copy(mm, vma, address, page_table, pmd, + orig_pte, old_page); +} + static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 65842d688b7c..e2e8014fb755 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -104,7 +104,7 @@ void put_online_mems(void) } -static void mem_hotplug_begin(void) +void mem_hotplug_begin(void) { mem_hotplug.active_writer = current; @@ -119,7 +119,7 @@ static void mem_hotplug_begin(void) } } -static void mem_hotplug_done(void) +void mem_hotplug_done(void) { mem_hotplug.active_writer = NULL; mutex_unlock(&mem_hotplug.lock); @@ -502,7 +502,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); + err = __add_section(nid, zone, section_nr_to_pfn(i)); /* * EEXIST is finally dealt with by ioresource collision @@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) } +/* Must be protected by mem_hotplug_begin() */ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long flags; @@ -969,7 +970,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int ret; struct memory_notify arg; - mem_hotplug_begin(); /* * This doesn't need a lock to do pfn_to_page(). * The section can't be removed here because of the @@ -977,21 +977,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ zone = page_zone(pfn_to_page(pfn)); - ret = -EINVAL; if ((zone_idx(zone) > ZONE_NORMAL || online_type == MMOP_ONLINE_MOVABLE) && !can_online_high_movable(zone)) - goto out; + return -EINVAL; if (online_type == MMOP_ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) - goto out; + return -EINVAL; } if (online_type == MMOP_ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) - goto out; + return -EINVAL; } /* Previous code may changed the zone of the pfn range */ @@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = notifier_to_errno(ret); if (ret) { memory_notify(MEM_CANCEL_ONLINE, &arg); - goto out; + return ret; } /* * If this zone is not populated, then it is not in zonelist. @@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); - goto out; + return ret; } zone->present_pages += onlined_pages; @@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); -out: - mem_hotplug_done(); - return ret; + return 0; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1688,21 +1685,18 @@ static int __ref __offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - mem_hotplug_begin(); - zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; - ret = -EINVAL; if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) - goto out; + return -EINVAL; /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); if (ret) - goto out; + return ret; arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1795,7 +1789,6 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - mem_hotplug_done(); return 0; failed_removal: @@ -1805,12 +1798,10 @@ failed_removal: memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); - -out: - mem_hotplug_done(); return ret; } +/* Must be protected by mem_hotplug_begin() */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4721046a134a..ede26291d4aa 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x return alloc_huge_page_node(page_hstate(compound_head(page)), node); else - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE, 0); } /* @@ -1985,7 +1986,8 @@ retry_cpuset: nmask = policy_nodemask(gfp, pol); if (!nmask || node_isset(node, *nmask)) { mpol_cond_put(pol); - page = alloc_pages_exact_node(node, gfp, order); + page = alloc_pages_exact_node(node, + gfp | __GFP_THISNODE, order); goto out; } } diff --git a/mm/mempool.c b/mm/mempool.c index e209c98c7203..949970db2874 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -113,23 +113,24 @@ EXPORT_SYMBOL(mempool_create_node); * mempool_create(). * @new_min_nr: the new minimum number of elements guaranteed to be * allocated for this pool. - * @gfp_mask: the usual allocation bitmask. * * This function shrinks/grows the pool. In the case of growing, * it cannot be guaranteed that the pool will be grown to the new * size immediately, but new mempool_free() calls will refill it. + * This function may sleep. * * Note, the caller must guarantee that no mempool_destroy is called * while this function is running. mempool_alloc() & mempool_free() * might be called (eg. from IRQ contexts) while this function executes. */ -int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) +int mempool_resize(mempool_t *pool, int new_min_nr) { void *element; void **new_elements; unsigned long flags; BUG_ON(new_min_nr <= 0); + might_sleep(); spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { @@ -145,7 +146,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) spin_unlock_irqrestore(&pool->lock, flags); /* Grow the pool */ - new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); + new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), + GFP_KERNEL); if (!new_elements) return -ENOMEM; @@ -164,7 +166,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) while (pool->curr_nr < pool->min_nr) { spin_unlock_irqrestore(&pool->lock, flags); - element = pool->alloc(gfp_mask, pool->pool_data); + element = pool->alloc(GFP_KERNEL, pool->pool_data); if (!element) goto out; spin_lock_irqsave(&pool->lock, flags); diff --git a/mm/memtest.c b/mm/memtest.c new file mode 100644 index 000000000000..1997d934b13b --- /dev/null +++ b/mm/memtest.c @@ -0,0 +1,118 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/init.h> +#include <linux/pfn.h> +#include <linux/memblock.h> + +static u64 patterns[] __initdata = { + /* The first entry has to be 0 to leave memtest with zeroed memory */ + 0, + 0xffffffffffffffffULL, + 0x5555555555555555ULL, + 0xaaaaaaaaaaaaaaaaULL, + 0x1111111111111111ULL, + 0x2222222222222222ULL, + 0x4444444444444444ULL, + 0x8888888888888888ULL, + 0x3333333333333333ULL, + 0x6666666666666666ULL, + 0x9999999999999999ULL, + 0xccccccccccccccccULL, + 0x7777777777777777ULL, + 0xbbbbbbbbbbbbbbbbULL, + 0xddddddddddddddddULL, + 0xeeeeeeeeeeeeeeeeULL, + 0x7a6c7258554e494cULL, /* yeah ;-) */ +}; + +static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) +{ + printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", + (unsigned long long) pattern, + (unsigned long long) start_bad, + (unsigned long long) end_bad); + memblock_reserve(start_bad, end_bad - start_bad); +} + +static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size) +{ + u64 *p, *start, *end; + phys_addr_t start_bad, last_bad; + phys_addr_t start_phys_aligned; + const size_t incr = sizeof(pattern); + + start_phys_aligned = ALIGN(start_phys, incr); + start = __va(start_phys_aligned); + end = start + (size - (start_phys_aligned - start_phys)) / incr; + start_bad = 0; + last_bad = 0; + + for (p = start; p < end; p++) + *p = pattern; + + for (p = start; p < end; p++, start_phys_aligned += incr) { + if (*p == pattern) + continue; + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + continue; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); + start_bad = last_bad = start_phys_aligned; + } + if (start_bad) + reserve_bad_mem(pattern, start_bad, last_bad + incr); +} + +static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) +{ + u64 i; + phys_addr_t this_start, this_end; + + for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + if (this_start < this_end) { + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long)this_start, + (unsigned long long)this_end, + (unsigned long long)cpu_to_be64(pattern)); + memtest(pattern, this_start, this_end - this_start); + } + } +} + +/* default is disabled */ +static int memtest_pattern __initdata; + +static int __init parse_memtest(char *arg) +{ + if (arg) + memtest_pattern = simple_strtoul(arg, NULL, 0); + else + memtest_pattern = ARRAY_SIZE(patterns); + + return 0; +} + +early_param("memtest", parse_memtest); + +void __init early_memtest(phys_addr_t start, phys_addr_t end) +{ + unsigned int i; + unsigned int idx = 0; + + if (!memtest_pattern) + return; + + printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); + for (i = memtest_pattern-1; i < UINT_MAX; --i) { + idx = i % ARRAY_SIZE(patterns); + do_one_pass(patterns[idx], start, end); + } +} diff --git a/mm/migrate.c b/mm/migrate.c index 85e042686031..a65ff72ab739 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -901,12 +901,23 @@ out: } /* + * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work + * around it. + */ +#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM) +#define ICE_noinline noinline +#else +#define ICE_noinline +#endif + +/* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct page *page, int force, - enum migrate_mode mode) +static ICE_noinline int unmap_and_move(new_page_t get_new_page, + free_page_t put_new_page, + unsigned long private, struct page *page, + int force, enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -1554,30 +1565,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page, * page migration rate limiting control. * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs * window of time. Default here says do not migrate more than 1280M per second. - * If a node is rate-limited then PTE NUMA updates are also rate-limited. However - * as it is faults that reset the window, pte updates will happen unconditionally - * if there has not been a fault since @pteupdate_interval_millisecs after the - * throttle window closed. */ static unsigned int migrate_interval_millisecs __read_mostly = 100; -static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); -/* Returns true if NUMA migration is currently rate limited */ -bool migrate_ratelimited(int node) -{ - pg_data_t *pgdat = NODE_DATA(node); - - if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + - msecs_to_jiffies(pteupdate_interval_millisecs))) - return false; - - if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) - return false; - - return true; -} - /* Returns true if the node is migrate rate-limited after the update */ static bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) diff --git a/mm/mlock.c b/mm/mlock.c index 8a54cd214925..6fd2cf15e868 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -205,62 +205,6 @@ out: return nr_pages - 1; } -/** - * __mlock_vma_pages_range() - mlock a range of pages in the vma. - * @vma: target vma - * @start: start address - * @end: end address - * @nonblocking: - * - * This takes care of making the pages present too. - * - * return 0 on success, negative error code on error. - * - * vma->vm_mm->mmap_sem must be held. - * - * If @nonblocking is NULL, it may be held for read or write and will - * be unperturbed. - * - * If @nonblocking is non-NULL, it must held for read only and may be - * released. If it's released, *@nonblocking will be set to 0. - */ -long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, int *nonblocking) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long nr_pages = (end - start) / PAGE_SIZE; - int gup_flags; - - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(end & ~PAGE_MASK); - VM_BUG_ON_VMA(start < vma->vm_start, vma); - VM_BUG_ON_VMA(end > vma->vm_end, vma); - VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - - gup_flags = FOLL_TOUCH | FOLL_MLOCK; - /* - * We want to touch writable mappings with a write fault in order - * to break COW, except for shared mappings because these don't COW - * and we would not want to dirty them for nothing. - */ - if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) - gup_flags |= FOLL_WRITE; - - /* - * We want mlock to succeed for regions that have any permissions - * other than PROT_NONE. - */ - if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) - gup_flags |= FOLL_FORCE; - - /* - * We made sure addr is within a VMA, so the following will - * not result in a stack expansion that recurses back here. - */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, - NULL, NULL, nonblocking); -} - /* * convert get_user_pages() return value to posix mlock() error */ @@ -596,7 +540,7 @@ success: /* * vm_flags is protected by the mmap_sem held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we - * set VM_LOCKED, __mlock_vma_pages_range will bring it back. + * set VM_LOCKED, populate_vma_page_range will bring it back. */ if (lock) @@ -660,69 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -/* - * __mm_populate - populate and/or mlock pages within a range of address space. - * - * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap - * flags. VMAs must be already marked with the desired vm_flags, and - * mmap_sem must not be held. - */ -int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) -{ - struct mm_struct *mm = current->mm; - unsigned long end, nstart, nend; - struct vm_area_struct *vma = NULL; - int locked = 0; - long ret = 0; - - VM_BUG_ON(start & ~PAGE_MASK); - VM_BUG_ON(len != PAGE_ALIGN(len)); - end = start + len; - - for (nstart = start; nstart < end; nstart = nend) { - /* - * We want to fault in pages for [nstart; end) address range. - * Find first corresponding VMA. - */ - if (!locked) { - locked = 1; - down_read(&mm->mmap_sem); - vma = find_vma(mm, nstart); - } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) - break; - /* - * Set [nstart; nend) to intersection of desired address - * range with the first VMA. Also, skip undesirable VMA types. - */ - nend = min(end, vma->vm_end); - if (vma->vm_flags & (VM_IO | VM_PFNMAP)) - continue; - if (nstart < vma->vm_start) - nstart = vma->vm_start; - /* - * Now fault in a range of pages. __mlock_vma_pages_range() - * double checks the vma flags, so that it won't mlock pages - * if the vma was already munlocked. - */ - ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); - if (ret < 0) { - if (ignore_errors) { - ret = 0; - continue; /* continue at next VMA */ - } - ret = __mlock_posix_error_return(ret); - break; - } - nend = nstart + ret * PAGE_SIZE; - ret = 0; - } - if (locked) - up_read(&mm->mmap_sem); - return ret; /* 0 or negative error code */ -} - SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; @@ -750,9 +631,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); - if (!error) - error = __mm_populate(start, len, 0); - return error; + if (error) + return error; + + error = __mm_populate(start, len, 0); + if (error) + return __mlock_posix_error_return(error); + return 0; } SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) diff --git a/mm/mmap.c b/mm/mmap.c index 9ec50a368634..06a6076c92e5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2316,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) - __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); + populate_vma_page_range(prev, addr, prev->vm_end, NULL); return prev; } #else @@ -2351,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (expand_stack(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) - __mlock_vma_pages_range(vma, addr, start, NULL); + populate_vma_page_range(vma, addr, start, NULL); return vma; } #endif diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 642f38cb175a..52628c819bf7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -612,7 +612,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order, const nodemask_t *nodemask) + int order, const nodemask_t *nodemask, + struct mem_cgroup *memcg) { if (likely(!sysctl_panic_on_oom)) return; @@ -625,7 +626,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, if (constraint != CONSTRAINT_NONE) return; } - dump_header(NULL, gfp_mask, order, NULL, nodemask); + dump_header(NULL, gfp_mask, order, memcg, nodemask); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -740,7 +741,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, constraint = constrained_alloc(zonelist, gfp_mask, nodemask, &totalpages); mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; - check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); + check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL); if (sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, nodemask) && diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 644bcb665773..0372411f38fc 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2111,6 +2111,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) EXPORT_SYMBOL(account_page_dirtied); /* + * Helper function for deaccounting dirty page without writeback. + * + * Doing this should *normally* only ever be done when a page + * is truncated, and is not actually mapped anywhere at all. However, + * fs/buffer.c does this when it notices that somebody has cleaned + * out all the buffers on a page without actually doing it through + * the VM. Can you say "ext3 is horribly ugly"? Thought you could. + */ +void account_page_cleaned(struct page *page, struct address_space *mapping) +{ + if (mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); + task_io_account_cancelled_write(PAGE_CACHE_SIZE); + } +} +EXPORT_SYMBOL(account_page_cleaned); + +/* * For address_spaces which do not use buffers. Just tag the page as dirty in * its radix tree. * diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 40e29429e7b0..1b849500640c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1032,11 +1032,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, static int fallbacks[MIGRATE_TYPES][4] = { [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, #ifdef CONFIG_CMA - [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ -#else - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, #endif [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ #ifdef CONFIG_MEMORY_ISOLATION @@ -1044,6 +1042,17 @@ static int fallbacks[MIGRATE_TYPES][4] = { #endif }; +#ifdef CONFIG_CMA +static struct page *__rmqueue_cma_fallback(struct zone *zone, + unsigned int order) +{ + return __rmqueue_smallest(zone, order, MIGRATE_CMA); +} +#else +static inline struct page *__rmqueue_cma_fallback(struct zone *zone, + unsigned int order) { return NULL; } +#endif + /* * Move the free pages in a range to the free lists of the requested type. * Note that start_page and end_pages are not aligned on a pageblock @@ -1136,14 +1145,40 @@ static void change_pageblock_range(struct page *pageblock_page, * as fragmentation caused by those allocations polluting movable pageblocks * is worse than movable allocations stealing from unmovable and reclaimable * pageblocks. - * - * If we claim more than half of the pageblock, change pageblock's migratetype - * as well. */ -static void try_to_steal_freepages(struct zone *zone, struct page *page, - int start_type, int fallback_type) +static bool can_steal_fallback(unsigned int order, int start_mt) +{ + /* + * Leaving this order check is intended, although there is + * relaxed order check in next check. The reason is that + * we can actually steal whole pageblock if this condition met, + * but, below check doesn't guarantee it and that is just heuristic + * so could be changed anytime. + */ + if (order >= pageblock_order) + return true; + + if (order >= pageblock_order / 2 || + start_mt == MIGRATE_RECLAIMABLE || + start_mt == MIGRATE_UNMOVABLE || + page_group_by_mobility_disabled) + return true; + + return false; +} + +/* + * This function implements actual steal behaviour. If order is large enough, + * we can steal whole pageblock. If not, we first move freepages in this + * pageblock and check whether half of pages are moved or not. If half of + * pages are moved, we can change migratetype of pageblock and permanently + * use it's pages as requested migratetype in the future. + */ +static void steal_suitable_fallback(struct zone *zone, struct page *page, + int start_type) { int current_order = page_order(page); + int pages; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { @@ -1151,19 +1186,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page, return; } - if (current_order >= pageblock_order / 2 || - start_type == MIGRATE_RECLAIMABLE || - start_type == MIGRATE_UNMOVABLE || - page_group_by_mobility_disabled) { - int pages; + pages = move_freepages_block(zone, page, start_type); + + /* Claim the whole block if over half of it is free */ + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) + set_pageblock_migratetype(page, start_type); +} + +/* + * Check whether there is a suitable fallback freepage with requested order. + * If only_stealable is true, this function returns fallback_mt only if + * we can steal other freepages all together. This would help to reduce + * fragmentation due to mixed migratetype pages in one pageblock. + */ +int find_suitable_fallback(struct free_area *area, unsigned int order, + int migratetype, bool only_stealable, bool *can_steal) +{ + int i; + int fallback_mt; + + if (area->nr_free == 0) + return -1; + + *can_steal = false; + for (i = 0;; i++) { + fallback_mt = fallbacks[migratetype][i]; + if (fallback_mt == MIGRATE_RESERVE) + break; + + if (list_empty(&area->free_list[fallback_mt])) + continue; - pages = move_freepages_block(zone, page, start_type); + if (can_steal_fallback(order, migratetype)) + *can_steal = true; - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) - set_pageblock_migratetype(page, start_type); + if (!only_stealable) + return fallback_mt; + + if (*can_steal) + return fallback_mt; } + + return -1; } /* Remove an element from the buddy allocator from the fallback list */ @@ -1173,64 +1238,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) struct free_area *area; unsigned int current_order; struct page *page; + int fallback_mt; + bool can_steal; /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order && current_order <= MAX_ORDER-1; --current_order) { - int i; - for (i = 0;; i++) { - int migratetype = fallbacks[start_migratetype][i]; - int buddy_type = start_migratetype; - - /* MIGRATE_RESERVE handled later if necessary */ - if (migratetype == MIGRATE_RESERVE) - break; - - area = &(zone->free_area[current_order]); - if (list_empty(&area->free_list[migratetype])) - continue; - - page = list_entry(area->free_list[migratetype].next, - struct page, lru); - area->nr_free--; - - if (!is_migrate_cma(migratetype)) { - try_to_steal_freepages(zone, page, - start_migratetype, - migratetype); - } else { - /* - * When borrowing from MIGRATE_CMA, we need to - * release the excess buddy pages to CMA - * itself, and we do not try to steal extra - * free pages. - */ - buddy_type = migratetype; - } + area = &(zone->free_area[current_order]); + fallback_mt = find_suitable_fallback(area, current_order, + start_migratetype, false, &can_steal); + if (fallback_mt == -1) + continue; - /* Remove the page from the freelists */ - list_del(&page->lru); - rmv_page_order(page); + page = list_entry(area->free_list[fallback_mt].next, + struct page, lru); + if (can_steal) + steal_suitable_fallback(zone, page, start_migratetype); - expand(zone, page, order, current_order, area, - buddy_type); + /* Remove the page from the freelists */ + area->nr_free--; + list_del(&page->lru); + rmv_page_order(page); - /* - * The freepage_migratetype may differ from pageblock's - * migratetype depending on the decisions in - * try_to_steal_freepages(). This is OK as long as it - * does not differ for MIGRATE_CMA pageblocks. For CMA - * we need to make sure unallocated pages flushed from - * pcp lists are returned to the correct freelist. - */ - set_freepage_migratetype(page, buddy_type); + expand(zone, page, order, current_order, area, + start_migratetype); + /* + * The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages(). This is OK as long as it + * does not differ for MIGRATE_CMA pageblocks. For CMA + * we need to make sure unallocated pages flushed from + * pcp lists are returned to the correct freelist. + */ + set_freepage_migratetype(page, start_migratetype); - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype); + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); - return page; - } + return page; } return NULL; @@ -1249,7 +1295,11 @@ retry_reserve: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { - page = __rmqueue_fallback(zone, order, migratetype); + if (migratetype == MIGRATE_MOVABLE) + page = __rmqueue_cma_fallback(zone, order); + + if (!page) + page = __rmqueue_fallback(zone, order, migratetype); /* * Use MIGRATE_RESERVE rather than fail an allocation. goto @@ -2362,13 +2412,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, *did_some_progress = 1; goto out; } - /* - * GFP_THISNODE contains __GFP_NORETRY and we never hit this. - * Sanity check for bare calls of __GFP_THISNODE, not real OOM. - * The caller should handle page allocation failure by itself if - * it specifies __GFP_THISNODE. - * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. - */ + /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; } @@ -2623,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, } /* - * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and - * __GFP_NOWARN set) should not cause reclaim since the subsystem - * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim - * using a larger set of nodes after it has established that the - * allowed per node queues are empty and that nodes are - * over allocated. + * If this allocation cannot block and it is for a specific node, then + * fail early. There's no need to wakeup kswapd or retry for a + * speculative node-specific allocation. */ - if (IS_ENABLED(CONFIG_NUMA) && - (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) goto nopage; retry: @@ -2824,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, /* * Check the zones suitable for the gfp_mask contain at least one * valid zone. It's possible to have an empty zonelist as a result - * of GFP_THISNODE and a memoryless node + * of __GFP_THISNODE and a memoryless node */ if (unlikely(!zonelist->_zonerefs->zone)) return NULL; @@ -3201,38 +3241,31 @@ static void show_migration_types(unsigned char type) * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the * memory on each free list with the exception of the first item on the list. - * Suppresses nodes that are not allowed by current's cpuset if - * SHOW_MEM_FILTER_NODES is passed. + * + * Bits in @filter: + * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's + * cpuset. */ void show_free_areas(unsigned int filter) { + unsigned long free_pcp = 0; int cpu; struct zone *zone; for_each_populated_zone(zone) { if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; - show_node(zone); - printk("%s per-cpu:\n", zone->name); - for_each_online_cpu(cpu) { - struct per_cpu_pageset *pageset; - - pageset = per_cpu_ptr(zone->pageset, cpu); - - printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", - cpu, pageset->pcp.high, - pageset->pcp.batch, pageset->pcp.count); - } + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; } printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu" - " dirty:%lu writeback:%lu unstable:%lu\n" - " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" - " free_cma:%lu\n", + " free:%lu free_pcp:%lu free_cma:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_ISOLATED_ANON), @@ -3243,13 +3276,14 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), - global_page_state(NR_FREE_PAGES), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE), + global_page_state(NR_FREE_PAGES), + free_pcp, global_page_state(NR_FREE_CMA_PAGES)); for_each_populated_zone(zone) { @@ -3257,6 +3291,11 @@ void show_free_areas(unsigned int filter) if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; + + free_pcp = 0; + for_each_online_cpu(cpu) + free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; + show_node(zone); printk("%s" " free:%lukB" @@ -3283,6 +3322,8 @@ void show_free_areas(unsigned int filter) " pagetables:%lukB" " unstable:%lukB" " bounce:%lukB" + " free_pcp:%lukB" + " local_pcp:%ukB" " free_cma:%lukB" " writeback_tmp:%lukB" " pages_scanned:%lu" @@ -3314,6 +3355,8 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), + K(free_pcp), + K(this_cpu_read(zone->pageset->pcp.count)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), K(zone_page_state(zone, NR_PAGES_SCANNED)), @@ -5717,7 +5760,7 @@ static void __setup_per_zone_wmarks(void) * value here. * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) - * deltas controls asynch page reclaim, and so should + * deltas control asynch page reclaim, and so should * not be capped for highmem. */ unsigned long min_pages; diff --git a/mm/slab.c b/mm/slab.c index c4b89eaf4c96..7eb38dd1cefa 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, return NULL; } +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return flags; +} + #else /* CONFIG_NUMA */ static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); @@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) return __cache_free_alien(cachep, objp, node, page_node); } + +/* + * Construct gfp mask to allocate from a specific node but do not invoke reclaim + * or warn about failures. + */ +static inline gfp_t gfp_exact_node(gfp_t flags) +{ + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; +} #endif /* @@ -2825,7 +2839,7 @@ alloc_done: if (unlikely(!ac->avail)) { int x; force_grow: - x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); + x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); @@ -3019,7 +3033,7 @@ retry: get_node(cache, nid) && get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + gfp_exact_node(flags), nid); if (obj) break; } @@ -3047,7 +3061,7 @@ retry: nid = page_to_nid(page); if (cache_grow(cache, flags, nid, page)) { obj = ____cache_alloc_node(cache, - flags | GFP_THISNODE, nid); + gfp_exact_node(flags), nid); if (!obj) /* * Another processor may allocate the @@ -3118,7 +3132,7 @@ retry: must_grow: spin_unlock(&n->list_lock); - x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); + x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL); if (x) goto retry; diff --git a/mm/slob.c b/mm/slob.c index 94a7fede6d48..4765f65019c7 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -532,7 +532,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) return 0; } -void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) +static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; @@ -558,7 +558,6 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; } -EXPORT_SYMBOL(slob_alloc_node); void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { diff --git a/mm/slub.c b/mm/slub.c index 82c473780c91..0fdd6c1e1f82 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -374,7 +374,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, freelist_new, counters_new)) - return 1; + return true; } else #endif { @@ -384,7 +384,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page page->freelist = freelist_new; set_page_slub_counters(page, counters_new); slab_unlock(page); - return 1; + return true; } slab_unlock(page); } @@ -396,7 +396,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif - return 0; + return false; } static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, @@ -410,7 +410,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, freelist_new, counters_new)) - return 1; + return true; } else #endif { @@ -424,7 +424,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, set_page_slub_counters(page, counters_new); slab_unlock(page); local_irq_restore(flags); - return 1; + return true; } slab_unlock(page); local_irq_restore(flags); @@ -437,7 +437,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif - return 0; + return false; } #ifdef CONFIG_SLUB_DEBUG @@ -1137,15 +1137,6 @@ static int __init setup_slub_debug(char *str) */ goto check_slabs; - if (tolower(*str) == 'o') { - /* - * Avoid enabling debugging on caches if its minimum order - * would increase as a result. - */ - disable_higher_order_debug = 1; - goto out; - } - slub_debug = 0; if (*str == '-') /* @@ -1176,6 +1167,13 @@ static int __init setup_slub_debug(char *str) case 'a': slub_debug |= SLAB_FAILSLAB; break; + case 'o': + /* + * Avoid enabling debugging on caches if its minimum + * order would increase as a result. + */ + disable_higher_order_debug = 1; + break; default: pr_err("slub_debug option '%c' unknown. skipped\n", *str); diff --git a/mm/truncate.c b/mm/truncate.c index ddec5a5966d7..7a9d8a3cb143 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -93,35 +93,6 @@ void do_invalidatepage(struct page *page, unsigned int offset, } /* - * This cancels just the dirty bit on the kernel page itself, it - * does NOT actually remove dirty bits on any mmap's that may be - * around. It also leaves the page tagged dirty, so any sync - * activity will still find it on the dirty lists, and in particular, - * clear_page_dirty_for_io() will still look at the dirty bits in - * the VM. - * - * Doing this should *normally* only ever be done when a page - * is truncated, and is not actually mapped anywhere at all. However, - * fs/buffer.c does this when it notices that somebody has cleaned - * out all the buffers on a page without actually doing it through - * the VM. Can you say "ext3 is horribly ugly"? Tought you could. - */ -void cancel_dirty_page(struct page *page, unsigned int account_size) -{ - if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; - if (mapping && mapping_cap_account_dirty(mapping)) { - dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), - BDI_RECLAIMABLE); - if (account_size) - task_io_account_cancelled_write(account_size); - } - } -} -EXPORT_SYMBOL(cancel_dirty_page); - -/* * If truncate cannot remove the fs-private metadata from the page, the page * becomes orphaned. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_fault(). @@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page)) do_invalidatepage(page, 0, PAGE_CACHE_SIZE); - cancel_dirty_page(page, PAGE_CACHE_SIZE); + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * Hence dirty accounting check is placed after invalidation. + */ + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping); ClearPageMappedToDisk(page); delete_from_page_cache(page); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 49abccf29a29..a5bbdd3b5d67 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -29,6 +29,7 @@ #include <linux/atomic.h> #include <linux/compiler.h> #include <linux/llist.h> +#include <linux/bitops.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> @@ -74,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_clear_huge(pmd)) + continue; if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next); @@ -88,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_clear_huge(pud)) + continue; if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next); @@ -1314,7 +1319,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) - align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); + align = 1ul << clamp_t(int, fls_long(size), + PAGE_SHIFT, IOREMAP_MAX_ORDER); size = PAGE_ALIGN(size); if (unlikely(!size)) |