summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig22
-rw-r--r--mm/Makefile8
-rw-r--r--mm/backing-dev.c26
-rw-r--r--mm/bootmem.c6
-rw-r--r--mm/bounce.c8
-rw-r--r--mm/compaction.c68
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/frontswap.c344
-rw-r--r--mm/highmem.c12
-rw-r--r--mm/hugetlb.c195
-rw-r--r--mm/hugetlb_cgroup.c418
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h8
-rw-r--r--mm/madvise.c18
-rw-r--r--mm/memblock.c146
-rw-r--r--mm/memcontrol.c396
-rw-r--r--mm/memory-failure.c35
-rw-r--r--mm/memory.c30
-rw-r--r--mm/memory_hotplug.c22
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/migrate.c81
-rw-r--r--mm/mmap.c11
-rw-r--r--mm/mmu_notifier.c45
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/nobootmem.c40
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c244
-rw-r--r--mm/page-writeback.c107
-rw-r--r--mm/page_alloc.c325
-rw-r--r--mm/page_cgroup.c6
-rw-r--r--mm/page_io.c157
-rw-r--r--mm/page_isolation.c93
-rw-r--r--mm/pagewalk.c1
-rw-r--r--mm/percpu-vm.c1
-rw-r--r--mm/shmem.c256
-rw-r--r--mm/slab.c622
-rw-r--r--mm/slab.h33
-rw-r--r--mm/slab_common.c120
-rw-r--r--mm/slob.c152
-rw-r--r--mm/slub.c464
-rw-r--r--mm/sparse.c49
-rw-r--r--mm/swap.c52
-rw-r--r--mm/swap_state.c7
-rw-r--r--mm/swapfile.c211
-rw-r--r--mm/vmalloc.c52
-rw-r--r--mm/vmscan.c192
-rw-r--r--mm/vmstat.c1
48 files changed, 3410 insertions, 1710 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b2176374b98e..d5c8019c6627 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
config NO_BOOTMEM
boolean
+config MEMORY_ISOLATION
+ boolean
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
+ select MEMORY_ISOLATION
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
depends on MMU
depends on ARCH_SUPPORTS_MEMORY_FAILURE
bool "Enable recovery from hardware memory errors"
+ select MEMORY_ISOLATION
help
Enables code to recover from some memory failures on systems
with MCA recovery. This allows a system to continue running
@@ -389,3 +394,20 @@ config CLEANCACHE
in a negligible performance hit.
If unsure, say Y to enable cleancache
+
+config FRONTSWAP
+ bool "Enable frontswap to cache swap pages if tmem is present"
+ depends on SWAP
+ default n
+ help
+ Frontswap is so named because it can be thought of as the opposite
+ of a "backing" store for a swap device. The data is stored into
+ "transcendent memory", memory that is not directly accessible or
+ addressable by the kernel and is of unknown and possibly
+ time-varying size. When space in transcendent memory is available,
+ a significant swap I/O reduction may be achieved. When none is
+ available, all frontswap calls are reduced to a single pointer-
+ compare-against-NULL resulting in a negligible performance hit
+ and swap data is stored as normal on the matching swap device.
+
+ If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index a156285ce88d..92753e2d82da 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,8 +15,9 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o mm_init.o mmu_context.o percpu.o \
+ mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o $(mmu-y)
+
obj-y += init-mm.o
ifdef CONFIG_NO_BOOTMEM
@@ -29,6 +30,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
obj-$(CONFIG_HAS_DMA) += dmapool.o
obj-$(CONFIG_HUGETLBFS) += hugetlb.o
obj-$(CONFIG_NUMA) += mempolicy.o
@@ -47,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aafb07e..6b4718e2ee34 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -677,7 +677,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->min_ratio = 0;
bdi->max_ratio = 100;
- bdi->max_prop_frac = PROP_FRAC_BASE;
+ bdi->max_prop_frac = FPROP_FRAC_BASE;
spin_lock_init(&bdi->wb_lock);
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +700,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->write_bandwidth = INIT_BW;
bdi->avg_write_bandwidth = INIT_BW;
- err = prop_local_init_percpu(&bdi->completions);
+ err = fprop_local_init_percpu(&bdi->completions);
if (err) {
err:
@@ -744,7 +744,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
- prop_local_destroy_percpu(&bdi->completions);
+ fprop_local_destroy_percpu(&bdi->completions);
}
EXPORT_SYMBOL(bdi_destroy);
@@ -886,3 +886,23 @@ out:
return ret;
}
EXPORT_SYMBOL(wait_iff_congested);
+
+int pdflush_proc_obsolete(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ char kbuf[] = "0\n";
+
+ if (*ppos) {
+ *lenp = 0;
+ return 0;
+ }
+
+ if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
+ return -EFAULT;
+ printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
+ table->procname);
+
+ *lenp = 2;
+ *ppos += *lenp;
+ return 2;
+}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ec4fcb7a56c8..bcb63ac48cc5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
return ___alloc_bootmem(size, align, goal, limit);
}
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
@@ -710,6 +710,10 @@ again:
if (ptr)
return ptr;
+ /* do not panic in alloc_bootmem_bdata() */
+ if (limit && goal + size > limit)
+ limit = 0;
+
ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
if (ptr)
return ptr;
diff --git a/mm/bounce.c b/mm/bounce.c
index d1be02ca1889..042086775561 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -24,23 +24,25 @@
static mempool_t *page_pool, *isa_page_pool;
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
static __init int init_emergency_pool(void)
{
-#ifndef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
if (max_pfn <= max_low_pfn)
return 0;
#endif
page_pool = mempool_create_page_pool(POOL_SIZE, 0);
BUG_ON(!page_pool);
- printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+ printk("bounce pool size: %d pages\n", POOL_SIZE);
return 0;
}
__initcall(init_emergency_pool);
+#endif
+#ifdef CONFIG_HIGHMEM
/*
* highmem version, map in to vec
*/
diff --git a/mm/compaction.c b/mm/compaction.c
index 7ea259d82a99..e78cb9688421 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone,
pfn -= pageblock_nr_pages) {
unsigned long isolated;
+ /*
+ * Skip ahead if another thread is compacting in the area
+ * simultaneously. If we wrapped around, we can only skip
+ * ahead if zone->compact_cached_free_pfn also wrapped to
+ * above our starting point.
+ */
+ if (cc->order > 0 && (!cc->wrapped ||
+ zone->compact_cached_free_pfn >
+ cc->start_free_pfn))
+ pfn = min(pfn, zone->compact_cached_free_pfn);
+
if (!pfn_valid(pfn))
continue;
@@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone,
* looking for free pages, the search will restart here as
* page migration may have returned some pages to the allocator
*/
- if (isolated)
+ if (isolated) {
high_pfn = max(high_pfn, pfn);
+ if (cc->order > 0)
+ zone->compact_cached_free_pfn = high_pfn;
+ }
}
/* split_free_page does not map the pages */
@@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
return ISOLATE_SUCCESS;
}
+/*
+ * Returns the start pfn of the last page block in a zone. This is the starting
+ * point for full compaction of a zone. Compaction searches for free pages from
+ * the end of each zone, while isolate_freepages_block scans forward inside each
+ * page block.
+ */
+static unsigned long start_free_pfn(struct zone *zone)
+{
+ unsigned long free_pfn;
+ free_pfn = zone->zone_start_pfn + zone->spanned_pages;
+ free_pfn &= ~(pageblock_nr_pages-1);
+ return free_pfn;
+}
+
static int compact_finished(struct zone *zone,
struct compact_control *cc)
{
@@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone,
if (fatal_signal_pending(current))
return COMPACT_PARTIAL;
- /* Compaction run completes if the migrate and free scanner meet */
- if (cc->free_pfn <= cc->migrate_pfn)
+ /*
+ * A full (order == -1) compaction run starts at the beginning and
+ * end of a zone; it completes when the migrate and free scanner meet.
+ * A partial (order > 0) compaction can start with the free scanner
+ * at a random point in the zone, and may have to restart.
+ */
+ if (cc->free_pfn <= cc->migrate_pfn) {
+ if (cc->order > 0 && !cc->wrapped) {
+ /* We started partway through; restart at the end. */
+ unsigned long free_pfn = start_free_pfn(zone);
+ zone->compact_cached_free_pfn = free_pfn;
+ cc->free_pfn = free_pfn;
+ cc->wrapped = 1;
+ return COMPACT_CONTINUE;
+ }
+ return COMPACT_COMPLETE;
+ }
+
+ /* We wrapped around and ended up where we started. */
+ if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
return COMPACT_COMPLETE;
/*
@@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
/* Setup to move all movable pages to the end of the zone */
cc->migrate_pfn = zone->zone_start_pfn;
- cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
- cc->free_pfn &= ~(pageblock_nr_pages-1);
+
+ if (cc->order > 0) {
+ /* Incremental compaction. Start where the last one stopped. */
+ cc->free_pfn = zone->compact_cached_free_pfn;
+ cc->start_free_pfn = cc->free_pfn;
+ } else {
+ /* Order == -1 starts at the end of the zone. */
+ cc->free_pfn = start_free_pfn(zone);
+ }
migrate_prep_local();
@@ -701,8 +754,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
if (err) {
putback_lru_pages(&cc->migratepages);
cc->nr_migratepages = 0;
+ if (err == -ENOMEM) {
+ ret = COMPACT_PARTIAL;
+ goto out;
+ }
}
-
}
out:
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af79..9b75a045dbf4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_WILLNEED:
- if (!mapping->a_ops->readpage) {
- ret = -EINVAL;
- break;
- }
-
/* First and last PARTIAL page! */
start_index = offset >> PAGE_CACHE_SHIFT;
end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
nrpages = end_index - start_index + 1;
if (!nrpages)
nrpages = ~0UL;
-
- ret = force_page_cache_readahead(mapping, file,
- start_index,
- nrpages);
- if (ret > 0)
- ret = 0;
+
+ /*
+ * Ignore return value because fadvise() shall return
+ * success even if filesystem can't retrieve a hint,
+ */
+ force_page_cache_readahead(mapping, file, start_index,
+ nrpages);
break;
case POSIX_FADV_NOREUSE:
break;
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..6b3e71a2cd48
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,344 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap. See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/security.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops __read_mostly;
+
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+bool frontswap_enabled __read_mostly;
+EXPORT_SYMBOL(frontswap_enabled);
+
+/*
+ * If enabled, frontswap_store will return failure even on success. As
+ * a result, the swap subsystem will always write the page to swap, in
+ * effect converting frontswap into a writethrough cache. In this mode,
+ * there is no direct reduction in swap writes, but a frontswap backend
+ * can unilaterally "reclaim" any pages in use with no data loss, thus
+ * providing increases control over maximum memory usage due to frontswap.
+ */
+static bool frontswap_writethrough_enabled __read_mostly;
+
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured). These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_loads;
+static u64 frontswap_succ_stores;
+static u64 frontswap_failed_stores;
+static u64 frontswap_invalidates;
+
+static inline void inc_frontswap_loads(void) {
+ frontswap_loads++;
+}
+static inline void inc_frontswap_succ_stores(void) {
+ frontswap_succ_stores++;
+}
+static inline void inc_frontswap_failed_stores(void) {
+ frontswap_failed_stores++;
+}
+static inline void inc_frontswap_invalidates(void) {
+ frontswap_invalidates++;
+}
+#else
+static inline void inc_frontswap_loads(void) { }
+static inline void inc_frontswap_succ_stores(void) { }
+static inline void inc_frontswap_failed_stores(void) { }
+static inline void inc_frontswap_invalidates(void) { }
+#endif
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting.
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+ struct frontswap_ops old = frontswap_ops;
+
+ frontswap_ops = *ops;
+ frontswap_enabled = true;
+ return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+
+/*
+ * Enable/disable frontswap writethrough (see above).
+ */
+void frontswap_writethrough(bool enable)
+{
+ frontswap_writethrough_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_writethrough);
+
+/*
+ * Called when a swap device is swapon'd.
+ */
+void __frontswap_init(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ frontswap_ops.init(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+
+static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+ frontswap_clear(sis, offset);
+ atomic_dec(&sis->frontswap_pages);
+}
+
+/*
+ * "Store" data from a page to frontswap and associate it with the page's
+ * swaptype and offset. Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implementation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure.
+ */
+int __frontswap_store(struct page *page)
+{
+ int ret = -1, dup = 0;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset))
+ dup = 1;
+ ret = frontswap_ops.store(type, offset, page);
+ if (ret == 0) {
+ frontswap_set(sis, offset);
+ inc_frontswap_succ_stores();
+ if (!dup)
+ atomic_inc(&sis->frontswap_pages);
+ } else {
+ /*
+ failed dup always results in automatic invalidate of
+ the (older) page from frontswap
+ */
+ inc_frontswap_failed_stores();
+ if (dup)
+ __frontswap_clear(sis, offset);
+ }
+ if (frontswap_writethrough_enabled)
+ /* report failure so swap also writes to swap device */
+ ret = -1;
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_store);
+
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache.
+ */
+int __frontswap_load(struct page *page)
+{
+ int ret = -1;
+ swp_entry_t entry = { .val = page_private(page), };
+ int type = swp_type(entry);
+ struct swap_info_struct *sis = swap_info[type];
+ pgoff_t offset = swp_offset(entry);
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset))
+ ret = frontswap_ops.load(type, offset, page);
+ if (ret == 0)
+ inc_frontswap_loads();
+ return ret;
+}
+EXPORT_SYMBOL(__frontswap_load);
+
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (frontswap_test(sis, offset)) {
+ frontswap_ops.invalidate_page(type, offset);
+ __frontswap_clear(sis, offset);
+ inc_frontswap_invalidates();
+ }
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+ struct swap_info_struct *sis = swap_info[type];
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+ frontswap_ops.invalidate_area(type);
+ atomic_set(&sis->frontswap_pages, 0);
+ memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+
+static unsigned long __frontswap_curr_pages(void)
+{
+ int type;
+ unsigned long totalpages = 0;
+ struct swap_info_struct *si = NULL;
+
+ assert_spin_locked(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ totalpages += atomic_read(&si->frontswap_pages);
+ }
+ return totalpages;
+}
+
+static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+ int *swapid)
+{
+ int ret = -EINVAL;
+ struct swap_info_struct *si = NULL;
+ int si_frontswap_pages;
+ unsigned long total_pages_to_unuse = total;
+ unsigned long pages = 0, pages_to_unuse = 0;
+ int type;
+
+ assert_spin_locked(&swap_lock);
+ for (type = swap_list.head; type >= 0; type = si->next) {
+ si = swap_info[type];
+ si_frontswap_pages = atomic_read(&si->frontswap_pages);
+ if (total_pages_to_unuse < si_frontswap_pages) {
+ pages = pages_to_unuse = total_pages_to_unuse;
+ } else {
+ pages = si_frontswap_pages;
+ pages_to_unuse = 0; /* unuse all */
+ }
+ /* ensure there is enough RAM to fetch pages from frontswap */
+ if (security_vm_enough_memory_mm(current->mm, pages)) {
+ ret = -ENOMEM;
+ continue;
+ }
+ vm_unacct_memory(pages);
+ *unused = pages_to_unuse;
+ *swapid = type;
+ ret = 0;
+ break;
+ }
+
+ return ret;
+}
+
+static int __frontswap_shrink(unsigned long target_pages,
+ unsigned long *pages_to_unuse,
+ int *type)
+{
+ unsigned long total_pages = 0, total_pages_to_unuse;
+
+ assert_spin_locked(&swap_lock);
+
+ total_pages = __frontswap_curr_pages();
+ if (total_pages <= target_pages) {
+ /* Nothing to do */
+ *pages_to_unuse = 0;
+ return 0;
+ }
+ total_pages_to_unuse = total_pages - target_pages;
+ return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
+}
+
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+ unsigned long pages_to_unuse = 0;
+ int type, ret;
+
+ /*
+ * we don't want to hold swap_lock while doing a very
+ * lengthy try_to_unuse, but swap_list may change
+ * so restart scan from swap_list.head each time
+ */
+ spin_lock(&swap_lock);
+ ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+ spin_unlock(&swap_lock);
+ if (ret == 0 && pages_to_unuse)
+ try_to_unuse(type, true, pages_to_unuse);
+ return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices. This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+ unsigned long totalpages = 0;
+
+ spin_lock(&swap_lock);
+ totalpages = __frontswap_curr_pages();
+ spin_unlock(&swap_lock);
+
+ return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *root = debugfs_create_dir("frontswap", NULL);
+ if (root == NULL)
+ return -ENXIO;
+ debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
+ debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
+ debugfs_create_u64("failed_stores", S_IRUGO, root,
+ &frontswap_failed_stores);
+ debugfs_create_u64("invalidates", S_IRUGO,
+ root, &frontswap_invalidates);
+#endif
+ return 0;
+}
+
+module_init(init_frontswap);
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6250c3..d517cd16a6eb 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
#endif
+struct page *kmap_to_page(void *vaddr)
+{
+ unsigned long addr = (unsigned long)vaddr;
+
+ if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
+ int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
+ return pte_page(pkmap_page_table[i]);
+ }
+
+ return virt_to_page(addr);
+}
+
static void flush_all_zero_pkmaps(void)
{
int i;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831276a3..bc727122dd44 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
#include <asm/page.h>
#include <asm/pgtable.h>
-#include <linux/io.h>
+#include <asm/tlb.h>
+#include <linux/io.h>
#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
+#include <linux/hugetlb_cgroup.h>
#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
-static int max_hstate;
+int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static unsigned long __initdata default_hstate_size;
-#define for_each_hstate(h) \
- for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
-
/*
* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
*/
-static DEFINE_SPINLOCK(hugetlb_lock);
+DEFINE_SPINLOCK(hugetlb_lock);
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
- list_add(&page->lru, &h->hugepage_freelists[nid]);
+ list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
}
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
if (list_empty(&h->hugepage_freelists[nid]))
return NULL;
page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
- list_del(&page->lru);
+ list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
1 << PG_active | 1 << PG_reserved |
1 << PG_private | 1 << PG_writeback);
}
+ VM_BUG_ON(hugetlb_cgroup_from_page(page));
set_compound_page_dtor(page, NULL);
set_page_refcounted(page);
arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
page->mapping = NULL;
BUG_ON(page_count(page));
BUG_ON(page_mapcount(page));
- INIT_LIST_HEAD(&page->lru);
spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_uncharge_page(hstate_index(h),
+ pages_per_huge_page(h), page);
if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+ /* remove the page from active list */
+ list_del(&page->lru);
update_and_free_page(h, page);
h->surplus_huge_pages--;
h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
+ INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
+ set_hugetlb_cgroup(page, NULL);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
spin_lock(&hugetlb_lock);
if (page) {
+ INIT_LIST_HEAD(&page->lru);
r_nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page);
+ set_hugetlb_cgroup(page, NULL);
/*
* We incremented the global counters already
*/
@@ -993,7 +1001,6 @@ retry:
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0)
break;
- list_del(&page->lru);
/*
* This page is now managed by the hugetlb allocator and has
* no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
/* Free unnecessary surplus pages to the buddy allocator */
if (!list_empty(&surplus_list)) {
list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
- list_del(&page->lru);
put_page(page);
}
}
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
struct hstate *h = hstate_vma(vma);
struct page *page;
long chg;
+ int ret, idx;
+ struct hugetlb_cgroup *h_cg;
+ idx = hstate_index(h);
/*
* Processes that did not create the mapping will have no
* reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
- return ERR_PTR(-VM_FAULT_OOM);
+ return ERR_PTR(-ENOMEM);
if (chg)
if (hugepage_subpool_get_pages(spool, chg))
- return ERR_PTR(-VM_FAULT_SIGBUS);
+ return ERR_PTR(-ENOSPC);
+ ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+ if (ret) {
+ hugepage_subpool_put_pages(spool, chg);
+ return ERR_PTR(-ENOSPC);
+ }
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
- spin_unlock(&hugetlb_lock);
-
- if (!page) {
+ if (page) {
+ /* update page cgroup details */
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
+ spin_unlock(&hugetlb_lock);
+ } else {
+ spin_unlock(&hugetlb_lock);
page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) {
+ hugetlb_cgroup_uncharge_cgroup(idx,
+ pages_per_huge_page(h),
+ h_cg);
hugepage_subpool_put_pages(spool, chg);
- return ERR_PTR(-VM_FAULT_SIGBUS);
+ return ERR_PTR(-ENOSPC);
}
+ spin_lock(&hugetlb_lock);
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+ h_cg, page);
+ list_move(&page->lru, &h->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
}
set_page_private(page, (unsigned long)spool);
vma_commit_reservation(h, vma, addr);
-
return page;
}
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct attribute_group *hstate_attr_group)
{
int retval;
- int hi = h - hstates;
+ int hi = hstate_index(h);
hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
if (!nhs->hugepages_kobj)
return; /* no hstate attributes */
- for_each_hstate(h)
- if (nhs->hstate_kobjs[h - hstates]) {
- kobject_put(nhs->hstate_kobjs[h - hstates]);
- nhs->hstate_kobjs[h - hstates] = NULL;
+ for_each_hstate(h) {
+ int idx = hstate_index(h);
+ if (nhs->hstate_kobjs[idx]) {
+ kobject_put(nhs->hstate_kobjs[idx]);
+ nhs->hstate_kobjs[idx] = NULL;
}
+ }
kobject_put(nhs->hugepages_kobj);
nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
hugetlb_unregister_all_nodes();
for_each_hstate(h) {
- kobject_put(hstate_kobjs[h - hstates]);
+ kobject_put(hstate_kobjs[hstate_index(h)]);
}
kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
if (!size_to_hstate(default_hstate_size))
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
}
- default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+ default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
if (default_hstate_max_huge_pages)
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
return;
}
- BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+ BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
BUG_ON(order == 0);
- h = &hstates[max_hstate++];
+ h = &hstates[hugetlb_max_hstate++];
h->order = order;
h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
h->nr_huge_pages = 0;
h->free_huge_pages = 0;
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+ INIT_LIST_HEAD(&h->hugepage_activelist);
h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
+ /*
+ * Add cgroup control files only if the huge page consists
+ * of more than two normal pages. This is because we use
+ * page[2].lru.next for storing cgoup details.
+ */
+ if (order >= HUGETLB_CGROUP_MIN_ORDER)
+ hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
parsed_hstate = h;
}
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
static unsigned long *last_mhp;
/*
- * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
* so this hugepages= parameter goes to the "default hstate".
*/
- if (!max_hstate)
+ if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
else
mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
* But we need to allocate >= MAX_ORDER hstates here early to still
* use the bootmem allocator.
*/
- if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+ if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
hugetlb_hstate_alloc_pages(parsed_hstate);
last_mhp = mhp;
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
return 0;
}
-void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct page *ref_page)
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page)
{
+ int force_flush = 0;
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
pte_t *ptep;
pte_t pte;
struct page *page;
- struct page *tmp;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- /*
- * A page gathering list, protected by per file i_mmap_mutex. The
- * lock is used to avoid list corruption from multiple unmapping
- * of the same page since we are using page->lru.
- */
- LIST_HEAD(page_list);
-
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
BUG_ON(end & ~huge_page_mask(h));
+ tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, start, end);
+again:
spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
}
pte = huge_ptep_get_and_clear(mm, address, ptep);
+ tlb_remove_tlb_entry(tlb, ptep, address);
if (pte_dirty(pte))
set_page_dirty(page);
- list_add(&page->lru, &page_list);
+ page_remove_rmap(page);
+ force_flush = !__tlb_remove_page(tlb, page);
+ if (force_flush)
+ break;
/* Bail out after unmapping reference page if supplied */
if (ref_page)
break;
}
- flush_tlb_range(vma, start, end);
spin_unlock(&mm->page_table_lock);
- mmu_notifier_invalidate_range_end(mm, start, end);
- list_for_each_entry_safe(page, tmp, &page_list, lru) {
- page_remove_rmap(page);
- list_del(&page->lru);
- put_page(page);
+ /*
+ * mmu_gather ran out of room to batch pages, we break out of
+ * the PTE lock to avoid doing the potential expensive TLB invalidate
+ * and page-free while holding it.
+ */
+ if (force_flush) {
+ force_flush = 0;
+ tlb_flush_mmu(tlb);
+ if (address < end && !ref_page)
+ goto again;
}
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_end_vma(tlb, vma);
+}
+
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, struct page *ref_page)
+{
+ __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+
+ /*
+ * Clear this flag so that x86's huge_pmd_share page_table_shareable
+ * test will fail on a vma being torn down, and not grab a page table
+ * on its way out. We're lucky that the flag has such an appropriate
+ * name, and can in fact be safely cleared here. We could clear it
+ * before the __unmap_hugepage_range above, but all that's necessary
+ * is to clear it before releasing the i_mmap_mutex. This works
+ * because in the context this is called, the VMA is about to be
+ * destroyed and the i_mmap_mutex is held.
+ */
+ vma->vm_flags &= ~VM_MAYSHARE;
}
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
- mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
- __unmap_hugepage_range(vma, start, end, ref_page);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ struct mm_struct *mm;
+ struct mmu_gather tlb;
+
+ mm = vma->vm_mm;
+
+ tlb_gather_mmu(&tlb, mm, 0);
+ __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+ tlb_finish_mmu(&tlb, start, end);
}
/*
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* from the time of fork. This would look like data corruption
*/
if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
- __unmap_hugepage_range(iter_vma,
- address, address + huge_page_size(h),
- page);
+ unmap_hugepage_range(iter_vma, address,
+ address + huge_page_size(h), page);
}
mutex_unlock(&mapping->i_mmap_mutex);
@@ -2496,6 +2560,7 @@ retry_avoidcopy:
new_page = alloc_huge_page(vma, address, outside_reserve);
if (IS_ERR(new_page)) {
+ long err = PTR_ERR(new_page);
page_cache_release(old_page);
/*
@@ -2524,7 +2589,10 @@ retry_avoidcopy:
/* Caller expects lock to be held */
spin_lock(&mm->page_table_lock);
- return -PTR_ERR(new_page);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ else
+ return VM_FAULT_SIGBUS;
}
/*
@@ -2642,7 +2710,11 @@ retry:
goto out;
page = alloc_huge_page(vma, address, 0);
if (IS_ERR(page)) {
- ret = -PTR_ERR(page);
+ ret = PTR_ERR(page);
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2679,7 +2751,7 @@ retry:
*/
if (unlikely(PageHWPoison(page))) {
ret = VM_FAULT_HWPOISON |
- VM_FAULT_SET_HINDEX(h - hstates);
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
}
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(h - hstates);
+ VM_FAULT_SET_HINDEX(hstate_index(h));
}
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
}
}
spin_unlock(&mm->page_table_lock);
- mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
-
+ /*
+ * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+ * may have cleared our pud entry and done put_page on the page table:
+ * once we release i_mmap_mutex, another task can do the final put_page
+ * and that page table be reused and filled with junk.
+ */
flush_tlb_range(vma, start, end);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
}
int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 000000000000..a3f358fb8a0c
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,418 @@
+/*
+ *
+ * Copyright IBM Corporation, 2012
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+
+struct hugetlb_cgroup {
+ struct cgroup_subsys_state css;
+ /*
+ * the counter to account for hugepages from hugetlb.
+ */
+ struct res_counter hugepage[HUGE_MAX_HSTATE];
+};
+
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val) ((val) & 0xffff)
+
+struct cgroup_subsys hugetlb_subsys __read_mostly;
+static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+ return container_of(s, struct hugetlb_cgroup, css);
+}
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+ return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
+ hugetlb_subsys_id));
+}
+
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
+{
+ return hugetlb_cgroup_from_css(task_subsys_state(task,
+ hugetlb_subsys_id));
+}
+
+static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
+{
+ return (h_cg == root_h_cgroup);
+}
+
+static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
+{
+ if (!cg->parent)
+ return NULL;
+ return hugetlb_cgroup_from_cgroup(cg->parent);
+}
+
+static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
+{
+ int idx;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
+
+ for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+ if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
+ return true;
+ }
+ return false;
+}
+
+static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
+{
+ int idx;
+ struct cgroup *parent_cgroup;
+ struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
+
+ h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
+ if (!h_cgroup)
+ return ERR_PTR(-ENOMEM);
+
+ parent_cgroup = cgroup->parent;
+ if (parent_cgroup) {
+ parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
+ for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+ res_counter_init(&h_cgroup->hugepage[idx],
+ &parent_h_cgroup->hugepage[idx]);
+ } else {
+ root_h_cgroup = h_cgroup;
+ for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+ res_counter_init(&h_cgroup->hugepage[idx], NULL);
+ }
+ return &h_cgroup->css;
+}
+
+static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
+{
+ struct hugetlb_cgroup *h_cgroup;
+
+ h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
+ kfree(h_cgroup);
+}
+
+
+/*
+ * Should be called with hugetlb_lock held.
+ * Since we are holding hugetlb_lock, pages cannot get moved from
+ * active list or uncharged from the cgroup, So no need to get
+ * page reference and test for page active here. This function
+ * cannot fail.
+ */
+static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
+ struct page *page)
+{
+ int csize;
+ struct res_counter *counter;
+ struct res_counter *fail_res;
+ struct hugetlb_cgroup *page_hcg;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+ struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
+
+ page_hcg = hugetlb_cgroup_from_page(page);
+ /*
+ * We can have pages in active list without any cgroup
+ * ie, hugepage with less than 3 pages. We can safely
+ * ignore those pages.
+ */
+ if (!page_hcg || page_hcg != h_cg)
+ goto out;
+
+ csize = PAGE_SIZE << compound_order(page);
+ if (!parent) {
+ parent = root_h_cgroup;
+ /* root has no limit */
+ res_counter_charge_nofail(&parent->hugepage[idx],
+ csize, &fail_res);
+ }
+ counter = &h_cg->hugepage[idx];
+ res_counter_uncharge_until(counter, counter->parent, csize);
+
+ set_hugetlb_cgroup(page, parent);
+out:
+ return;
+}
+
+/*
+ * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
+ * the parent cgroup.
+ */
+static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
+{
+ struct hstate *h;
+ struct page *page;
+ int ret = 0, idx = 0;
+
+ do {
+ if (cgroup_task_count(cgroup) ||
+ !list_empty(&cgroup->children)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ for_each_hstate(h) {
+ spin_lock(&hugetlb_lock);
+ list_for_each_entry(page, &h->hugepage_activelist, lru)
+ hugetlb_cgroup_move_parent(idx, cgroup, page);
+
+ spin_unlock(&hugetlb_lock);
+ idx++;
+ }
+ cond_resched();
+ } while (hugetlb_cgroup_have_usage(cgroup));
+out:
+ return ret;
+}
+
+int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup **ptr)
+{
+ int ret = 0;
+ struct res_counter *fail_res;
+ struct hugetlb_cgroup *h_cg = NULL;
+ unsigned long csize = nr_pages * PAGE_SIZE;
+
+ if (hugetlb_cgroup_disabled())
+ goto done;
+ /*
+ * We don't charge any cgroup if the compound page have less
+ * than 3 pages.
+ */
+ if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+ goto done;
+again:
+ rcu_read_lock();
+ h_cg = hugetlb_cgroup_from_task(current);
+ if (!css_tryget(&h_cg->css)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
+ css_put(&h_cg->css);
+done:
+ *ptr = h_cg;
+ return ret;
+}
+
+/* Should be called with hugetlb_lock held */
+void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg,
+ struct page *page)
+{
+ if (hugetlb_cgroup_disabled() || !h_cg)
+ return;
+
+ set_hugetlb_cgroup(page, h_cg);
+ return;
+}
+
+/*
+ * Should be called with hugetlb_lock held
+ */
+void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+ struct page *page)
+{
+ struct hugetlb_cgroup *h_cg;
+ unsigned long csize = nr_pages * PAGE_SIZE;
+
+ if (hugetlb_cgroup_disabled())
+ return;
+ VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
+ h_cg = hugetlb_cgroup_from_page(page);
+ if (unlikely(!h_cg))
+ return;
+ set_hugetlb_cgroup(page, NULL);
+ res_counter_uncharge(&h_cg->hugepage[idx], csize);
+ return;
+}
+
+void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
+ struct hugetlb_cgroup *h_cg)
+{
+ unsigned long csize = nr_pages * PAGE_SIZE;
+
+ if (hugetlb_cgroup_disabled() || !h_cg)
+ return;
+
+ if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+ return;
+
+ res_counter_uncharge(&h_cg->hugepage[idx], csize);
+ return;
+}
+
+static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
+ struct file *file, char __user *buf,
+ size_t nbytes, loff_t *ppos)
+{
+ u64 val;
+ char str[64];
+ int idx, name, len;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+
+ idx = MEMFILE_IDX(cft->private);
+ name = MEMFILE_ATTR(cft->private);
+
+ val = res_counter_read_u64(&h_cg->hugepage[idx], name);
+ len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+ return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
+ const char *buffer)
+{
+ int idx, name, ret;
+ unsigned long long val;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+
+ idx = MEMFILE_IDX(cft->private);
+ name = MEMFILE_ATTR(cft->private);
+
+ switch (name) {
+ case RES_LIMIT:
+ if (hugetlb_cgroup_is_root(h_cg)) {
+ /* Can't set limit on root */
+ ret = -EINVAL;
+ break;
+ }
+ /* This function does all necessary parse...reuse it */
+ ret = res_counter_memparse_write_strategy(buffer, &val);
+ if (ret)
+ break;
+ ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
+static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
+{
+ int idx, name, ret = 0;
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+
+ idx = MEMFILE_IDX(event);
+ name = MEMFILE_ATTR(event);
+
+ switch (name) {
+ case RES_MAX_USAGE:
+ res_counter_reset_max(&h_cg->hugepage[idx]);
+ break;
+ case RES_FAILCNT:
+ res_counter_reset_failcnt(&h_cg->hugepage[idx]);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+
+static char *mem_fmt(char *buf, int size, unsigned long hsize)
+{
+ if (hsize >= (1UL << 30))
+ snprintf(buf, size, "%luGB", hsize >> 30);
+ else if (hsize >= (1UL << 20))
+ snprintf(buf, size, "%luMB", hsize >> 20);
+ else
+ snprintf(buf, size, "%luKB", hsize >> 10);
+ return buf;
+}
+
+int __init hugetlb_cgroup_file_init(int idx)
+{
+ char buf[32];
+ struct cftype *cft;
+ struct hstate *h = &hstates[idx];
+
+ /* format the size */
+ mem_fmt(buf, 32, huge_page_size(h));
+
+ /* Add the limit file */
+ cft = &h->cgroup_files[0];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
+ cft->read = hugetlb_cgroup_read;
+ cft->write_string = hugetlb_cgroup_write;
+
+ /* Add the usage file */
+ cft = &h->cgroup_files[1];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
+ cft->read = hugetlb_cgroup_read;
+
+ /* Add the MAX usage file */
+ cft = &h->cgroup_files[2];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
+ cft->trigger = hugetlb_cgroup_reset;
+ cft->read = hugetlb_cgroup_read;
+
+ /* Add the failcntfile */
+ cft = &h->cgroup_files[3];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
+ cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT);
+ cft->trigger = hugetlb_cgroup_reset;
+ cft->read = hugetlb_cgroup_read;
+
+ /* NULL terminate the last cft */
+ cft = &h->cgroup_files[4];
+ memset(cft, 0, sizeof(*cft));
+
+ WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
+
+ return 0;
+}
+
+/*
+ * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
+ * when we migrate hugepages
+ */
+void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
+{
+ struct hugetlb_cgroup *h_cg;
+ struct hstate *h = page_hstate(oldhpage);
+
+ if (hugetlb_cgroup_disabled())
+ return;
+
+ VM_BUG_ON(!PageHuge(oldhpage));
+ spin_lock(&hugetlb_lock);
+ h_cg = hugetlb_cgroup_from_page(oldhpage);
+ set_hugetlb_cgroup(oldhpage, NULL);
+
+ /* move the h_cg details to new cgroup */
+ set_hugetlb_cgroup(newhpage, h_cg);
+ list_move(&newhpage->lru, &h->hugepage_activelist);
+ spin_unlock(&hugetlb_lock);
+ return;
+}
+
+struct cgroup_subsys hugetlb_subsys = {
+ .name = "hugetlb",
+ .create = hugetlb_cgroup_create,
+ .pre_destroy = hugetlb_cgroup_pre_destroy,
+ .destroy = hugetlb_cgroup_destroy,
+ .subsys_id = hugetlb_subsys_id,
+};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index cc448bb983ba..3a61efc518d5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -123,7 +123,7 @@ static int pfn_inject_init(void)
if (!dentry)
goto fail;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
hwpoison_dir, &hwpoison_filter_memcg);
if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 2ba87fbfb75b..3314f79d775a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,8 +118,14 @@ struct compact_control {
unsigned long nr_freepages; /* Number of isolated free pages */
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
+ unsigned long start_free_pfn; /* where we started the search */
unsigned long migrate_pfn; /* isolate_migratepages search base */
bool sync; /* Synchronous migration */
+ bool wrapped; /* Order > 0 compactions are
+ incremental, once free_pfn
+ and migrate_pfn meet, we restart
+ from the top of the zone;
+ remember we wrapped around. */
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
@@ -347,3 +353,5 @@ extern u32 hwpoison_filter_enable;
extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
+
+extern void set_pageblock_order(void);
diff --git a/mm/madvise.c b/mm/madvise.c
index deff1b64a08c..14d260fa0d17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -15,6 +15,7 @@
#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/fs.h>
+#include <linux/file.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma,
{
loff_t offset;
int error;
+ struct file *f;
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
return -EINVAL;
- if (!vma->vm_file || !vma->vm_file->f_mapping
- || !vma->vm_file->f_mapping->host) {
+ f = vma->vm_file;
+
+ if (!f || !f->f_mapping || !f->f_mapping->host) {
return -EINVAL;
}
@@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma,
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- /* filesystem's fallocate may need to take i_mutex */
+ /*
+ * Filesystem's fallocate may need to take i_mutex. We need to
+ * explicitly grab a reference because the vma (and hence the
+ * vma's reference to the file) can go away as soon as we drop
+ * mmap_sem.
+ */
+ get_file(f);
up_read(&current->mm->mmap_sem);
- error = do_fallocate(vma->vm_file,
+ error = do_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
+ fput(f);
down_read(&current->mm->mmap_sem);
return error;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 952123eba433..4d9393c7edc9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
MAX_NUMNODES);
}
-/*
- * Free memblock.reserved.regions
- */
-int __init_memblock memblock_free_reserved_regions(void)
-{
- if (memblock.reserved.regions == memblock_reserved_init_regions)
- return 0;
-
- return memblock_free(__pa(memblock.reserved.regions),
- sizeof(struct memblock_region) * memblock.reserved.max);
-}
-
-/*
- * Reserve memblock.reserved.regions
- */
-int __init_memblock memblock_reserve_reserved_regions(void)
-{
- if (memblock.reserved.regions == memblock_reserved_init_regions)
- return 0;
-
- return memblock_reserve(__pa(memblock.reserved.regions),
- sizeof(struct memblock_region) * memblock.reserved.max);
-}
-
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
{
type->total_size -= type->regions[r].size;
@@ -184,9 +160,39 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
}
}
-static int __init_memblock memblock_double_array(struct memblock_type *type)
+phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
+ phys_addr_t *addr)
+{
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
+
+ *addr = __pa(memblock.reserved.regions);
+
+ return PAGE_ALIGN(sizeof(struct memblock_region) *
+ memblock.reserved.max);
+}
+
+/**
+ * memblock_double_array - double the size of the memblock regions array
+ * @type: memblock type of the regions array being doubled
+ * @new_area_start: starting address of memory range to avoid overlap with
+ * @new_area_size: size of memory range to avoid overlap with
+ *
+ * Double the size of the @type regions array. If memblock is being used to
+ * allocate memory for a new reserved regions array and there is a previously
+ * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
+ * waiting to be reserved, ensure the memory used by the new array does
+ * not overlap.
+ *
+ * RETURNS:
+ * 0 on success, -1 on failure.
+ */
+static int __init_memblock memblock_double_array(struct memblock_type *type,
+ phys_addr_t new_area_start,
+ phys_addr_t new_area_size)
{
struct memblock_region *new_array, *old_array;
+ phys_addr_t old_alloc_size, new_alloc_size;
phys_addr_t old_size, new_size, addr;
int use_slab = slab_is_available();
int *in_slab;
@@ -200,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
/* Calculate new doubled size */
old_size = type->max * sizeof(struct memblock_region);
new_size = old_size << 1;
+ /*
+ * We need to allocated new one align to PAGE_SIZE,
+ * so we can free them completely later.
+ */
+ old_alloc_size = PAGE_ALIGN(old_size);
+ new_alloc_size = PAGE_ALIGN(new_size);
/* Retrieve the slab flag */
if (type == &memblock.memory)
@@ -210,19 +222,30 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
/* Try to find some space for it.
*
* WARNING: We assume that either slab_is_available() and we use it or
- * we use MEMBLOCK for allocations. That means that this is unsafe to use
- * when bootmem is currently active (unless bootmem itself is implemented
- * on top of MEMBLOCK which isn't the case yet)
+ * we use MEMBLOCK for allocations. That means that this is unsafe to
+ * use when bootmem is currently active (unless bootmem itself is
+ * implemented on top of MEMBLOCK which isn't the case yet)
*
* This should however not be an issue for now, as we currently only
- * call into MEMBLOCK while it's still active, or much later when slab is
- * active for memory hotplug operations
+ * call into MEMBLOCK while it's still active, or much later when slab
+ * is active for memory hotplug operations
*/
if (use_slab) {
new_array = kmalloc(new_size, GFP_KERNEL);
addr = new_array ? __pa(new_array) : 0;
} else {
- addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+ /* only exclude range when trying to double reserved.regions */
+ if (type != &memblock.reserved)
+ new_area_start = new_area_size = 0;
+
+ addr = memblock_find_in_range(new_area_start + new_area_size,
+ memblock.current_limit,
+ new_alloc_size, PAGE_SIZE);
+ if (!addr && new_area_size)
+ addr = memblock_find_in_range(0,
+ min(new_area_start, memblock.current_limit),
+ new_alloc_size, PAGE_SIZE);
+
new_array = addr ? __va(addr) : 0;
}
if (!addr) {
@@ -231,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
return -1;
}
- memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
- memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
+ memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
+ memblock_type_name(type), type->max * 2, (u64)addr,
+ (u64)addr + new_size - 1);
- /* Found space, we now need to move the array over before
- * we add the reserved region since it may be our reserved
- * array itself that is full.
+ /*
+ * Found space, we now need to move the array over before we add the
+ * reserved region since it may be our reserved array itself that is
+ * full.
*/
memcpy(new_array, type->regions, old_size);
memset(new_array + type->max, 0, old_size);
@@ -244,20 +269,19 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
type->regions = new_array;
type->max <<= 1;
- /* Free old array. We needn't free it if the array is the
- * static one
- */
+ /* Free old array. We needn't free it if the array is the static one */
if (*in_slab)
kfree(old_array);
else if (old_array != memblock_memory_init_regions &&
old_array != memblock_reserved_init_regions)
- memblock_free(__pa(old_array), old_size);
+ memblock_free(__pa(old_array), old_alloc_size);
- /* Reserve the new array if that comes from the memblock.
- * Otherwise, we needn't do it
+ /*
+ * Reserve the new array if that comes from the memblock. Otherwise, we
+ * needn't do it
*/
if (!use_slab)
- BUG_ON(memblock_reserve(addr, new_size));
+ BUG_ON(memblock_reserve(addr, new_alloc_size));
/* Update slab flag */
*in_slab = use_slab;
@@ -399,7 +423,7 @@ repeat:
*/
if (!insert) {
while (type->cnt + nr_new > type->max)
- if (memblock_double_array(type) < 0)
+ if (memblock_double_array(type, obase, size) < 0)
return -ENOMEM;
insert = true;
goto repeat;
@@ -450,7 +474,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
/* we'll create at most two more regions */
while (type->cnt + 2 > type->max)
- if (memblock_double_array(type) < 0)
+ if (memblock_double_array(type, base, size) < 0)
return -ENOMEM;
for (i = 0; i < type->cnt; i++) {
@@ -540,9 +564,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
* __next_free_mem_range - next function for for_each_free_mem_range()
* @idx: pointer to u64 loop variable
* @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
*
* Find the first free area from *@idx which matches @nid, fill the out
* parameters, and update *@idx for the next iteration. The lower 32bit of
@@ -616,9 +640,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
* __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
* @idx: pointer to u64 loop variable
* @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
*
* Reverse of __next_free_mem_range().
*/
@@ -867,6 +891,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
return memblock_search(&memblock.memory, addr) != -1;
}
+/**
+ * memblock_is_region_memory - check if a region is a subset of memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) is a subset of a memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
{
int idx = memblock_search(&memblock.memory, base);
@@ -879,6 +913,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
memblock.memory.regions[idx].size) >= end;
}
+/**
+ * memblock_is_region_reserved - check if a region intersects reserved memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) intersects a reserved memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
{
memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ac35bccadb7b..795e525afaba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
/* for remember boot option*/
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
#else
static int really_do_swap_account __initdata = 0;
@@ -87,7 +87,7 @@ enum mem_cgroup_stat_index {
MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
- MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+ MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
MEM_CGROUP_STAT_NSTATS,
};
@@ -378,9 +378,7 @@ static bool move_file(void)
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
- MEM_CGROUP_CHARGE_TYPE_MAPPED,
- MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
- MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
+ MEM_CGROUP_CHARGE_TYPE_ANON,
MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
NR_CHARGE_TYPE,
@@ -407,8 +405,14 @@ enum charge_type {
static void mem_cgroup_get(struct mem_cgroup *memcg);
static void mem_cgroup_put(struct mem_cgroup *memcg);
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+ return container_of(s, struct mem_cgroup, css);
+}
+
/* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
#include <net/sock.h>
#include <net/ip.h>
@@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(tcp_proto_cgroup);
#endif /* CONFIG_INET */
-#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+#endif /* CONFIG_MEMCG_KMEM */
-#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
{
int val = (charge) ? 1 : -1;
- this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
+ this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
}
static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
{
- return container_of(cgroup_subsys_state(cont,
- mem_cgroup_subsys_id), struct mem_cgroup,
- css);
+ return mem_cgroup_from_css(
+ cgroup_subsys_state(cont, mem_cgroup_subsys_id));
}
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
if (unlikely(!p))
return NULL;
- return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
- struct mem_cgroup, css);
+ return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
}
struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
if (css) {
if (css == &root->css || css_tryget(css))
- memcg = container_of(css,
- struct mem_cgroup, css);
+ memcg = mem_cgroup_from_css(css);
} else
id = 0;
rcu_read_unlock();
@@ -1148,7 +1149,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
{
if (root_memcg == memcg)
return true;
- if (!root_memcg->use_hierarchy)
+ if (!root_memcg->use_hierarchy || !memcg)
return false;
return css_is_ancestor(&memcg->css, &root_memcg->css);
}
@@ -1234,7 +1235,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
- * @mem: the memory cgroup
+ * @memcg: the memory cgroup
*
* Returns the maximum amount of memory @mem can be charged with, in
* pages.
@@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
/*
* Return the memory (and swap, if configured) limit for a memcg.
*/
-u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
u64 limit;
u64 memsw;
@@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
return min(limit, memsw);
}
+void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+ int order)
+{
+ struct mem_cgroup *iter;
+ unsigned long chosen_points = 0;
+ unsigned long totalpages;
+ unsigned int points = 0;
+ struct task_struct *chosen = NULL;
+
+ /*
+ * If current has a pending SIGKILL, then automatically select it. The
+ * goal is to allow it to allocate so that it may quickly exit and free
+ * its memory.
+ */
+ if (fatal_signal_pending(current)) {
+ set_thread_flag(TIF_MEMDIE);
+ return;
+ }
+
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+ totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ struct cgroup *cgroup = iter->css.cgroup;
+ struct cgroup_iter it;
+ struct task_struct *task;
+
+ cgroup_iter_start(cgroup, &it);
+ while ((task = cgroup_iter_next(cgroup, &it))) {
+ switch (oom_scan_process_thread(task, totalpages, NULL,
+ false)) {
+ case OOM_SCAN_SELECT:
+ if (chosen)
+ put_task_struct(chosen);
+ chosen = task;
+ chosen_points = ULONG_MAX;
+ get_task_struct(chosen);
+ /* fall through */
+ case OOM_SCAN_CONTINUE:
+ continue;
+ case OOM_SCAN_ABORT:
+ cgroup_iter_end(cgroup, &it);
+ mem_cgroup_iter_break(memcg, iter);
+ if (chosen)
+ put_task_struct(chosen);
+ return;
+ case OOM_SCAN_OK:
+ break;
+ };
+ points = oom_badness(task, memcg, NULL, totalpages);
+ if (points > chosen_points) {
+ if (chosen)
+ put_task_struct(chosen);
+ chosen = task;
+ chosen_points = points;
+ get_task_struct(chosen);
+ }
+ }
+ cgroup_iter_end(cgroup, &it);
+ }
+
+ if (!chosen)
+ return;
+ points = chosen_points * 1000 / totalpages;
+ oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
+ NULL, "Memory cgroup out of memory");
+}
+
static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
gfp_t gfp_mask,
unsigned long flags)
@@ -1508,7 +1576,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
/**
* test_mem_cgroup_node_reclaimable
- * @mem: the target memcg
+ * @memcg: the target memcg
* @nid: the node ID to be checked.
* @noswap : specify true here if the user wants flle only information.
*
@@ -1899,7 +1967,7 @@ again:
return;
/*
* If this memory cgroup is not under account moving, we don't
- * need to take move_lock_page_cgroup(). Because we already hold
+ * need to take move_lock_mem_cgroup(). Because we already hold
* rcu_read_lock(), any calls to move_account will be delayed until
* rcu_read_unlock() if mem_cgroup_stolen() == true.
*/
@@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
/*
* It's guaranteed that pc->mem_cgroup never changes while
* lock is held because a routine modifies pc->mem_cgroup
- * should take move_lock_page_cgroup().
+ * should take move_lock_mem_cgroup().
*/
move_unlock_mem_cgroup(pc->mem_cgroup, flags);
}
@@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
* We always charge the cgroup the mm_struct belongs to.
* The mm_struct's mem_cgroup changes on task migration if the
* thread group leader migrates. It's possible that mm is not
- * set, if so charge the init_mm (happens for pagecache usage).
+ * set, if so charge the root memcg (happens for pagecache usage).
*/
if (!*ptr && !mm)
*ptr = root_mem_cgroup;
@@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
css = css_lookup(&mem_cgroup_subsys, id);
if (!css)
return NULL;
- return container_of(css, struct mem_cgroup, css);
+ return mem_cgroup_from_css(css);
}
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
bool anon;
lock_page_cgroup(pc);
- if (unlikely(PageCgroupUsed(pc))) {
- unlock_page_cgroup(pc);
- __mem_cgroup_cancel_charge(memcg, nr_pages);
- return;
- }
+ VM_BUG_ON(PageCgroupUsed(pc));
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
@@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
spin_unlock_irq(&zone->lru_lock);
}
- if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+ if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
anon = true;
else
anon = false;
@@ -2644,8 +2708,7 @@ out:
static int mem_cgroup_move_parent(struct page *page,
struct page_cgroup *pc,
- struct mem_cgroup *child,
- gfp_t gfp_mask)
+ struct mem_cgroup *child)
{
struct mem_cgroup *parent;
unsigned int nr_pages;
@@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page,
VM_BUG_ON(page->mapping && !PageAnon(page));
VM_BUG_ON(!mm);
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_MAPPED);
-}
-
-static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
- enum charge_type ctype);
-
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask)
-{
- struct mem_cgroup *memcg = NULL;
- enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
- int ret;
-
- if (mem_cgroup_disabled())
- return 0;
- if (PageCompound(page))
- return 0;
-
- if (unlikely(!mm))
- mm = &init_mm;
- if (!page_is_file_cache(page))
- type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-
- if (!PageSwapCache(page))
- ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
- else { /* page is swapcache/shmem */
- ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
- if (!ret)
- __mem_cgroup_commit_charge_swapin(page, memcg, type);
- }
- return ret;
+ MEM_CGROUP_CHARGE_TYPE_ANON);
}
/*
@@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
* struct page_cgroup is acquired. This refcnt will be consumed by
* "commit()" or removed by "cancel()"
*/
-int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
- struct page *page,
- gfp_t mask, struct mem_cgroup **memcgp)
+static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+ struct page *page,
+ gfp_t mask,
+ struct mem_cgroup **memcgp)
{
struct mem_cgroup *memcg;
+ struct page_cgroup *pc;
int ret;
- *memcgp = NULL;
-
- if (mem_cgroup_disabled())
- return 0;
-
- if (!do_swap_account)
- goto charge_cur_mm;
+ pc = lookup_page_cgroup(page);
/*
- * A racing thread's fault, or swapoff, may have already updated
- * the pte, and even removed page from swap cache: in those cases
- * do_swap_page()'s pte_same() test will fail; but there's also a
- * KSM case which does need to charge the page.
+ * Every swap fault against a single page tries to charge the
+ * page, bail as early as possible. shmem_unuse() encounters
+ * already charged pages, too. The USED bit is protected by
+ * the page lock, which serializes swap cache removal, which
+ * in turn serializes uncharging.
*/
- if (!PageSwapCache(page))
+ if (PageCgroupUsed(pc))
+ return 0;
+ if (!do_swap_account)
goto charge_cur_mm;
memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
@@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
ret = 0;
return ret;
charge_cur_mm:
- if (unlikely(!mm))
- mm = &init_mm;
ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
if (ret == -EINTR)
ret = 0;
return ret;
}
+int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
+ gfp_t gfp_mask, struct mem_cgroup **memcgp)
+{
+ *memcgp = NULL;
+ if (mem_cgroup_disabled())
+ return 0;
+ /*
+ * A racing thread's fault, or swapoff, may have already
+ * updated the pte, and even removed page from swap cache: in
+ * those cases unuse_pte()'s pte_same() test will fail; but
+ * there's also a KSM case which does need to charge the page.
+ */
+ if (!PageSwapCache(page)) {
+ int ret;
+
+ ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
+ if (ret == -EINTR)
+ ret = 0;
+ return ret;
+ }
+ return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
+}
+
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+{
+ if (mem_cgroup_disabled())
+ return;
+ if (!memcg)
+ return;
+ __mem_cgroup_cancel_charge(memcg, 1);
+}
+
static void
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
enum charge_type ctype)
@@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
struct mem_cgroup *memcg)
{
__mem_cgroup_commit_charge_swapin(page, memcg,
- MEM_CGROUP_CHARGE_TYPE_MAPPED);
+ MEM_CGROUP_CHARGE_TYPE_ANON);
}
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
{
+ struct mem_cgroup *memcg = NULL;
+ enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ int ret;
+
if (mem_cgroup_disabled())
- return;
- if (!memcg)
- return;
- __mem_cgroup_cancel_charge(memcg, 1);
+ return 0;
+ if (PageCompound(page))
+ return 0;
+
+ if (!PageSwapCache(page))
+ ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
+ else { /* page is swapcache/shmem */
+ ret = __mem_cgroup_try_charge_swapin(mm, page,
+ gfp_mask, &memcg);
+ if (!ret)
+ __mem_cgroup_commit_charge_swapin(page, memcg, type);
+ }
+ return ret;
}
static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -2911,7 +2986,8 @@ direct_uncharge:
* uncharge if !page_mapped(page)
*/
static struct mem_cgroup *
-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
+ bool end_migration)
{
struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
@@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
if (mem_cgroup_disabled())
return NULL;
- if (PageSwapCache(page))
- return NULL;
+ VM_BUG_ON(PageSwapCache(page));
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
@@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
anon = PageAnon(page);
switch (ctype) {
- case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+ case MEM_CGROUP_CHARGE_TYPE_ANON:
/*
* Generally PageAnon tells if it's the anon statistics to be
* updated; but sometimes e.g. mem_cgroup_uncharge_page() is
@@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
/* fallthrough */
case MEM_CGROUP_CHARGE_TYPE_DROP:
/* See mem_cgroup_prepare_migration() */
- if (page_mapped(page) || PageCgroupMigration(pc))
+ if (page_mapped(page))
+ goto unlock_out;
+ /*
+ * Pages under migration may not be uncharged. But
+ * end_migration() /must/ be the one uncharging the
+ * unused post-migration page and so it has to call
+ * here with the migration bit still set. See the
+ * res_counter handling below.
+ */
+ if (!end_migration && PageCgroupMigration(pc))
goto unlock_out;
break;
case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
mem_cgroup_swap_statistics(memcg, true);
mem_cgroup_get(memcg);
}
- if (!mem_cgroup_is_root(memcg))
+ /*
+ * Migration does not charge the res_counter for the
+ * replacement page, so leave it alone when phasing out the
+ * page that is unused after the migration.
+ */
+ if (!end_migration && !mem_cgroup_is_root(memcg))
mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
return memcg;
@@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
if (page_mapped(page))
return;
VM_BUG_ON(page->mapping && !PageAnon(page));
- __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+ if (PageSwapCache(page))
+ return;
+ __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
}
void mem_cgroup_uncharge_cache_page(struct page *page)
{
VM_BUG_ON(page_mapped(page));
VM_BUG_ON(page->mapping);
- __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
+ __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
}
/*
@@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
if (!swapout) /* this was a swap cache but the swap is unused ! */
ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
- memcg = __mem_cgroup_uncharge_common(page, ctype);
+ memcg = __mem_cgroup_uncharge_common(page, ctype, false);
/*
* record memcg information, if swapout && memcg != NULL,
@@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
}
#endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
/*
* called from swap_entry_free(). remove record in swap_cgroup and
* uncharge "memsw" account.
@@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
* Before starting migration, account PAGE_SIZE to mem_cgroup that the old
* page belongs to.
*/
-int mem_cgroup_prepare_migration(struct page *page,
- struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
+void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
+ struct mem_cgroup **memcgp)
{
struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
enum charge_type ctype;
- int ret = 0;
*memcgp = NULL;
VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
- return 0;
+ return;
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
@@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page,
* we return here.
*/
if (!memcg)
- return 0;
+ return;
*memcgp = memcg;
- ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
- css_put(&memcg->css);/* drop extra refcnt */
- if (ret) {
- if (PageAnon(page)) {
- lock_page_cgroup(pc);
- ClearPageCgroupMigration(pc);
- unlock_page_cgroup(pc);
- /*
- * The old page may be fully unmapped while we kept it.
- */
- mem_cgroup_uncharge_page(page);
- }
- /* we'll need to revisit this error code (we have -EINTR) */
- return -ENOMEM;
- }
/*
* We charge new page before it's used/mapped. So, even if unlock_page()
* is called before end_migration, we can catch all events on this new
@@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page,
* mapcount will be finally 0 and we call uncharge in end_migration().
*/
if (PageAnon(page))
- ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
- else if (page_is_file_cache(page))
- ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
else
- ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+ ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+ /*
+ * The page is committed to the memcg, but it's not actually
+ * charged to the res_counter since we plan on replacing the
+ * old one and only one page is going to be left afterwards.
+ */
__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
- return ret;
}
/* remove redundant charge if migration failed*/
@@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
used = newpage;
unused = oldpage;
}
+ anon = PageAnon(used);
+ __mem_cgroup_uncharge_common(unused,
+ anon ? MEM_CGROUP_CHARGE_TYPE_ANON
+ : MEM_CGROUP_CHARGE_TYPE_CACHE,
+ true);
+ css_put(&memcg->css);
/*
* We disallowed uncharge of pages under migration because mapcount
* of the page goes down to zero, temporarly.
@@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
lock_page_cgroup(pc);
ClearPageCgroupMigration(pc);
unlock_page_cgroup(pc);
- anon = PageAnon(used);
- __mem_cgroup_uncharge_common(unused,
- anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
- : MEM_CGROUP_CHARGE_TYPE_CACHE);
/*
* If a page is a file cache, radix-tree replacement is very atomic
@@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
*/
if (!memcg)
return;
-
- if (PageSwapBacked(oldpage))
- type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-
/*
* Even if newpage->mapping was NULL before starting replacement,
* the newpage may be on LRU(or pagevec for LRU) already. We lock
@@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.
- * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+ * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
/*
* Rather than hide all in some function, I do this in
* open coded manner. You see what this really does.
- * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+ * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
}
/*
- * This routine traverse page_cgroup in given list and drop them all.
- * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
+ * Traverse a specified page_cgroup list and try to drop them all. This doesn't
+ * reclaim the pages page themselves - it just removes the page_cgroups.
+ * Returns true if some page_cgroups were not freed, indicating that the caller
+ * must retry this operation.
*/
-static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
+static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
int node, int zid, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
@@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
struct list_head *list;
struct page *busy;
struct zone *zone;
- int ret = 0;
zone = &NODE_DATA(node)->node_zones[zid];
mz = mem_cgroup_zoneinfo(memcg, node, zid);
@@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
struct page_cgroup *pc;
struct page *page;
- ret = 0;
spin_lock_irqsave(&zone->lru_lock, flags);
if (list_empty(list)) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
pc = lookup_page_cgroup(page);
- ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
- if (ret == -ENOMEM || ret == -EINTR)
- break;
-
- if (ret == -EBUSY || ret == -EINVAL) {
+ if (mem_cgroup_move_parent(page, pc, memcg)) {
/* found lock contention or "pc" is obsolete. */
busy = page;
cond_resched();
} else
busy = NULL;
}
-
- if (!ret && !list_empty(list))
- return -EBUSY;
- return ret;
+ return !list_empty(list);
}
/*
@@ -3692,9 +3760,6 @@ move_account:
ret = -EBUSY;
if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
goto out;
- ret = -EINTR;
- if (signal_pending(current))
- goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
drain_all_stock_sync(memcg);
@@ -3715,9 +3780,6 @@ move_account:
}
mem_cgroup_end_move(memcg);
memcg_oom_recover(memcg);
- /* it seems parent cgroup doesn't have enough mem */
- if (ret == -ENOMEM)
- goto try_to_free;
cond_resched();
/* "ret" should also be checked to ensure all lists are empty. */
} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
@@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
parent_memcg = mem_cgroup_from_cont(parent);
cgroup_lock();
+
+ if (memcg->use_hierarchy == val)
+ goto out;
+
/*
* If parent's use_hierarchy is set, we can't make any modifications
* in the child subtrees. If it is unset, then the change can
@@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
retval = -EBUSY;
} else
retval = -EINVAL;
+
+out:
cgroup_unlock();
return retval;
@@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
if (swap)
- val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
return val << PAGE_SHIFT;
}
@@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
#endif
#ifdef CONFIG_NUMA
-static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft,
+static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
struct seq_file *m)
{
int nid;
@@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
}
-static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
struct seq_file *m)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
unsigned int i;
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
+ if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
@@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
long long val = 0;
- if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
+ if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
continue;
for_each_mem_cgroup_tree(mi, memcg)
val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
@@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
return 0;
}
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
{
return mem_cgroup_sockets_init(memcg, ss);
@@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = {
},
{
.name = "stat",
- .read_seq_string = mem_control_stat_show,
+ .read_seq_string = memcg_stat_show,
},
{
.name = "force_empty",
@@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = {
#ifdef CONFIG_NUMA
{
.name = "numa_stat",
- .read_seq_string = mem_control_numa_stat_show,
+ .read_seq_string = memcg_numa_stat_show,
},
#endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(parent_mem_cgroup);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
static void __init enable_swap_cgroup(void)
{
if (!mem_cgroup_disabled() && really_do_swap_account)
@@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
.__DEPRECATED_clear_css_refs = true,
};
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
static int __init enable_swap_account(char *s)
{
/* consider enabled if no parameter or 1 is given */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ab1e7145e290..a6e2141a6610 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
* can only guarantee that the page either belongs to the memcg tasks, or is
* a freed page.
*/
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
u64 hwpoison_filter_memcg;
EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
static int hwpoison_filter_task(struct page *p)
@@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* Also when FAIL is set do a force kill because something went
* wrong earlier.
*/
-static void kill_procs(struct list_head *to_kill, int doit, int trapno,
+static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
int fail, struct page *page, unsigned long pfn,
int flags)
{
struct to_kill *tk, *next;
list_for_each_entry_safe (tk, next, to_kill, nd) {
- if (doit) {
+ if (forcekill) {
/*
* In case something went wrong with munmapping
* make sure the process doesn't catch the
@@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
- int kill = 1;
+ int kill = 1, forcekill;
struct page *hpage = compound_head(p);
struct page *ppage;
@@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* be called inside page lock (it's recommended but not enforced).
*/
mapping = page_mapping(hpage);
- if (!PageDirty(hpage) && mapping &&
+ if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
mapping_cap_writeback_dirty(mapping)) {
if (page_mkclean(hpage)) {
SetPageDirty(hpage);
@@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
* killing is needed or not. Only kill when the page
- * was dirty, otherwise the tokill list is merely
+ * was dirty or the process is not restartable,
+ * otherwise the tokill list is merely
* freed. When there was a problem unmapping earlier
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs(&tokill, !!PageDirty(ppage), trapno,
+ forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+ kill_procs(&tokill, forcekill, trapno,
ret != SWAP_SUCCESS, p, pfn, flags);
return ret;
@@ -1414,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
int ret;
unsigned long pfn = page_to_pfn(page);
struct page *hpage = compound_head(page);
- LIST_HEAD(pagelist);
ret = get_any_page(page, pfn, flags);
if (ret < 0)
@@ -1429,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
}
/* Keep page count to indicate a given hugepage is isolated. */
-
- list_add(&hpage->lru, &pagelist);
- ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
- true);
+ ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
+ MIGRATE_SYNC);
+ put_page(hpage);
if (ret) {
- struct page *page1, *page2;
- list_for_each_entry_safe(page1, page2, &pagelist, lru)
- put_page(page1);
-
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
- if (ret > 0)
- ret = -EIO;
return ret;
}
done:
if (!PageHWPoison(hpage))
- atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
+ atomic_long_add(1 << compound_trans_order(hpage),
+ &mce_bad_pages);
set_page_hwpoison_huge_page(hpage);
dequeue_hwpoisoned_huge_page(hpage);
/* keep elevated page count for bad page */
@@ -1561,7 +1556,7 @@ int soft_offline_page(struct page *page, int flags)
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- 0, MIGRATE_SYNC);
+ false, MIGRATE_SYNC);
if (ret) {
putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 1b7dc662bf9f..482f089765ff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
tlb->mm = mm;
tlb->fullmm = fullmm;
+ tlb->start = -1UL;
+ tlb->end = 0;
tlb->need_flush = 0;
tlb->fast_mode = (num_possible_cpus() == 1);
tlb->local.next = NULL;
@@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
{
struct mmu_gather_batch *batch, *next;
+ tlb->start = start;
+ tlb->end = end;
tlb_flush_mmu(tlb);
/* keep the page table cache within bounds */
@@ -1204,6 +1208,11 @@ again:
*/
if (force_flush) {
force_flush = 0;
+
+#ifdef HAVE_GENERIC_MMU_GATHER
+ tlb->start = addr;
+ tlb->end = end;
+#endif
tlb_flush_mmu(tlb);
if (addr != end)
goto again;
@@ -1225,7 +1234,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
- VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+#ifdef CONFIG_DEBUG_VM
+ if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
+ pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
+ __func__, addr, end,
+ vma->vm_start,
+ vma->vm_end);
+ BUG();
+ }
+#endif
split_huge_page_pmd(vma->vm_mm, pmd);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
@@ -1326,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
* Since no pte has actually been setup, it is
* safe to do nothing in this case.
*/
- if (vma->vm_file)
- unmap_hugepage_range(vma, start, end, NULL);
+ if (vma->vm_file) {
+ mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
+ mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ }
} else
unmap_page_range(tlb, vma, start, end, details);
}
@@ -1366,7 +1386,7 @@ void unmap_vmas(struct mmu_gather *tlb,
/**
* zap_page_range - remove user pages in a given range
* @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
+ * @start: starting address of pages to zap
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
*
@@ -3921,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
free_page((unsigned long)buf);
}
}
- up_read(&current->mm->mmap_sem);
+ up_read(&mm->mmap_sem);
}
#ifdef CONFIG_PROVE_LOCKING
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d7e3ec8e0f3..3ad25f9d1fc1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
zone->present_pages += onlined_pages;
zone->zone_pgdat->node_present_pages += onlined_pages;
- if (need_zonelists_rebuild)
- build_all_zonelists(zone);
- else
- zone_pcp_update(zone);
+ if (onlined_pages) {
+ node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+ if (need_zonelists_rebuild)
+ build_all_zonelists(NULL, zone);
+ else
+ zone_pcp_update(zone);
+ }
mutex_unlock(&zonelists_mutex);
init_per_zone_wmark_min();
- if (onlined_pages) {
+ if (onlined_pages)
kswapd_run(zone_to_nid(zone));
- node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
- }
vm_total_pages = nr_free_pagecache_pages();
@@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
* to access not-initialized zonelist, build here.
*/
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL);
+ build_all_zonelists(pgdat, NULL);
mutex_unlock(&zonelists_mutex);
return pgdat;
@@ -618,7 +619,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
pgdat = hotadd_new_pgdat(nid, start);
ret = -ENOMEM;
if (!pgdat)
- goto out;
+ goto error;
new_pgdat = 1;
}
@@ -965,6 +966,9 @@ repeat:
init_per_zone_wmark_min();
+ if (!populated_zone(zone))
+ zone_pcp_reset(zone);
+
if (!node_present_pages(node)) {
node_clear_state(node, N_HIGH_MEMORY);
kswapd_stop(node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f15c1b24ca18..bd92431d4c49 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
nr_failed = migrate_pages(&pagelist, new_vma_page,
(unsigned long)vma,
- false, true);
+ false, MIGRATE_SYNC);
if (nr_failed)
putback_lru_pages(&pagelist);
}
@@ -1602,8 +1602,14 @@ static unsigned interleave_nodes(struct mempolicy *policy)
* task can change it's policy. The system default policy requires no
* such protection.
*/
-unsigned slab_node(struct mempolicy *policy)
+unsigned slab_node(void)
{
+ struct mempolicy *policy;
+
+ if (in_interrupt())
+ return numa_node_id();
+
+ policy = current->mempolicy;
if (!policy || policy->flags & MPOL_F_LOCAL)
return numa_node_id();
diff --git a/mm/migrate.c b/mm/migrate.c
index be26d5cbe56b..77ed2d773705 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -33,6 +33,7 @@
#include <linux/memcontrol.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
#include <asm/tlbflush.h>
@@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
{
int rc = -EAGAIN;
int remap_swapcache = 1;
- int charge = 0;
struct mem_cgroup *mem;
struct anon_vma *anon_vma = NULL;
@@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
/* charge against new page */
- charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
- if (charge == -ENOMEM) {
- rc = -ENOMEM;
- goto unlock;
- }
- BUG_ON(charge);
+ mem_cgroup_prepare_migration(page, newpage, &mem);
if (PageWriteback(page)) {
/*
@@ -819,8 +814,7 @@ skip_unmap:
put_anon_vma(anon_vma);
uncharge:
- if (!charge)
- mem_cgroup_end_migration(mem, page, newpage, rc == 0);
+ mem_cgroup_end_migration(mem, page, newpage, rc == 0);
unlock:
unlock_page(page);
out:
@@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (anon_vma)
put_anon_vma(anon_vma);
- unlock_page(hpage);
-out:
- if (rc != -EAGAIN) {
- list_del(&hpage->lru);
- put_page(hpage);
- }
+ if (!rc)
+ hugetlb_cgroup_migrate(hpage, new_hpage);
+ unlock_page(hpage);
+out:
put_page(new_hpage);
-
if (result) {
if (rc)
*result = rc;
@@ -1016,48 +1007,32 @@ out:
return nr_failed + retry;
}
-int migrate_huge_pages(struct list_head *from,
- new_page_t get_new_page, unsigned long private, bool offlining,
- enum migrate_mode mode)
+int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
+ unsigned long private, bool offlining,
+ enum migrate_mode mode)
{
- int retry = 1;
- int nr_failed = 0;
- int pass = 0;
- struct page *page;
- struct page *page2;
- int rc;
-
- for (pass = 0; pass < 10 && retry; pass++) {
- retry = 0;
-
- list_for_each_entry_safe(page, page2, from, lru) {
+ int pass, rc;
+
+ for (pass = 0; pass < 10; pass++) {
+ rc = unmap_and_move_huge_page(get_new_page,
+ private, hpage, pass > 2, offlining,
+ mode);
+ switch (rc) {
+ case -ENOMEM:
+ goto out;
+ case -EAGAIN:
+ /* try again */
cond_resched();
-
- rc = unmap_and_move_huge_page(get_new_page,
- private, page, pass > 2, offlining,
- mode);
-
- switch(rc) {
- case -ENOMEM:
- goto out;
- case -EAGAIN:
- retry++;
- break;
- case 0:
- break;
- default:
- /* Permanent failure */
- nr_failed++;
- break;
- }
+ break;
+ case 0:
+ goto out;
+ default:
+ rc = -EIO;
+ goto out;
}
}
- rc = 0;
out:
- if (rc)
- return rc;
-
- return nr_failed + retry;
+ return rc;
}
#ifdef CONFIG_NUMA
diff --git a/mm/mmap.c b/mm/mmap.c
index 3edfcdfa42d9..e3e86914f11a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
const unsigned long stack_flags
= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
+ mm->total_vm += pages;
+
if (file) {
mm->shared_vm += pages;
if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -1347,7 +1349,6 @@ munmap_back:
out:
perf_event_mmap(vma);
- mm->total_vm += len >> PAGE_SHIFT;
vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if (!mlock_vma_pages_range(vma, addr, addr + len))
@@ -1707,7 +1708,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
return -ENOMEM;
/* Ok, everything looks good - let it rip */
- mm->total_vm += grow;
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
@@ -1889,7 +1889,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
- mm->total_vm -= nrpages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
vma = remove_vma(vma);
} while (vma);
@@ -2345,9 +2344,6 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
security_vm_enough_memory_mm(mm, vma_pages(vma)))
return -ENOMEM;
- if (vma->vm_file && uprobe_mmap(vma))
- return -EINVAL;
-
vma_link(mm, vma, prev, rb_link, rb_parent);
return 0;
}
@@ -2418,9 +2414,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
if (new_vma->vm_file) {
get_file(new_vma->vm_file);
- if (uprobe_mmap(new_vma))
- goto out_free_mempol;
-
if (vma->vm_flags & VM_EXECUTABLE)
added_exe_file_vma(mm);
}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9a611d3a1848..862b60822d9f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
void __mmu_notifier_release(struct mm_struct *mm)
{
struct mmu_notifier *mn;
+ struct hlist_node *n;
+
+ /*
+ * RCU here will block mmu_notifier_unregister until
+ * ->release returns.
+ */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
+ /*
+ * if ->release runs before mmu_notifier_unregister it
+ * must be handled as it's the only way for the driver
+ * to flush all existing sptes and stop the driver
+ * from establishing any more sptes before all the
+ * pages in the mm are freed.
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ rcu_read_unlock();
spin_lock(&mm->mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
* mmu_notifier_unregister to return.
*/
hlist_del_init_rcu(&mn->hlist);
- /*
- * RCU here will block mmu_notifier_unregister until
- * ->release returns.
- */
- rcu_read_lock();
- spin_unlock(&mm->mmu_notifier_mm->lock);
- /*
- * if ->release runs before mmu_notifier_unregister it
- * must be handled as it's the only way for the driver
- * to flush all existing sptes and stop the driver
- * from establishing any more sptes before all the
- * pages in the mm are freed.
- */
- if (mn->ops->release)
- mn->ops->release(mn, mm);
- rcu_read_unlock();
- spin_lock(&mm->mmu_notifier_mm->lock);
}
spin_unlock(&mm->mmu_notifier_mm->lock);
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
{
BUG_ON(atomic_read(&mm->mm_count) <= 0);
- spin_lock(&mm->mmu_notifier_mm->lock);
if (!hlist_unhashed(&mn->hlist)) {
- hlist_del_rcu(&mn->hlist);
-
/*
* RCU here will force exit_mmap to wait ->release to finish
* before freeing the pages.
*/
rcu_read_lock();
- spin_unlock(&mm->mmu_notifier_mm->lock);
+
/*
* exit_mmap will block in mmu_notifier_release to
* guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
if (mn->ops->release)
mn->ops->release(mn, mm);
rcu_read_unlock();
- } else
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ hlist_del_rcu(&mn->hlist);
spin_unlock(&mm->mmu_notifier_mm->lock);
+ }
/*
* Wait any running method to finish, of course including
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 6830eab5bf09..3cef80f6ac79 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
lruvec->zone = zone;
#endif
}
diff --git a/mm/mremap.c b/mm/mremap.c
index 21fed202ddad..cc06d0e48d05 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* If this were a serious issue, we'd add a flag to do_munmap().
*/
hiwater_vm = mm->hiwater_vm;
- mm->total_vm += new_len >> PAGE_SHIFT;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
if (do_munmap(mm, old_addr, old_len) < 0) {
@@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
goto out;
}
- mm->total_vm += pages;
vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
if (vma->vm_flags & VM_LOCKED) {
mm->locked_vm += pages;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index d23415c001bc..405573010f99 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
__free_pages_bootmem(pfn_to_page(i), 0);
}
+static unsigned long __init __free_memory_core(phys_addr_t start,
+ phys_addr_t end)
+{
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = min_t(unsigned long,
+ PFN_DOWN(end), max_low_pfn);
+
+ if (start_pfn > end_pfn)
+ return 0;
+
+ __free_pages_memory(start_pfn, end_pfn);
+
+ return end_pfn - start_pfn;
+}
+
unsigned long __init free_low_memory_core_early(int nodeid)
{
unsigned long count = 0;
- phys_addr_t start, end;
+ phys_addr_t start, end, size;
u64 i;
- /* free reserved array temporarily so that it's treated as free area */
- memblock_free_reserved_regions();
-
- for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
- unsigned long start_pfn = PFN_UP(start);
- unsigned long end_pfn = min_t(unsigned long,
- PFN_DOWN(end), max_low_pfn);
- if (start_pfn < end_pfn) {
- __free_pages_memory(start_pfn, end_pfn);
- count += end_pfn - start_pfn;
- }
- }
+ for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
+ count += __free_memory_core(start, end);
+
+ /* free range that is used for reserved array if we allocate it */
+ size = get_allocated_memblock_reserved_regions_info(&start);
+ if (size)
+ count += __free_memory_core(start, start + size);
- /* put region array back? */
- memblock_reserve_reserved_regions();
return count;
}
@@ -274,7 +282,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
return ___alloc_bootmem(size, align, goal, limit);
}
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
diff --git a/mm/nommu.c b/mm/nommu.c
index c4acfbc09972..d4b0c10872de 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
- ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+ retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
if (file)
fput(file);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ed0e19677360..198600861638 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p,
unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
const nodemask_t *nodemask, unsigned long totalpages)
{
- unsigned long points;
+ long points;
+ long adj;
if (oom_unkillable_task(p, memcg, nodemask))
return 0;
@@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
if (!p)
return 0;
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ adj = p->signal->oom_score_adj;
+ if (adj == OOM_SCORE_ADJ_MIN) {
task_unlock(p);
return 0;
}
@@ -210,20 +212,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* implementation used by LSMs.
*/
if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- points -= 30 * totalpages / 1000;
+ adj -= 30;
- /*
- * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
- * either completely disable oom killing or always prefer a certain
- * task.
- */
- points += p->signal->oom_score_adj * totalpages / 1000;
+ /* Normalize to oom_score_adj units */
+ adj *= totalpages / 1000;
+ points += adj;
/*
* Never return 0 for an eligible task regardless of the root bonus and
* oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
*/
- return points ? points : 1;
+ return points > 0 ? points : 1;
}
/*
@@ -289,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
}
#endif
+enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
+ unsigned long totalpages, const nodemask_t *nodemask,
+ bool force_kill)
+{
+ if (task->exit_state)
+ return OOM_SCAN_CONTINUE;
+ if (oom_unkillable_task(task, NULL, nodemask))
+ return OOM_SCAN_CONTINUE;
+
+ /*
+ * This task already has access to memory reserves and is being killed.
+ * Don't allow any other task to have access to the reserves.
+ */
+ if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
+ if (unlikely(frozen(task)))
+ __thaw_task(task);
+ if (!force_kill)
+ return OOM_SCAN_ABORT;
+ }
+ if (!task->mm)
+ return OOM_SCAN_CONTINUE;
+
+ if (task->flags & PF_EXITING) {
+ /*
+ * If task is current and is in the process of releasing memory,
+ * allow the "kill" to set TIF_MEMDIE, which will allow it to
+ * access memory reserves. Otherwise, it may stall forever.
+ *
+ * The iteration isn't broken here, however, in case other
+ * threads are found to have already been oom killed.
+ */
+ if (task == current)
+ return OOM_SCAN_SELECT;
+ else if (!force_kill) {
+ /*
+ * If this task is not being ptraced on exit, then wait
+ * for it to finish before killing some other task
+ * unnecessarily.
+ */
+ if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
+ return OOM_SCAN_ABORT;
+ }
+ }
+ return OOM_SCAN_OK;
+}
+
/*
* Simple selection loop. We chose the process with the highest
- * number of 'points'. We expect the caller will lock the tasklist.
+ * number of 'points'.
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
static struct task_struct *select_bad_process(unsigned int *ppoints,
- unsigned long totalpages, struct mem_cgroup *memcg,
- const nodemask_t *nodemask, bool force_kill)
+ unsigned long totalpages, const nodemask_t *nodemask,
+ bool force_kill)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
unsigned long chosen_points = 0;
+ rcu_read_lock();
do_each_thread(g, p) {
unsigned int points;
- if (p->exit_state)
- continue;
- if (oom_unkillable_task(p, memcg, nodemask))
- continue;
-
- /*
- * This task already has access to memory reserves and is
- * being killed. Don't allow any other task access to the
- * memory reserve.
- *
- * Note: this may have a chance of deadlock if it gets
- * blocked waiting for another task which itself is waiting
- * for memory. Is there a better alternative?
- */
- if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
- if (unlikely(frozen(p)))
- __thaw_task(p);
- if (!force_kill)
- return ERR_PTR(-1UL);
- }
- if (!p->mm)
+ switch (oom_scan_process_thread(p, totalpages, nodemask,
+ force_kill)) {
+ case OOM_SCAN_SELECT:
+ chosen = p;
+ chosen_points = ULONG_MAX;
+ /* fall through */
+ case OOM_SCAN_CONTINUE:
continue;
-
- if (p->flags & PF_EXITING) {
- /*
- * If p is the current task and is in the process of
- * releasing memory, we allow the "kill" to set
- * TIF_MEMDIE, which will allow it to gain access to
- * memory reserves. Otherwise, it may stall forever.
- *
- * The loop isn't broken here, however, in case other
- * threads are found to have already been oom killed.
- */
- if (p == current) {
- chosen = p;
- chosen_points = ULONG_MAX;
- } else if (!force_kill) {
- /*
- * If this task is not being ptraced on exit,
- * then wait for it to finish before killing
- * some other task unnecessarily.
- */
- if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
- return ERR_PTR(-1UL);
- }
- }
-
- points = oom_badness(p, memcg, nodemask, totalpages);
+ case OOM_SCAN_ABORT:
+ rcu_read_unlock();
+ return ERR_PTR(-1UL);
+ case OOM_SCAN_OK:
+ break;
+ };
+ points = oom_badness(p, NULL, nodemask, totalpages);
if (points > chosen_points) {
chosen = p;
chosen_points = points;
}
} while_each_thread(g, p);
+ if (chosen)
+ get_task_struct(chosen);
+ rcu_read_unlock();
*ppoints = chosen_points * 1000 / totalpages;
return chosen;
@@ -366,23 +382,22 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
/**
* dump_tasks - dump current memory state of all system tasks
- * @mem: current's memory controller, if constrained
+ * @memcg: current's memory controller, if constrained
* @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* Dumps the current memory state of all eligible tasks. Tasks not in the same
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
* are not shown.
- * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
- * value, oom_score_adj value, and name.
- *
- * Call with tasklist_lock read-locked.
+ * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
+ * swapents, oom_score_adj value, and name.
*/
static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *task;
- pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
+ pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
+ rcu_read_lock();
for_each_process(p) {
if (oom_unkillable_task(p, memcg, nodemask))
continue;
@@ -397,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
+ pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
- task_cpu(task), task->signal->oom_adj,
+ task->mm->nr_ptes,
+ get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
}
+ rcu_read_unlock();
}
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -424,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
}
#define K(x) ((x) << (PAGE_SHIFT-10))
-static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
- unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message)
+/*
+ * Must be called while holding a reference to p, which will be released upon
+ * returning.
+ */
+void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+ unsigned int points, unsigned long totalpages,
+ struct mem_cgroup *memcg, nodemask_t *nodemask,
+ const char *message)
{
struct task_struct *victim = p;
struct task_struct *child;
@@ -443,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
*/
if (p->flags & PF_EXITING) {
set_tsk_thread_flag(p, TIF_MEMDIE);
+ put_task_struct(p);
return;
}
@@ -460,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* parent. This attempts to lose the minimal amount of work done while
* still freeing memory.
*/
+ read_lock(&tasklist_lock);
do {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
@@ -472,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
child_points = oom_badness(child, memcg, nodemask,
totalpages);
if (child_points > victim_points) {
+ put_task_struct(victim);
victim = child;
victim_points = child_points;
+ get_task_struct(victim);
}
}
} while_each_thread(p, t);
+ read_unlock(&tasklist_lock);
- victim = find_lock_task_mm(victim);
- if (!victim)
+ rcu_read_lock();
+ p = find_lock_task_mm(victim);
+ if (!p) {
+ rcu_read_unlock();
+ put_task_struct(victim);
return;
+ } else if (victim != p) {
+ get_task_struct(p);
+ put_task_struct(victim);
+ victim = p;
+ }
/* mm cannot safely be dereferenced after task_unlock(victim) */
mm = victim->mm;
@@ -511,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
task_unlock(p);
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
}
+ rcu_read_unlock();
set_tsk_thread_flag(victim, TIF_MEMDIE);
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+ put_task_struct(victim);
}
#undef K
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask)
+void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+ int order, const nodemask_t *nodemask)
{
if (likely(!sysctl_panic_on_oom))
return;
@@ -534,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
if (constraint != CONSTRAINT_NONE)
return;
}
- read_lock(&tasklist_lock);
dump_header(NULL, gfp_mask, order, NULL, nodemask);
- read_unlock(&tasklist_lock);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
- int order)
-{
- unsigned long limit;
- unsigned int points = 0;
- struct task_struct *p;
-
- /*
- * If current has a pending SIGKILL, then automatically select it. The
- * goal is to allow it to allocate so that it may quickly exit and free
- * its memory.
- */
- if (fatal_signal_pending(current)) {
- set_thread_flag(TIF_MEMDIE);
- return;
- }
-
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
- limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
- read_lock(&tasklist_lock);
- p = select_bad_process(&points, limit, memcg, NULL, false);
- if (p && PTR_ERR(p) != -1UL)
- oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
- "Memory cgroup out of memory");
- read_unlock(&tasklist_lock);
-}
-#endif
-
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
int register_oom_notifier(struct notifier_block *nb)
@@ -691,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
- unsigned int points;
+ unsigned int uninitialized_var(points);
enum oom_constraint constraint = CONSTRAINT_NONE;
int killed = 0;
@@ -719,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
- read_lock(&tasklist_lock);
- if (sysctl_oom_kill_allocating_task &&
+ if (sysctl_oom_kill_allocating_task && current->mm &&
!oom_unkillable_task(current, NULL, nodemask) &&
- current->mm) {
+ current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ get_task_struct(current);
oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
nodemask,
"Out of memory (oom_kill_allocating_task)");
goto out;
}
- p = select_bad_process(&points, totalpages, NULL, mpol_mask,
- force_kill);
+ p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
- read_unlock(&tasklist_lock);
panic("Out of memory and no killable processes...\n");
}
if (PTR_ERR(p) != -1UL) {
@@ -743,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
killed = 1;
}
out:
- read_unlock(&tasklist_lock);
-
/*
- * Give "p" a good chance of killing itself before we
- * retry to allocate memory unless "p" is current
+ * Give the killed threads a good chance of exiting before trying to
+ * allocate memory again.
*/
- if (killed && !test_thread_flag(TIF_MEMDIE))
- schedule_timeout_uninterruptible(1);
+ if (killed)
+ schedule_timeout_killable(1);
}
/*
@@ -765,6 +766,5 @@ void pagefault_out_of_memory(void)
out_of_memory(NULL, 0, 0, NULL, false);
clear_system_oom();
}
- if (!test_thread_flag(TIF_MEMDIE))
- schedule_timeout_uninterruptible(1);
+ schedule_timeout_killable(1);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 93d8d2f7108c..e5363f34e025 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h> /* __set_page_dirty_buffers */
#include <linux/pagevec.h>
+#include <linux/timer.h>
#include <trace/events/writeback.h>
/*
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
* measured in page writeback completions.
*
*/
-static struct prop_descriptor vm_completions;
+static struct fprop_global writeout_completions;
+
+static void writeout_period(unsigned long t);
+/* Timer for aging of writeout_completions */
+static struct timer_list writeout_period_timer =
+ TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
+static unsigned long writeout_period_time = 0;
+
+/*
+ * Length of period for aging writeout fractions of bdis. This is an
+ * arbitrarily chosen number. The longer the period, the slower fractions will
+ * reflect changes in current writeout rate.
+ */
+#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
/*
* Work out the current dirty-memory clamping and background writeout
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
zone_page_state(zone, NR_WRITEBACK) <= limit;
}
-/*
- * couple the period to the dirty_ratio:
- *
- * period/2 ~ roundup_pow_of_two(dirty limit)
- */
-static int calc_period_shift(void)
-{
- unsigned long dirty_total;
-
- if (vm_dirty_bytes)
- dirty_total = vm_dirty_bytes / PAGE_SIZE;
- else
- dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
- 100;
- return 2 + ilog2(dirty_total - 1);
-}
-
-/*
- * update the period when the dirty threshold changes.
- */
-static void update_completion_period(void)
-{
- int shift = calc_period_shift();
- prop_change_shift(&vm_completions, shift);
-
- writeback_set_ratelimit();
-}
-
int dirty_background_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
- update_completion_period();
+ writeback_set_ratelimit();
vm_dirty_bytes = 0;
}
return ret;
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
- update_completion_period();
+ writeback_set_ratelimit();
vm_dirty_ratio = 0;
}
return ret;
}
+static unsigned long wp_next_time(unsigned long cur_time)
+{
+ cur_time += VM_COMPLETIONS_PERIOD_LEN;
+ /* 0 has a special meaning... */
+ if (!cur_time)
+ return 1;
+ return cur_time;
+}
+
/*
* Increment the BDI's writeout completion count and the global writeout
* completion count. Called from test_clear_page_writeback().
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
{
__inc_bdi_stat(bdi, BDI_WRITTEN);
- __prop_inc_percpu_max(&vm_completions, &bdi->completions,
- bdi->max_prop_frac);
+ __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
+ bdi->max_prop_frac);
+ /* First event after period switching was turned off? */
+ if (!unlikely(writeout_period_time)) {
+ /*
+ * We can race with other __bdi_writeout_inc calls here but
+ * it does not cause any harm since the resulting time when
+ * timer will fire and what is in writeout_period_time will be
+ * roughly the same.
+ */
+ writeout_period_time = wp_next_time(jiffies);
+ mod_timer(&writeout_period_timer, writeout_period_time);
+ }
}
void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
static void bdi_writeout_fraction(struct backing_dev_info *bdi,
long *numerator, long *denominator)
{
- prop_fraction_percpu(&vm_completions, &bdi->completions,
+ fprop_fraction_percpu(&writeout_completions, &bdi->completions,
numerator, denominator);
}
/*
+ * On idle system, we can be called long after we scheduled because we use
+ * deferred timers so count with missed periods.
+ */
+static void writeout_period(unsigned long t)
+{
+ int miss_periods = (jiffies - writeout_period_time) /
+ VM_COMPLETIONS_PERIOD_LEN;
+
+ if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
+ writeout_period_time = wp_next_time(writeout_period_time +
+ miss_periods * VM_COMPLETIONS_PERIOD_LEN);
+ mod_timer(&writeout_period_timer, writeout_period_time);
+ } else {
+ /*
+ * Aging has zeroed all fractions. Stop wasting CPU on period
+ * updates.
+ */
+ writeout_period_time = 0;
+ }
+}
+
+/*
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
* registered backing devices, which, for obvious reasons, can not
* exceed 100%.
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
ret = -EINVAL;
} else {
bdi->max_ratio = max_ratio;
- bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
+ bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
}
spin_unlock_bh(&bdi_lock);
@@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
* bdi->dirty_ratelimit = balanced_dirty_ratelimit;
*
* However to get a more stable dirty_ratelimit, the below elaborated
- * code makes use of task_ratelimit to filter out sigular points and
+ * code makes use of task_ratelimit to filter out singular points and
* limit the step size.
*
* The below code essentially only uses the relative value of
@@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
* feel and care are stable dirty rate and small position error.
*
* |task_ratelimit - dirty_ratelimit| is used to limit the step size
- * and filter out the sigular points of balanced_dirty_ratelimit. Which
+ * and filter out the singular points of balanced_dirty_ratelimit. Which
* keeps jumping around randomly and can even leap far away at times
* due to the small 200ms estimation period of dirty_rate (we want to
* keep that period small to reduce time lags).
@@ -1606,13 +1634,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
*/
void __init page_writeback_init(void)
{
- int shift;
-
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
- shift = calc_period_shift();
- prop_descriptor_init(&vm_completions, shift);
+ fprop_global_init(&writeout_completions);
}
/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44030096da63..889532b8e6c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
#include <linux/page_cgroup.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
-#include <linux/memory.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <linux/ftrace_event.h>
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+/*
+ * NOTE:
+ * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
+ * Instead, use {un}set_pageblock_isolate.
+ */
+void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled))
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
return pages_moved;
}
-static int move_freepages_block(struct zone *zone, struct page *page,
+int move_freepages_block(struct zone *zone, struct page *page,
int migratetype)
{
unsigned long start_pfn, end_pfn;
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
to_drain = pcp->batch;
else
to_drain = pcp->count;
- free_pcppages_bulk(zone, to_drain, pcp);
- pcp->count -= to_drain;
+ if (to_drain > 0) {
+ free_pcppages_bulk(zone, to_drain, pcp);
+ pcp->count -= to_drain;
+ }
local_irq_restore(flags);
}
#endif
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
}
__setup("fail_page_alloc=", setup_fail_page_alloc);
-static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
if (order < fail_page_alloc.min_order)
- return 0;
+ return false;
if (gfp_mask & __GFP_NOFAIL)
- return 0;
+ return false;
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
- return 0;
+ return false;
if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
- return 0;
+ return false;
return should_fail(&fail_page_alloc.attr, 1 << order);
}
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
#else /* CONFIG_FAIL_PAGE_ALLOC */
-static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
- return 0;
+ return false;
}
#endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
{
/* free_pages my go negative - that's OK */
long min = mark;
+ long lowmem_reserve = z->lowmem_reserve[classzone_idx];
int o;
free_pages -= (1 << order) - 1;
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+ if (free_pages <= min + lowmem_reserve)
return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
return true;
}
+#ifdef CONFIG_MEMORY_ISOLATION
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+ if (unlikely(zone->nr_pageblock_isolate))
+ return zone->nr_pageblock_isolate * pageblock_nr_pages;
+ return 0;
+}
+#else
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+ return 0;
+}
+#endif
+
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+ /*
+ * If the zone has MIGRATE_ISOLATE type free pages, we should consider
+ * it. nr_zone_isolate_freepages is never accurate so kswapd might not
+ * sleep although it could do so. But this is more desirable for memory
+ * hotplug than sleeping which can cause a livelock in the direct
+ * reclaim path.
+ */
+ free_pages -= nr_zone_isolate_freepages(z);
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
free_pages);
}
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
page = get_page_from_freelist(gfp_mask, nodemask,
order, zonelist, high_zoneidx,
- alloc_flags, preferred_zone,
- migratetype);
+ alloc_flags & ~ALLOC_NO_WATERMARKS,
+ preferred_zone, migratetype);
if (page) {
preferred_zone->compact_considered = 0;
preferred_zone->compact_defer_shift = 0;
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
- alloc_flags, preferred_zone,
- migratetype);
+ alloc_flags & ~ALLOC_NO_WATERMARKS,
+ preferred_zone, migratetype);
/*
* If an allocation failed after direct reclaim, it could be because
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
- if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
+ if (gfp_mask & __GFP_MEMALLOC)
+ alloc_flags |= ALLOC_NO_WATERMARKS;
+ else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+ alloc_flags |= ALLOC_NO_WATERMARKS;
+ else if (!in_interrupt() &&
+ ((current->flags & PF_MEMALLOC) ||
+ unlikely(test_thread_flag(TIF_MEMDIE))))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
return alloc_flags;
}
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+ return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2340,11 +2378,27 @@ rebalance:
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
+ /*
+ * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
+ * the allocation is high priority and these type of
+ * allocations are system rather than user orientated
+ */
+ zonelist = node_zonelist(numa_node_id(), gfp_mask);
+
page = __alloc_pages_high_priority(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
- if (page)
+ if (page) {
+ /*
+ * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+ * necessary to allocate the page. The expectation is
+ * that the caller is taking steps that will free more
+ * memory. The caller should avoid the page being used
+ * for !PFMEMALLOC purposes.
+ */
+ page->pfmemalloc = true;
goto got_pg;
+ }
}
/* Atomic allocations - we can't balance anything */
@@ -2463,8 +2517,8 @@ nopage:
got_pg:
if (kmemcheck_enabled)
kmemcheck_pagealloc_alloc(page, order, gfp_mask);
- return page;
+ return page;
}
/*
@@ -2515,6 +2569,8 @@ retry_cpuset:
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
+ else
+ page->pfmemalloc = false;
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
user_zonelist_order = oldval;
} else if (oldval != user_zonelist_order) {
mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL);
+ build_all_zonelists(NULL, NULL);
mutex_unlock(&zonelists_mutex);
}
}
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone);
DEFINE_MUTEX(zonelists_mutex);
/* return values int ....just for stop_machine() */
-static __init_refok int __build_all_zonelists(void *data)
+static int __build_all_zonelists(void *data)
{
int nid;
int cpu;
+ pg_data_t *self = data;
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
#endif
+
+ if (self && !node_online(self->node_id)) {
+ build_zonelists(self);
+ build_zonelist_cache(self);
+ }
+
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data)
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
*/
-void __ref build_all_zonelists(void *data)
+void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
set_zonelist_order();
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data)
/* we have to stop all cpus to guarantee there is no user
of zonelist */
#ifdef CONFIG_MEMORY_HOTPLUG
- if (data)
- setup_zone_pageset((struct zone *)data);
+ if (zone)
+ setup_zone_pageset(zone);
#endif
- stop_machine(__build_all_zonelists, NULL, NULL);
+ stop_machine(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
-static int zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
{
#ifdef CONFIG_MMU
int batch;
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
pcp->batch = PAGE_SHIFT * 8;
}
-static void setup_zone_pageset(struct zone *zone)
+static void __meminit setup_zone_pageset(struct zone *zone)
{
int cpu;
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
return 0;
}
-static int __zone_pcp_update(void *data)
-{
- struct zone *zone = data;
- int cpu;
- unsigned long batch = zone_batchsize(zone), flags;
-
- for_each_possible_cpu(cpu) {
- struct per_cpu_pageset *pset;
- struct per_cpu_pages *pcp;
-
- pset = per_cpu_ptr(zone->pageset, cpu);
- pcp = &pset->pcp;
-
- local_irq_save(flags);
- free_pcppages_bulk(zone, pcp->count, pcp);
- setup_pageset(pset, batch);
- local_irq_restore(flags);
- }
- return 0;
-}
-
-void zone_pcp_update(struct zone *zone)
-{
- stop_machine(__zone_pcp_update, zone, NULL);
-}
-
static __meminit void zone_pcp_init(struct zone *zone)
{
/*
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
zone_batchsize(zone));
}
-__meminit int init_currently_empty_zone(struct zone *zone,
+int __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
unsigned long size,
enum memmap_context context)
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
unsigned int order;
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void)
* include/linux/pageblock-flags.h for the values of pageblock_order based on
* the kernel config
*/
-static inline void set_pageblock_order(void)
+void __init set_pageblock_order(void)
{
}
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void)
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
*/
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
int ret;
pgdat_resize_init(pgdat);
- pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
- pgdat->kswapd_max_order = 0;
+ init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone->spanned_pages = size;
zone->present_pages = realsize;
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+ zone->compact_cached_free_pfn = zone->zone_start_pfn +
+ zone->spanned_pages;
+ zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
+#endif
#ifdef CONFIG_NUMA
zone->node = nid;
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
zone_pcp_init(zone);
lruvec_init(&zone->lruvec, zone);
- zap_zone_vm_stats(zone);
- zone->flags = 0;
if (!size)
continue;
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
{
pg_data_t *pgdat = NODE_DATA(nid);
+ /* pg_data_t should be reset to zero when it's allocated */
+ WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx);
+
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4750,7 +4794,7 @@ out:
}
/* Any regular memory on that node ? */
-static void check_for_regular_memory(pg_data_t *pgdat)
+static void __init check_for_regular_memory(pg_data_t *pgdat)
{
#ifdef CONFIG_HIGHMEM
enum zone_type zone_type;
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
}
/*
- * This is designed as sub function...plz see page_isolation.c also.
- * set/clear page block's type to be ISOLATE.
- * page allocater never alloc memory from ISOLATE block.
+ * This function checks whether pageblock includes unmovable pages or not.
+ * If @count is not zero, it is okay to include less @count unmovable pages
+ *
+ * PageLRU check wihtout isolation or lru_lock could race so that
+ * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
+ * expect this function should be exact.
*/
-
-static int
-__count_immobile_pages(struct zone *zone, struct page *page, int count)
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
{
unsigned long pfn, iter, found;
int mt;
/*
* For avoiding noise data, lru_add_drain_all() should be called
- * If ZONE_MOVABLE, the zone never contains immobile pages
+ * If ZONE_MOVABLE, the zone never contains unmovable pages
*/
if (zone_idx(zone) == ZONE_MOVABLE)
- return true;
+ return false;
mt = get_pageblock_migratetype(page);
if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
- return true;
+ return false;
pfn = page_to_pfn(page);
for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
continue;
page = pfn_to_page(check);
- if (!page_count(page)) {
+ /*
+ * We can't use page_count without pin a page
+ * because another CPU can free compound page.
+ * This check already skips compound tails of THP
+ * because their page->_count is zero at all time.
+ */
+ if (!atomic_read(&page->_count)) {
if (PageBuddy(page))
iter += (1 << page_order(page)) - 1;
continue;
}
+
if (!PageLRU(page))
found++;
/*
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
* page at boot.
*/
if (found > count)
- return false;
+ return true;
}
- return true;
+ return false;
}
bool is_pageblock_removable_nolock(struct page *page)
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page)
zone->zone_start_pfn + zone->spanned_pages <= pfn)
return false;
- return __count_immobile_pages(zone, page, 0);
-}
-
-int set_migratetype_isolate(struct page *page)
-{
- struct zone *zone;
- unsigned long flags, pfn;
- struct memory_isolate_notify arg;
- int notifier_ret;
- int ret = -EBUSY;
-
- zone = page_zone(page);
-
- spin_lock_irqsave(&zone->lock, flags);
-
- pfn = page_to_pfn(page);
- arg.start_pfn = pfn;
- arg.nr_pages = pageblock_nr_pages;
- arg.pages_found = 0;
-
- /*
- * It may be possible to isolate a pageblock even if the
- * migratetype is not MIGRATE_MOVABLE. The memory isolation
- * notifier chain is used by balloon drivers to return the
- * number of pages in a range that are held by the balloon
- * driver to shrink memory. If all the pages are accounted for
- * by balloons, are free, or on the LRU, isolation can continue.
- * Later, for example, when memory hotplug notifier runs, these
- * pages reported as "can be isolated" should be isolated(freed)
- * by the balloon driver through the memory notifier chain.
- */
- notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
- notifier_ret = notifier_to_errno(notifier_ret);
- if (notifier_ret)
- goto out;
- /*
- * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
- * We just check MOVABLE pages.
- */
- if (__count_immobile_pages(zone, page, arg.pages_found))
- ret = 0;
-
- /*
- * immobile means "not-on-lru" paes. If immobile is larger than
- * removable-by-driver pages reported by notifier, we'll fail.
- */
-
-out:
- if (!ret) {
- set_pageblock_migratetype(page, MIGRATE_ISOLATE);
- move_freepages_block(zone, page, MIGRATE_ISOLATE);
- }
-
- spin_unlock_irqrestore(&zone->lock, flags);
- if (!ret)
- drain_all_pages();
- return ret;
-}
-
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
-{
- struct zone *zone;
- unsigned long flags;
- zone = page_zone(page);
- spin_lock_irqsave(&zone->lock, flags);
- if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
- goto out;
- set_pageblock_migratetype(page, migratetype);
- move_freepages_block(zone, page, migratetype);
-out:
- spin_unlock_irqrestore(&zone->lock, flags);
+ return !has_unmovable_pages(zone, page, 0);
}
#ifdef CONFIG_CMA
@@ -5635,7 +5617,12 @@ static struct page *
__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
int **resultp)
{
- return alloc_page(GFP_HIGHUSER_MOVABLE);
+ gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+ return alloc_page(gfp_mask);
}
/* [start, end) must belong to a single zone. */
@@ -5864,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
}
#endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int __meminit __zone_pcp_update(void *data)
+{
+ struct zone *zone = data;
+ int cpu;
+ unsigned long batch = zone_batchsize(zone), flags;
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pset;
+ struct per_cpu_pages *pcp;
+
+ pset = per_cpu_ptr(zone->pageset, cpu);
+ pcp = &pset->pcp;
+
+ local_irq_save(flags);
+ if (pcp->count > 0)
+ free_pcppages_bulk(zone, pcp->count, pcp);
+ setup_pageset(pset, batch);
+ local_irq_restore(flags);
+ }
+ return 0;
+}
+
+void __meminit zone_pcp_update(struct zone *zone)
+{
+ stop_machine(__zone_pcp_update, zone, NULL);
+}
+#endif
+
#ifdef CONFIG_MEMORY_HOTREMOVE
+void zone_pcp_reset(struct zone *zone)
+{
+ unsigned long flags;
+
+ /* avoid races with drain_pages() */
+ local_irq_save(flags);
+ if (zone->pageset != &boot_pageset) {
+ free_percpu(zone->pageset);
+ zone->pageset = &boot_pageset;
+ }
+ local_irq_restore(flags);
+}
+
/*
* All pages in the range must be isolated before calling this.
*/
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059c..5ddad0c6daa6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
#endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
static DEFINE_MUTEX(swap_cgroup_mutex);
struct swap_cgroup_ctrl {
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
/**
* swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @end: swap entry to be cmpxchged
+ * @ent: swap entry to be cmpxchged
* @old: old id
* @new: new id
*
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
/**
* swap_cgroup_record - record mem_cgroup for this swp_entry.
* @ent: swap entry to be recorded into
- * @mem: mem_cgroup to be recorded
+ * @id: mem_cgroup to be recorded
*
* Returns old value at success, 0 at failure.
* (Of course, old value can be 0.)
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..78eee32ee486 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -17,7 +17,9 @@
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/swapops.h>
+#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/frontswap.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -85,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err)
bio_put(bio);
}
+int generic_swapfile_activate(struct swap_info_struct *sis,
+ struct file *swap_file,
+ sector_t *span)
+{
+ struct address_space *mapping = swap_file->f_mapping;
+ struct inode *inode = mapping->host;
+ unsigned blocks_per_page;
+ unsigned long page_no;
+ unsigned blkbits;
+ sector_t probe_block;
+ sector_t last_block;
+ sector_t lowest_block = -1;
+ sector_t highest_block = 0;
+ int nr_extents = 0;
+ int ret;
+
+ blkbits = inode->i_blkbits;
+ blocks_per_page = PAGE_SIZE >> blkbits;
+
+ /*
+ * Map all the blocks into the extent list. This code doesn't try
+ * to be very smart.
+ */
+ probe_block = 0;
+ page_no = 0;
+ last_block = i_size_read(inode) >> blkbits;
+ while ((probe_block + blocks_per_page) <= last_block &&
+ page_no < sis->max) {
+ unsigned block_in_page;
+ sector_t first_block;
+
+ first_block = bmap(inode, probe_block);
+ if (first_block == 0)
+ goto bad_bmap;
+
+ /*
+ * It must be PAGE_SIZE aligned on-disk
+ */
+ if (first_block & (blocks_per_page - 1)) {
+ probe_block++;
+ goto reprobe;
+ }
+
+ for (block_in_page = 1; block_in_page < blocks_per_page;
+ block_in_page++) {
+ sector_t block;
+
+ block = bmap(inode, probe_block + block_in_page);
+ if (block == 0)
+ goto bad_bmap;
+ if (block != first_block + block_in_page) {
+ /* Discontiguity */
+ probe_block++;
+ goto reprobe;
+ }
+ }
+
+ first_block >>= (PAGE_SHIFT - blkbits);
+ if (page_no) { /* exclude the header page */
+ if (first_block < lowest_block)
+ lowest_block = first_block;
+ if (first_block > highest_block)
+ highest_block = first_block;
+ }
+
+ /*
+ * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+ */
+ ret = add_swap_extent(sis, page_no, 1, first_block);
+ if (ret < 0)
+ goto out;
+ nr_extents += ret;
+ page_no++;
+ probe_block += blocks_per_page;
+reprobe:
+ continue;
+ }
+ ret = nr_extents;
+ *span = 1 + highest_block - lowest_block;
+ if (page_no == 0)
+ page_no = 1; /* force Empty message */
+ sis->max = page_no;
+ sis->pages = page_no - 1;
+ sis->highest_bit = page_no - 1;
+out:
+ return ret;
+bad_bmap:
+ printk(KERN_ERR "swapon: swapfile has holes\n");
+ ret = -EINVAL;
+ goto out;
+}
+
/*
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
@@ -93,11 +187,45 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
{
struct bio *bio;
int ret = 0, rw = WRITE;
+ struct swap_info_struct *sis = page_swap_info(page);
if (try_to_free_swap(page)) {
unlock_page(page);
goto out;
}
+ if (frontswap_store(page) == 0) {
+ set_page_writeback(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ goto out;
+ }
+
+ if (sis->flags & SWP_FILE) {
+ struct kiocb kiocb;
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+ struct iovec iov = {
+ .iov_base = kmap(page),
+ .iov_len = PAGE_SIZE,
+ };
+
+ init_sync_kiocb(&kiocb, swap_file);
+ kiocb.ki_pos = page_file_offset(page);
+ kiocb.ki_left = PAGE_SIZE;
+ kiocb.ki_nbytes = PAGE_SIZE;
+
+ unlock_page(page);
+ ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
+ &kiocb, &iov,
+ kiocb.ki_pos, 1);
+ kunmap(page);
+ if (ret == PAGE_SIZE) {
+ count_vm_event(PSWPOUT);
+ ret = 0;
+ }
+ return ret;
+ }
+
bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
if (bio == NULL) {
set_page_dirty(page);
@@ -119,9 +247,26 @@ int swap_readpage(struct page *page)
{
struct bio *bio;
int ret = 0;
+ struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageUptodate(page));
+ if (frontswap_load(page) == 0) {
+ SetPageUptodate(page);
+ unlock_page(page);
+ goto out;
+ }
+
+ if (sis->flags & SWP_FILE) {
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+
+ ret = mapping->a_ops->readpage(swap_file, page);
+ if (!ret)
+ count_vm_event(PSWPIN);
+ return ret;
+ }
+
bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
if (bio == NULL) {
unlock_page(page);
@@ -133,3 +278,15 @@ int swap_readpage(struct page *page)
out:
return ret;
}
+
+int swap_set_page_dirty(struct page *page)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+
+ if (sis->flags & SWP_FILE) {
+ struct address_space *mapping = sis->swap_file->f_mapping;
+ return mapping->a_ops->set_page_dirty(page);
+ } else {
+ return __set_page_dirty_no_writeback(page);
+ }
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c9f04774f2b8..247d1f175739 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,101 @@
#include <linux/mm.h>
#include <linux/page-isolation.h>
#include <linux/pageblock-flags.h>
+#include <linux/memory.h>
#include "internal.h"
+/* called while holding zone->lock */
+static void set_pageblock_isolate(struct page *page)
+{
+ if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
+ return;
+
+ set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+ page_zone(page)->nr_pageblock_isolate++;
+}
+
+/* called while holding zone->lock */
+static void restore_pageblock_isolate(struct page *page, int migratetype)
+{
+ struct zone *zone = page_zone(page);
+ if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
+ return;
+
+ BUG_ON(zone->nr_pageblock_isolate <= 0);
+ set_pageblock_migratetype(page, migratetype);
+ zone->nr_pageblock_isolate--;
+}
+
+int set_migratetype_isolate(struct page *page)
+{
+ struct zone *zone;
+ unsigned long flags, pfn;
+ struct memory_isolate_notify arg;
+ int notifier_ret;
+ int ret = -EBUSY;
+
+ zone = page_zone(page);
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ pfn = page_to_pfn(page);
+ arg.start_pfn = pfn;
+ arg.nr_pages = pageblock_nr_pages;
+ arg.pages_found = 0;
+
+ /*
+ * It may be possible to isolate a pageblock even if the
+ * migratetype is not MIGRATE_MOVABLE. The memory isolation
+ * notifier chain is used by balloon drivers to return the
+ * number of pages in a range that are held by the balloon
+ * driver to shrink memory. If all the pages are accounted for
+ * by balloons, are free, or on the LRU, isolation can continue.
+ * Later, for example, when memory hotplug notifier runs, these
+ * pages reported as "can be isolated" should be isolated(freed)
+ * by the balloon driver through the memory notifier chain.
+ */
+ notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+ notifier_ret = notifier_to_errno(notifier_ret);
+ if (notifier_ret)
+ goto out;
+ /*
+ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+ * We just check MOVABLE pages.
+ */
+ if (!has_unmovable_pages(zone, page, arg.pages_found))
+ ret = 0;
+
+ /*
+ * immobile means "not-on-lru" paes. If immobile is larger than
+ * removable-by-driver pages reported by notifier, we'll fail.
+ */
+
+out:
+ if (!ret) {
+ set_pageblock_isolate(page);
+ move_freepages_block(zone, page, MIGRATE_ISOLATE);
+ }
+
+ spin_unlock_irqrestore(&zone->lock, flags);
+ if (!ret)
+ drain_all_pages();
+ return ret;
+}
+
+void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+{
+ struct zone *zone;
+ unsigned long flags;
+ zone = page_zone(page);
+ spin_lock_irqsave(&zone->lock, flags);
+ if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ goto out;
+ move_freepages_block(zone, page, migratetype);
+ restore_pageblock_isolate(page, migratetype);
+out:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
static inline struct page *
__first_valid_page(unsigned long pfn, unsigned long nr_pages)
{
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..6c118d012bb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
/**
* walk_page_range - walk a memory map's page tables with a callback
- * @mm: memory map to walk
* @addr: starting address
* @end: ending address
* @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c3..3707c71ae4cd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
* @chunk: chunk to depopulate
* @off: offset to the area to depopulate
* @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
*
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
* from @chunk. If @flush is true, vcache is flushed before unmapping
diff --git a/mm/shmem.c b/mm/shmem.c
index c244e93a70fa..d4e184e2a38e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
}
/*
+ * Sometimes, before we decide whether to proceed or to fail, we must check
+ * that an entry was not already brought back from swap by a racing thread.
+ *
+ * Checking page is not enough: by the time a SwapCache page is locked, it
+ * might be reused, and again be SwapCache, using the same swap as before.
+ */
+static bool shmem_confirm_swap(struct address_space *mapping,
+ pgoff_t index, swp_entry_t swap)
+{
+ void *item;
+
+ rcu_read_lock();
+ item = radix_tree_lookup(&mapping->page_tree, index);
+ rcu_read_unlock();
+ return item == swp_to_radix_entry(swap);
+}
+
+/*
* Like add_to_page_cache_locked, but error if expected item has gone.
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
pgoff_t index, gfp_t gfp, void *expected)
{
- int error = 0;
+ int error;
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(!PageSwapBacked(page));
+ page_cache_get(page);
+ page->mapping = mapping;
+ page->index = index;
+
+ spin_lock_irq(&mapping->tree_lock);
if (!expected)
- error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ error = radix_tree_insert(&mapping->page_tree, index, page);
+ else
+ error = shmem_radix_tree_replace(mapping, index, expected,
+ page);
if (!error) {
- page_cache_get(page);
- page->mapping = mapping;
- page->index = index;
-
- spin_lock_irq(&mapping->tree_lock);
- if (!expected)
- error = radix_tree_insert(&mapping->page_tree,
- index, page);
- else
- error = shmem_radix_tree_replace(mapping, index,
- expected, page);
- if (!error) {
- mapping->nrpages++;
- __inc_zone_page_state(page, NR_FILE_PAGES);
- __inc_zone_page_state(page, NR_SHMEM);
- spin_unlock_irq(&mapping->tree_lock);
- } else {
- page->mapping = NULL;
- spin_unlock_irq(&mapping->tree_lock);
- page_cache_release(page);
- }
- if (!expected)
- radix_tree_preload_end();
+ mapping->nrpages++;
+ __inc_zone_page_state(page, NR_FILE_PAGES);
+ __inc_zone_page_state(page, NR_SHMEM);
+ spin_unlock_irq(&mapping->tree_lock);
+ } else {
+ page->mapping = NULL;
+ spin_unlock_irq(&mapping->tree_lock);
+ page_cache_release(page);
}
- if (error)
- mem_cgroup_uncharge_cache_page(page);
return error;
}
@@ -683,10 +692,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
mutex_lock(&shmem_swaplist_mutex);
/*
* We needed to drop mutex to make that restrictive page
- * allocation; but the inode might already be freed by now,
- * and we cannot refer to inode or mapping or info to check.
- * However, we do hold page lock on the PageSwapCache page,
- * so can check if that still has our reference remaining.
+ * allocation, but the inode might have been freed while we
+ * dropped it: although a racing shmem_evict_inode() cannot
+ * complete without emptying the radix_tree, our page lock
+ * on this swapcache page is not enough to prevent that -
+ * free_swap_and_cache() of our swap entry will only
+ * trylock_page(), removing swap from radix_tree whatever.
+ *
+ * We must not proceed to shmem_add_to_page_cache() if the
+ * inode has been freed, but of course we cannot rely on
+ * inode or mapping or info to check that. However, we can
+ * safely check if our swap entry is still in use (and here
+ * it can't have got reused for another page): if it's still
+ * in use, then the inode cannot have been freed yet, and we
+ * can safely proceed (if it's no longer in use, that tells
+ * nothing about the inode, but we don't need to unuse swap).
*/
if (!page_swapcount(*pagep))
error = -ENOENT;
@@ -730,9 +750,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
/*
* There's a faint possibility that swap page was replaced before
- * caller locked it: it will come back later with the right page.
+ * caller locked it: caller will come back later with the right page.
*/
- if (unlikely(!PageSwapCache(page)))
+ if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
goto out;
/*
@@ -909,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
- pvma.vm_pgoff = index;
+ /* Bias interleave by inode number to distribute better across nodes */
+ pvma.vm_pgoff = index + info->vfs_inode.i_ino;
pvma.vm_ops = NULL;
pvma.vm_policy = spol;
return swapin_readahead(swap, gfp, &pvma, 0);
@@ -922,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp,
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
- pvma.vm_pgoff = index;
+ /* Bias interleave by inode number to distribute better across nodes */
+ pvma.vm_pgoff = index + info->vfs_inode.i_ino;
pvma.vm_ops = NULL;
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
@@ -995,21 +1017,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
newpage = shmem_alloc_page(gfp, info, index);
if (!newpage)
return -ENOMEM;
- VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
- *pagep = newpage;
page_cache_get(newpage);
copy_highpage(newpage, oldpage);
+ flush_dcache_page(newpage);
- VM_BUG_ON(!PageLocked(oldpage));
__set_page_locked(newpage);
- VM_BUG_ON(!PageUptodate(oldpage));
SetPageUptodate(newpage);
- VM_BUG_ON(!PageSwapBacked(oldpage));
SetPageSwapBacked(newpage);
- VM_BUG_ON(!swap_index);
set_page_private(newpage, swap_index);
- VM_BUG_ON(!PageSwapCache(oldpage));
SetPageSwapCache(newpage);
/*
@@ -1019,13 +1035,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
spin_lock_irq(&swap_mapping->tree_lock);
error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
newpage);
- __inc_zone_page_state(newpage, NR_FILE_PAGES);
- __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+ if (!error) {
+ __inc_zone_page_state(newpage, NR_FILE_PAGES);
+ __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+ }
spin_unlock_irq(&swap_mapping->tree_lock);
- BUG_ON(error);
- mem_cgroup_replace_page_cache(oldpage, newpage);
- lru_cache_add_anon(newpage);
+ if (unlikely(error)) {
+ /*
+ * Is this possible? I think not, now that our callers check
+ * both PageSwapCache and page_private after getting page lock;
+ * but be defensive. Reverse old to newpage for clear and free.
+ */
+ oldpage = newpage;
+ } else {
+ mem_cgroup_replace_page_cache(oldpage, newpage);
+ lru_cache_add_anon(newpage);
+ *pagep = newpage;
+ }
ClearPageSwapCache(oldpage);
set_page_private(oldpage, 0);
@@ -1033,7 +1060,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
unlock_page(oldpage);
page_cache_release(oldpage);
page_cache_release(oldpage);
- return 0;
+ return error;
}
/*
@@ -1107,9 +1134,10 @@ repeat:
/* We have to do this with page locked to prevent races */
lock_page(page);
- if (!PageSwapCache(page) || page->mapping) {
+ if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ !shmem_confirm_swap(mapping, index, swap)) {
error = -EEXIST; /* try again */
- goto failed;
+ goto unlock;
}
if (!PageUptodate(page)) {
error = -EIO;
@@ -1125,9 +1153,12 @@ repeat:
error = mem_cgroup_cache_charge(page, current->mm,
gfp & GFP_RECLAIM_MASK);
- if (!error)
+ if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
gfp, swp_to_radix_entry(swap));
+ /* We already confirmed swap, and make no allocation */
+ VM_BUG_ON(error);
+ }
if (error)
goto failed;
@@ -1164,11 +1195,18 @@ repeat:
__set_page_locked(page);
error = mem_cgroup_cache_charge(page, current->mm,
gfp & GFP_RECLAIM_MASK);
- if (!error)
- error = shmem_add_to_page_cache(page, mapping, index,
- gfp, NULL);
if (error)
goto decused;
+ error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ if (!error) {
+ error = shmem_add_to_page_cache(page, mapping, index,
+ gfp, NULL);
+ radix_tree_preload_end();
+ }
+ if (error) {
+ mem_cgroup_uncharge_cache_page(page);
+ goto decused;
+ }
lru_cache_add_anon(page);
spin_lock(&info->lock);
@@ -1228,14 +1266,10 @@ decused:
unacct:
shmem_unacct_blocks(info->flags, 1);
failed:
- if (swap.val && error != -EINVAL) {
- struct page *test = find_get_page(mapping, index);
- if (test && !radix_tree_exceptional_entry(test))
- page_cache_release(test);
- /* Have another try if the entry has changed */
- if (test != swp_to_radix_entry(swap))
- error = -EEXIST;
- }
+ if (swap.val && error != -EINVAL &&
+ !shmem_confirm_swap(mapping, index, swap))
+ error = -EEXIST;
+unlock:
if (page) {
unlock_page(page);
page_cache_release(page);
@@ -1247,7 +1281,7 @@ failed:
spin_unlock(&info->lock);
goto repeat;
}
- if (error == -EEXIST)
+ if (error == -EEXIST) /* from above or from radix_tree_insert */
goto repeat;
return error;
}
@@ -1675,98 +1709,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
return error;
}
-/*
- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
- */
-static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
- pgoff_t index, pgoff_t end, int origin)
-{
- struct page *page;
- struct pagevec pvec;
- pgoff_t indices[PAGEVEC_SIZE];
- bool done = false;
- int i;
-
- pagevec_init(&pvec, 0);
- pvec.nr = 1; /* start small: we may be there already */
- while (!done) {
- pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
- pvec.nr, pvec.pages, indices);
- if (!pvec.nr) {
- if (origin == SEEK_DATA)
- index = end;
- break;
- }
- for (i = 0; i < pvec.nr; i++, index++) {
- if (index < indices[i]) {
- if (origin == SEEK_HOLE) {
- done = true;
- break;
- }
- index = indices[i];
- }
- page = pvec.pages[i];
- if (page && !radix_tree_exceptional_entry(page)) {
- if (!PageUptodate(page))
- page = NULL;
- }
- if (index >= end ||
- (page && origin == SEEK_DATA) ||
- (!page && origin == SEEK_HOLE)) {
- done = true;
- break;
- }
- }
- shmem_deswap_pagevec(&pvec);
- pagevec_release(&pvec);
- pvec.nr = PAGEVEC_SIZE;
- cond_resched();
- }
- return index;
-}
-
-static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
-{
- struct address_space *mapping;
- struct inode *inode;
- pgoff_t start, end;
- loff_t new_offset;
-
- if (origin != SEEK_DATA && origin != SEEK_HOLE)
- return generic_file_llseek_size(file, offset, origin,
- MAX_LFS_FILESIZE);
- mapping = file->f_mapping;
- inode = mapping->host;
- mutex_lock(&inode->i_mutex);
- /* We're holding i_mutex so we can access i_size directly */
-
- if (offset < 0)
- offset = -EINVAL;
- else if (offset >= inode->i_size)
- offset = -ENXIO;
- else {
- start = offset >> PAGE_CACHE_SHIFT;
- end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- new_offset = shmem_seek_hole_data(mapping, start, end, origin);
- new_offset <<= PAGE_CACHE_SHIFT;
- if (new_offset > offset) {
- if (new_offset < inode->i_size)
- offset = new_offset;
- else if (origin == SEEK_DATA)
- offset = -ENXIO;
- else
- offset = inode->i_size;
- }
- }
-
- if (offset >= 0 && offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- mutex_unlock(&inode->i_mutex);
- return offset;
-}
-
static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
{
@@ -1937,7 +1879,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
}
static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- struct nameidata *nd)
+ bool excl)
{
return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
}
@@ -2770,7 +2712,7 @@ static const struct address_space_operations shmem_aops = {
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
- .llseek = shmem_file_llseek,
+ .llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.aio_read = shmem_file_aio_read,
diff --git a/mm/slab.c b/mm/slab.c
index e901a36e2520..f8b0d539b482 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
* Further notes from the original documentation:
*
* 11 April '97. Started multi-threading - markhe
- * The global cache-chain is protected by the mutex 'cache_chain_mutex'.
+ * The global cache-chain is protected by the mutex 'slab_mutex'.
* The sem is only needed when accessing/extending the cache-chain, which
* can never happen inside an interrupt (kmem_cache_create(),
* kmem_cache_shrink() and kmem_cache_reap()).
@@ -87,6 +87,7 @@
*/
#include <linux/slab.h>
+#include "slab.h"
#include <linux/mm.h>
#include <linux/poison.h>
#include <linux/swap.h>
@@ -117,12 +118,16 @@
#include <linux/memory.h>
#include <linux/prefetch.h>
+#include <net/sock.h>
+
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
#include <trace/events/kmem.h>
+#include "internal.h"
+
/*
* DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
* 0 for faster, smaller code (especially in the critical paths).
@@ -151,6 +156,12 @@
#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
#endif
+/*
+ * true if a page was allocated from pfmemalloc reserves for network-based
+ * swap
+ */
+static bool pfmemalloc_active __read_mostly;
+
/* Legal flag mask for kmem_cache_create(). */
#if DEBUG
# define CREATE_MASK (SLAB_RED_ZONE | \
@@ -256,9 +267,30 @@ struct array_cache {
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
+ *
+ * Entries should not be directly dereferenced as
+ * entries belonging to slabs marked pfmemalloc will
+ * have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
+#define SLAB_OBJ_PFMEMALLOC 1
+static inline bool is_obj_pfmemalloc(void *objp)
+{
+ return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
+}
+
+static inline void set_obj_pfmemalloc(void **objp)
+{
+ *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
+ return;
+}
+
+static inline void clear_obj_pfmemalloc(void **objp)
+{
+ *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
+}
+
/*
* bootstrap: The caches do not work without cpuarrays anymore, but the
* cpuarrays are allocated from the generic caches...
@@ -424,8 +456,8 @@ static void kmem_list3_init(struct kmem_list3 *parent)
* cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
* redzone word.
* cachep->obj_offset: The real object.
- * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->size - 1* BYTES_PER_WORD: last caller address
* [BYTES_PER_WORD long]
*/
static int obj_offset(struct kmem_cache *cachep)
@@ -433,11 +465,6 @@ static int obj_offset(struct kmem_cache *cachep)
return cachep->obj_offset;
}
-static int obj_size(struct kmem_cache *cachep)
-{
- return cachep->obj_size;
-}
-
static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
{
BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
@@ -449,23 +476,22 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
{
BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
if (cachep->flags & SLAB_STORE_USER)
- return (unsigned long long *)(objp + cachep->buffer_size -
+ return (unsigned long long *)(objp + cachep->size -
sizeof(unsigned long long) -
REDZONE_ALIGN);
- return (unsigned long long *) (objp + cachep->buffer_size -
+ return (unsigned long long *) (objp + cachep->size -
sizeof(unsigned long long));
}
static void **dbg_userword(struct kmem_cache *cachep, void *objp)
{
BUG_ON(!(cachep->flags & SLAB_STORE_USER));
- return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
+ return (void **)(objp + cachep->size - BYTES_PER_WORD);
}
#else
#define obj_offset(x) 0
-#define obj_size(cachep) (cachep->buffer_size)
#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long long *)NULL;})
#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
@@ -475,7 +501,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#ifdef CONFIG_TRACING
size_t slab_buffer_size(struct kmem_cache *cachep)
{
- return cachep->buffer_size;
+ return cachep->size;
}
EXPORT_SYMBOL(slab_buffer_size);
#endif
@@ -489,56 +515,37 @@ EXPORT_SYMBOL(slab_buffer_size);
static int slab_max_order = SLAB_MAX_ORDER_LO;
static bool slab_max_order_set __initdata;
-/*
- * Functions for storing/retrieving the cachep and or slab from the page
- * allocator. These are used to find the slab an obj belongs to. With kfree(),
- * these are used to find the cache which an obj belongs to.
- */
-static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
-{
- page->lru.next = (struct list_head *)cache;
-}
-
static inline struct kmem_cache *page_get_cache(struct page *page)
{
page = compound_head(page);
BUG_ON(!PageSlab(page));
- return (struct kmem_cache *)page->lru.next;
-}
-
-static inline void page_set_slab(struct page *page, struct slab *slab)
-{
- page->lru.prev = (struct list_head *)slab;
-}
-
-static inline struct slab *page_get_slab(struct page *page)
-{
- BUG_ON(!PageSlab(page));
- return (struct slab *)page->lru.prev;
+ return page->slab_cache;
}
static inline struct kmem_cache *virt_to_cache(const void *obj)
{
struct page *page = virt_to_head_page(obj);
- return page_get_cache(page);
+ return page->slab_cache;
}
static inline struct slab *virt_to_slab(const void *obj)
{
struct page *page = virt_to_head_page(obj);
- return page_get_slab(page);
+
+ VM_BUG_ON(!PageSlab(page));
+ return page->slab_page;
}
static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
unsigned int idx)
{
- return slab->s_mem + cache->buffer_size * idx;
+ return slab->s_mem + cache->size * idx;
}
/*
- * We want to avoid an expensive divide : (offset / cache->buffer_size)
- * Using the fact that buffer_size is a constant for a particular cache,
- * we can replace (offset / cache->buffer_size) by
+ * We want to avoid an expensive divide : (offset / cache->size)
+ * Using the fact that size is a constant for a particular cache,
+ * we can replace (offset / cache->size) by
* reciprocal_divide(offset, cache->reciprocal_buffer_size)
*/
static inline unsigned int obj_to_index(const struct kmem_cache *cache,
@@ -584,33 +591,12 @@ static struct kmem_cache cache_cache = {
.batchcount = 1,
.limit = BOOT_CPUCACHE_ENTRIES,
.shared = 1,
- .buffer_size = sizeof(struct kmem_cache),
+ .size = sizeof(struct kmem_cache),
.name = "kmem_cache",
};
#define BAD_ALIEN_MAGIC 0x01020304ul
-/*
- * chicken and egg problem: delay the per-cpu array allocation
- * until the general caches are up.
- */
-static enum {
- NONE,
- PARTIAL_AC,
- PARTIAL_L3,
- EARLY,
- LATE,
- FULL
-} g_cpucache_up;
-
-/*
- * used by boot code to determine if it can use slab based allocator
- */
-int slab_is_available(void)
-{
- return g_cpucache_up >= EARLY;
-}
-
#ifdef CONFIG_LOCKDEP
/*
@@ -676,7 +662,7 @@ static void init_node_lock_keys(int q)
{
struct cache_sizes *s = malloc_sizes;
- if (g_cpucache_up < LATE)
+ if (slab_state < UP)
return;
for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
@@ -716,12 +702,6 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
}
#endif
-/*
- * Guard access to the cache-chain.
- */
-static DEFINE_MUTEX(cache_chain_mutex);
-static struct list_head cache_chain;
-
static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -951,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries,
return nc;
}
+static inline bool is_slab_pfmemalloc(struct slab *slabp)
+{
+ struct page *page = virt_to_page(slabp->s_mem);
+
+ return PageSlabPfmemalloc(page);
+}
+
+/* Clears pfmemalloc_active if no slabs have pfmalloc set */
+static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
+ struct array_cache *ac)
+{
+ struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
+ struct slab *slabp;
+ unsigned long flags;
+
+ if (!pfmemalloc_active)
+ return;
+
+ spin_lock_irqsave(&l3->list_lock, flags);
+ list_for_each_entry(slabp, &l3->slabs_full, list)
+ if (is_slab_pfmemalloc(slabp))
+ goto out;
+
+ list_for_each_entry(slabp, &l3->slabs_partial, list)
+ if (is_slab_pfmemalloc(slabp))
+ goto out;
+
+ list_for_each_entry(slabp, &l3->slabs_free, list)
+ if (is_slab_pfmemalloc(slabp))
+ goto out;
+
+ pfmemalloc_active = false;
+out:
+ spin_unlock_irqrestore(&l3->list_lock, flags);
+}
+
+static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ gfp_t flags, bool force_refill)
+{
+ int i;
+ void *objp = ac->entry[--ac->avail];
+
+ /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
+ if (unlikely(is_obj_pfmemalloc(objp))) {
+ struct kmem_list3 *l3;
+
+ if (gfp_pfmemalloc_allowed(flags)) {
+ clear_obj_pfmemalloc(&objp);
+ return objp;
+ }
+
+ /* The caller cannot use PFMEMALLOC objects, find another one */
+ for (i = 1; i < ac->avail; i++) {
+ /* If a !PFMEMALLOC object is found, swap them */
+ if (!is_obj_pfmemalloc(ac->entry[i])) {
+ objp = ac->entry[i];
+ ac->entry[i] = ac->entry[ac->avail];
+ ac->entry[ac->avail] = objp;
+ return objp;
+ }
+ }
+
+ /*
+ * If there are empty slabs on the slabs_free list and we are
+ * being forced to refill the cache, mark this one !pfmemalloc.
+ */
+ l3 = cachep->nodelists[numa_mem_id()];
+ if (!list_empty(&l3->slabs_free) && force_refill) {
+ struct slab *slabp = virt_to_slab(objp);
+ ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
+ clear_obj_pfmemalloc(&objp);
+ recheck_pfmemalloc_active(cachep, ac);
+ return objp;
+ }
+
+ /* No !PFMEMALLOC objects available */
+ ac->avail++;
+ objp = NULL;
+ }
+
+ return objp;
+}
+
+static inline void *ac_get_obj(struct kmem_cache *cachep,
+ struct array_cache *ac, gfp_t flags, bool force_refill)
+{
+ void *objp;
+
+ if (unlikely(sk_memalloc_socks()))
+ objp = __ac_get_obj(cachep, ac, flags, force_refill);
+ else
+ objp = ac->entry[--ac->avail];
+
+ return objp;
+}
+
+static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ void *objp)
+{
+ if (unlikely(pfmemalloc_active)) {
+ /* Some pfmemalloc slabs exist, check if this is one */
+ struct page *page = virt_to_page(objp);
+ if (PageSlabPfmemalloc(page))
+ set_obj_pfmemalloc(&objp);
+ }
+
+ return objp;
+}
+
+static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+ void *objp)
+{
+ if (unlikely(sk_memalloc_socks()))
+ objp = __ac_put_obj(cachep, ac, objp);
+
+ ac->entry[ac->avail++] = objp;
+}
+
/*
* Transfer objects in one arraycache to another.
* Locking must be handled by the caller.
@@ -1127,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, alien, nodeid);
}
- alien->entry[alien->avail++] = objp;
+ ac_put_obj(cachep, alien, objp);
spin_unlock(&alien->lock);
} else {
spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1145,7 +1243,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
* When hotplugging memory or a cpu, existing nodelists are not replaced if
* already in use.
*
- * Must hold cache_chain_mutex.
+ * Must hold slab_mutex.
*/
static int init_cache_nodelists_node(int node)
{
@@ -1153,7 +1251,7 @@ static int init_cache_nodelists_node(int node)
struct kmem_list3 *l3;
const int memsize = sizeof(struct kmem_list3);
- list_for_each_entry(cachep, &cache_chain, next) {
+ list_for_each_entry(cachep, &slab_caches, list) {
/*
* Set up the size64 kmemlist for cpu before we can
* begin anything. Make sure some other cpu on this
@@ -1169,7 +1267,7 @@ static int init_cache_nodelists_node(int node)
/*
* The l3s don't come and go as CPUs come and
- * go. cache_chain_mutex is sufficient
+ * go. slab_mutex is sufficient
* protection here.
*/
cachep->nodelists[node] = l3;
@@ -1191,7 +1289,7 @@ static void __cpuinit cpuup_canceled(long cpu)
int node = cpu_to_mem(cpu);
const struct cpumask *mask = cpumask_of_node(node);
- list_for_each_entry(cachep, &cache_chain, next) {
+ list_for_each_entry(cachep, &slab_caches, list) {
struct array_cache *nc;
struct array_cache *shared;
struct array_cache **alien;
@@ -1241,7 +1339,7 @@ free_array_cache:
* the respective cache's slabs, now we can go ahead and
* shrink each nodelist to its limit.
*/
- list_for_each_entry(cachep, &cache_chain, next) {
+ list_for_each_entry(cachep, &slab_caches, list) {
l3 = cachep->nodelists[node];
if (!l3)
continue;
@@ -1270,7 +1368,7 @@ static int __cpuinit cpuup_prepare(long cpu)
* Now we can go ahead with allocating the shared arrays and
* array caches
*/
- list_for_each_entry(cachep, &cache_chain, next) {
+ list_for_each_entry(cachep, &slab_caches, list) {
struct array_cache *nc;
struct array_cache *shared = NULL;
struct array_cache **alien = NULL;
@@ -1338,9 +1436,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- mutex_lock(&cache_chain_mutex);
+ mutex_lock(&slab_mutex);
err = cpuup_prepare(cpu);
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
@@ -1350,7 +1448,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
/*
- * Shutdown cache reaper. Note that the cache_chain_mutex is
+ * Shutdown cache reaper. Note that the slab_mutex is
* held so that if cache_reap() is invoked it cannot do
* anything expensive but will only modify reap_work
* and reschedule the timer.
@@ -1377,9 +1475,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
#endif
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
- mutex_lock(&cache_chain_mutex);
+ mutex_lock(&slab_mutex);
cpuup_canceled(cpu);
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
break;
}
return notifier_from_errno(err);
@@ -1395,14 +1493,14 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
* Returns -EBUSY if all objects cannot be drained so that the node is not
* removed.
*
- * Must hold cache_chain_mutex.
+ * Must hold slab_mutex.
*/
static int __meminit drain_cache_nodelists_node(int node)
{
struct kmem_cache *cachep;
int ret = 0;
- list_for_each_entry(cachep, &cache_chain, next) {
+ list_for_each_entry(cachep, &slab_caches, list) {
struct kmem_list3 *l3;
l3 = cachep->nodelists[node];
@@ -1433,14 +1531,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
switch (action) {
case MEM_GOING_ONLINE:
- mutex_lock(&cache_chain_mutex);
+ mutex_lock(&slab_mutex);
ret = init_cache_nodelists_node(nid);
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
break;
case MEM_GOING_OFFLINE:
- mutex_lock(&cache_chain_mutex);
+ mutex_lock(&slab_mutex);
ret = drain_cache_nodelists_node(nid);
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
break;
case MEM_ONLINE:
case MEM_OFFLINE:
@@ -1544,8 +1642,8 @@ void __init kmem_cache_init(void)
node = numa_mem_id();
/* 1) create the cache_cache */
- INIT_LIST_HEAD(&cache_chain);
- list_add(&cache_cache.next, &cache_chain);
+ INIT_LIST_HEAD(&slab_caches);
+ list_add(&cache_cache.list, &slab_caches);
cache_cache.colour_off = cache_line_size();
cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
@@ -1553,18 +1651,16 @@ void __init kmem_cache_init(void)
/*
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/
- cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+ cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
nr_node_ids * sizeof(struct kmem_list3 *);
-#if DEBUG
- cache_cache.obj_size = cache_cache.buffer_size;
-#endif
- cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
+ cache_cache.object_size = cache_cache.size;
+ cache_cache.size = ALIGN(cache_cache.size,
cache_line_size());
cache_cache.reciprocal_buffer_size =
- reciprocal_value(cache_cache.buffer_size);
+ reciprocal_value(cache_cache.size);
for (order = 0; order < MAX_ORDER; order++) {
- cache_estimate(order, cache_cache.buffer_size,
+ cache_estimate(order, cache_cache.size,
cache_line_size(), 0, &left_over, &cache_cache.num);
if (cache_cache.num)
break;
@@ -1585,7 +1681,7 @@ void __init kmem_cache_init(void)
* bug.
*/
- sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
+ sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
sizes[INDEX_AC].cs_size,
ARCH_KMALLOC_MINALIGN,
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1593,7 +1689,7 @@ void __init kmem_cache_init(void)
if (INDEX_AC != INDEX_L3) {
sizes[INDEX_L3].cs_cachep =
- kmem_cache_create(names[INDEX_L3].name,
+ __kmem_cache_create(names[INDEX_L3].name,
sizes[INDEX_L3].cs_size,
ARCH_KMALLOC_MINALIGN,
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1611,14 +1707,14 @@ void __init kmem_cache_init(void)
* allow tighter packing of the smaller caches.
*/
if (!sizes->cs_cachep) {
- sizes->cs_cachep = kmem_cache_create(names->name,
+ sizes->cs_cachep = __kmem_cache_create(names->name,
sizes->cs_size,
ARCH_KMALLOC_MINALIGN,
ARCH_KMALLOC_FLAGS|SLAB_PANIC,
NULL);
}
#ifdef CONFIG_ZONE_DMA
- sizes->cs_dmacachep = kmem_cache_create(
+ sizes->cs_dmacachep = __kmem_cache_create(
names->name_dma,
sizes->cs_size,
ARCH_KMALLOC_MINALIGN,
@@ -1676,27 +1772,27 @@ void __init kmem_cache_init(void)
}
}
- g_cpucache_up = EARLY;
+ slab_state = UP;
}
void __init kmem_cache_init_late(void)
{
struct kmem_cache *cachep;
- g_cpucache_up = LATE;
+ slab_state = UP;
/* Annotate slab for lockdep -- annotate the malloc caches */
init_lock_keys();
/* 6) resize the head arrays to their final sizes */
- mutex_lock(&cache_chain_mutex);
- list_for_each_entry(cachep, &cache_chain, next)
+ mutex_lock(&slab_mutex);
+ list_for_each_entry(cachep, &slab_caches, list)
if (enable_cpucache(cachep, GFP_NOWAIT))
BUG();
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
/* Done! */
- g_cpucache_up = FULL;
+ slab_state = FULL;
/*
* Register a cpu startup notifier callback that initializes
@@ -1727,6 +1823,9 @@ static int __init cpucache_init(void)
*/
for_each_online_cpu(cpu)
start_cpu_timer(cpu);
+
+ /* Done! */
+ slab_state = FULL;
return 0;
}
__initcall(cpucache_init);
@@ -1743,7 +1842,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
"SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
nodeid, gfpflags);
printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
- cachep->name, cachep->buffer_size, cachep->gfporder);
+ cachep->name, cachep->size, cachep->gfporder);
for_each_online_node(node) {
unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
@@ -1798,7 +1897,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
flags |= __GFP_COMP;
#endif
- flags |= cachep->gfpflags;
+ flags |= cachep->allocflags;
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
flags |= __GFP_RECLAIMABLE;
@@ -1809,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
return NULL;
}
+ /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+ if (unlikely(page->pfmemalloc))
+ pfmemalloc_active = true;
+
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
add_zone_page_state(page_zone(page),
@@ -1816,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
else
add_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_pages);
- for (i = 0; i < nr_pages; i++)
+ for (i = 0; i < nr_pages; i++) {
__SetPageSlab(page + i);
+ if (page->pfmemalloc)
+ SetPageSlabPfmemalloc(page + i);
+ }
+
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1850,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
NR_SLAB_UNRECLAIMABLE, nr_freed);
while (i--) {
BUG_ON(!PageSlab(page));
+ __ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
page++;
}
@@ -1874,7 +1982,7 @@ static void kmem_rcu_free(struct rcu_head *head)
static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
unsigned long caller)
{
- int size = obj_size(cachep);
+ int size = cachep->object_size;
addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
@@ -1906,7 +2014,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
{
- int size = obj_size(cachep);
+ int size = cachep->object_size;
addr = &((char *)addr)[obj_offset(cachep)];
memset(addr, val, size);
@@ -1966,7 +2074,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
printk("\n");
}
realobj = (char *)objp + obj_offset(cachep);
- size = obj_size(cachep);
+ size = cachep->object_size;
for (i = 0; i < size && lines; i += 16, lines--) {
int limit;
limit = 16;
@@ -1983,7 +2091,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
int lines = 0;
realobj = (char *)objp + obj_offset(cachep);
- size = obj_size(cachep);
+ size = cachep->object_size;
for (i = 0; i < size; i++) {
char exp = POISON_FREE;
@@ -2047,10 +2155,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- if (cachep->buffer_size % PAGE_SIZE == 0 &&
+ if (cachep->size % PAGE_SIZE == 0 &&
OFF_SLAB(cachep))
kernel_map_pages(virt_to_page(objp),
- cachep->buffer_size / PAGE_SIZE, 1);
+ cachep->size / PAGE_SIZE, 1);
else
check_poison_obj(cachep, objp);
#else
@@ -2194,10 +2302,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
- if (g_cpucache_up == FULL)
+ if (slab_state >= FULL)
return enable_cpucache(cachep, gfp);
- if (g_cpucache_up == NONE) {
+ if (slab_state == DOWN) {
/*
* Note: the first kmem_cache_create must create the cache
* that's used by kmalloc(24), otherwise the creation of
@@ -2212,16 +2320,16 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
*/
set_up_list3s(cachep, SIZE_AC);
if (INDEX_AC == INDEX_L3)
- g_cpucache_up = PARTIAL_L3;
+ slab_state = PARTIAL_L3;
else
- g_cpucache_up = PARTIAL_AC;
+ slab_state = PARTIAL_ARRAYCACHE;
} else {
cachep->array[smp_processor_id()] =
kmalloc(sizeof(struct arraycache_init), gfp);
- if (g_cpucache_up == PARTIAL_AC) {
+ if (slab_state == PARTIAL_ARRAYCACHE) {
set_up_list3s(cachep, SIZE_L3);
- g_cpucache_up = PARTIAL_L3;
+ slab_state = PARTIAL_L3;
} else {
int node;
for_each_online_node(node) {
@@ -2247,7 +2355,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
}
/**
- * kmem_cache_create - Create a cache.
+ * __kmem_cache_create - Create a cache.
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @align: The required alignment for the objects.
@@ -2274,59 +2382,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
* as davem.
*/
struct kmem_cache *
-kmem_cache_create (const char *name, size_t size, size_t align,
+__kmem_cache_create (const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
size_t left_over, slab_size, ralign;
- struct kmem_cache *cachep = NULL, *pc;
+ struct kmem_cache *cachep = NULL;
gfp_t gfp;
- /*
- * Sanity checks... these are all serious usage bugs.
- */
- if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
- size > KMALLOC_MAX_SIZE) {
- printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
- name);
- BUG();
- }
-
- /*
- * We use cache_chain_mutex to ensure a consistent view of
- * cpu_online_mask as well. Please see cpuup_callback
- */
- if (slab_is_available()) {
- get_online_cpus();
- mutex_lock(&cache_chain_mutex);
- }
-
- list_for_each_entry(pc, &cache_chain, next) {
- char tmp;
- int res;
-
- /*
- * This happens when the module gets unloaded and doesn't
- * destroy its slab cache and no-one else reuses the vmalloc
- * area of the module. Print a warning.
- */
- res = probe_kernel_address(pc->name, tmp);
- if (res) {
- printk(KERN_ERR
- "SLAB: cache with size %d has lost its name\n",
- pc->buffer_size);
- continue;
- }
-
- if (!strcmp(pc->name, name)) {
- printk(KERN_ERR
- "kmem_cache_create: duplicate cache %s\n", name);
- dump_stack();
- goto oops;
- }
- }
-
#if DEBUG
- WARN_ON(strchr(name, ' ')); /* It confuses parsers */
#if FORCED_DEBUG
/*
* Enable redzoning and last user accounting, except for caches with
@@ -2415,11 +2478,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
/* Get cache's description obj. */
cachep = kmem_cache_zalloc(&cache_cache, gfp);
if (!cachep)
- goto oops;
+ return NULL;
cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+ cachep->object_size = size;
+ cachep->align = align;
#if DEBUG
- cachep->obj_size = size;
/*
* Both debugging options require word-alignment which is calculated
@@ -2442,7 +2506,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
- && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+ && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
size = PAGE_SIZE;
}
@@ -2471,8 +2535,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
printk(KERN_ERR
"kmem_cache_create: couldn't create cache %s.\n", name);
kmem_cache_free(&cache_cache, cachep);
- cachep = NULL;
- goto oops;
+ return NULL;
}
slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+ sizeof(struct slab), align);
@@ -2508,10 +2571,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
cachep->colour = left_over / cachep->colour_off;
cachep->slab_size = slab_size;
cachep->flags = flags;
- cachep->gfpflags = 0;
+ cachep->allocflags = 0;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
- cachep->gfpflags |= GFP_DMA;
- cachep->buffer_size = size;
+ cachep->allocflags |= GFP_DMA;
+ cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
@@ -2530,8 +2593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (setup_cpu_cache(cachep, gfp)) {
__kmem_cache_destroy(cachep);
- cachep = NULL;
- goto oops;
+ return NULL;
}
if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2545,18 +2607,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
/* cache setup completed, link it into the list */
- list_add(&cachep->next, &cache_chain);
-oops:
- if (!cachep && (flags & SLAB_PANIC))
- panic("kmem_cache_create(): failed to create slab `%s'\n",
- name);
- if (slab_is_available()) {
- mutex_unlock(&cache_chain_mutex);
- put_online_cpus();
- }
+ list_add(&cachep->list, &slab_caches);
return cachep;
}
-EXPORT_SYMBOL(kmem_cache_create);
#if DEBUG
static void check_irq_off(void)
@@ -2671,7 +2724,7 @@ out:
return nr_freed;
}
-/* Called with cache_chain_mutex held to protect against cpu hotplug */
+/* Called with slab_mutex held to protect against cpu hotplug */
static int __cache_shrink(struct kmem_cache *cachep)
{
int ret = 0, i = 0;
@@ -2706,9 +2759,9 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
BUG_ON(!cachep || in_interrupt());
get_online_cpus();
- mutex_lock(&cache_chain_mutex);
+ mutex_lock(&slab_mutex);
ret = __cache_shrink(cachep);
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
put_online_cpus();
return ret;
}
@@ -2736,15 +2789,15 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
/* Find the cache in the chain of caches. */
get_online_cpus();
- mutex_lock(&cache_chain_mutex);
+ mutex_lock(&slab_mutex);
/*
* the chain is never empty, cache_cache is never destroyed
*/
- list_del(&cachep->next);
+ list_del(&cachep->list);
if (__cache_shrink(cachep)) {
slab_error(cachep, "Can't free all objects");
- list_add(&cachep->next, &cache_chain);
- mutex_unlock(&cache_chain_mutex);
+ list_add(&cachep->list, &slab_caches);
+ mutex_unlock(&slab_mutex);
put_online_cpus();
return;
}
@@ -2753,7 +2806,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
rcu_barrier();
__kmem_cache_destroy(cachep);
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2840,10 +2893,10 @@ static void cache_init_objs(struct kmem_cache *cachep,
slab_error(cachep, "constructor overwrote the"
" start of an object");
}
- if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
+ if ((cachep->size % PAGE_SIZE) == 0 &&
OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
kernel_map_pages(virt_to_page(objp),
- cachep->buffer_size / PAGE_SIZE, 0);
+ cachep->size / PAGE_SIZE, 0);
#else
if (cachep->ctor)
cachep->ctor(objp);
@@ -2857,9 +2910,9 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
{
if (CONFIG_ZONE_DMA_FLAG) {
if (flags & GFP_DMA)
- BUG_ON(!(cachep->gfpflags & GFP_DMA));
+ BUG_ON(!(cachep->allocflags & GFP_DMA));
else
- BUG_ON(cachep->gfpflags & GFP_DMA);
+ BUG_ON(cachep->allocflags & GFP_DMA);
}
}
@@ -2918,8 +2971,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
nr_pages <<= cache->gfporder;
do {
- page_set_cache(page, cache);
- page_set_slab(page, slab);
+ page->slab_cache = cache;
+ page->slab_page = slab;
page++;
} while (--nr_pages);
}
@@ -3057,7 +3110,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
kfree_debugcheck(objp);
page = virt_to_head_page(objp);
- slabp = page_get_slab(page);
+ slabp = page->slab_page;
if (cachep->flags & SLAB_RED_ZONE) {
verify_redzone_free(cachep, objp);
@@ -3077,10 +3130,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
#endif
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
+ if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
store_stackinfo(cachep, objp, (unsigned long)caller);
kernel_map_pages(virt_to_page(objp),
- cachep->buffer_size / PAGE_SIZE, 0);
+ cachep->size / PAGE_SIZE, 0);
} else {
poison_obj(cachep, objp, POISON_FREE);
}
@@ -3120,16 +3173,19 @@ bad:
#define check_slabp(x,y) do { } while(0)
#endif
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
+ bool force_refill)
{
int batchcount;
struct kmem_list3 *l3;
struct array_cache *ac;
int node;
-retry:
check_irq_off();
node = numa_mem_id();
+ if (unlikely(force_refill))
+ goto force_grow;
+retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3179,8 +3235,8 @@ retry:
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
- node);
+ ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
+ node));
}
check_slabp(cachep, slabp);
@@ -3199,18 +3255,22 @@ alloc_done:
if (unlikely(!ac->avail)) {
int x;
+force_grow:
x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
- if (!x && ac->avail == 0) /* no objects in sight? abort */
+
+ /* no objects in sight? abort */
+ if (!x && (ac->avail == 0 || force_refill))
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
goto retry;
}
ac->touched = 1;
- return ac->entry[--ac->avail];
+
+ return ac_get_obj(cachep, ac, flags, force_refill);
}
static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3230,9 +3290,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
return objp;
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+ if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
kernel_map_pages(virt_to_page(objp),
- cachep->buffer_size / PAGE_SIZE, 1);
+ cachep->size / PAGE_SIZE, 1);
else
check_poison_obj(cachep, objp);
#else
@@ -3261,8 +3321,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
struct slab *slabp;
unsigned objnr;
- slabp = page_get_slab(virt_to_head_page(objp));
- objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+ slabp = virt_to_head_page(objp)->slab_page;
+ objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
}
#endif
@@ -3285,30 +3345,42 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
if (cachep == &cache_cache)
return false;
- return should_failslab(obj_size(cachep), flags, cachep->flags);
+ return should_failslab(cachep->object_size, flags, cachep->flags);
}
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
+ bool force_refill = false;
check_irq_off();
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
- STATS_INC_ALLOCHIT(cachep);
ac->touched = 1;
- objp = ac->entry[--ac->avail];
- } else {
- STATS_INC_ALLOCMISS(cachep);
- objp = cache_alloc_refill(cachep, flags);
+ objp = ac_get_obj(cachep, ac, flags, false);
+
/*
- * the 'ac' may be updated by cache_alloc_refill(),
- * and kmemleak_erase() requires its correct value.
+ * Allow for the possibility all avail objects are not allowed
+ * by the current flags
*/
- ac = cpu_cache_get(cachep);
+ if (objp) {
+ STATS_INC_ALLOCHIT(cachep);
+ goto out;
+ }
+ force_refill = true;
}
+
+ STATS_INC_ALLOCMISS(cachep);
+ objp = cache_alloc_refill(cachep, flags, force_refill);
+ /*
+ * the 'ac' may be updated by cache_alloc_refill(),
+ * and kmemleak_erase() requires its correct value.
+ */
+ ac = cpu_cache_get(cachep);
+
+out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3336,7 +3408,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
nid_alloc = cpuset_slab_spread_node();
else if (current->mempolicy)
- nid_alloc = slab_node(current->mempolicy);
+ nid_alloc = slab_node();
if (nid_alloc != nid_here)
return ____cache_alloc_node(cachep, flags, nid_alloc);
return NULL;
@@ -3368,7 +3440,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
- zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+ zonelist = node_zonelist(slab_node(), flags);
retry:
/*
@@ -3545,14 +3617,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
out:
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
- kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
+ kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
flags);
if (likely(ptr))
- kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
+ kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
if (unlikely((flags & __GFP_ZERO) && ptr))
- memset(ptr, 0, obj_size(cachep));
+ memset(ptr, 0, cachep->object_size);
return ptr;
}
@@ -3607,15 +3679,15 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
- kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
+ kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
flags);
prefetchw(objp);
if (likely(objp))
- kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
+ kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
if (unlikely((flags & __GFP_ZERO) && objp))
- memset(objp, 0, obj_size(cachep));
+ memset(objp, 0, cachep->object_size);
return objp;
}
@@ -3630,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
struct kmem_list3 *l3;
for (i = 0; i < nr_objects; i++) {
- void *objp = objpp[i];
+ void *objp;
struct slab *slabp;
+ clear_obj_pfmemalloc(&objpp[i]);
+ objp = objpp[i];
+
slabp = virt_to_slab(objp);
l3 = cachep->nodelists[node];
list_del(&slabp->list);
@@ -3731,7 +3806,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
- kmemcheck_slab_free(cachep, objp, obj_size(cachep));
+ kmemcheck_slab_free(cachep, objp, cachep->object_size);
/*
* Skip calling cache_free_alien() when the platform is not numa.
@@ -3750,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
cache_flusharray(cachep, ac);
}
- ac->entry[ac->avail++] = objp;
+ ac_put_obj(cachep, ac, objp);
}
/**
@@ -3766,7 +3841,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
trace_kmem_cache_alloc(_RET_IP_, ret,
- obj_size(cachep), cachep->buffer_size, flags);
+ cachep->object_size, cachep->size, flags);
return ret;
}
@@ -3794,7 +3869,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
__builtin_return_address(0));
trace_kmem_cache_alloc_node(_RET_IP_, ret,
- obj_size(cachep), cachep->buffer_size,
+ cachep->object_size, cachep->size,
flags, nodeid);
return ret;
@@ -3876,7 +3951,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
ret = __cache_alloc(cachep, flags, caller);
trace_kmalloc((unsigned long) caller, ret,
- size, cachep->buffer_size, flags);
+ size, cachep->size, flags);
return ret;
}
@@ -3916,9 +3991,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
unsigned long flags;
local_irq_save(flags);
- debug_check_no_locks_freed(objp, obj_size(cachep));
+ debug_check_no_locks_freed(objp, cachep->object_size);
if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(objp, obj_size(cachep));
+ debug_check_no_obj_freed(objp, cachep->object_size);
__cache_free(cachep, objp, __builtin_return_address(0));
local_irq_restore(flags);
@@ -3947,8 +4022,9 @@ void kfree(const void *objp)
local_irq_save(flags);
kfree_debugcheck(objp);
c = virt_to_cache(objp);
- debug_check_no_locks_freed(objp, obj_size(c));
- debug_check_no_obj_freed(objp, obj_size(c));
+ debug_check_no_locks_freed(objp, c->object_size);
+
+ debug_check_no_obj_freed(objp, c->object_size);
__cache_free(c, (void *)objp, __builtin_return_address(0));
local_irq_restore(flags);
}
@@ -3956,7 +4032,7 @@ EXPORT_SYMBOL(kfree);
unsigned int kmem_cache_size(struct kmem_cache *cachep)
{
- return obj_size(cachep);
+ return cachep->object_size;
}
EXPORT_SYMBOL(kmem_cache_size);
@@ -4030,7 +4106,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
return 0;
fail:
- if (!cachep->next.next) {
+ if (!cachep->list.next) {
/* Cache is not active yet. Roll back what we did */
node--;
while (node >= 0) {
@@ -4065,7 +4141,7 @@ static void do_ccupdate_local(void *info)
new->new[smp_processor_id()] = old;
}
-/* Always called with the cache_chain_mutex held */
+/* Always called with the slab_mutex held */
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
@@ -4109,7 +4185,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
return alloc_kmemlist(cachep, gfp);
}
-/* Called with cache_chain_mutex held always */
+/* Called with slab_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
int err;
@@ -4124,13 +4200,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
* The numbers are guessed, we should auto-tune as described by
* Bonwick.
*/
- if (cachep->buffer_size > 131072)
+ if (cachep->size > 131072)
limit = 1;
- else if (cachep->buffer_size > PAGE_SIZE)
+ else if (cachep->size > PAGE_SIZE)
limit = 8;
- else if (cachep->buffer_size > 1024)
+ else if (cachep->size > 1024)
limit = 24;
- else if (cachep->buffer_size > 256)
+ else if (cachep->size > 256)
limit = 54;
else
limit = 120;
@@ -4145,7 +4221,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
* to a larger limit. Thus disabled by default.
*/
shared = 0;
- if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
+ if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
shared = 8;
#if DEBUG
@@ -4211,11 +4287,11 @@ static void cache_reap(struct work_struct *w)
int node = numa_mem_id();
struct delayed_work *work = to_delayed_work(w);
- if (!mutex_trylock(&cache_chain_mutex))
+ if (!mutex_trylock(&slab_mutex))
/* Give up. Setup the next iteration. */
goto out;
- list_for_each_entry(searchp, &cache_chain, next) {
+ list_for_each_entry(searchp, &slab_caches, list) {
check_irq_on();
/*
@@ -4253,7 +4329,7 @@ next:
cond_resched();
}
check_irq_on();
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
next_reap_node();
out:
/* Set up the next iteration */
@@ -4289,26 +4365,26 @@ static void *s_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
- mutex_lock(&cache_chain_mutex);
+ mutex_lock(&slab_mutex);
if (!n)
print_slabinfo_header(m);
- return seq_list_start(&cache_chain, *pos);
+ return seq_list_start(&slab_caches, *pos);
}
static void *s_next(struct seq_file *m, void *p, loff_t *pos)
{
- return seq_list_next(p, &cache_chain, pos);
+ return seq_list_next(p, &slab_caches, pos);
}
static void s_stop(struct seq_file *m, void *p)
{
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
}
static int s_show(struct seq_file *m, void *p)
{
- struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
+ struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
struct slab *slabp;
unsigned long active_objs;
unsigned long num_objs;
@@ -4364,7 +4440,7 @@ static int s_show(struct seq_file *m, void *p)
printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
- name, active_objs, num_objs, cachep->buffer_size,
+ name, active_objs, num_objs, cachep->size,
cachep->num, (1 << cachep->gfporder));
seq_printf(m, " : tunables %4u %4u %4u",
cachep->limit, cachep->batchcount, cachep->shared);
@@ -4454,9 +4530,9 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
return -EINVAL;
/* Find the cache in the chain of caches. */
- mutex_lock(&cache_chain_mutex);
+ mutex_lock(&slab_mutex);
res = -EINVAL;
- list_for_each_entry(cachep, &cache_chain, next) {
+ list_for_each_entry(cachep, &slab_caches, list) {
if (!strcmp(cachep->name, kbuf)) {
if (limit < 1 || batchcount < 1 ||
batchcount > limit || shared < 0) {
@@ -4469,7 +4545,7 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
break;
}
}
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
if (res >= 0)
res = count;
return res;
@@ -4492,8 +4568,8 @@ static const struct file_operations proc_slabinfo_operations = {
static void *leaks_start(struct seq_file *m, loff_t *pos)
{
- mutex_lock(&cache_chain_mutex);
- return seq_list_start(&cache_chain, *pos);
+ mutex_lock(&slab_mutex);
+ return seq_list_start(&slab_caches, *pos);
}
static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4532,7 +4608,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
int i;
if (n[0] == n[1])
return;
- for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
+ for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
continue;
if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
@@ -4558,7 +4634,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
static int leaks_show(struct seq_file *m, void *p)
{
- struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
+ struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
struct slab *slabp;
struct kmem_list3 *l3;
const char *name;
@@ -4592,17 +4668,17 @@ static int leaks_show(struct seq_file *m, void *p)
name = cachep->name;
if (n[0] == n[1]) {
/* Increase the buffer size */
- mutex_unlock(&cache_chain_mutex);
+ mutex_unlock(&slab_mutex);
m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
if (!m->private) {
/* Too bad, we are really out */
m->private = n;
- mutex_lock(&cache_chain_mutex);
+ mutex_lock(&slab_mutex);
return -ENOMEM;
}
*(unsigned long *)m->private = n[0] * 2;
kfree(n);
- mutex_lock(&cache_chain_mutex);
+ mutex_lock(&slab_mutex);
/* Now make sure this entry will be retried */
m->count = m->size;
return 0;
@@ -4677,6 +4753,6 @@ size_t ksize(const void *objp)
if (unlikely(objp == ZERO_SIZE_PTR))
return 0;
- return obj_size(virt_to_cache(objp));
+ return virt_to_cache(objp)->object_size;
}
EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
new file mode 100644
index 000000000000..db7848caaa25
--- /dev/null
+++ b/mm/slab.h
@@ -0,0 +1,33 @@
+#ifndef MM_SLAB_H
+#define MM_SLAB_H
+/*
+ * Internal slab definitions
+ */
+
+/*
+ * State of the slab allocator.
+ *
+ * This is used to describe the states of the allocator during bootup.
+ * Allocators use this to gradually bootstrap themselves. Most allocators
+ * have the problem that the structures used for managing slab caches are
+ * allocated from slab caches themselves.
+ */
+enum slab_state {
+ DOWN, /* No slab functionality yet */
+ PARTIAL, /* SLUB: kmem_cache_node available */
+ PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */
+ PARTIAL_L3, /* SLAB: kmalloc size for l3 struct available */
+ UP, /* Slab caches usable but not all extras yet */
+ FULL /* Everything is working */
+};
+
+extern enum slab_state slab_state;
+
+/* The slab cache mutex protects the management structures during changes */
+extern struct mutex slab_mutex;
+extern struct list_head slab_caches;
+
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *));
+
+#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
new file mode 100644
index 000000000000..aa3ca5bb01b5
--- /dev/null
+++ b/mm/slab_common.c
@@ -0,0 +1,120 @@
+/*
+ * Slab allocator functions that are independent of the allocator strategy
+ *
+ * (C) 2012 Christoph Lameter <cl@linux.com>
+ */
+#include <linux/slab.h>
+
+#include <linux/mm.h>
+#include <linux/poison.h>
+#include <linux/interrupt.h>
+#include <linux/memory.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/page.h>
+
+#include "slab.h"
+
+enum slab_state slab_state;
+LIST_HEAD(slab_caches);
+DEFINE_MUTEX(slab_mutex);
+
+/*
+ * kmem_cache_create - Create a cache.
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @ctor: A constructor for the objects.
+ *
+ * Returns a ptr to the cache on success, NULL on failure.
+ * Cannot be called within a interrupt, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline. This can be beneficial if you're counting cycles as closely
+ * as davem.
+ */
+
+struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
+{
+ struct kmem_cache *s = NULL;
+
+#ifdef CONFIG_DEBUG_VM
+ if (!name || in_interrupt() || size < sizeof(void *) ||
+ size > KMALLOC_MAX_SIZE) {
+ printk(KERN_ERR "kmem_cache_create(%s) integrity check"
+ " failed\n", name);
+ goto out;
+ }
+#endif
+
+ get_online_cpus();
+ mutex_lock(&slab_mutex);
+
+#ifdef CONFIG_DEBUG_VM
+ list_for_each_entry(s, &slab_caches, list) {
+ char tmp;
+ int res;
+
+ /*
+ * This happens when the module gets unloaded and doesn't
+ * destroy its slab cache and no-one else reuses the vmalloc
+ * area of the module. Print a warning.
+ */
+ res = probe_kernel_address(s->name, tmp);
+ if (res) {
+ printk(KERN_ERR
+ "Slab cache with size %d has lost its name\n",
+ s->object_size);
+ continue;
+ }
+
+ if (!strcmp(s->name, name)) {
+ printk(KERN_ERR "kmem_cache_create(%s): Cache name"
+ " already exists.\n",
+ name);
+ dump_stack();
+ s = NULL;
+ goto oops;
+ }
+ }
+
+ WARN_ON(strchr(name, ' ')); /* It confuses parsers */
+#endif
+
+ s = __kmem_cache_create(name, size, align, flags, ctor);
+
+#ifdef CONFIG_DEBUG_VM
+oops:
+#endif
+ mutex_unlock(&slab_mutex);
+ put_online_cpus();
+
+#ifdef CONFIG_DEBUG_VM
+out:
+#endif
+ if (!s && (flags & SLAB_PANIC))
+ panic("kmem_cache_create: Failed to create slab '%s'\n", name);
+
+ return s;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+int slab_is_available(void)
+{
+ return slab_state >= UP;
+}
diff --git a/mm/slob.c b/mm/slob.c
index 8105be42cad1..45d4ca79933a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -59,6 +59,8 @@
#include <linux/kernel.h>
#include <linux/slab.h>
+#include "slab.h"
+
#include <linux/mm.h>
#include <linux/swap.h> /* struct reclaim_state */
#include <linux/cache.h>
@@ -92,36 +94,6 @@ struct slob_block {
typedef struct slob_block slob_t;
/*
- * We use struct page fields to manage some slob allocation aspects,
- * however to avoid the horrible mess in include/linux/mm_types.h, we'll
- * just define our own struct page type variant here.
- */
-struct slob_page {
- union {
- struct {
- unsigned long flags; /* mandatory */
- atomic_t _count; /* mandatory */
- slobidx_t units; /* free units left in page */
- unsigned long pad[2];
- slob_t *free; /* first free slob_t in page */
- struct list_head list; /* linked list of free pages */
- };
- struct page page;
- };
-};
-static inline void struct_slob_page_wrong_size(void)
-{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
-
-/*
- * free_slob_page: call before a slob_page is returned to the page allocator.
- */
-static inline void free_slob_page(struct slob_page *sp)
-{
- reset_page_mapcount(&sp->page);
- sp->page.mapping = NULL;
-}
-
-/*
* All partially free slob pages go on these lists.
*/
#define SLOB_BREAK1 256
@@ -131,46 +103,23 @@ static LIST_HEAD(free_slob_medium);
static LIST_HEAD(free_slob_large);
/*
- * is_slob_page: True for all slob pages (false for bigblock pages)
- */
-static inline int is_slob_page(struct slob_page *sp)
-{
- return PageSlab((struct page *)sp);
-}
-
-static inline void set_slob_page(struct slob_page *sp)
-{
- __SetPageSlab((struct page *)sp);
-}
-
-static inline void clear_slob_page(struct slob_page *sp)
-{
- __ClearPageSlab((struct page *)sp);
-}
-
-static inline struct slob_page *slob_page(const void *addr)
-{
- return (struct slob_page *)virt_to_page(addr);
-}
-
-/*
* slob_page_free: true for pages on free_slob_pages list.
*/
-static inline int slob_page_free(struct slob_page *sp)
+static inline int slob_page_free(struct page *sp)
{
- return PageSlobFree((struct page *)sp);
+ return PageSlobFree(sp);
}
-static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
+static void set_slob_page_free(struct page *sp, struct list_head *list)
{
list_add(&sp->list, list);
- __SetPageSlobFree((struct page *)sp);
+ __SetPageSlobFree(sp);
}
-static inline void clear_slob_page_free(struct slob_page *sp)
+static inline void clear_slob_page_free(struct page *sp)
{
list_del(&sp->list);
- __ClearPageSlobFree((struct page *)sp);
+ __ClearPageSlobFree(sp);
}
#define SLOB_UNIT sizeof(slob_t)
@@ -267,12 +216,12 @@ static void slob_free_pages(void *b, int order)
/*
* Allocate a slob block within a given slob_page sp.
*/
-static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
+static void *slob_page_alloc(struct page *sp, size_t size, int align)
{
slob_t *prev, *cur, *aligned = NULL;
int delta = 0, units = SLOB_UNITS(size);
- for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
+ for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
slobidx_t avail = slob_units(cur);
if (align) {
@@ -296,12 +245,12 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
if (prev)
set_slob(prev, slob_units(prev), next);
else
- sp->free = next;
+ sp->freelist = next;
} else { /* fragment */
if (prev)
set_slob(prev, slob_units(prev), cur + units);
else
- sp->free = cur + units;
+ sp->freelist = cur + units;
set_slob(cur + units, avail - units, next);
}
@@ -320,7 +269,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
*/
static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
{
- struct slob_page *sp;
+ struct page *sp;
struct list_head *prev;
struct list_head *slob_list;
slob_t *b = NULL;
@@ -341,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
* If there's a node specification, search for a partial
* page with a matching node id in the freelist.
*/
- if (node != -1 && page_to_nid(&sp->page) != node)
+ if (node != -1 && page_to_nid(sp) != node)
continue;
#endif
/* Enough room on this page? */
@@ -369,12 +318,12 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
if (!b)
return NULL;
- sp = slob_page(b);
- set_slob_page(sp);
+ sp = virt_to_page(b);
+ __SetPageSlab(sp);
spin_lock_irqsave(&slob_lock, flags);
sp->units = SLOB_UNITS(PAGE_SIZE);
- sp->free = b;
+ sp->freelist = b;
INIT_LIST_HEAD(&sp->list);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
@@ -392,7 +341,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
*/
static void slob_free(void *block, int size)
{
- struct slob_page *sp;
+ struct page *sp;
slob_t *prev, *next, *b = (slob_t *)block;
slobidx_t units;
unsigned long flags;
@@ -402,7 +351,7 @@ static void slob_free(void *block, int size)
return;
BUG_ON(!size);
- sp = slob_page(block);
+ sp = virt_to_page(block);
units = SLOB_UNITS(size);
spin_lock_irqsave(&slob_lock, flags);
@@ -412,8 +361,8 @@ static void slob_free(void *block, int size)
if (slob_page_free(sp))
clear_slob_page_free(sp);
spin_unlock_irqrestore(&slob_lock, flags);
- clear_slob_page(sp);
- free_slob_page(sp);
+ __ClearPageSlab(sp);
+ reset_page_mapcount(sp);
slob_free_pages(b, 0);
return;
}
@@ -421,7 +370,7 @@ static void slob_free(void *block, int size)
if (!slob_page_free(sp)) {
/* This slob page is about to become partially free. Easy! */
sp->units = units;
- sp->free = b;
+ sp->freelist = b;
set_slob(b, units,
(void *)((unsigned long)(b +
SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
@@ -441,15 +390,15 @@ static void slob_free(void *block, int size)
*/
sp->units += units;
- if (b < sp->free) {
- if (b + units == sp->free) {
- units += slob_units(sp->free);
- sp->free = slob_next(sp->free);
+ if (b < (slob_t *)sp->freelist) {
+ if (b + units == sp->freelist) {
+ units += slob_units(sp->freelist);
+ sp->freelist = slob_next(sp->freelist);
}
- set_slob(b, units, sp->free);
- sp->free = b;
+ set_slob(b, units, sp->freelist);
+ sp->freelist = b;
} else {
- prev = sp->free;
+ prev = sp->freelist;
next = slob_next(prev);
while (b > next) {
prev = next;
@@ -522,7 +471,7 @@ EXPORT_SYMBOL(__kmalloc_node);
void kfree(const void *block)
{
- struct slob_page *sp;
+ struct page *sp;
trace_kfree(_RET_IP_, block);
@@ -530,43 +479,36 @@ void kfree(const void *block)
return;
kmemleak_free(block);
- sp = slob_page(block);
- if (is_slob_page(sp)) {
+ sp = virt_to_page(block);
+ if (PageSlab(sp)) {
int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
} else
- put_page(&sp->page);
+ put_page(sp);
}
EXPORT_SYMBOL(kfree);
/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
size_t ksize(const void *block)
{
- struct slob_page *sp;
+ struct page *sp;
BUG_ON(!block);
if (unlikely(block == ZERO_SIZE_PTR))
return 0;
- sp = slob_page(block);
- if (is_slob_page(sp)) {
+ sp = virt_to_page(block);
+ if (PageSlab(sp)) {
int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
} else
- return sp->page.private;
+ return sp->private;
}
EXPORT_SYMBOL(ksize);
-struct kmem_cache {
- unsigned int size, align;
- unsigned long flags;
- const char *name;
- void (*ctor)(void *);
-};
-
-struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *c;
@@ -589,13 +531,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
c->align = ARCH_SLAB_MINALIGN;
if (c->align < align)
c->align = align;
- } else if (flags & SLAB_PANIC)
- panic("Cannot create slab cache %s\n", name);
- kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
+ kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
+ c->refcount = 1;
+ }
return c;
}
-EXPORT_SYMBOL(kmem_cache_create);
void kmem_cache_destroy(struct kmem_cache *c)
{
@@ -678,19 +619,12 @@ int kmem_cache_shrink(struct kmem_cache *d)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-static unsigned int slob_ready __read_mostly;
-
-int slab_is_available(void)
-{
- return slob_ready;
-}
-
void __init kmem_cache_init(void)
{
- slob_ready = 1;
+ slab_state = UP;
}
void __init kmem_cache_init_late(void)
{
- /* Nothing to do */
+ slab_state = FULL;
}
diff --git a/mm/slub.c b/mm/slub.c
index 8c691fa1cf3c..8f78e2577031 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/bitops.h>
#include <linux/slab.h>
+#include "slab.h"
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kmemcheck.h>
@@ -33,15 +34,17 @@
#include <trace/events/kmem.h>
+#include "internal.h"
+
/*
* Lock order:
- * 1. slub_lock (Global Semaphore)
+ * 1. slab_mutex (Global Mutex)
* 2. node->list_lock
* 3. slab_lock(page) (Only on some arches and for debugging)
*
- * slub_lock
+ * slab_mutex
*
- * The role of the slub_lock is to protect the list of all the slabs
+ * The role of the slab_mutex is to protect the list of all the slabs
* and to synchronize major metadata changes to slab cache structures.
*
* The slab_lock is only used for debugging and on arches that do not
@@ -182,17 +185,6 @@ static int kmem_size = sizeof(struct kmem_cache);
static struct notifier_block slab_notifier;
#endif
-static enum {
- DOWN, /* No slab functionality available */
- PARTIAL, /* Kmem_cache_node works */
- UP, /* Everything works but does not show up in sysfs */
- SYSFS /* Sysfs up */
-} slab_state = DOWN;
-
-/* A list of all slab caches on the system */
-static DECLARE_RWSEM(slub_lock);
-static LIST_HEAD(slab_caches);
-
/*
* Tracking user of a slab.
*/
@@ -237,11 +229,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
* Core slab cache functions
*******************************************************************/
-int slab_is_available(void)
-{
- return slab_state >= UP;
-}
-
static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
{
return s->node[node];
@@ -311,7 +298,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
* and whatever may come after it.
*/
if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
- return s->objsize;
+ return s->object_size;
#endif
/*
@@ -609,11 +596,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (p > addr + 16)
print_section("Bytes b4 ", p - 16, 16);
- print_section("Object ", p, min_t(unsigned long, s->objsize,
+ print_section("Object ", p, min_t(unsigned long, s->object_size,
PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
- print_section("Redzone ", p + s->objsize,
- s->inuse - s->objsize);
+ print_section("Redzone ", p + s->object_size,
+ s->inuse - s->object_size);
if (s->offset)
off = s->offset + sizeof(void *);
@@ -655,12 +642,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
u8 *p = object;
if (s->flags & __OBJECT_POISON) {
- memset(p, POISON_FREE, s->objsize - 1);
- p[s->objsize - 1] = POISON_END;
+ memset(p, POISON_FREE, s->object_size - 1);
+ p[s->object_size - 1] = POISON_END;
}
if (s->flags & SLAB_RED_ZONE)
- memset(p + s->objsize, val, s->inuse - s->objsize);
+ memset(p + s->object_size, val, s->inuse - s->object_size);
}
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
@@ -705,10 +692,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
* Poisoning uses 0x6b (POISON_FREE) and the last byte is
* 0xa5 (POISON_END)
*
- * object + s->objsize
+ * object + s->object_size
* Padding to reach word boundary. This is also used for Redzoning.
* Padding is extended by another word if Redzoning is enabled and
- * objsize == inuse.
+ * object_size == inuse.
*
* We fill with 0xbb (RED_INACTIVE) for inactive objects and with
* 0xcc (RED_ACTIVE) for objects in use.
@@ -727,7 +714,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
* object + s->size
* Nothing is used beyond s->size.
*
- * If slabcaches are merged then the objsize and inuse boundaries are mostly
+ * If slabcaches are merged then the object_size and inuse boundaries are mostly
* ignored. And therefore no slab options that rely on these boundaries
* may be used with merged slabcaches.
*/
@@ -787,25 +774,25 @@ static int check_object(struct kmem_cache *s, struct page *page,
void *object, u8 val)
{
u8 *p = object;
- u8 *endobject = object + s->objsize;
+ u8 *endobject = object + s->object_size;
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, page, object, "Redzone",
- endobject, val, s->inuse - s->objsize))
+ endobject, val, s->inuse - s->object_size))
return 0;
} else {
- if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
+ if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
check_bytes_and_report(s, page, p, "Alignment padding",
- endobject, POISON_INUSE, s->inuse - s->objsize);
+ endobject, POISON_INUSE, s->inuse - s->object_size);
}
}
if (s->flags & SLAB_POISON) {
if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
(!check_bytes_and_report(s, page, p, "Poison", p,
- POISON_FREE, s->objsize - 1) ||
+ POISON_FREE, s->object_size - 1) ||
!check_bytes_and_report(s, page, p, "Poison",
- p + s->objsize - 1, POISON_END, 1)))
+ p + s->object_size - 1, POISON_END, 1)))
return 0;
/*
* check_pad_bytes cleans up on its own.
@@ -926,7 +913,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
page->freelist);
if (!alloc)
- print_section("Object ", (void *)object, s->objsize);
+ print_section("Object ", (void *)object, s->object_size);
dump_stack();
}
@@ -942,14 +929,14 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
lockdep_trace_alloc(flags);
might_sleep_if(flags & __GFP_WAIT);
- return should_failslab(s->objsize, flags, s->flags);
+ return should_failslab(s->object_size, flags, s->flags);
}
static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
{
flags &= gfp_allowed_mask;
kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
- kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
+ kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
}
static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -966,13 +953,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
unsigned long flags;
local_irq_save(flags);
- kmemcheck_slab_free(s, x, s->objsize);
- debug_check_no_locks_freed(x, s->objsize);
+ kmemcheck_slab_free(s, x, s->object_size);
+ debug_check_no_locks_freed(x, s->object_size);
local_irq_restore(flags);
}
#endif
if (!(s->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(x, s->objsize);
+ debug_check_no_obj_freed(x, s->object_size);
}
/*
@@ -1207,7 +1194,7 @@ out:
__setup("slub_debug", setup_slub_debug);
-static unsigned long kmem_cache_flags(unsigned long objsize,
+static unsigned long kmem_cache_flags(unsigned long object_size,
unsigned long flags, const char *name,
void (*ctor)(void *))
{
@@ -1237,7 +1224,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
static inline void remove_full(struct kmem_cache *s, struct page *page) {}
-static inline unsigned long kmem_cache_flags(unsigned long objsize,
+static inline unsigned long kmem_cache_flags(unsigned long object_size,
unsigned long flags, const char *name,
void (*ctor)(void *))
{
@@ -1314,13 +1301,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
stat(s, ORDER_FALLBACK);
}
- if (flags & __GFP_WAIT)
- local_irq_disable();
-
- if (!page)
- return NULL;
-
- if (kmemcheck_enabled
+ if (kmemcheck_enabled && page
&& !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
int pages = 1 << oo_order(oo);
@@ -1336,6 +1317,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
kmemcheck_mark_unallocated_pages(page, pages);
}
+ if (flags & __GFP_WAIT)
+ local_irq_disable();
+ if (!page)
+ return NULL;
+
page->objects = oo_objects(oo);
mod_zone_page_state(page_zone(page),
(s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1370,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab = s;
__SetPageSlab(page);
+ if (page->pfmemalloc)
+ SetPageSlabPfmemalloc(page);
start = page_address(page);
@@ -1413,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-pages);
+ __ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
reset_page_mapcount(page);
if (current->reclaim_state)
@@ -1490,12 +1479,12 @@ static inline void remove_partial(struct kmem_cache_node *n,
}
/*
- * Lock slab, remove from the partial list and put the object into the
- * per cpu freelist.
+ * Remove slab from the partial list, freeze it and
+ * return the pointer to the freelist.
*
* Returns a list of objects or NULL if it fails.
*
- * Must hold list_lock.
+ * Must hold list_lock since we modify the partial list.
*/
static inline void *acquire_slab(struct kmem_cache *s,
struct kmem_cache_node *n, struct page *page,
@@ -1510,26 +1499,27 @@ static inline void *acquire_slab(struct kmem_cache *s,
* The old freelist is the list of objects for the
* per cpu allocation list.
*/
- do {
- freelist = page->freelist;
- counters = page->counters;
- new.counters = counters;
- if (mode) {
- new.inuse = page->objects;
- new.freelist = NULL;
- } else {
- new.freelist = freelist;
- }
+ freelist = page->freelist;
+ counters = page->counters;
+ new.counters = counters;
+ if (mode) {
+ new.inuse = page->objects;
+ new.freelist = NULL;
+ } else {
+ new.freelist = freelist;
+ }
- VM_BUG_ON(new.frozen);
- new.frozen = 1;
+ VM_BUG_ON(new.frozen);
+ new.frozen = 1;
- } while (!__cmpxchg_double_slab(s, page,
+ if (!__cmpxchg_double_slab(s, page,
freelist, counters,
new.freelist, new.counters,
- "lock and freeze"));
+ "acquire_slab"))
+ return NULL;
remove_partial(n, page);
+ WARN_ON(!freelist);
return freelist;
}
@@ -1563,7 +1553,6 @@ static void *get_partial_node(struct kmem_cache *s,
if (!object) {
c->page = page;
- c->node = page_to_nid(page);
stat(s, ALLOC_FROM_PARTIAL);
object = t;
available = page->objects - page->inuse;
@@ -1617,7 +1606,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
do {
cpuset_mems_cookie = get_mems_allowed();
- zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+ zonelist = node_zonelist(slab_node(), flags);
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
struct kmem_cache_node *n;
@@ -1731,14 +1720,12 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
/*
* Remove the cpu slab
*/
-static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist)
{
enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
- struct page *page = c->page;
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
int lock = 0;
enum slab_modes l = M_NONE, m = M_NONE;
- void *freelist;
void *nextfree;
int tail = DEACTIVATE_TO_HEAD;
struct page new;
@@ -1749,11 +1736,6 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
tail = DEACTIVATE_TO_TAIL;
}
- c->tid = next_tid(c->tid);
- c->page = NULL;
- freelist = c->freelist;
- c->freelist = NULL;
-
/*
* Stage one: Free all available per cpu objects back
* to the page freelist while it is still frozen. Leave the
@@ -1879,21 +1861,31 @@ redo:
}
}
-/* Unfreeze all the cpu partial slabs */
+/*
+ * Unfreeze all the cpu partial slabs.
+ *
+ * This function must be called with interrupt disabled.
+ */
static void unfreeze_partials(struct kmem_cache *s)
{
- struct kmem_cache_node *n = NULL;
+ struct kmem_cache_node *n = NULL, *n2 = NULL;
struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
struct page *page, *discard_page = NULL;
while ((page = c->partial)) {
- enum slab_modes { M_PARTIAL, M_FREE };
- enum slab_modes l, m;
struct page new;
struct page old;
c->partial = page->next;
- l = M_FREE;
+
+ n2 = get_node(s, page_to_nid(page));
+ if (n != n2) {
+ if (n)
+ spin_unlock(&n->list_lock);
+
+ n = n2;
+ spin_lock(&n->list_lock);
+ }
do {
@@ -1906,43 +1898,17 @@ static void unfreeze_partials(struct kmem_cache *s)
new.frozen = 0;
- if (!new.inuse && (!n || n->nr_partial > s->min_partial))
- m = M_FREE;
- else {
- struct kmem_cache_node *n2 = get_node(s,
- page_to_nid(page));
-
- m = M_PARTIAL;
- if (n != n2) {
- if (n)
- spin_unlock(&n->list_lock);
-
- n = n2;
- spin_lock(&n->list_lock);
- }
- }
-
- if (l != m) {
- if (l == M_PARTIAL) {
- remove_partial(n, page);
- stat(s, FREE_REMOVE_PARTIAL);
- } else {
- add_partial(n, page,
- DEACTIVATE_TO_TAIL);
- stat(s, FREE_ADD_PARTIAL);
- }
-
- l = m;
- }
-
- } while (!cmpxchg_double_slab(s, page,
+ } while (!__cmpxchg_double_slab(s, page,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab"));
- if (m == M_FREE) {
+ if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
page->next = discard_page;
discard_page = page;
+ } else {
+ add_partial(n, page, DEACTIVATE_TO_TAIL);
+ stat(s, FREE_ADD_PARTIAL);
}
}
@@ -2011,7 +1977,11 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
stat(s, CPUSLAB_FLUSH);
- deactivate_slab(s, c);
+ deactivate_slab(s, c->page, c->freelist);
+
+ c->tid = next_tid(c->tid);
+ c->page = NULL;
+ c->freelist = NULL;
}
/*
@@ -2055,10 +2025,10 @@ static void flush_all(struct kmem_cache *s)
* Check if the objects in a per cpu structure fit numa
* locality expectations.
*/
-static inline int node_match(struct kmem_cache_cpu *c, int node)
+static inline int node_match(struct page *page, int node)
{
#ifdef CONFIG_NUMA
- if (node != NUMA_NO_NODE && c->node != node)
+ if (node != NUMA_NO_NODE && page_to_nid(page) != node)
return 0;
#endif
return 1;
@@ -2101,10 +2071,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
"SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
nid, gfpflags);
printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
- "default order: %d, min order: %d\n", s->name, s->objsize,
+ "default order: %d, min order: %d\n", s->name, s->object_size,
s->size, oo_order(s->oo), oo_order(s->min));
- if (oo_order(s->min) > get_order(s->objsize))
+ if (oo_order(s->min) > get_order(s->object_size))
printk(KERN_WARNING " %s debugging increased min order, use "
"slub_debug=O to disable.\n", s->name);
@@ -2130,10 +2100,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
int node, struct kmem_cache_cpu **pc)
{
- void *object;
- struct kmem_cache_cpu *c;
- struct page *page = new_slab(s, flags, node);
+ void *freelist;
+ struct kmem_cache_cpu *c = *pc;
+ struct page *page;
+ freelist = get_partial(s, flags, node, c);
+
+ if (freelist)
+ return freelist;
+
+ page = new_slab(s, flags, node);
if (page) {
c = __this_cpu_ptr(s->cpu_slab);
if (c->page)
@@ -2143,17 +2119,24 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
* No other reference to the page yet so we can
* muck around with it freely without cmpxchg
*/
- object = page->freelist;
+ freelist = page->freelist;
page->freelist = NULL;
stat(s, ALLOC_SLAB);
- c->node = page_to_nid(page);
c->page = page;
*pc = c;
} else
- object = NULL;
+ freelist = NULL;
- return object;
+ return freelist;
+}
+
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
+{
+ if (unlikely(PageSlabPfmemalloc(page)))
+ return gfp_pfmemalloc_allowed(gfpflags);
+
+ return true;
}
/*
@@ -2163,6 +2146,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
* The page is still frozen if the return value is not NULL.
*
* If this function returns NULL then the page has been unfrozen.
+ *
+ * This function must be called with interrupt disabled.
*/
static inline void *get_freelist(struct kmem_cache *s, struct page *page)
{
@@ -2173,13 +2158,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
do {
freelist = page->freelist;
counters = page->counters;
+
new.counters = counters;
VM_BUG_ON(!new.frozen);
new.inuse = page->objects;
new.frozen = freelist != NULL;
- } while (!cmpxchg_double_slab(s, page,
+ } while (!__cmpxchg_double_slab(s, page,
freelist, counters,
NULL, new.counters,
"get_freelist"));
@@ -2206,7 +2192,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
- void **object;
+ void *freelist;
+ struct page *page;
unsigned long flags;
local_irq_save(flags);
@@ -2219,25 +2206,41 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
c = this_cpu_ptr(s->cpu_slab);
#endif
- if (!c->page)
+ page = c->page;
+ if (!page)
goto new_slab;
redo:
- if (unlikely(!node_match(c, node))) {
+
+ if (unlikely(!node_match(page, node))) {
stat(s, ALLOC_NODE_MISMATCH);
- deactivate_slab(s, c);
+ deactivate_slab(s, page, c->freelist);
+ c->page = NULL;
+ c->freelist = NULL;
+ goto new_slab;
+ }
+
+ /*
+ * By rights, we should be searching for a slab page that was
+ * PFMEMALLOC but right now, we are losing the pfmemalloc
+ * information when the page leaves the per-cpu allocator
+ */
+ if (unlikely(!pfmemalloc_match(page, gfpflags))) {
+ deactivate_slab(s, page, c->freelist);
+ c->page = NULL;
+ c->freelist = NULL;
goto new_slab;
}
/* must check again c->freelist in case of cpu migration or IRQ */
- object = c->freelist;
- if (object)
+ freelist = c->freelist;
+ if (freelist)
goto load_freelist;
stat(s, ALLOC_SLOWPATH);
- object = get_freelist(s, c->page);
+ freelist = get_freelist(s, page);
- if (!object) {
+ if (!freelist) {
c->page = NULL;
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
@@ -2246,50 +2249,50 @@ redo:
stat(s, ALLOC_REFILL);
load_freelist:
- c->freelist = get_freepointer(s, object);
+ /*
+ * freelist is pointing to the list of objects to be used.
+ * page is pointing to the page from which the objects are obtained.
+ * That page must be frozen for per cpu allocations to work.
+ */
+ VM_BUG_ON(!c->page->frozen);
+ c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
local_irq_restore(flags);
- return object;
+ return freelist;
new_slab:
if (c->partial) {
- c->page = c->partial;
- c->partial = c->page->next;
- c->node = page_to_nid(c->page);
+ page = c->page = c->partial;
+ c->partial = page->next;
stat(s, CPU_PARTIAL_ALLOC);
c->freelist = NULL;
goto redo;
}
- /* Then do expensive stuff like retrieving pages from the partial lists */
- object = get_partial(s, gfpflags, node, c);
-
- if (unlikely(!object)) {
+ freelist = new_slab_objects(s, gfpflags, node, &c);
- object = new_slab_objects(s, gfpflags, node, &c);
+ if (unlikely(!freelist)) {
+ if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+ slab_out_of_memory(s, gfpflags, node);
- if (unlikely(!object)) {
- if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
- slab_out_of_memory(s, gfpflags, node);
-
- local_irq_restore(flags);
- return NULL;
- }
+ local_irq_restore(flags);
+ return NULL;
}
- if (likely(!kmem_cache_debug(s)))
+ page = c->page;
+ if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
goto load_freelist;
/* Only entered in the debug case */
- if (!alloc_debug_processing(s, c->page, object, addr))
+ if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
goto new_slab; /* Slab failed checks. Next slab needed */
- c->freelist = get_freepointer(s, object);
- deactivate_slab(s, c);
- c->node = NUMA_NO_NODE;
+ deactivate_slab(s, page, get_freepointer(s, freelist));
+ c->page = NULL;
+ c->freelist = NULL;
local_irq_restore(flags);
- return object;
+ return freelist;
}
/*
@@ -2307,6 +2310,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
{
void **object;
struct kmem_cache_cpu *c;
+ struct page *page;
unsigned long tid;
if (slab_pre_alloc_hook(s, gfpflags))
@@ -2332,8 +2336,8 @@ redo:
barrier();
object = c->freelist;
- if (unlikely(!object || !node_match(c, node)))
-
+ page = c->page;
+ if (unlikely(!object || !node_match(page, node)))
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
@@ -2364,7 +2368,7 @@ redo:
}
if (unlikely(gfpflags & __GFP_ZERO) && object)
- memset(object, 0, s->objsize);
+ memset(object, 0, s->object_size);
slab_post_alloc_hook(s, gfpflags, object);
@@ -2375,7 +2379,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
- trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
+ trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
return ret;
}
@@ -2405,7 +2409,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
- s->objsize, s->size, gfpflags, node);
+ s->object_size, s->size, gfpflags, node);
return ret;
}
@@ -2900,7 +2904,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
unsigned long flags = s->flags;
- unsigned long size = s->objsize;
+ unsigned long size = s->object_size;
unsigned long align = s->align;
int order;
@@ -2929,7 +2933,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* end of the object and the free pointer. If not then add an
* additional word to have some bytes to store Redzone information.
*/
- if ((flags & SLAB_RED_ZONE) && size == s->objsize)
+ if ((flags & SLAB_RED_ZONE) && size == s->object_size)
size += sizeof(void *);
#endif
@@ -2977,7 +2981,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* user specified and the dynamic determination of cache line size
* on bootup.
*/
- align = calculate_alignment(flags, align, s->objsize);
+ align = calculate_alignment(flags, align, s->object_size);
s->align = align;
/*
@@ -3025,7 +3029,7 @@ static int kmem_cache_open(struct kmem_cache *s,
memset(s, 0, kmem_size);
s->name = name;
s->ctor = ctor;
- s->objsize = size;
+ s->object_size = size;
s->align = align;
s->flags = kmem_cache_flags(size, flags, name, ctor);
s->reserved = 0;
@@ -3040,7 +3044,7 @@ static int kmem_cache_open(struct kmem_cache *s,
* Disable debugging flags that store metadata if the min slab
* order increased.
*/
- if (get_order(s->size) > get_order(s->objsize)) {
+ if (get_order(s->size) > get_order(s->object_size)) {
s->flags &= ~DEBUG_METADATA_FLAGS;
s->offset = 0;
if (!calculate_sizes(s, -1))
@@ -3114,7 +3118,7 @@ error:
*/
unsigned int kmem_cache_size(struct kmem_cache *s)
{
- return s->objsize;
+ return s->object_size;
}
EXPORT_SYMBOL(kmem_cache_size);
@@ -3192,11 +3196,11 @@ static inline int kmem_cache_close(struct kmem_cache *s)
*/
void kmem_cache_destroy(struct kmem_cache *s)
{
- down_write(&slub_lock);
+ mutex_lock(&slab_mutex);
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
- up_write(&slub_lock);
+ mutex_unlock(&slab_mutex);
if (kmem_cache_close(s)) {
printk(KERN_ERR "SLUB %s: %s called for cache that "
"still has objects.\n", s->name, __func__);
@@ -3206,7 +3210,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
rcu_barrier();
sysfs_slab_remove(s);
} else
- up_write(&slub_lock);
+ mutex_unlock(&slab_mutex);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -3268,7 +3272,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
/*
* This function is called with IRQs disabled during early-boot on
- * single CPU so there's no need to take slub_lock here.
+ * single CPU so there's no need to take slab_mutex here.
*/
if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
flags, NULL))
@@ -3553,10 +3557,10 @@ static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
- down_read(&slub_lock);
+ mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
kmem_cache_shrink(s);
- up_read(&slub_lock);
+ mutex_unlock(&slab_mutex);
return 0;
}
@@ -3577,7 +3581,7 @@ static void slab_mem_offline_callback(void *arg)
if (offline_node < 0)
return;
- down_read(&slub_lock);
+ mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
n = get_node(s, offline_node);
if (n) {
@@ -3593,7 +3597,7 @@ static void slab_mem_offline_callback(void *arg)
kmem_cache_free(kmem_cache_node, n);
}
}
- up_read(&slub_lock);
+ mutex_unlock(&slab_mutex);
}
static int slab_mem_going_online_callback(void *arg)
@@ -3616,7 +3620,7 @@ static int slab_mem_going_online_callback(void *arg)
* allocate a kmem_cache_node structure in order to bring the node
* online.
*/
- down_read(&slub_lock);
+ mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
/*
* XXX: kmem_cache_alloc_node will fallback to other nodes
@@ -3632,7 +3636,7 @@ static int slab_mem_going_online_callback(void *arg)
s->node[nid] = n;
}
out:
- up_read(&slub_lock);
+ mutex_unlock(&slab_mutex);
return ret;
}
@@ -3843,11 +3847,11 @@ void __init kmem_cache_init(void)
if (s && s->size) {
char *name = kasprintf(GFP_NOWAIT,
- "dma-kmalloc-%d", s->objsize);
+ "dma-kmalloc-%d", s->object_size);
BUG_ON(!name);
kmalloc_dma_caches[i] = create_kmalloc_cache(name,
- s->objsize, SLAB_CACHE_DMA);
+ s->object_size, SLAB_CACHE_DMA);
}
}
#endif
@@ -3924,16 +3928,12 @@ static struct kmem_cache *find_mergeable(size_t size,
return NULL;
}
-struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
char *n;
- if (WARN_ON(!name))
- return NULL;
-
- down_write(&slub_lock);
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
s->refcount++;
@@ -3941,49 +3941,42 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
* Adjust the object sizes so that we clear
* the complete object on kzalloc.
*/
- s->objsize = max(s->objsize, (int)size);
+ s->object_size = max(s->object_size, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
if (sysfs_slab_alias(s, name)) {
s->refcount--;
- goto err;
+ return NULL;
}
- up_write(&slub_lock);
return s;
}
n = kstrdup(name, GFP_KERNEL);
if (!n)
- goto err;
+ return NULL;
s = kmalloc(kmem_size, GFP_KERNEL);
if (s) {
if (kmem_cache_open(s, n,
size, align, flags, ctor)) {
+ int r;
+
list_add(&s->list, &slab_caches);
- up_write(&slub_lock);
- if (sysfs_slab_add(s)) {
- down_write(&slub_lock);
- list_del(&s->list);
- kfree(n);
- kfree(s);
- goto err;
- }
- return s;
+ mutex_unlock(&slab_mutex);
+ r = sysfs_slab_add(s);
+ mutex_lock(&slab_mutex);
+
+ if (!r)
+ return s;
+
+ list_del(&s->list);
+ kmem_cache_close(s);
}
kfree(s);
}
kfree(n);
-err:
- up_write(&slub_lock);
-
- if (flags & SLAB_PANIC)
- panic("Cannot create slabcache %s\n", name);
- else
- s = NULL;
- return s;
+ return NULL;
}
-EXPORT_SYMBOL(kmem_cache_create);
#ifdef CONFIG_SMP
/*
@@ -4002,13 +3995,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
- down_read(&slub_lock);
+ mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
local_irq_save(flags);
__flush_cpu_slab(s, cpu);
local_irq_restore(flags);
}
- up_read(&slub_lock);
+ mutex_unlock(&slab_mutex);
break;
default:
break;
@@ -4500,30 +4493,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
for_each_possible_cpu(cpu) {
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- int node = ACCESS_ONCE(c->node);
+ int node;
struct page *page;
- if (node < 0)
- continue;
page = ACCESS_ONCE(c->page);
- if (page) {
- if (flags & SO_TOTAL)
- x = page->objects;
- else if (flags & SO_OBJECTS)
- x = page->inuse;
- else
- x = 1;
+ if (!page)
+ continue;
- total += x;
- nodes[node] += x;
- }
- page = c->partial;
+ node = page_to_nid(page);
+ if (flags & SO_TOTAL)
+ x = page->objects;
+ else if (flags & SO_OBJECTS)
+ x = page->inuse;
+ else
+ x = 1;
+ total += x;
+ nodes[node] += x;
+
+ page = ACCESS_ONCE(c->partial);
if (page) {
x = page->pobjects;
total += x;
nodes[node] += x;
}
+
per_cpu[node]++;
}
}
@@ -4623,7 +4617,7 @@ SLAB_ATTR_RO(align);
static ssize_t object_size_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->objsize);
+ return sprintf(buf, "%d\n", s->object_size);
}
SLAB_ATTR_RO(object_size);
@@ -5286,7 +5280,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
const char *name;
int unmergeable;
- if (slab_state < SYSFS)
+ if (slab_state < FULL)
/* Defer until later */
return 0;
@@ -5331,7 +5325,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
static void sysfs_slab_remove(struct kmem_cache *s)
{
- if (slab_state < SYSFS)
+ if (slab_state < FULL)
/*
* Sysfs has not been setup yet so no need to remove the
* cache from sysfs.
@@ -5359,7 +5353,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
{
struct saved_alias *al;
- if (slab_state == SYSFS) {
+ if (slab_state == FULL) {
/*
* If we have a leftover link then remove it.
*/
@@ -5383,16 +5377,16 @@ static int __init slab_sysfs_init(void)
struct kmem_cache *s;
int err;
- down_write(&slub_lock);
+ mutex_lock(&slab_mutex);
slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
if (!slab_kset) {
- up_write(&slub_lock);
+ mutex_unlock(&slab_mutex);
printk(KERN_ERR "Cannot register slab subsystem.\n");
return -ENOSYS;
}
- slab_state = SYSFS;
+ slab_state = FULL;
list_for_each_entry(s, &slab_caches, list) {
err = sysfs_slab_add(s);
@@ -5408,11 +5402,11 @@ static int __init slab_sysfs_init(void)
err = sysfs_slab_alias(al->s, al->name);
if (err)
printk(KERN_ERR "SLUB: Unable to add boot slab alias"
- " %s to sysfs\n", s->name);
+ " %s to sysfs\n", al->name);
kfree(al);
}
- up_write(&slub_lock);
+ mutex_unlock(&slab_mutex);
resiliency_test();
return 0;
}
@@ -5427,7 +5421,7 @@ __initcall(slab_sysfs_init);
static void print_slabinfo_header(struct seq_file *m)
{
seq_puts(m, "slabinfo - version: 2.1\n");
- seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
+ seq_puts(m, "# name <active_objs> <num_objs> <object_size> "
"<objperslab> <pagesperslab>");
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
@@ -5438,7 +5432,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
{
loff_t n = *pos;
- down_read(&slub_lock);
+ mutex_lock(&slab_mutex);
if (!n)
print_slabinfo_header(m);
@@ -5452,7 +5446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
static void s_stop(struct seq_file *m, void *p)
{
- up_read(&slub_lock);
+ mutex_unlock(&slab_mutex);
}
static int s_show(struct seq_file *m, void *p)
diff --git a/mm/sparse.c b/mm/sparse.c
index 6a4bf9160e85..fac95f2888f2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
if (slab_is_available()) {
if (node_state(nid, N_HIGH_MEMORY))
- section = kmalloc_node(array_size, GFP_KERNEL, nid);
+ section = kzalloc_node(array_size, GFP_KERNEL, nid);
else
- section = kmalloc(array_size, GFP_KERNEL);
- } else
+ section = kzalloc(array_size, GFP_KERNEL);
+ } else {
section = alloc_bootmem_node(NODE_DATA(nid), array_size);
-
- if (section)
- memset(section, 0, array_size);
+ }
return section;
}
static int __meminit sparse_index_init(unsigned long section_nr, int nid)
{
- static DEFINE_SPINLOCK(index_init_lock);
unsigned long root = SECTION_NR_TO_ROOT(section_nr);
struct mem_section *section;
int ret = 0;
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
section = sparse_index_alloc(nid);
if (!section)
return -ENOMEM;
- /*
- * This lock keeps two different sections from
- * reallocating for the same index
- */
- spin_lock(&index_init_lock);
-
- if (mem_section[root]) {
- ret = -EEXIST;
- goto out;
- }
mem_section[root] = section;
-out:
- spin_unlock(&index_init_lock);
+
return ret;
}
#else /* !SPARSEMEM_EXTREME */
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms)
break;
}
+ VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
+
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
@@ -275,8 +263,9 @@ static unsigned long * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
- pg_data_t *host_pgdat;
- unsigned long goal;
+ unsigned long goal, limit;
+ unsigned long *p;
+ int nid;
/*
* A page may contain usemaps for other sections preventing the
* page being freed and making a section unremovable while
@@ -287,10 +276,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
* from the same section as the pgdat where possible to avoid
* this problem.
*/
- goal = __pa(pgdat) & PAGE_SECTION_MASK;
- host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT));
- return __alloc_bootmem_node_nopanic(host_pgdat, size,
- SMP_CACHE_BYTES, goal);
+ goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
+ limit = goal + (1UL << PA_SECTION_SHIFT);
+ nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
+again:
+ p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
+ SMP_CACHE_BYTES, goal, limit);
+ if (!p && limit) {
+ limit = 0;
+ goto again;
+ }
+ return p;
}
static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -485,6 +481,9 @@ void __init sparse_init(void)
struct page **map_map;
#endif
+ /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
+ set_pageblock_order();
+
/*
* map is using big page (aka 2M in x86 64 bit)
* usemap is less one page (aka 24 bytes)
diff --git a/mm/swap.c b/mm/swap.c
index 4e7e2ec67078..77825883298f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -236,6 +236,58 @@ void put_pages_list(struct list_head *pages)
}
EXPORT_SYMBOL(put_pages_list);
+/*
+ * get_kernel_pages() - pin kernel pages in memory
+ * @kiov: An array of struct kvec structures
+ * @nr_segs: number of segments to pin
+ * @write: pinning for read/write, currently ignored
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_segs long.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with.
+ */
+int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
+ struct page **pages)
+{
+ int seg;
+
+ for (seg = 0; seg < nr_segs; seg++) {
+ if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
+ return seg;
+
+ pages[seg] = kmap_to_page(kiov[seg].iov_base);
+ page_cache_get(pages[seg]);
+ }
+
+ return seg;
+}
+EXPORT_SYMBOL_GPL(get_kernel_pages);
+
+/*
+ * get_kernel_page() - pin a kernel page in memory
+ * @start: starting kernel address
+ * @write: pinning for read/write, currently ignored
+ * @pages: array that receives pointer to the page pinned.
+ * Must be at least nr_segs long.
+ *
+ * Returns 1 if page is pinned. If the page was not pinned, returns
+ * -errno. The page returned must be released with a put_page() call
+ * when it is finished with.
+ */
+int get_kernel_page(unsigned long start, int write, struct page **pages)
+{
+ const struct kvec kiov = {
+ .iov_base = (void *)start,
+ .iov_len = PAGE_SIZE
+ };
+
+ return get_kernel_pages(&kiov, 1, write, pages);
+}
+EXPORT_SYMBOL_GPL(get_kernel_page);
+
static void pagevec_lru_move_fn(struct pagevec *pvec,
void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
void *arg)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4c5ff7f284d9..0cb36fb1f61c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/migrate.h>
#include <linux/page_cgroup.h>
@@ -26,7 +27,7 @@
*/
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .set_page_dirty = swap_set_page_dirty,
.migratepage = migrate_page,
};
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
unsigned long offset = swp_offset(entry);
unsigned long start_offset, end_offset;
unsigned long mask = (1UL << page_cluster) - 1;
+ struct blk_plug plug;
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
if (!start_offset) /* First page is swap header. */
start_offset++;
+ blk_start_plug(&plug);
for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
continue;
page_cache_release(page);
}
+ blk_finish_plug(&plug);
+
lru_add_drain(); /* Push any new pages onto the LRU now */
return read_swap_cache_async(entry, gfp_mask, vma, addr);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 457b10baef59..14e254c768fc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,9 @@
#include <linux/memcontrol.h>
#include <linux/poll.h>
#include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+#include <linux/export.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -42,7 +45,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
static void free_swap_count_continuations(struct swap_info_struct *);
static sector_t map_swap_entry(swp_entry_t, struct block_device**);
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
long nr_swap_pages;
long total_swap_pages;
@@ -53,9 +56,9 @@ static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
static DEFINE_MUTEX(swapon_mutex);
@@ -546,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
/* free if no reference */
if (!usage) {
- struct gendisk *disk = p->bdev->bd_disk;
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
@@ -556,9 +558,13 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
swap_list.next = p->type;
nr_swap_pages++;
p->inuse_pages--;
- if ((p->flags & SWP_BLKDEV) &&
- disk->fops->swap_slot_free_notify)
- disk->fops->swap_slot_free_notify(p->bdev, offset);
+ frontswap_invalidate_page(p->type, offset);
+ if (p->flags & SWP_BLKDEV) {
+ struct gendisk *disk = p->bdev->bd_disk;
+ if (disk->fops->swap_slot_free_notify)
+ disk->fops->swap_slot_free_notify(p->bdev,
+ offset);
+ }
}
return usage;
@@ -829,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
- if (ret > 0)
- mem_cgroup_cancel_charge_swapin(memcg);
+ mem_cgroup_cancel_charge_swapin(memcg);
ret = 0;
goto out;
}
@@ -985,11 +990,12 @@ static int unuse_mm(struct mm_struct *mm,
}
/*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap_map (or frontswap_map if frontswap parameter is true)
+ * from current position to next entry still in use.
* Recycle to start on reaching the end, returning 0 when empty.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
- unsigned int prev)
+ unsigned int prev, bool frontswap)
{
unsigned int max = si->max;
unsigned int i = prev;
@@ -1015,6 +1021,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
prev = 0;
i = 1;
}
+ if (frontswap) {
+ if (frontswap_test(si, i))
+ break;
+ else
+ continue;
+ }
count = si->swap_map[i];
if (count && swap_count(count) != SWAP_MAP_BAD)
break;
@@ -1026,8 +1038,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
* We completely avoid races by reading each swap page in advance,
* and then search for the process using it. All the necessary
* page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages; ignored if frontswap is false
*/
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+ unsigned long pages_to_unuse)
{
struct swap_info_struct *si = swap_info[type];
struct mm_struct *start_mm;
@@ -1060,7 +1076,7 @@ static int try_to_unuse(unsigned int type)
* one pass through swap_map is enough, but not necessarily:
* there are races when an instance of an entry might be missed.
*/
- while ((i = find_next_to_unuse(si, i)) != 0) {
+ while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
if (signal_pending(current)) {
retval = -EINTR;
break;
@@ -1227,6 +1243,10 @@ static int try_to_unuse(unsigned int type)
* interactive performance.
*/
cond_resched();
+ if (frontswap && pages_to_unuse > 0) {
+ if (!--pages_to_unuse)
+ break;
+ }
}
mmput(start_mm);
@@ -1310,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
list_del(&se->list);
kfree(se);
}
+
+ if (sis->flags & SWP_FILE) {
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+
+ sis->flags &= ~SWP_FILE;
+ mapping->a_ops->swap_deactivate(swap_file);
+ }
}
/*
@@ -1318,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
*
* This function rather assumes that it is called in ascending page order.
*/
-static int
+int
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block)
{
@@ -1391,102 +1419,33 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
*/
static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
{
- struct inode *inode;
- unsigned blocks_per_page;
- unsigned long page_no;
- unsigned blkbits;
- sector_t probe_block;
- sector_t last_block;
- sector_t lowest_block = -1;
- sector_t highest_block = 0;
- int nr_extents = 0;
+ struct file *swap_file = sis->swap_file;
+ struct address_space *mapping = swap_file->f_mapping;
+ struct inode *inode = mapping->host;
int ret;
- inode = sis->swap_file->f_mapping->host;
if (S_ISBLK(inode->i_mode)) {
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
- goto out;
+ return ret;
}
- blkbits = inode->i_blkbits;
- blocks_per_page = PAGE_SIZE >> blkbits;
-
- /*
- * Map all the blocks into the extent list. This code doesn't try
- * to be very smart.
- */
- probe_block = 0;
- page_no = 0;
- last_block = i_size_read(inode) >> blkbits;
- while ((probe_block + blocks_per_page) <= last_block &&
- page_no < sis->max) {
- unsigned block_in_page;
- sector_t first_block;
-
- first_block = bmap(inode, probe_block);
- if (first_block == 0)
- goto bad_bmap;
-
- /*
- * It must be PAGE_SIZE aligned on-disk
- */
- if (first_block & (blocks_per_page - 1)) {
- probe_block++;
- goto reprobe;
- }
-
- for (block_in_page = 1; block_in_page < blocks_per_page;
- block_in_page++) {
- sector_t block;
-
- block = bmap(inode, probe_block + block_in_page);
- if (block == 0)
- goto bad_bmap;
- if (block != first_block + block_in_page) {
- /* Discontiguity */
- probe_block++;
- goto reprobe;
- }
- }
-
- first_block >>= (PAGE_SHIFT - blkbits);
- if (page_no) { /* exclude the header page */
- if (first_block < lowest_block)
- lowest_block = first_block;
- if (first_block > highest_block)
- highest_block = first_block;
+ if (mapping->a_ops->swap_activate) {
+ ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+ if (!ret) {
+ sis->flags |= SWP_FILE;
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ *span = sis->pages;
}
+ return ret;
+ }
- /*
- * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
- */
- ret = add_swap_extent(sis, page_no, 1, first_block);
- if (ret < 0)
- goto out;
- nr_extents += ret;
- page_no++;
- probe_block += blocks_per_page;
-reprobe:
- continue;
- }
- ret = nr_extents;
- *span = 1 + highest_block - lowest_block;
- if (page_no == 0)
- page_no = 1; /* force Empty message */
- sis->max = page_no;
- sis->pages = page_no - 1;
- sis->highest_bit = page_no - 1;
-out:
- return ret;
-bad_bmap:
- printk(KERN_ERR "swapon: swapfile has holes\n");
- ret = -EINVAL;
- goto out;
+ return generic_swapfile_activate(sis, swap_file, span);
}
static void enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map)
+ unsigned char *swap_map,
+ unsigned long *frontswap_map)
{
int i, prev;
@@ -1496,6 +1455,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
else
p->prio = --least_priority;
p->swap_map = swap_map;
+ frontswap_map_set(p, frontswap_map);
p->flags |= SWP_WRITEOK;
nr_swap_pages += p->pages;
total_swap_pages += p->pages;
@@ -1512,6 +1472,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
swap_list.head = swap_list.next = p->type;
else
swap_info[prev]->next = p->type;
+ frontswap_init(p->type);
spin_unlock(&swap_lock);
}
@@ -1585,7 +1546,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
spin_unlock(&swap_lock);
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
- err = try_to_unuse(type);
+ err = try_to_unuse(type, false, 0); /* force all pages to be unused */
compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
if (err) {
@@ -1596,7 +1557,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
* sys_swapoff for this swap_info_struct at this point.
*/
/* re-insert swap space back into swap_list */
- enable_swap_info(p, p->prio, p->swap_map);
+ enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
goto out_dput;
}
@@ -1622,9 +1583,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
swap_map = p->swap_map;
p->swap_map = NULL;
p->flags = 0;
+ frontswap_invalidate_area(type);
spin_unlock(&swap_lock);
mutex_unlock(&swapon_mutex);
vfree(swap_map);
+ vfree(frontswap_map_get(p));
/* Destroy swap account informatin */
swap_cgroup_swapoff(type);
@@ -1893,24 +1856,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
/*
* Find out how many pages are allowed for a single swap
- * device. There are three limiting factors: 1) the number
+ * device. There are two limiting factors: 1) the number
* of bits for the swap offset in the swp_entry_t type, and
* 2) the number of bits in the swap pte as defined by the
- * the different architectures, and 3) the number of free bits
- * in an exceptional radix_tree entry. In order to find the
+ * different architectures. In order to find the
* largest possible bit mask, a swap entry with swap type 0
* and swap offset ~0UL is created, encoded to a swap pte,
* decoded to a swp_entry_t again, and finally the swap
* offset is extracted. This will mask all the bits from
* the initial ~0UL mask that can't be encoded in either
* the swp_entry_t or the architecture definition of a
- * swap pte. Then the same is done for a radix_tree entry.
+ * swap pte.
*/
maxpages = swp_offset(pte_to_swp_entry(
- swp_entry_to_pte(swp_entry(0, ~0UL))));
- maxpages = swp_offset(radix_to_swp_entry(
- swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
-
+ swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
if (maxpages > swap_header->info.last_page) {
maxpages = swap_header->info.last_page + 1;
/* p->max is an unsigned int: don't overflow it */
@@ -1988,6 +1947,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sector_t span;
unsigned long maxpages;
unsigned char *swap_map = NULL;
+ unsigned long *frontswap_map = NULL;
struct page *page = NULL;
struct inode *inode = NULL;
@@ -2071,6 +2031,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = nr_extents;
goto bad_swap;
}
+ /* frontswap enabled? set up bit-per-page map for frontswap */
+ if (frontswap_enabled)
+ frontswap_map = vzalloc(maxpages / sizeof(long));
if (p->bdev) {
if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2086,14 +2049,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (swap_flags & SWAP_FLAG_PREFER)
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- enable_swap_info(p, prio, swap_map);
+ enable_swap_info(p, prio, swap_map, frontswap_map);
printk(KERN_INFO "Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk %s%s\n",
+ "Priority:%d extents:%d across:%lluk %s%s%s\n",
p->pages<<(PAGE_SHIFT-10), name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
- (p->flags & SWP_DISCARDABLE) ? "D" : "");
+ (p->flags & SWP_DISCARDABLE) ? "D" : "",
+ (frontswap_map) ? "FS" : "");
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
@@ -2261,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
+struct swap_info_struct *page_swap_info(struct page *page)
+{
+ swp_entry_t swap = { .val = page_private(page) };
+ BUG_ON(!PageSwapCache(page));
+ return swap_info[swp_type(swap)];
+}
+
+/*
+ * out-of-line __page_file_ methods to avoid include hell.
+ */
+struct address_space *__page_file_mapping(struct page *page)
+{
+ VM_BUG_ON(!PageSwapCache(page));
+ return page_swap_info(page)->swap_file->f_mapping;
+}
+EXPORT_SYMBOL_GPL(__page_file_mapping);
+
+pgoff_t __page_file_index(struct page *page)
+{
+ swp_entry_t swap = { .val = page_private(page) };
+ VM_BUG_ON(!PageSwapCache(page));
+ return swp_offset(swap);
+}
+EXPORT_SYMBOL_GPL(__page_file_index);
+
/*
* add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2aad49981b57..2bb90b1d241c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,11 +413,11 @@ nocache:
if (addr + size - 1 < addr)
goto overflow;
- n = rb_next(&first->rb_node);
- if (n)
- first = rb_entry(n, struct vmap_area, rb_node);
- else
+ if (list_is_last(&first->list, &vmap_area_list))
goto found;
+
+ first = list_entry(first->list.next,
+ struct vmap_area, list);
}
found:
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
BUG_ON(size & ~PAGE_MASK);
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+ if (WARN_ON(size == 0)) {
+ /*
+ * Allocating 0 bytes isn't what caller wants since
+ * get_order(0) returns funny result. Just warn and terminate
+ * early.
+ */
+ return NULL;
+ }
order = get_order(size);
again:
@@ -1280,7 +1288,7 @@ DEFINE_RWLOCK(vmlist_lock);
struct vm_struct *vmlist;
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, void *caller)
+ unsigned long flags, const void *caller)
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
@@ -1306,7 +1314,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm)
}
static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, void *caller)
+ unsigned long flags, const void *caller)
{
setup_vmalloc_vm(vm, va, flags, caller);
insert_vmalloc_vmlist(vm);
@@ -1314,7 +1322,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
- unsigned long end, int node, gfp_t gfp_mask, void *caller)
+ unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
@@ -1375,7 +1383,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
- void *caller)
+ const void *caller)
{
return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
caller);
@@ -1397,13 +1405,21 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
}
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
- void *caller)
+ const void *caller)
{
return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-1, GFP_KERNEL, caller);
}
-static struct vm_struct *find_vm_area(const void *addr)
+/**
+ * find_vm_area - find a continuous kernel virtual area
+ * @addr: base address
+ *
+ * Search for the kernel VM area starting at @addr, and return it.
+ * It is up to the caller to do all required locking to keep the returned
+ * pointer valid.
+ */
+struct vm_struct *find_vm_area(const void *addr)
{
struct vmap_area *va;
@@ -1568,9 +1584,9 @@ EXPORT_SYMBOL(vmap);
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
- int node, void *caller);
+ int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node, void *caller)
+ pgprot_t prot, int node, const void *caller)
{
const int order = 0;
struct page **pages;
@@ -1643,7 +1659,7 @@ fail:
*/
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
- pgprot_t prot, int node, void *caller)
+ pgprot_t prot, int node, const void *caller)
{
struct vm_struct *area;
void *addr;
@@ -1699,7 +1715,7 @@ fail:
*/
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
- int node, void *caller)
+ int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
gfp_mask, prot, node, caller);
@@ -1975,9 +1991,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
* IOREMAP area is treated as memory hole and no copy is done.
*
* If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0.
- * @buf should be kernel's buffer. Because this function uses KM_USER0,
- * the caller should guarantee KM_USER0 is not used.
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
*
* Note: In usual ops, vread() is never necessary because the caller
* should know vmalloc() area is valid and can use memcpy().
@@ -2051,9 +2065,7 @@ finished:
* IOREMAP area is treated as memory hole and no copy is done.
*
* If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0.
- * @buf should be kernel's buffer. Because this function uses KM_USER0,
- * the caller should guarantee KM_USER0 is not used.
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
*
* Note: In usual ops, vwrite() is never necessary because the caller
* should know vmalloc() area is valid and can use memcpy().
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eeb3bc9d1d36..8d01243d9560 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
cond_resched();
+ mem_cgroup_uncharge_start();
while (!list_empty(page_list)) {
enum page_references references;
struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
if (PageWriteback(page)) {
- nr_writeback++;
- unlock_page(page);
- goto keep;
+ /*
+ * memcg doesn't have any dirty pages throttling so we
+ * could easily OOM just because too many pages are in
+ * writeback and there is nothing else to reclaim.
+ *
+ * Check __GFP_IO, certainly because a loop driver
+ * thread might enter reclaim, and deadlock if it waits
+ * on a page for which it is needed to do the write
+ * (loop masks off __GFP_IO|__GFP_FS for this reason);
+ * but more thought would probably show more reasons.
+ *
+ * Don't require __GFP_FS, since we're not going into
+ * the FS, just waiting on its writeback completion.
+ * Worryingly, ext4 gfs2 and xfs allocate pages with
+ * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
+ * testing may_enter_fs here is liable to OOM on them.
+ */
+ if (global_reclaim(sc) ||
+ !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+ /*
+ * This is slightly racy - end_page_writeback()
+ * might have just cleared PageReclaim, then
+ * setting PageReclaim here end up interpreted
+ * as PageReadahead - but that does not matter
+ * enough to care. What we do want is for this
+ * page to have PageReclaim set next time memcg
+ * reclaim reaches the tests above, so it will
+ * then wait_on_page_writeback() to avoid OOM;
+ * and it's also appropriate in global reclaim.
+ */
+ SetPageReclaim(page);
+ nr_writeback++;
+ goto keep_locked;
+ }
+ wait_on_page_writeback(page);
}
references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
+ mem_cgroup_uncharge_end();
*ret_nr_dirty += nr_dirty;
*ret_nr_writeback += nr_writeback;
return nr_reclaimed;
@@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc)
* by looking at the fraction of the pages scanned we did rotate back
* onto the active list instead of evict.
*
- * nr[0] = anon pages to scan; nr[1] = file pages to scan
+ * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
+ * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
@@ -2111,6 +2146,83 @@ out:
return 0;
}
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ unsigned long pfmemalloc_reserve = 0;
+ unsigned long free_pages = 0;
+ int i;
+ bool wmark_ok;
+
+ for (i = 0; i <= ZONE_NORMAL; i++) {
+ zone = &pgdat->node_zones[i];
+ pfmemalloc_reserve += min_wmark_pages(zone);
+ free_pages += zone_page_state(zone, NR_FREE_PAGES);
+ }
+
+ wmark_ok = free_pages > pfmemalloc_reserve / 2;
+
+ /* kswapd must be awake if processes are being throttled */
+ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
+ pgdat->classzone_idx = min(pgdat->classzone_idx,
+ (enum zone_type)ZONE_NORMAL);
+ wake_up_interruptible(&pgdat->kswapd_wait);
+ }
+
+ return wmark_ok;
+}
+
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+ nodemask_t *nodemask)
+{
+ struct zone *zone;
+ int high_zoneidx = gfp_zone(gfp_mask);
+ pg_data_t *pgdat;
+
+ /*
+ * Kernel threads should not be throttled as they may be indirectly
+ * responsible for cleaning pages necessary for reclaim to make forward
+ * progress. kjournald for example may enter direct reclaim while
+ * committing a transaction where throttling it could forcing other
+ * processes to block on log_wait_commit().
+ */
+ if (current->flags & PF_KTHREAD)
+ return;
+
+ /* Check if the pfmemalloc reserves are ok */
+ first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+ pgdat = zone->zone_pgdat;
+ if (pfmemalloc_watermark_ok(pgdat))
+ return;
+
+ /* Account for the throttling */
+ count_vm_event(PGSCAN_DIRECT_THROTTLE);
+
+ /*
+ * If the caller cannot enter the filesystem, it's possible that it
+ * is due to the caller holding an FS lock or performing a journal
+ * transaction in the case of a filesystem like ext[3|4]. In this case,
+ * it is not safe to block on pfmemalloc_wait as kswapd could be
+ * blocked waiting on the same lock. Instead, throttle for up to a
+ * second before continuing.
+ */
+ if (!(gfp_mask & __GFP_FS)) {
+ wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+ pfmemalloc_watermark_ok(pgdat), HZ);
+ return;
+ }
+
+ /* Throttle until kswapd wakes the process */
+ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+ pfmemalloc_watermark_ok(pgdat));
+}
+
unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *nodemask)
{
@@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.gfp_mask = sc.gfp_mask,
};
+ throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+
+ /*
+ * Do not enter reclaim if fatal signal is pending. 1 is returned so
+ * that the page allocator does not consider triggering OOM
+ */
+ if (fatal_signal_pending(current))
+ return 1;
+
trace_mm_vmscan_direct_reclaim_begin(order,
sc.may_writepage,
gfp_mask);
@@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
return nr_reclaimed;
}
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
gfp_t gfp_mask, bool noswap,
@@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
return balanced_pages >= (present_pages >> 2);
}
-/* is kswapd sleeping prematurely? */
-static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+/*
+ * Prepare kswapd for sleeping. This verifies that there are no processes
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
+ *
+ * Returns true if kswapd is ready to sleep
+ */
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
int classzone_idx)
{
int i;
@@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
if (remaining)
- return true;
+ return false;
+
+ /*
+ * There is a potential race between when kswapd checks its watermarks
+ * and a process gets throttled. There is also a potential race if
+ * processes get throttled, kswapd wakes, a large process exits therby
+ * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+ * is going to sleep, no process should be sleeping on pfmemalloc_wait
+ * so wake them now if necessary. If necessary, processes will wake
+ * kswapd and get throttled again
+ */
+ if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+ wake_up(&pgdat->pfmemalloc_wait);
+ return false;
+ }
/* Check the watermark levels */
for (i = 0; i <= classzone_idx; i++) {
@@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
* must be balanced
*/
if (order)
- return !pgdat_balanced(pgdat, balanced, classzone_idx);
+ return pgdat_balanced(pgdat, balanced, classzone_idx);
else
- return !all_zones_ok;
+ return all_zones_ok;
}
/*
@@ -2537,7 +2677,7 @@ loop_again:
* consider it to be no longer congested. It's
* possible there are dirty pages backed by
* congested BDIs but as pressure is relieved,
- * spectulatively avoid congestion waits
+ * speculatively avoid congestion waits
*/
zone_clear_flag(zone, ZONE_CONGESTED);
if (i <= *classzone_idx)
@@ -2545,6 +2685,16 @@ loop_again:
}
}
+
+ /*
+ * If the low watermark is met there is no need for processes
+ * to be throttled on pfmemalloc_wait as they should not be
+ * able to safely make forward progress. Wake them
+ */
+ if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+ pfmemalloc_watermark_ok(pgdat))
+ wake_up(&pgdat->pfmemalloc_wait);
+
if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
break; /* kswapd: all done */
/*
@@ -2646,7 +2796,7 @@ out:
}
/*
- * Return the order we were reclaiming at so sleeping_prematurely()
+ * Return the order we were reclaiming at so prepare_kswapd_sleep()
* makes a decision on the order we were last reclaiming at. However,
* if another caller entered the allocator slow path while kswapd
* was awake, order will remain at the higher level
@@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
@@ -2688,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* them before going back to sleep.
*/
set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
- schedule();
+
+ if (!kthread_should_stop())
+ schedule();
+
set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
} else {
if (remaining)
@@ -2955,14 +3108,17 @@ int kswapd_run(int nid)
}
/*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold lock_memory_hotplug().
*/
void kswapd_stop(int nid)
{
struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
- if (kswapd)
+ if (kswapd) {
kthread_stop(kswapd);
+ NODE_DATA(nid)->kswapd = NULL;
+ }
}
static int __init kswapd_init(void)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1bbbbd9776ad..df7a6748231d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -745,6 +745,7 @@ const char * const vmstat_text[] = {
TEXTS_FOR_ZONES("pgsteal_direct")
TEXTS_FOR_ZONES("pgscan_kswapd")
TEXTS_FOR_ZONES("pgscan_direct")
+ "pgscan_direct_throttle",
#ifdef CONFIG_NUMA
"zone_reclaim_failed",