summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig25
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c651
-rw-r--r--mm/bootmem.c13
-rw-r--r--mm/cleancache.c276
-rw-r--r--mm/cma.c62
-rw-r--r--mm/cma.h24
-rw-r--r--mm/cma_debug.c206
-rw-r--r--mm/compaction.c75
-rw-r--r--mm/debug.c2
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c187
-rw-r--r--mm/frontswap.c215
-rw-r--r--mm/gup.c128
-rw-r--r--mm/huge_memory.c140
-rw-r--r--mm/hugetlb.c443
-rw-r--r--mm/hwpoison-inject.c15
-rw-r--r--mm/internal.h19
-rw-r--r--mm/kasan/kasan.c13
-rw-r--r--mm/kasan/kasan.h1
-rw-r--r--mm/kmemleak.c171
-rw-r--r--mm/ksm.c10
-rw-r--r--mm/madvise.c1
-rw-r--r--mm/memblock.c179
-rw-r--r--mm/memcontrol.c527
-rw-r--r--mm/memory-failure.c351
-rw-r--r--mm/memory.c469
-rw-r--r--mm/memory_hotplug.c52
-rw-r--r--mm/mempolicy.c44
-rw-r--r--mm/mempool.c127
-rw-r--r--mm/memtest.c119
-rw-r--r--mm/migrate.c55
-rw-r--r--mm/mlock.c131
-rw-r--r--mm/mm_init.c9
-rw-r--r--mm/mmap.c31
-rw-r--r--mm/mprotect.c11
-rw-r--r--mm/mremap.c40
-rw-r--r--mm/nobootmem.c21
-rw-r--r--mm/nommu.c120
-rw-r--r--mm/oom_kill.c165
-rw-r--r--mm/page-writeback.c1241
-rw-r--r--mm/page_alloc.c993
-rw-r--r--mm/page_io.c9
-rw-r--r--mm/page_isolation.c3
-rw-r--r--mm/page_owner.c9
-rw-r--r--mm/percpu.c6
-rw-r--r--mm/pgtable-generic.c29
-rw-r--r--mm/process_vm_access.c35
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c17
-rw-r--r--mm/shmem.c78
-rw-r--r--mm/slab.c23
-rw-r--r--mm/slab.h1
-rw-r--r--mm/slab_common.c93
-rw-r--r--mm/slob.c3
-rw-r--r--mm/slub.c33
-rw-r--r--mm/swap.c35
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/truncate.c53
-rw-r--r--mm/util.c41
-rw-r--r--mm/vmalloc.c103
-rw-r--r--mm/vmscan.c106
-rw-r--r--mm/zbud.c23
-rw-r--r--mm/zpool.c35
-rw-r--r--mm/zsmalloc.c979
-rw-r--r--mm/zswap.c12
67 files changed, 5926 insertions, 3174 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a03131b6ba8e..e79de2bd12cd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -368,6 +368,7 @@ config MEMORY_FAILURE
depends on ARCH_SUPPORTS_MEMORY_FAILURE
bool "Enable recovery from hardware memory errors"
select MEMORY_ISOLATION
+ select RAS
help
Enables code to recover from some memory failures on systems
with MCA recovery. This allows a system to continue running
@@ -517,6 +518,12 @@ config CMA_DEBUG
processing calls such as dma_alloc_from_contiguous().
This option does not affect warning and error messages.
+config CMA_DEBUGFS
+ bool "CMA debugfs interface"
+ depends on CMA && DEBUG_FS
+ help
+ Turns on the DebugFS interface for CMA.
+
config CMA_AREAS
int "Maximum count of the CMA areas"
depends on CMA
@@ -629,3 +636,21 @@ config MAX_STACK_SIZE_MB
changed to a smaller value in which case that is used.
A sane initial value is 80 MB.
+
+# For architectures that support deferred memory initialisation
+config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
+ bool
+
+config DEFERRED_STRUCT_PAGE_INIT
+ bool "Defer initialisation of struct pages to kswapd"
+ default n
+ depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
+ depends on MEMORY_HOTPLUG
+ help
+ Ordinarily all struct pages are initialised during early boot in a
+ single thread. On very large machines this can take a considerable
+ amount of time. If this option is set, large machines will bring up
+ a subset of memmap at boot and then initialise the rest in parallel
+ when kswapd starts. This has a potential performance impact on
+ processes running early in the lifetime of the systemm until kswapd
+ finishes the initialisation.
diff --git a/mm/Makefile b/mm/Makefile
index 15dbe9903c27..98c4eaeabdcb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
@@ -76,3 +77,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
+obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6dc4580df2af..dac5bf59309d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -18,6 +18,7 @@ struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
@@ -48,7 +49,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
struct bdi_writeback *wb = &bdi->wb;
unsigned long background_thresh;
unsigned long dirty_thresh;
- unsigned long bdi_thresh;
+ unsigned long wb_thresh;
unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
struct inode *inode;
@@ -66,7 +67,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
- bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+ wb_thresh = wb_calc_thresh(wb, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
@@ -84,19 +85,19 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"b_dirty_time: %10lu\n"
"bdi_list: %10u\n"
"state: %10lx\n",
- (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
- (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
- K(bdi_thresh),
+ (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
+ (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
+ K(wb_thresh),
K(dirty_thresh),
K(background_thresh),
- (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
- (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
- (unsigned long) K(bdi->write_bandwidth),
+ (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
+ (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
+ (unsigned long) K(wb->write_bandwidth),
nr_dirty,
nr_io,
nr_more_io,
nr_dirty_time,
- !list_empty(&bdi->bdi_list), bdi->state);
+ !list_empty(&bdi->bdi_list), bdi->wb.state);
#undef K
return 0;
@@ -255,13 +256,8 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
-int bdi_has_dirty_io(struct backing_dev_info *bdi)
-{
- return wb_has_dirty_io(&bdi->wb);
-}
-
/*
- * This function is used when the first inode for this bdi is marked dirty. It
+ * This function is used when the first inode for this wb is marked dirty. It
* wakes-up the corresponding bdi thread which should then take care of the
* periodic background write-out of dirty inodes. Since the write-out would
* starts only 'dirty_writeback_interval' centisecs from now anyway, we just
@@ -274,178 +270,565 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
* We have to be careful not to postpone flush work if it is scheduled for
* earlier. Thus we use queue_delayed_work().
*/
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+void wb_wakeup_delayed(struct bdi_writeback *wb)
{
unsigned long timeout;
timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
- spin_lock_bh(&bdi->wb_lock);
- if (test_bit(BDI_registered, &bdi->state))
- queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
- spin_unlock_bh(&bdi->wb_lock);
+ spin_lock_bh(&wb->work_lock);
+ if (test_bit(WB_registered, &wb->state))
+ queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+ spin_unlock_bh(&wb->work_lock);
}
/*
- * Remove bdi from bdi_list, and ensure that it is no longer visible
+ * Initial write bandwidth: 100 MB/s
*/
-static void bdi_remove_from_list(struct backing_dev_info *bdi)
+#define INIT_BW (100 << (20 - PAGE_SHIFT))
+
+static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
+ int blkcg_id, gfp_t gfp)
{
- spin_lock_bh(&bdi_lock);
- list_del_rcu(&bdi->bdi_list);
- spin_unlock_bh(&bdi_lock);
+ int i, err;
- synchronize_rcu_expedited();
-}
+ memset(wb, 0, sizeof(*wb));
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
- const char *fmt, ...)
-{
- va_list args;
- struct device *dev;
+ wb->bdi = bdi;
+ wb->last_old_flush = jiffies;
+ INIT_LIST_HEAD(&wb->b_dirty);
+ INIT_LIST_HEAD(&wb->b_io);
+ INIT_LIST_HEAD(&wb->b_more_io);
+ INIT_LIST_HEAD(&wb->b_dirty_time);
+ spin_lock_init(&wb->list_lock);
- if (bdi->dev) /* The driver needs to use separate queues per device */
- return 0;
+ wb->bw_time_stamp = jiffies;
+ wb->balanced_dirty_ratelimit = INIT_BW;
+ wb->dirty_ratelimit = INIT_BW;
+ wb->write_bandwidth = INIT_BW;
+ wb->avg_write_bandwidth = INIT_BW;
- va_start(args, fmt);
- dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
- va_end(args);
- if (IS_ERR(dev))
- return PTR_ERR(dev);
+ spin_lock_init(&wb->work_lock);
+ INIT_LIST_HEAD(&wb->work_list);
+ INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
- bdi->dev = dev;
+ wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
+ if (!wb->congested)
+ return -ENOMEM;
- bdi_debug_register(bdi, dev_name(dev));
- set_bit(BDI_registered, &bdi->state);
+ err = fprop_local_init_percpu(&wb->completions, gfp);
+ if (err)
+ goto out_put_cong;
- spin_lock_bh(&bdi_lock);
- list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
- spin_unlock_bh(&bdi_lock);
+ for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
+ err = percpu_counter_init(&wb->stat[i], 0, gfp);
+ if (err)
+ goto out_destroy_stat;
+ }
- trace_writeback_bdi_register(bdi);
return 0;
-}
-EXPORT_SYMBOL(bdi_register);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
-{
- return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+out_destroy_stat:
+ while (--i)
+ percpu_counter_destroy(&wb->stat[i]);
+ fprop_local_destroy_percpu(&wb->completions);
+out_put_cong:
+ wb_congested_put(wb->congested);
+ return err;
}
-EXPORT_SYMBOL(bdi_register_dev);
/*
* Remove bdi from the global list and shutdown any threads we have running
*/
-static void bdi_wb_shutdown(struct backing_dev_info *bdi)
+static void wb_shutdown(struct bdi_writeback *wb)
{
/* Make sure nobody queues further work */
- spin_lock_bh(&bdi->wb_lock);
- if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
- spin_unlock_bh(&bdi->wb_lock);
+ spin_lock_bh(&wb->work_lock);
+ if (!test_and_clear_bit(WB_registered, &wb->state)) {
+ spin_unlock_bh(&wb->work_lock);
return;
}
- spin_unlock_bh(&bdi->wb_lock);
+ spin_unlock_bh(&wb->work_lock);
/*
- * Make sure nobody finds us on the bdi_list anymore
+ * Drain work list and shutdown the delayed_work. !WB_registered
+ * tells wb_workfn() that @wb is dying and its work_list needs to
+ * be drained no matter what.
*/
- bdi_remove_from_list(bdi);
+ mod_delayed_work(bdi_wq, &wb->dwork, 0);
+ flush_delayed_work(&wb->dwork);
+ WARN_ON(!list_empty(&wb->work_list));
+}
- /*
- * Drain work list and shutdown the delayed_work. At this point,
- * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
- * is dying and its work_list needs to be drained no matter what.
- */
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
- flush_delayed_work(&bdi->wb.dwork);
+static void wb_exit(struct bdi_writeback *wb)
+{
+ int i;
+
+ WARN_ON(delayed_work_pending(&wb->dwork));
+
+ for (i = 0; i < NR_WB_STAT_ITEMS; i++)
+ percpu_counter_destroy(&wb->stat[i]);
+
+ fprop_local_destroy_percpu(&wb->completions);
+ wb_congested_put(wb->congested);
}
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <linux/memcontrol.h>
+
/*
- * Called when the device behind @bdi has been removed or ejected.
+ * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
+ * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
+ * protected. cgwb_release_wait is used to wait for the completion of cgwb
+ * releases from bdi destruction path.
+ */
+static DEFINE_SPINLOCK(cgwb_lock);
+static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
+
+/**
+ * wb_congested_get_create - get or create a wb_congested
+ * @bdi: associated bdi
+ * @blkcg_id: ID of the associated blkcg
+ * @gfp: allocation mask
+ *
+ * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one.
+ * The returned wb_congested has its reference count incremented. Returns
+ * NULL on failure.
+ */
+struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+{
+ struct bdi_writeback_congested *new_congested = NULL, *congested;
+ struct rb_node **node, *parent;
+ unsigned long flags;
+retry:
+ spin_lock_irqsave(&cgwb_lock, flags);
+
+ node = &bdi->cgwb_congested_tree.rb_node;
+ parent = NULL;
+
+ while (*node != NULL) {
+ parent = *node;
+ congested = container_of(parent, struct bdi_writeback_congested,
+ rb_node);
+ if (congested->blkcg_id < blkcg_id)
+ node = &parent->rb_left;
+ else if (congested->blkcg_id > blkcg_id)
+ node = &parent->rb_right;
+ else
+ goto found;
+ }
+
+ if (new_congested) {
+ /* !found and storage for new one already allocated, insert */
+ congested = new_congested;
+ new_congested = NULL;
+ rb_link_node(&congested->rb_node, parent, node);
+ rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
+ goto found;
+ }
+
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+
+ /* allocate storage for new one and retry */
+ new_congested = kzalloc(sizeof(*new_congested), gfp);
+ if (!new_congested)
+ return NULL;
+
+ atomic_set(&new_congested->refcnt, 0);
+ new_congested->bdi = bdi;
+ new_congested->blkcg_id = blkcg_id;
+ goto retry;
+
+found:
+ atomic_inc(&congested->refcnt);
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ kfree(new_congested);
+ return congested;
+}
+
+/**
+ * wb_congested_put - put a wb_congested
+ * @congested: wb_congested to put
*
- * We can't really do much here except for reducing the dirty ratio at
- * the moment. In the future we should be able to set a flag so that
- * the filesystem can handle errors at mark_inode_dirty time instead
- * of only at writeback time.
+ * Put @congested and destroy it if the refcnt reaches zero.
*/
-void bdi_unregister(struct backing_dev_info *bdi)
+void wb_congested_put(struct bdi_writeback_congested *congested)
{
- if (WARN_ON_ONCE(!bdi->dev))
+ unsigned long flags;
+
+ local_irq_save(flags);
+ if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
+ local_irq_restore(flags);
return;
+ }
- bdi_set_min_ratio(bdi, 0);
+ /* bdi might already have been destroyed leaving @congested unlinked */
+ if (congested->bdi) {
+ rb_erase(&congested->rb_node,
+ &congested->bdi->cgwb_congested_tree);
+ congested->bdi = NULL;
+ }
+
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ kfree(congested);
}
-EXPORT_SYMBOL(bdi_unregister);
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+static void cgwb_release_workfn(struct work_struct *work)
{
- memset(wb, 0, sizeof(*wb));
+ struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
+ release_work);
+ struct backing_dev_info *bdi = wb->bdi;
- wb->bdi = bdi;
- wb->last_old_flush = jiffies;
- INIT_LIST_HEAD(&wb->b_dirty);
- INIT_LIST_HEAD(&wb->b_io);
- INIT_LIST_HEAD(&wb->b_more_io);
- INIT_LIST_HEAD(&wb->b_dirty_time);
- spin_lock_init(&wb->list_lock);
- INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
+ wb_shutdown(wb);
+
+ css_put(wb->memcg_css);
+ css_put(wb->blkcg_css);
+
+ fprop_local_destroy_percpu(&wb->memcg_completions);
+ percpu_ref_exit(&wb->refcnt);
+ wb_exit(wb);
+ kfree_rcu(wb, rcu);
+
+ if (atomic_dec_and_test(&bdi->usage_cnt))
+ wake_up_all(&cgwb_release_wait);
}
-/*
- * Initial write bandwidth: 100 MB/s
+static void cgwb_release(struct percpu_ref *refcnt)
+{
+ struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
+ refcnt);
+ schedule_work(&wb->release_work);
+}
+
+static void cgwb_kill(struct bdi_writeback *wb)
+{
+ lockdep_assert_held(&cgwb_lock);
+
+ WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
+ list_del(&wb->memcg_node);
+ list_del(&wb->blkcg_node);
+ percpu_ref_kill(&wb->refcnt);
+}
+
+static int cgwb_create(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css, gfp_t gfp)
+{
+ struct mem_cgroup *memcg;
+ struct cgroup_subsys_state *blkcg_css;
+ struct blkcg *blkcg;
+ struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
+ struct bdi_writeback *wb;
+ unsigned long flags;
+ int ret = 0;
+
+ memcg = mem_cgroup_from_css(memcg_css);
+ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+ blkcg = css_to_blkcg(blkcg_css);
+ memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+ blkcg_cgwb_list = &blkcg->cgwb_list;
+
+ /* look up again under lock and discard on blkcg mismatch */
+ spin_lock_irqsave(&cgwb_lock, flags);
+ wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+ if (wb && wb->blkcg_css != blkcg_css) {
+ cgwb_kill(wb);
+ wb = NULL;
+ }
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ if (wb)
+ goto out_put;
+
+ /* need to create a new one */
+ wb = kmalloc(sizeof(*wb), gfp);
+ if (!wb)
+ return -ENOMEM;
+
+ ret = wb_init(wb, bdi, blkcg_css->id, gfp);
+ if (ret)
+ goto err_free;
+
+ ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
+ if (ret)
+ goto err_wb_exit;
+
+ ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
+ if (ret)
+ goto err_ref_exit;
+
+ wb->memcg_css = memcg_css;
+ wb->blkcg_css = blkcg_css;
+ INIT_WORK(&wb->release_work, cgwb_release_workfn);
+ set_bit(WB_registered, &wb->state);
+
+ /*
+ * The root wb determines the registered state of the whole bdi and
+ * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
+ * whether they're still online. Don't link @wb if any is dead.
+ * See wb_memcg_offline() and wb_blkcg_offline().
+ */
+ ret = -ENODEV;
+ spin_lock_irqsave(&cgwb_lock, flags);
+ if (test_bit(WB_registered, &bdi->wb.state) &&
+ blkcg_cgwb_list->next && memcg_cgwb_list->next) {
+ /* we might have raced another instance of this function */
+ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
+ if (!ret) {
+ atomic_inc(&bdi->usage_cnt);
+ list_add(&wb->memcg_node, memcg_cgwb_list);
+ list_add(&wb->blkcg_node, blkcg_cgwb_list);
+ css_get(memcg_css);
+ css_get(blkcg_css);
+ }
+ }
+ spin_unlock_irqrestore(&cgwb_lock, flags);
+ if (ret) {
+ if (ret == -EEXIST)
+ ret = 0;
+ goto err_fprop_exit;
+ }
+ goto out_put;
+
+err_fprop_exit:
+ fprop_local_destroy_percpu(&wb->memcg_completions);
+err_ref_exit:
+ percpu_ref_exit(&wb->refcnt);
+err_wb_exit:
+ wb_exit(wb);
+err_free:
+ kfree(wb);
+out_put:
+ css_put(blkcg_css);
+ return ret;
+}
+
+/**
+ * wb_get_create - get wb for a given memcg, create if necessary
+ * @bdi: target bdi
+ * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+ * @gfp: allocation mask to use
+ *
+ * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
+ * create one. The returned wb has its refcount incremented.
+ *
+ * This function uses css_get() on @memcg_css and thus expects its refcnt
+ * to be positive on invocation. IOW, rcu_read_lock() protection on
+ * @memcg_css isn't enough. try_get it before calling this function.
+ *
+ * A wb is keyed by its associated memcg. As blkcg implicitly enables
+ * memcg on the default hierarchy, memcg association is guaranteed to be
+ * more specific (equal or descendant to the associated blkcg) and thus can
+ * identify both the memcg and blkcg associations.
+ *
+ * Because the blkcg associated with a memcg may change as blkcg is enabled
+ * and disabled closer to root in the hierarchy, each wb keeps track of
+ * both the memcg and blkcg associated with it and verifies the blkcg on
+ * each lookup. On mismatch, the existing wb is discarded and a new one is
+ * created.
*/
-#define INIT_BW (100 << (20 - PAGE_SHIFT))
+struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css,
+ gfp_t gfp)
+{
+ struct bdi_writeback *wb;
+
+ might_sleep_if(gfp & __GFP_WAIT);
+
+ if (!memcg_css->parent)
+ return &bdi->wb;
+
+ do {
+ rcu_read_lock();
+ wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+ if (wb) {
+ struct cgroup_subsys_state *blkcg_css;
+
+ /* see whether the blkcg association has changed */
+ blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
+ &blkio_cgrp_subsys);
+ if (unlikely(wb->blkcg_css != blkcg_css ||
+ !wb_tryget(wb)))
+ wb = NULL;
+ css_put(blkcg_css);
+ }
+ rcu_read_unlock();
+ } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
+
+ return wb;
+}
-int bdi_init(struct backing_dev_info *bdi)
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
- int i, err;
+ int ret;
+
+ INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
+ bdi->cgwb_congested_tree = RB_ROOT;
+ atomic_set(&bdi->usage_cnt, 1);
+
+ ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+ if (!ret) {
+ bdi->wb.memcg_css = mem_cgroup_root_css;
+ bdi->wb.blkcg_css = blkcg_root_css;
+ }
+ return ret;
+}
+
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+{
+ struct radix_tree_iter iter;
+ struct bdi_writeback_congested *congested, *congested_n;
+ void **slot;
+
+ WARN_ON(test_bit(WB_registered, &bdi->wb.state));
+
+ spin_lock_irq(&cgwb_lock);
+
+ radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
+ cgwb_kill(*slot);
+ rbtree_postorder_for_each_entry_safe(congested, congested_n,
+ &bdi->cgwb_congested_tree, rb_node) {
+ rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree);
+ congested->bdi = NULL; /* mark @congested unlinked */
+ }
+
+ spin_unlock_irq(&cgwb_lock);
+
+ /*
+ * All cgwb's and their congested states must be shutdown and
+ * released before returning. Drain the usage counter to wait for
+ * all cgwb's and cgwb_congested's ever created on @bdi.
+ */
+ atomic_dec(&bdi->usage_cnt);
+ wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
+}
+
+/**
+ * wb_memcg_offline - kill all wb's associated with a memcg being offlined
+ * @memcg: memcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @memcg.
+ */
+void wb_memcg_offline(struct mem_cgroup *memcg)
+{
+ LIST_HEAD(to_destroy);
+ struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+ struct bdi_writeback *wb, *next;
+
+ spin_lock_irq(&cgwb_lock);
+ list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
+ cgwb_kill(wb);
+ memcg_cgwb_list->next = NULL; /* prevent new wb's */
+ spin_unlock_irq(&cgwb_lock);
+}
+
+/**
+ * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
+ * @blkcg: blkcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @blkcg.
+ */
+void wb_blkcg_offline(struct blkcg *blkcg)
+{
+ LIST_HEAD(to_destroy);
+ struct bdi_writeback *wb, *next;
+
+ spin_lock_irq(&cgwb_lock);
+ list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+ cgwb_kill(wb);
+ blkcg->cgwb_list.next = NULL; /* prevent new wb's */
+ spin_unlock_irq(&cgwb_lock);
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static int cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+ int err;
+
+ bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL);
+ if (!bdi->wb_congested)
+ return -ENOMEM;
+
+ err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+ if (err) {
+ kfree(bdi->wb_congested);
+ return err;
+ }
+ return 0;
+}
+
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+int bdi_init(struct backing_dev_info *bdi)
+{
bdi->dev = NULL;
bdi->min_ratio = 0;
bdi->max_ratio = 100;
bdi->max_prop_frac = FPROP_FRAC_BASE;
- spin_lock_init(&bdi->wb_lock);
INIT_LIST_HEAD(&bdi->bdi_list);
- INIT_LIST_HEAD(&bdi->work_list);
+ init_waitqueue_head(&bdi->wb_waitq);
- bdi_wb_init(&bdi->wb, bdi);
+ return cgwb_bdi_init(bdi);
+}
+EXPORT_SYMBOL(bdi_init);
- for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
- err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
- if (err)
- goto err;
- }
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+ const char *fmt, ...)
+{
+ va_list args;
+ struct device *dev;
- bdi->dirty_exceeded = 0;
+ if (bdi->dev) /* The driver needs to use separate queues per device */
+ return 0;
- bdi->bw_time_stamp = jiffies;
- bdi->written_stamp = 0;
+ va_start(args, fmt);
+ dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+ va_end(args);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
- bdi->balanced_dirty_ratelimit = INIT_BW;
- bdi->dirty_ratelimit = INIT_BW;
- bdi->write_bandwidth = INIT_BW;
- bdi->avg_write_bandwidth = INIT_BW;
+ bdi->dev = dev;
- err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
+ bdi_debug_register(bdi, dev_name(dev));
+ set_bit(WB_registered, &bdi->wb.state);
- if (err) {
-err:
- while (i--)
- percpu_counter_destroy(&bdi->bdi_stat[i]);
- }
+ spin_lock_bh(&bdi_lock);
+ list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+ spin_unlock_bh(&bdi_lock);
- return err;
+ trace_writeback_bdi_register(bdi);
+ return 0;
}
-EXPORT_SYMBOL(bdi_init);
+EXPORT_SYMBOL(bdi_register);
-void bdi_destroy(struct backing_dev_info *bdi)
+int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
{
- int i;
+ return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+}
+EXPORT_SYMBOL(bdi_register_dev);
- bdi_wb_shutdown(bdi);
+/*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+ spin_lock_bh(&bdi_lock);
+ list_del_rcu(&bdi->bdi_list);
+ spin_unlock_bh(&bdi_lock);
- WARN_ON(!list_empty(&bdi->work_list));
- WARN_ON(delayed_work_pending(&bdi->wb.dwork));
+ synchronize_rcu_expedited();
+}
+
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+ /* make sure nobody finds us on the bdi_list anymore */
+ bdi_remove_from_list(bdi);
+ wb_shutdown(&bdi->wb);
+ cgwb_bdi_destroy(bdi);
if (bdi->dev) {
bdi_debug_unregister(bdi);
@@ -453,9 +836,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
bdi->dev = NULL;
}
- for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
- percpu_counter_destroy(&bdi->bdi_stat[i]);
- fprop_local_destroy_percpu(&bdi->completions);
+ wb_exit(&bdi->wb);
}
EXPORT_SYMBOL(bdi_destroy);
@@ -488,31 +869,31 @@ static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
};
-static atomic_t nr_bdi_congested[2];
+static atomic_t nr_wb_congested[2];
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
{
- enum bdi_state bit;
wait_queue_head_t *wqh = &congestion_wqh[sync];
+ enum wb_state bit;
- bit = sync ? BDI_sync_congested : BDI_async_congested;
- if (test_and_clear_bit(bit, &bdi->state))
- atomic_dec(&nr_bdi_congested[sync]);
+ bit = sync ? WB_sync_congested : WB_async_congested;
+ if (test_and_clear_bit(bit, &congested->state))
+ atomic_dec(&nr_wb_congested[sync]);
smp_mb__after_atomic();
if (waitqueue_active(wqh))
wake_up(wqh);
}
-EXPORT_SYMBOL(clear_bdi_congested);
+EXPORT_SYMBOL(clear_wb_congested);
-void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
{
- enum bdi_state bit;
+ enum wb_state bit;
- bit = sync ? BDI_sync_congested : BDI_async_congested;
- if (!test_and_set_bit(bit, &bdi->state))
- atomic_inc(&nr_bdi_congested[sync]);
+ bit = sync ? WB_sync_congested : WB_async_congested;
+ if (!test_and_set_bit(bit, &congested->state))
+ atomic_inc(&nr_wb_congested[sync]);
}
-EXPORT_SYMBOL(set_bdi_congested);
+EXPORT_SYMBOL(set_wb_congested);
/**
* congestion_wait - wait for a backing_dev to become uncongested
@@ -571,7 +952,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
* encountered in the current zone, yield if necessary instead
* of sleeping on the congestion queue
*/
- if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+ if (atomic_read(&nr_wb_congested[sync]) == 0 ||
!test_bit(ZONE_CONGESTED, &zone->flags)) {
cond_resched();
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 477be696511d..a23dd1934654 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -164,7 +164,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
end = PFN_DOWN(physaddr + size);
for (; cursor < end; cursor++) {
- __free_pages_bootmem(pfn_to_page(cursor), 0);
+ __free_pages_bootmem(pfn_to_page(cursor), cursor, 0);
totalram_pages++;
}
}
@@ -172,7 +172,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
struct page *page;
- unsigned long *map, start, end, pages, count = 0;
+ unsigned long *map, start, end, pages, cur, count = 0;
if (!bdata->node_bootmem_map)
return 0;
@@ -210,17 +210,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
int order = ilog2(BITS_PER_LONG);
- __free_pages_bootmem(pfn_to_page(start), order);
+ __free_pages_bootmem(pfn_to_page(start), start, order);
count += BITS_PER_LONG;
start += BITS_PER_LONG;
} else {
- unsigned long cur = start;
+ cur = start;
start = ALIGN(start + 1, BITS_PER_LONG);
while (vec && cur != start) {
if (vec & 1) {
page = pfn_to_page(cur);
- __free_pages_bootmem(page, 0);
+ __free_pages_bootmem(page, cur, 0);
count++;
}
vec >>= 1;
@@ -229,12 +229,13 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
}
}
+ cur = bdata->node_min_pfn;
page = virt_to_page(bdata->node_bootmem_map);
pages = bdata->node_low_pfn - bdata->node_min_pfn;
pages = bootmem_bootmap_pages(pages);
count += pages;
while (pages--)
- __free_pages_bootmem(page++, 0);
+ __free_pages_bootmem(page++, cur++, 0);
bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 053bcd8f12fb..8fc50811119b 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -19,7 +19,7 @@
#include <linux/cleancache.h>
/*
- * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * cleancache_ops is set by cleancache_register_ops to contain the pointers
* to the cleancache "backend" implementation functions.
*/
static struct cleancache_ops *cleancache_ops __read_mostly;
@@ -34,145 +34,107 @@ static u64 cleancache_failed_gets;
static u64 cleancache_puts;
static u64 cleancache_invalidates;
-/*
- * When no backend is registered all calls to init_fs and init_shared_fs
- * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or
- * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array
- * [shared_|]fs_poolid_map) are given to the respective super block
- * (sb->cleancache_poolid) and no tmem_pools are created. When a backend
- * registers with cleancache the previous calls to init_fs and init_shared_fs
- * are executed to create tmem_pools and set the respective poolids. While no
- * backend is registered all "puts", "gets" and "flushes" are ignored or failed.
- */
-#define MAX_INITIALIZABLE_FS 32
-#define FAKE_FS_POOLID_OFFSET 1000
-#define FAKE_SHARED_FS_POOLID_OFFSET 2000
-
-#define FS_NO_BACKEND (-1)
-#define FS_UNKNOWN (-2)
-static int fs_poolid_map[MAX_INITIALIZABLE_FS];
-static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
-static char *uuids[MAX_INITIALIZABLE_FS];
-/*
- * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
- * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
- * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
- */
-static DEFINE_MUTEX(poolid_mutex);
-/*
- * When set to false (default) all calls to the cleancache functions, except
- * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
- * by the if (!cleancache_ops) return. This means multiple threads (from
- * different filesystems) will be checking cleancache_ops. The usage of a
- * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
- * OK if the time between the backend's have been initialized (and
- * cleancache_ops has been set to not NULL) and when the filesystems start
- * actually calling the backends. The inverse (when unloading) is obviously
- * not good - but this shim does not do that (yet).
- */
-
-/*
- * The backends and filesystems work all asynchronously. This is b/c the
- * backends can be built as modules.
- * The usual sequence of events is:
- * a) mount / -> __cleancache_init_fs is called. We set the
- * [shared_|]fs_poolid_map and uuids for.
- *
- * b). user does I/Os -> we call the rest of __cleancache_* functions
- * which return immediately as cleancache_ops is false.
- *
- * c). modprobe zcache -> cleancache_register_ops. We init the backend
- * and set cleancache_ops to true, and for any fs_poolid_map
- * (which is set by __cleancache_init_fs) we initialize the poolid.
- *
- * d). user does I/Os -> now that cleancache_ops is true all the
- * __cleancache_* functions can call the backend. They all check
- * that fs_poolid_map is valid and if so invoke the backend.
- *
- * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is
- * reset (which is the second check in the __cleancache_* ops
- * to call the backend).
- *
- * The sequence of event could also be c), followed by a), and d). and e). The
- * c) would not happen anymore. There is also the chance of c), and one thread
- * doing a) + d), and another doing e). For that case we depend on the
- * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
- * that it handles all I/Os before it invalidates the fs (which is last part
- * of unmounting process).
- *
- * Note: The acute reader will notice that there is no "rmmod zcache" case.
- * This is b/c the functionality for that is not yet implemented and when
- * done, will require some extra locking not yet devised.
- */
+static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
+{
+ switch (sb->cleancache_poolid) {
+ case CLEANCACHE_NO_BACKEND:
+ __cleancache_init_fs(sb);
+ break;
+ case CLEANCACHE_NO_BACKEND_SHARED:
+ __cleancache_init_shared_fs(sb);
+ break;
+ }
+}
/*
- * Register operations for cleancache, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * Register operations for cleancache. Returns 0 on success.
*/
-struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(struct cleancache_ops *ops)
{
- struct cleancache_ops *old = cleancache_ops;
- int i;
+ if (cmpxchg(&cleancache_ops, NULL, ops))
+ return -EBUSY;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (fs_poolid_map[i] == FS_NO_BACKEND)
- fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
- if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
- shared_fs_poolid_map[i] = ops->init_shared_fs
- (uuids[i], PAGE_SIZE);
- }
/*
- * We MUST set cleancache_ops _after_ we have called the backends
- * init_fs or init_shared_fs functions. Otherwise the compiler might
- * re-order where cleancache_ops is set in this function.
+ * A cleancache backend can be built as a module and hence loaded after
+ * a cleancache enabled filesystem has called cleancache_init_fs. To
+ * handle such a scenario, here we call ->init_fs or ->init_shared_fs
+ * for each active super block. To differentiate between local and
+ * shared filesystems, we temporarily initialize sb->cleancache_poolid
+ * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
+ * respectively in case there is no backend registered at the time
+ * cleancache_init_fs or cleancache_init_shared_fs is called.
+ *
+ * Since filesystems can be mounted concurrently with cleancache
+ * backend registration, we have to be careful to guarantee that all
+ * cleancache enabled filesystems that has been mounted by the time
+ * cleancache_register_ops is called has got and all mounted later will
+ * get cleancache_poolid. This is assured by the following statements
+ * tied together:
+ *
+ * a) iterate_supers skips only those super blocks that has started
+ * ->kill_sb
+ *
+ * b) if iterate_supers encounters a super block that has not finished
+ * ->mount yet, it waits until it is finished
+ *
+ * c) cleancache_init_fs is called from ->mount and
+ * cleancache_invalidate_fs is called from ->kill_sb
+ *
+ * d) we call iterate_supers after cleancache_ops has been set
+ *
+ * From a) it follows that if iterate_supers skips a super block, then
+ * either the super block is already dead, in which case we do not need
+ * to bother initializing cleancache for it, or it was mounted after we
+ * initiated iterate_supers. In the latter case, it must have seen
+ * cleancache_ops set according to d) and initialized cleancache from
+ * ->mount by itself according to c). This proves that we call
+ * ->init_fs at least once for each active super block.
+ *
+ * From b) and c) it follows that if iterate_supers encounters a super
+ * block that has already started ->init_fs, it will wait until ->mount
+ * and hence ->init_fs has finished, then check cleancache_poolid, see
+ * that it has already been set and therefore do nothing. This proves
+ * that we call ->init_fs no more than once for each super block.
+ *
+ * Combined together, the last two paragraphs prove the function
+ * correctness.
+ *
+ * Note that various cleancache callbacks may proceed before this
+ * function is called or even concurrently with it, but since
+ * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
+ * until the corresponding ->init_fs has been actually called and
+ * cleancache_ops has been set.
*/
- barrier();
- cleancache_ops = ops;
- mutex_unlock(&poolid_mutex);
- return old;
+ iterate_supers(cleancache_register_ops_sb, NULL);
+ return 0;
}
EXPORT_SYMBOL(cleancache_register_ops);
/* Called by a cleancache-enabled filesystem at time of mount */
void __cleancache_init_fs(struct super_block *sb)
{
- int i;
+ int pool_id = CLEANCACHE_NO_BACKEND;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (fs_poolid_map[i] == FS_UNKNOWN) {
- sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
- if (cleancache_ops)
- fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
- else
- fs_poolid_map[i] = FS_NO_BACKEND;
- break;
- }
+ if (cleancache_ops) {
+ pool_id = cleancache_ops->init_fs(PAGE_SIZE);
+ if (pool_id < 0)
+ pool_id = CLEANCACHE_NO_POOL;
}
- mutex_unlock(&poolid_mutex);
+ sb->cleancache_poolid = pool_id;
}
EXPORT_SYMBOL(__cleancache_init_fs);
/* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+void __cleancache_init_shared_fs(struct super_block *sb)
{
- int i;
+ int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
- mutex_lock(&poolid_mutex);
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
- sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
- uuids[i] = uuid;
- if (cleancache_ops)
- shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
- (uuid, PAGE_SIZE);
- else
- shared_fs_poolid_map[i] = FS_NO_BACKEND;
- break;
- }
+ if (cleancache_ops) {
+ pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
+ if (pool_id < 0)
+ pool_id = CLEANCACHE_NO_POOL;
}
- mutex_unlock(&poolid_mutex);
+ sb->cleancache_poolid = pool_id;
}
EXPORT_SYMBOL(__cleancache_init_shared_fs);
@@ -202,19 +164,6 @@ static int cleancache_get_key(struct inode *inode,
}
/*
- * Returns a pool_id that is associated with a given fake poolid.
- */
-static int get_poolid_from_fake(int fake_pool_id)
-{
- if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
- return shared_fs_poolid_map[fake_pool_id -
- FAKE_SHARED_FS_POOLID_OFFSET];
- else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
- return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
- return FS_NO_BACKEND;
-}
-
-/*
* "Get" data from cleancache associated with the poolid/inode/index
* that were specified when the data was put to cleanache and, if
* successful, use it to fill the specified page with data and return 0.
@@ -229,7 +178,6 @@ int __cleancache_get_page(struct page *page)
{
int ret = -1;
int pool_id;
- int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops) {
@@ -238,17 +186,14 @@ int __cleancache_get_page(struct page *page)
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
- fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (fake_pool_id < 0)
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
+ if (pool_id < 0)
goto out;
- pool_id = get_poolid_from_fake(fake_pool_id);
if (cleancache_get_key(page->mapping->host, &key) < 0)
goto out;
- if (pool_id >= 0)
- ret = cleancache_ops->get_page(pool_id,
- key, page->index, page);
+ ret = cleancache_ops->get_page(pool_id, key, page->index, page);
if (ret == 0)
cleancache_succ_gets++;
else
@@ -271,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page);
void __cleancache_put_page(struct page *page)
{
int pool_id;
- int fake_pool_id;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops) {
@@ -280,12 +224,7 @@ void __cleancache_put_page(struct page *page)
}
VM_BUG_ON_PAGE(!PageLocked(page), page);
- fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (fake_pool_id < 0)
- return;
-
- pool_id = get_poolid_from_fake(fake_pool_id);
-
+ pool_id = page->mapping->host->i_sb->cleancache_poolid;
if (pool_id >= 0 &&
cleancache_get_key(page->mapping->host, &key) >= 0) {
cleancache_ops->put_page(pool_id, key, page->index, page);
@@ -306,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping,
struct page *page)
{
/* careful... page->mapping is NULL sometimes when this is called */
- int pool_id;
- int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops)
return;
- if (fake_pool_id >= 0) {
- pool_id = get_poolid_from_fake(fake_pool_id);
- if (pool_id < 0)
- return;
-
+ if (pool_id >= 0) {
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (cleancache_get_key(mapping->host, &key) >= 0) {
cleancache_ops->invalidate_page(pool_id,
@@ -339,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
*/
void __cleancache_invalidate_inode(struct address_space *mapping)
{
- int pool_id;
- int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+ int pool_id = mapping->host->i_sb->cleancache_poolid;
struct cleancache_filekey key = { .u.key = { 0 } };
if (!cleancache_ops)
return;
- if (fake_pool_id < 0)
- return;
-
- pool_id = get_poolid_from_fake(fake_pool_id);
-
if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
cleancache_ops->invalidate_inode(pool_id, key);
}
@@ -363,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode);
*/
void __cleancache_invalidate_fs(struct super_block *sb)
{
- int index;
- int fake_pool_id = sb->cleancache_poolid;
- int old_poolid = fake_pool_id;
+ int pool_id;
- mutex_lock(&poolid_mutex);
- if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) {
- index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET;
- old_poolid = shared_fs_poolid_map[index];
- shared_fs_poolid_map[index] = FS_UNKNOWN;
- uuids[index] = NULL;
- } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
- index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
- old_poolid = fs_poolid_map[index];
- fs_poolid_map[index] = FS_UNKNOWN;
- }
- sb->cleancache_poolid = -1;
- if (cleancache_ops)
- cleancache_ops->invalidate_fs(old_poolid);
- mutex_unlock(&poolid_mutex);
+ pool_id = sb->cleancache_poolid;
+ sb->cleancache_poolid = CLEANCACHE_NO_POOL;
+
+ if (cleancache_ops && pool_id >= 0)
+ cleancache_ops->invalidate_fs(pool_id);
}
EXPORT_SYMBOL(__cleancache_invalidate_fs);
static int __init init_cleancache(void)
{
- int i;
-
#ifdef CONFIG_DEBUG_FS
struct dentry *root = debugfs_create_dir("cleancache", NULL);
if (root == NULL)
@@ -400,10 +314,6 @@ static int __init init_cleancache(void)
debugfs_create_u64("invalidates", S_IRUGO,
root, &cleancache_invalidates);
#endif
- for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
- fs_poolid_map[i] = FS_UNKNOWN;
- shared_fs_poolid_map[i] = FS_UNKNOWN;
- }
return 0;
}
module_init(init_cleancache)
diff --git a/mm/cma.c b/mm/cma.c
index 68ecb7a42983..e7d1db533025 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -23,6 +23,7 @@
# define DEBUG
#endif
#endif
+#define CREATE_TRACE_POINTS
#include <linux/memblock.h>
#include <linux/err.h>
@@ -34,30 +35,26 @@
#include <linux/cma.h>
#include <linux/highmem.h>
#include <linux/io.h>
+#include <trace/events/cma.h>
-struct cma {
- unsigned long base_pfn;
- unsigned long count;
- unsigned long *bitmap;
- unsigned int order_per_bit; /* Order of pages represented by one bit */
- struct mutex lock;
-};
+#include "cma.h"
-static struct cma cma_areas[MAX_CMA_AREAS];
-static unsigned cma_area_count;
+struct cma cma_areas[MAX_CMA_AREAS];
+unsigned cma_area_count;
static DEFINE_MUTEX(cma_mutex);
-phys_addr_t cma_get_base(struct cma *cma)
+phys_addr_t cma_get_base(const struct cma *cma)
{
return PFN_PHYS(cma->base_pfn);
}
-unsigned long cma_get_size(struct cma *cma)
+unsigned long cma_get_size(const struct cma *cma)
{
return cma->count << PAGE_SHIFT;
}
-static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
+static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
+ int align_order)
{
if (align_order <= cma->order_per_bit)
return 0;
@@ -68,7 +65,8 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
* Find a PFN aligned to the specified order and return an offset represented in
* order_per_bits.
*/
-static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
+static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
+ int align_order)
{
if (align_order <= cma->order_per_bit)
return 0;
@@ -77,18 +75,14 @@ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order)
- cma->base_pfn) >> cma->order_per_bit;
}
-static unsigned long cma_bitmap_maxno(struct cma *cma)
-{
- return cma->count >> cma->order_per_bit;
-}
-
-static unsigned long cma_bitmap_pages_to_bits(struct cma *cma,
- unsigned long pages)
+static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
+ unsigned long pages)
{
return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
}
-static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count)
+static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
+ unsigned int count)
{
unsigned long bitmap_no, bitmap_count;
@@ -134,6 +128,12 @@ static int __init cma_activate_area(struct cma *cma)
} while (--i);
mutex_init(&cma->lock);
+
+#ifdef CONFIG_CMA_DEBUGFS
+ INIT_HLIST_HEAD(&cma->mem_head);
+ spin_lock_init(&cma->mem_head_lock);
+#endif
+
return 0;
err:
@@ -167,7 +167,8 @@ core_initcall(cma_init_reserved_areas);
* This function creates custom contiguous area from already reserved memory.
*/
int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
- int order_per_bit, struct cma **res_cma)
+ unsigned int order_per_bit,
+ struct cma **res_cma)
{
struct cma *cma;
phys_addr_t alignment;
@@ -181,7 +182,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
if (!size || !memblock_is_region_reserved(base, size))
return -EINVAL;
- /* ensure minimal alignment requied by mm core */
+ /* ensure minimal alignment required by mm core */
alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
/* alignment should be aligned with order_per_bit */
@@ -237,7 +238,7 @@ int __init cma_declare_contiguous(phys_addr_t base,
/*
* high_memory isn't direct mapped memory so retrieving its physical
* address isn't appropriate. But it would be useful to check the
- * physical address of the highmem boundary so it's justfiable to get
+ * physical address of the highmem boundary so it's justifiable to get
* the physical address from it. On x86 there is a validation check for
* this case, so the following workaround is needed to avoid it.
*/
@@ -315,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base,
*/
if (base < highmem_start && limit > highmem_start) {
addr = memblock_alloc_range(size, alignment,
- highmem_start, limit);
+ highmem_start, limit,
+ MEMBLOCK_NONE);
limit = highmem_start;
}
if (!addr) {
addr = memblock_alloc_range(size, alignment, base,
- limit);
+ limit,
+ MEMBLOCK_NONE);
if (!addr) {
ret = -ENOMEM;
goto err;
@@ -358,7 +361,7 @@ err:
* This function allocates part of contiguous memory on specific
* contiguous memory area.
*/
-struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
+struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
{
unsigned long mask, offset, pfn, start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
@@ -415,6 +418,8 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
start = bitmap_no + mask + 1;
}
+ trace_cma_alloc(page ? pfn : -1UL, page, count, align);
+
pr_debug("%s(): returned %p\n", __func__, page);
return page;
}
@@ -429,7 +434,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
* It returns false when provided pages do not belong to contiguous area and
* true otherwise.
*/
-bool cma_release(struct cma *cma, struct page *pages, int count)
+bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
{
unsigned long pfn;
@@ -447,6 +452,7 @@ bool cma_release(struct cma *cma, struct page *pages, int count)
free_contig_range(pfn, count);
cma_clear_bitmap(cma, pfn, count);
+ trace_cma_release(pfn, pages, count);
return true;
}
diff --git a/mm/cma.h b/mm/cma.h
new file mode 100644
index 000000000000..1132d733556d
--- /dev/null
+++ b/mm/cma.h
@@ -0,0 +1,24 @@
+#ifndef __MM_CMA_H__
+#define __MM_CMA_H__
+
+struct cma {
+ unsigned long base_pfn;
+ unsigned long count;
+ unsigned long *bitmap;
+ unsigned int order_per_bit; /* Order of pages represented by one bit */
+ struct mutex lock;
+#ifdef CONFIG_CMA_DEBUGFS
+ struct hlist_head mem_head;
+ spinlock_t mem_head_lock;
+#endif
+};
+
+extern struct cma cma_areas[MAX_CMA_AREAS];
+extern unsigned cma_area_count;
+
+static unsigned long cma_bitmap_maxno(struct cma *cma)
+{
+ return cma->count >> cma->order_per_bit;
+}
+
+#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
new file mode 100644
index 000000000000..f8e4b60db167
--- /dev/null
+++ b/mm/cma_debug.c
@@ -0,0 +1,206 @@
+/*
+ * CMA DebugFS Interface
+ *
+ * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com>
+ */
+
+
+#include <linux/debugfs.h>
+#include <linux/cma.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm_types.h>
+
+#include "cma.h"
+
+struct cma_mem {
+ struct hlist_node node;
+ struct page *p;
+ unsigned long n;
+};
+
+static struct dentry *cma_debugfs_root;
+
+static int cma_debugfs_get(void *data, u64 *val)
+{
+ unsigned long *p = data;
+
+ *val = *p;
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
+
+static int cma_used_get(void *data, u64 *val)
+{
+ struct cma *cma = data;
+ unsigned long used;
+
+ mutex_lock(&cma->lock);
+ /* pages counter is smaller than sizeof(int) */
+ used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
+ mutex_unlock(&cma->lock);
+ *val = (u64)used << cma->order_per_bit;
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
+
+static int cma_maxchunk_get(void *data, u64 *val)
+{
+ struct cma *cma = data;
+ unsigned long maxchunk = 0;
+ unsigned long start, end = 0;
+ unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
+
+ mutex_lock(&cma->lock);
+ for (;;) {
+ start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
+ if (start >= cma->count)
+ break;
+ end = find_next_bit(cma->bitmap, bitmap_maxno, start);
+ maxchunk = max(end - start, maxchunk);
+ }
+ mutex_unlock(&cma->lock);
+ *val = (u64)maxchunk << cma->order_per_bit;
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n");
+
+static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
+{
+ spin_lock(&cma->mem_head_lock);
+ hlist_add_head(&mem->node, &cma->mem_head);
+ spin_unlock(&cma->mem_head_lock);
+}
+
+static struct cma_mem *cma_get_entry_from_list(struct cma *cma)
+{
+ struct cma_mem *mem = NULL;
+
+ spin_lock(&cma->mem_head_lock);
+ if (!hlist_empty(&cma->mem_head)) {
+ mem = hlist_entry(cma->mem_head.first, struct cma_mem, node);
+ hlist_del_init(&mem->node);
+ }
+ spin_unlock(&cma->mem_head_lock);
+
+ return mem;
+}
+
+static int cma_free_mem(struct cma *cma, int count)
+{
+ struct cma_mem *mem = NULL;
+
+ while (count) {
+ mem = cma_get_entry_from_list(cma);
+ if (mem == NULL)
+ return 0;
+
+ if (mem->n <= count) {
+ cma_release(cma, mem->p, mem->n);
+ count -= mem->n;
+ kfree(mem);
+ } else if (cma->order_per_bit == 0) {
+ cma_release(cma, mem->p, count);
+ mem->p += count;
+ mem->n -= count;
+ count = 0;
+ cma_add_to_cma_mem_list(cma, mem);
+ } else {
+ pr_debug("cma: cannot release partial block when order_per_bit != 0\n");
+ cma_add_to_cma_mem_list(cma, mem);
+ break;
+ }
+ }
+
+ return 0;
+
+}
+
+static int cma_free_write(void *data, u64 val)
+{
+ int pages = val;
+ struct cma *cma = data;
+
+ return cma_free_mem(cma, pages);
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
+
+static int cma_alloc_mem(struct cma *cma, int count)
+{
+ struct cma_mem *mem;
+ struct page *p;
+
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+
+ p = cma_alloc(cma, count, 0);
+ if (!p) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+
+ mem->p = p;
+ mem->n = count;
+
+ cma_add_to_cma_mem_list(cma, mem);
+
+ return 0;
+}
+
+static int cma_alloc_write(void *data, u64 val)
+{
+ int pages = val;
+ struct cma *cma = data;
+
+ return cma_alloc_mem(cma, pages);
+}
+DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
+
+static void cma_debugfs_add_one(struct cma *cma, int idx)
+{
+ struct dentry *tmp;
+ char name[16];
+ int u32s;
+
+ sprintf(name, "cma-%d", idx);
+
+ tmp = debugfs_create_dir(name, cma_debugfs_root);
+
+ debugfs_create_file("alloc", S_IWUSR, tmp, cma,
+ &cma_alloc_fops);
+
+ debugfs_create_file("free", S_IWUSR, tmp, cma,
+ &cma_free_fops);
+
+ debugfs_create_file("base_pfn", S_IRUGO, tmp,
+ &cma->base_pfn, &cma_debugfs_fops);
+ debugfs_create_file("count", S_IRUGO, tmp,
+ &cma->count, &cma_debugfs_fops);
+ debugfs_create_file("order_per_bit", S_IRUGO, tmp,
+ &cma->order_per_bit, &cma_debugfs_fops);
+ debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops);
+ debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops);
+
+ u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
+ debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s);
+}
+
+static int __init cma_debugfs_init(void)
+{
+ int i;
+
+ cma_debugfs_root = debugfs_create_dir("cma", NULL);
+ if (!cma_debugfs_root)
+ return -ENOMEM;
+
+ for (i = 0; i < cma_area_count; i++)
+ cma_debugfs_add_one(&cma_areas[i], i);
+
+ return 0;
+}
+late_initcall(cma_debugfs_init);
diff --git a/mm/compaction.c b/mm/compaction.c
index 8c0d9459b54a..018f08da99a2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -391,28 +391,6 @@ static inline bool compact_should_abort(struct compact_control *cc)
return false;
}
-/* Returns true if the page is within a block suitable for migration to */
-static bool suitable_migration_target(struct page *page)
-{
- /* If the page is a large free page, then disallow migration */
- if (PageBuddy(page)) {
- /*
- * We are checking page_order without zone->lock taken. But
- * the only small danger is that we skip a potentially suitable
- * pageblock, so it's not worth to check order for valid range.
- */
- if (page_order_unsafe(page) >= pageblock_order)
- return false;
- }
-
- /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
- if (migrate_async_suitable(get_pageblock_migratetype(page)))
- return true;
-
- /* Otherwise skip the block */
- return false;
-}
-
/*
* Isolate free pages onto a private freelist. If @strict is true, will abort
* returning 0 on any invalid PFNs or non-free pages inside of the pageblock
@@ -896,6 +874,29 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
+
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+ /* If the page is a large free page, then disallow migration */
+ if (PageBuddy(page)) {
+ /*
+ * We are checking page_order without zone->lock taken. But
+ * the only small danger is that we skip a potentially suitable
+ * pageblock, so it's not worth to check order for valid range.
+ */
+ if (page_order_unsafe(page) >= pageblock_order)
+ return false;
+ }
+
+ /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+ if (migrate_async_suitable(get_pageblock_migratetype(page)))
+ return true;
+
+ /* Otherwise skip the block */
+ return false;
+}
+
/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
@@ -1047,6 +1048,12 @@ typedef enum {
} isolate_migrate_t;
/*
+ * Allow userspace to control policy on scanning the unevictable LRU for
+ * compactable pages.
+ */
+int sysctl_compact_unevictable_allowed __read_mostly = 1;
+
+/*
* Isolate all pages that can be migrated from the first suitable block,
* starting at the block pointed to by the migrate scanner pfn within
* compact_control.
@@ -1057,6 +1064,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
unsigned long low_pfn, end_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
+ (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
(cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
/*
@@ -1174,13 +1182,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
struct free_area *area = &zone->free_area[order];
+ bool can_steal;
/* Job done if page is free of the right migratetype */
if (!list_empty(&area->free_list[migratetype]))
return COMPACT_PARTIAL;
- /* Job done if allocation would set block type */
- if (order >= pageblock_order && area->nr_free)
+#ifdef CONFIG_CMA
+ /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
+ if (migratetype == MIGRATE_MOVABLE &&
+ !list_empty(&area->free_list[MIGRATE_CMA]))
+ return COMPACT_PARTIAL;
+#endif
+ /*
+ * Job done if allocation would steal freepages from
+ * other migratetype buddy lists.
+ */
+ if (find_suitable_fallback(area, order, migratetype,
+ true, &can_steal) != -1)
return COMPACT_PARTIAL;
}
@@ -1587,6 +1606,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
+ /*
+ * When called via /proc/sys/vm/compact_memory
+ * this makes sure we compact the whole zone regardless of
+ * cached scanner positions.
+ */
+ if (cc->order == -1)
+ __reset_isolation_suitable(zone);
+
if (cc->order == -1 || !compaction_deferred(zone, cc->order))
compact_zone(zone, cc);
diff --git a/mm/debug.c b/mm/debug.c
index 3eb3ac2fcee7..76089ddf99ea 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -7,7 +7,7 @@
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/memcontrol.h>
static const struct trace_print_flags pageflag_names[] = {
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 4a3907cf79f8..b8a5bc66b0c0 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -115,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
- if (!bdi_write_congested(bdi))
+ if (!inode_write_congested(mapping->host))
__filemap_fdatawrite_range(mapping, offset, endbyte,
WB_SYNC_NONE);
diff --git a/mm/filemap.c b/mm/filemap.c
index ad7242043bdb..1283fc825458 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -13,7 +13,6 @@
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
-#include <linux/aio.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
@@ -101,6 +100,7 @@
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
+ * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
@@ -175,9 +175,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock.
+ * is safe. The caller must hold the mapping's tree_lock and
+ * mem_cgroup_begin_page_stat().
*/
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __delete_from_page_cache(struct page *page, void *shadow,
+ struct mem_cgroup *memcg)
{
struct address_space *mapping = page->mapping;
@@ -197,22 +199,24 @@ void __delete_from_page_cache(struct page *page, void *shadow)
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
- __dec_zone_page_state(page, NR_FILE_PAGES);
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!PageHuge(page))
+ __dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
BUG_ON(page_mapped(page));
/*
- * Some filesystems seem to re-dirty the page even after
- * the VM has canceled the dirty bit (eg ext3 journaling).
+ * At this point page must be either written or cleaned by truncate.
+ * Dirty page here signals a bug and loss of unwritten data.
*
- * Fix it up by doing a final dirty accounting check after
- * having removed the page entirely.
+ * This fixes dirty accounting after removing the page entirely but
+ * leaves PageDirty set: it has no effect for truncated page and
+ * anyway will be cleared before returning page into buddy allocator.
*/
- if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
- dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
- }
+ if (WARN_ON_ONCE(PageDirty(page)))
+ account_page_cleaned(page, mapping, memcg,
+ inode_to_wb(mapping->host));
}
/**
@@ -226,14 +230,20 @@ void __delete_from_page_cache(struct page *page, void *shadow)
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ struct mem_cgroup *memcg;
+ unsigned long flags;
+
void (*freepage)(struct page *);
BUG_ON(!PageLocked(page));
freepage = mapping->a_ops->freepage;
- spin_lock_irq(&mapping->tree_lock);
- __delete_from_page_cache(page, NULL);
- spin_unlock_irq(&mapping->tree_lock);
+
+ memcg = mem_cgroup_begin_page_stat(page);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ __delete_from_page_cache(page, NULL, memcg);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
if (freepage)
freepage(page);
@@ -283,7 +293,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
if (!mapping_cap_writeback_dirty(mapping))
return 0;
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
ret = do_writepages(mapping, &wbc);
+ wbc_detach_inode(&wbc);
return ret;
}
@@ -472,6 +484,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
+ struct mem_cgroup *memcg;
+ unsigned long flags;
pgoff_t offset = old->index;
freepage = mapping->a_ops->freepage;
@@ -480,15 +494,22 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->mapping = mapping;
new->index = offset;
- spin_lock_irq(&mapping->tree_lock);
- __delete_from_page_cache(old, NULL);
+ memcg = mem_cgroup_begin_page_stat(old);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
+ __delete_from_page_cache(old, NULL, memcg);
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
- __inc_zone_page_state(new, NR_FILE_PAGES);
+
+ /*
+ * hugetlb pages do not participate in page cache accounting.
+ */
+ if (!PageHuge(new))
+ __inc_zone_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
mem_cgroup_migrate(old, new, true);
radix_tree_preload_end();
if (freepage)
@@ -577,7 +598,10 @@ static int __add_to_page_cache_locked(struct page *page,
radix_tree_preload_end();
if (unlikely(error))
goto err_insert;
- __inc_zone_page_state(page, NR_FILE_PAGES);
+
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!huge)
+ __inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
mem_cgroup_commit_charge(page, memcg, false);
@@ -1656,8 +1680,8 @@ no_cached_page:
error = -ENOMEM;
goto out;
}
- error = add_to_page_cache_lru(page, mapping,
- index, GFP_KERNEL);
+ error = add_to_page_cache_lru(page, mapping, index,
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (error) {
page_cache_release(page);
if (error == -EEXIST) {
@@ -1695,7 +1719,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;
- if (io_is_direct(file)) {
+ if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
@@ -1708,7 +1732,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
pos + count - 1);
if (!retval) {
struct iov_iter data = *iter;
- retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);
+ retval = mapping->a_ops->direct_IO(iocb, &data, pos);
}
if (retval > 0) {
@@ -1758,7 +1782,8 @@ static int page_cache_read(struct file *file, pgoff_t offset)
if (!page)
return -ENOMEM;
- ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ ret = add_to_page_cache_lru(page, mapping, offset,
+ GFP_KERNEL & mapping_gfp_mask(mapping));
if (ret == 0)
ret = mapping->a_ops->readpage(file, page);
else if (ret == -EEXIST)
@@ -2261,41 +2286,38 @@ EXPORT_SYMBOL(read_cache_page_gfp);
* Returns appropriate error code that caller should return or
* zero in case that write should be allowed.
*/
-inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
+inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
unsigned long limit = rlimit(RLIMIT_FSIZE);
+ loff_t pos;
- if (unlikely(*pos < 0))
- return -EINVAL;
+ if (!iov_iter_count(from))
+ return 0;
- if (!isblk) {
- /* FIXME: this is for backwards compatibility with 2.4 */
- if (file->f_flags & O_APPEND)
- *pos = i_size_read(inode);
+ /* FIXME: this is for backwards compatibility with 2.4 */
+ if (iocb->ki_flags & IOCB_APPEND)
+ iocb->ki_pos = i_size_read(inode);
- if (limit != RLIM_INFINITY) {
- if (*pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- if (*count > limit - (typeof(limit))*pos) {
- *count = limit - (typeof(limit))*pos;
- }
+ pos = iocb->ki_pos;
+
+ if (limit != RLIM_INFINITY) {
+ if (iocb->ki_pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
}
+ iov_iter_truncate(from, limit - (unsigned long)pos);
}
/*
* LFS rule
*/
- if (unlikely(*pos + *count > MAX_NON_LFS &&
+ if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
!(file->f_flags & O_LARGEFILE))) {
- if (*pos >= MAX_NON_LFS) {
+ if (pos >= MAX_NON_LFS)
return -EFBIG;
- }
- if (*count > MAX_NON_LFS - (unsigned long)*pos) {
- *count = MAX_NON_LFS - (unsigned long)*pos;
- }
+ iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
}
/*
@@ -2305,34 +2327,11 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
* exceeded without writing data we send a signal and return EFBIG.
* Linus frestrict idea will clean these up nicely..
*/
- if (likely(!isblk)) {
- if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
- if (*count || *pos > inode->i_sb->s_maxbytes) {
- return -EFBIG;
- }
- /* zero-length writes at ->s_maxbytes are OK */
- }
-
- if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
- *count = inode->i_sb->s_maxbytes - *pos;
- } else {
-#ifdef CONFIG_BLOCK
- loff_t isize;
- if (bdev_read_only(I_BDEV(inode)))
- return -EPERM;
- isize = i_size_read(inode);
- if (*pos >= isize) {
- if (*count || *pos > isize)
- return -ENOSPC;
- }
+ if (unlikely(pos >= inode->i_sb->s_maxbytes))
+ return -EFBIG;
- if (*pos + *count > isize)
- *count = isize - *pos;
-#else
- return -EPERM;
-#endif
- }
- return 0;
+ iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
+ return iov_iter_count(from);
}
EXPORT_SYMBOL(generic_write_checks);
@@ -2396,7 +2395,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
}
data = *from;
- written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos);
+ written = mapping->a_ops->direct_IO(iocb, &data, pos);
/*
* Finally, try again to invalidate clean pages which might have been
@@ -2558,24 +2557,13 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct file *file = iocb->ki_filp;
struct address_space * mapping = file->f_mapping;
struct inode *inode = mapping->host;
- loff_t pos = iocb->ki_pos;
ssize_t written = 0;
ssize_t err;
ssize_t status;
- size_t count = iov_iter_count(from);
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- if (err)
- goto out;
-
- if (count == 0)
- goto out;
-
- iov_iter_truncate(from, count);
-
- err = file_remove_suid(file);
+ err = file_remove_privs(file);
if (err)
goto out;
@@ -2583,10 +2571,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
goto out;
- if (io_is_direct(file)) {
- loff_t endbyte;
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ loff_t pos, endbyte;
- written = generic_file_direct_write(iocb, from, pos);
+ written = generic_file_direct_write(iocb, from, iocb->ki_pos);
/*
* If the write stopped short of completing, fall back to
* buffered writes. Some filesystems do this for writes to
@@ -2594,13 +2582,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
* not succeed (even if it did, DAX does not handle dirty
* page-cache pages correctly).
*/
- if (written < 0 || written == count || IS_DAX(inode))
+ if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
goto out;
- pos += written;
- count -= written;
-
- status = generic_perform_write(file, from, pos);
+ status = generic_perform_write(file, from, pos = iocb->ki_pos);
/*
* If generic_perform_write() returned a synchronous error
* then we want to return the number of bytes which were
@@ -2612,15 +2597,15 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
err = status;
goto out;
}
- iocb->ki_pos = pos + status;
/*
* We need to ensure that the page cache pages are written to
* disk and invalidated to preserve the expected O_DIRECT
* semantics.
*/
endbyte = pos + status - 1;
- err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+ err = filemap_write_and_wait_range(mapping, pos, endbyte);
if (err == 0) {
+ iocb->ki_pos = endbyte + 1;
written += status;
invalidate_mapping_pages(mapping,
pos >> PAGE_CACHE_SHIFT,
@@ -2632,9 +2617,9 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
*/
}
} else {
- written = generic_perform_write(file, from, pos);
- if (likely(written >= 0))
- iocb->ki_pos = pos + written;
+ written = generic_perform_write(file, from, iocb->ki_pos);
+ if (likely(written > 0))
+ iocb->ki_pos += written;
}
out:
current->backing_dev_info = NULL;
@@ -2658,7 +2643,9 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
mutex_lock(&inode->i_mutex);
- ret = __generic_file_write_iter(iocb, from);
+ ret = generic_write_checks(iocb, from);
+ if (ret > 0)
+ ret = __generic_file_write_iter(iocb, from);
mutex_unlock(&inode->i_mutex);
if (ret > 0) {
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 8d82809eb085..27a9924caf61 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -21,11 +21,16 @@
#include <linux/swapfile.h>
/*
- * frontswap_ops is set by frontswap_register_ops to contain the pointers
- * to the frontswap "backend" implementation functions.
+ * frontswap_ops are added by frontswap_register_ops, and provide the
+ * frontswap "backend" implementation functions. Multiple implementations
+ * may be registered, but implementations can never deregister. This
+ * is a simple singly-linked list of all registered implementations.
*/
static struct frontswap_ops *frontswap_ops __read_mostly;
+#define for_each_frontswap_ops(ops) \
+ for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
+
/*
* If enabled, frontswap_store will return failure even on success. As
* a result, the swap subsystem will always write the page to swap, in
@@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { }
* on all frontswap functions to not call the backend until the backend
* has registered.
*
- * Specifically when no backend is registered (nobody called
- * frontswap_register_ops) all calls to frontswap_init (which is done via
- * swapon -> enable_swap_info -> frontswap_init) are registered and remembered
- * (via the setting of need_init bitmap) but fail to create tmem_pools. When a
- * backend registers with frontswap at some later point the previous
- * calls to frontswap_init are executed (by iterating over the need_init
- * bitmap) to create tmem_pools and set the respective poolids. All of that is
- * guarded by us using atomic bit operations on the 'need_init' bitmap.
- *
* This would not guards us against the user deciding to call swapoff right as
* we are calling the backend to initialize (so swapon is in action).
* Fortunatly for us, the swapon_mutex has been taked by the callee so we are
@@ -106,37 +102,64 @@ static inline void inc_frontswap_invalidates(void) { }
*
* Obviously the opposite (unloading the backend) must be done after all
* the frontswap_[store|load|invalidate_area|invalidate_page] start
- * ignorning or failing the requests - at which point frontswap_ops
- * would have to be made in some fashion atomic.
+ * ignoring or failing the requests. However, there is currently no way
+ * to unload a backend once it is registered.
*/
-static DECLARE_BITMAP(need_init, MAX_SWAPFILES);
/*
- * Register operations for frontswap, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * Register operations for frontswap
*/
-struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
+void frontswap_register_ops(struct frontswap_ops *ops)
{
- struct frontswap_ops *old = frontswap_ops;
- int i;
-
- for (i = 0; i < MAX_SWAPFILES; i++) {
- if (test_and_clear_bit(i, need_init)) {
- struct swap_info_struct *sis = swap_info[i];
- /* __frontswap_init _should_ have set it! */
- if (!sis->frontswap_map)
- return ERR_PTR(-EINVAL);
- ops->init(i);
- }
+ DECLARE_BITMAP(a, MAX_SWAPFILES);
+ DECLARE_BITMAP(b, MAX_SWAPFILES);
+ struct swap_info_struct *si;
+ unsigned int i;
+
+ bitmap_zero(a, MAX_SWAPFILES);
+ bitmap_zero(b, MAX_SWAPFILES);
+
+ spin_lock(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list) {
+ if (!WARN_ON(!si->frontswap_map))
+ set_bit(si->type, a);
}
+ spin_unlock(&swap_lock);
+
+ /* the new ops needs to know the currently active swap devices */
+ for_each_set_bit(i, a, MAX_SWAPFILES)
+ ops->init(i);
+
/*
- * We MUST have frontswap_ops set _after_ the frontswap_init's
- * have been called. Otherwise __frontswap_store might fail. Hence
- * the barrier to make sure compiler does not re-order us.
+ * Setting frontswap_ops must happen after the ops->init() calls
+ * above; cmpxchg implies smp_mb() which will ensure the init is
+ * complete at this point.
*/
- barrier();
- frontswap_ops = ops;
- return old;
+ do {
+ ops->next = frontswap_ops;
+ } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
+
+ spin_lock(&swap_lock);
+ plist_for_each_entry(si, &swap_active_head, list) {
+ if (si->frontswap_map)
+ set_bit(si->type, b);
+ }
+ spin_unlock(&swap_lock);
+
+ /*
+ * On the very unlikely chance that a swap device was added or
+ * removed between setting the "a" list bits and the ops init
+ * calls, we re-check and do init or invalidate for any changed
+ * bits.
+ */
+ if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ if (!test_bit(i, a) && test_bit(i, b))
+ ops->init(i);
+ else if (test_bit(i, a) && !test_bit(i, b))
+ ops->invalidate_area(i);
+ }
+ }
}
EXPORT_SYMBOL(frontswap_register_ops);
@@ -164,6 +187,7 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
void __frontswap_init(unsigned type, unsigned long *map)
{
struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
BUG_ON(sis == NULL);
@@ -179,28 +203,30 @@ void __frontswap_init(unsigned type, unsigned long *map)
* p->frontswap set to something valid to work properly.
*/
frontswap_map_set(sis, map);
- if (frontswap_ops)
- frontswap_ops->init(type);
- else {
- BUG_ON(type >= MAX_SWAPFILES);
- set_bit(type, need_init);
- }
+
+ for_each_frontswap_ops(ops)
+ ops->init(type);
}
EXPORT_SYMBOL(__frontswap_init);
bool __frontswap_test(struct swap_info_struct *sis,
pgoff_t offset)
{
- bool ret = false;
-
- if (frontswap_ops && sis->frontswap_map)
- ret = test_bit(offset, sis->frontswap_map);
- return ret;
+ if (sis->frontswap_map)
+ return test_bit(offset, sis->frontswap_map);
+ return false;
}
EXPORT_SYMBOL(__frontswap_test);
+static inline void __frontswap_set(struct swap_info_struct *sis,
+ pgoff_t offset)
+{
+ set_bit(offset, sis->frontswap_map);
+ atomic_inc(&sis->frontswap_pages);
+}
+
static inline void __frontswap_clear(struct swap_info_struct *sis,
- pgoff_t offset)
+ pgoff_t offset)
{
clear_bit(offset, sis->frontswap_map);
atomic_dec(&sis->frontswap_pages);
@@ -215,39 +241,46 @@ static inline void __frontswap_clear(struct swap_info_struct *sis,
*/
int __frontswap_store(struct page *page)
{
- int ret = -1, dup = 0;
+ int ret = -1;
swp_entry_t entry = { .val = page_private(page), };
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
+ struct frontswap_ops *ops;
/*
* Return if no backend registed.
* Don't need to inc frontswap_failed_stores here.
*/
if (!frontswap_ops)
- return ret;
+ return -1;
BUG_ON(!PageLocked(page));
BUG_ON(sis == NULL);
- if (__frontswap_test(sis, offset))
- dup = 1;
- ret = frontswap_ops->store(type, offset, page);
+
+ /*
+ * If a dup, we must remove the old page first; we can't leave the
+ * old page no matter if the store of the new page succeeds or fails,
+ * and we can't rely on the new page replacing the old page as we may
+ * not store to the same implementation that contains the old page.
+ */
+ if (__frontswap_test(sis, offset)) {
+ __frontswap_clear(sis, offset);
+ for_each_frontswap_ops(ops)
+ ops->invalidate_page(type, offset);
+ }
+
+ /* Try to store in each implementation, until one succeeds. */
+ for_each_frontswap_ops(ops) {
+ ret = ops->store(type, offset, page);
+ if (!ret) /* successful store */
+ break;
+ }
if (ret == 0) {
- set_bit(offset, sis->frontswap_map);
+ __frontswap_set(sis, offset);
inc_frontswap_succ_stores();
- if (!dup)
- atomic_inc(&sis->frontswap_pages);
} else {
- /*
- failed dup always results in automatic invalidate of
- the (older) page from frontswap
- */
inc_frontswap_failed_stores();
- if (dup) {
- __frontswap_clear(sis, offset);
- frontswap_ops->invalidate_page(type, offset);
- }
}
if (frontswap_writethrough_enabled)
/* report failure so swap also writes to swap device */
@@ -268,14 +301,22 @@ int __frontswap_load(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
+ struct frontswap_ops *ops;
+
+ if (!frontswap_ops)
+ return -1;
BUG_ON(!PageLocked(page));
BUG_ON(sis == NULL);
- /*
- * __frontswap_test() will check whether there is backend registered
- */
- if (__frontswap_test(sis, offset))
- ret = frontswap_ops->load(type, offset, page);
+ if (!__frontswap_test(sis, offset))
+ return -1;
+
+ /* Try loading from each implementation, until one succeeds. */
+ for_each_frontswap_ops(ops) {
+ ret = ops->load(type, offset, page);
+ if (!ret) /* successful load */
+ break;
+ }
if (ret == 0) {
inc_frontswap_loads();
if (frontswap_tmem_exclusive_gets_enabled) {
@@ -294,16 +335,19 @@ EXPORT_SYMBOL(__frontswap_load);
void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
{
struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
+
+ if (!frontswap_ops)
+ return;
BUG_ON(sis == NULL);
- /*
- * __frontswap_test() will check whether there is backend registered
- */
- if (__frontswap_test(sis, offset)) {
- frontswap_ops->invalidate_page(type, offset);
- __frontswap_clear(sis, offset);
- inc_frontswap_invalidates();
- }
+ if (!__frontswap_test(sis, offset))
+ return;
+
+ for_each_frontswap_ops(ops)
+ ops->invalidate_page(type, offset);
+ __frontswap_clear(sis, offset);
+ inc_frontswap_invalidates();
}
EXPORT_SYMBOL(__frontswap_invalidate_page);
@@ -314,16 +358,19 @@ EXPORT_SYMBOL(__frontswap_invalidate_page);
void __frontswap_invalidate_area(unsigned type)
{
struct swap_info_struct *sis = swap_info[type];
+ struct frontswap_ops *ops;
- if (frontswap_ops) {
- BUG_ON(sis == NULL);
- if (sis->frontswap_map == NULL)
- return;
- frontswap_ops->invalidate_area(type);
- atomic_set(&sis->frontswap_pages, 0);
- bitmap_zero(sis->frontswap_map, sis->max);
- }
- clear_bit(type, need_init);
+ if (!frontswap_ops)
+ return;
+
+ BUG_ON(sis == NULL);
+ if (sis->frontswap_map == NULL)
+ return;
+
+ for_each_frontswap_ops(ops)
+ ops->invalidate_area(type);
+ atomic_set(&sis->frontswap_pages, 0);
+ bitmap_zero(sis->frontswap_map, sis->max);
}
EXPORT_SYMBOL(__frontswap_invalidate_area);
diff --git a/mm/gup.c b/mm/gup.c
index a6e24e246f86..6297f6bccfb1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -92,7 +92,7 @@ retry:
*/
mark_page_accessed(page);
}
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
unsigned int fault_flags = 0;
int ret;
- /* For mlock, just skip the stack guard page. */
- if ((*flags & FOLL_MLOCK) &&
+ /* For mm_populate(), just skip the stack guard page. */
+ if ((*flags & FOLL_POPULATE) &&
(stack_guard_page_start(vma, address) ||
stack_guard_page_end(vma, address + PAGE_SIZE)))
return -ENOENT;
@@ -819,6 +819,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
EXPORT_SYMBOL(get_user_pages);
/**
+ * populate_vma_page_range() - populate a range of pages in the vma.
+ * @vma: target vma
+ * @start: start address
+ * @end: end address
+ * @nonblocking:
+ *
+ * This takes care of mlocking the pages too if VM_LOCKED is set.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held.
+ *
+ * If @nonblocking is NULL, it may be held for read or write and will
+ * be unperturbed.
+ *
+ * If @nonblocking is non-NULL, it must held for read only and may be
+ * released. If it's released, *@nonblocking will be set to 0.
+ */
+long populate_vma_page_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, int *nonblocking)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long nr_pages = (end - start) / PAGE_SIZE;
+ int gup_flags;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(end & ~PAGE_MASK);
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+
+ gup_flags = FOLL_TOUCH | FOLL_POPULATE;
+ /*
+ * We want to touch writable mappings with a write fault in order
+ * to break COW, except for shared mappings because these don't COW
+ * and we would not want to dirty them for nothing.
+ */
+ if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ gup_flags |= FOLL_WRITE;
+
+ /*
+ * We want mlock to succeed for regions that have any permissions
+ * other than PROT_NONE.
+ */
+ if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+ gup_flags |= FOLL_FORCE;
+
+ /*
+ * We made sure addr is within a VMA, so the following will
+ * not result in a stack expansion that recurses back here.
+ */
+ return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+ NULL, NULL, nonblocking);
+}
+
+/*
+ * __mm_populate - populate and/or mlock pages within a range of address space.
+ *
+ * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
+ * flags. VMAs must be already marked with the desired vm_flags, and
+ * mmap_sem must not be held.
+ */
+int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long end, nstart, nend;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+ long ret = 0;
+
+ VM_BUG_ON(start & ~PAGE_MASK);
+ VM_BUG_ON(len != PAGE_ALIGN(len));
+ end = start + len;
+
+ for (nstart = start; nstart < end; nstart = nend) {
+ /*
+ * We want to fault in pages for [nstart; end) address range.
+ * Find first corresponding VMA.
+ */
+ if (!locked) {
+ locked = 1;
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ /*
+ * Set [nstart; nend) to intersection of desired address
+ * range with the first VMA. Also, skip undesirable VMA types.
+ */
+ nend = min(end, vma->vm_end);
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ /*
+ * Now fault in a range of pages. populate_vma_page_range()
+ * double checks the vma flags, so that it won't mlock pages
+ * if the vma was already munlocked.
+ */
+ ret = populate_vma_page_range(vma, nstart, nend, &locked);
+ if (ret < 0) {
+ if (ignore_errors) {
+ ret = 0;
+ continue; /* continue at next VMA */
+ }
+ break;
+ }
+ nend = nstart + ret * PAGE_SIZE;
+ ret = 0;
+ }
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret; /* 0 or negative error code */
+}
+
+/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
*
@@ -901,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
*
* for an example see gup_get_pte in arch/x86/mm/gup.c
*/
- pte_t pte = ACCESS_ONCE(*ptep);
+ pte_t pte = READ_ONCE(*ptep);
struct page *page;
/*
@@ -1191,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
local_irq_save(flags);
pgdp = pgd_offset(mm, addr);
do {
- pgd_t pgd = ACCESS_ONCE(*pgdp);
+ pgd_t pgd = READ_ONCE(*pgdp);
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6817b0350c71..097c7a4bfbd9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
+static void khugepaged_slab_exit(void);
#define MM_SLOTS_HASH_BITS 10
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
- if (!khugepaged_enabled())
- return 0;
-
for_each_populated_zone(zone)
nr_zones++;
@@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void)
setup_per_zone_wmarks();
return 0;
}
-late_initcall(set_recommended_min_free_kbytes);
-static int start_khugepaged(void)
+static int start_stop_khugepaged(void)
{
int err = 0;
if (khugepaged_enabled()) {
@@ -156,6 +153,7 @@ static int start_khugepaged(void)
pr_err("khugepaged: kthread_run(khugepaged) failed\n");
err = PTR_ERR(khugepaged_thread);
khugepaged_thread = NULL;
+ goto fail;
}
if (!list_empty(&khugepaged_scan.mm_head))
@@ -166,7 +164,7 @@ static int start_khugepaged(void)
kthread_stop(khugepaged_thread);
khugepaged_thread = NULL;
}
-
+fail:
return err;
}
@@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void)
struct page *zero_page;
retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
- return ACCESS_ONCE(huge_zero_page);
+ return READ_ONCE(huge_zero_page);
zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
@@ -202,7 +200,7 @@ retry:
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
- return ACCESS_ONCE(huge_zero_page);
+ return READ_ONCE(huge_zero_page);
}
static void put_huge_zero_page(void)
@@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj,
int err;
mutex_lock(&khugepaged_mutex);
- err = start_khugepaged();
+ err = start_stop_khugepaged();
mutex_unlock(&khugepaged_mutex);
if (err)
@@ -634,27 +632,38 @@ static int __init hugepage_init(void)
err = hugepage_init_sysfs(&hugepage_kobj);
if (err)
- return err;
+ goto err_sysfs;
err = khugepaged_slab_init();
if (err)
- goto out;
+ goto err_slab;
- register_shrinker(&huge_zero_page_shrinker);
+ err = register_shrinker(&huge_zero_page_shrinker);
+ if (err)
+ goto err_hzp_shrinker;
/*
* By default disable transparent hugepages on smaller systems,
* where the extra memory used could hurt more than TLB overhead
* is likely to save. The admin can still enable it through /sys.
*/
- if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+ if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
transparent_hugepage_flags = 0;
+ return 0;
+ }
- start_khugepaged();
+ err = start_stop_khugepaged();
+ if (err)
+ goto err_khugepaged;
return 0;
-out:
+err_khugepaged:
+ unregister_shrinker(&huge_zero_page_shrinker);
+err_hzp_shrinker:
+ khugepaged_slab_exit();
+err_slab:
hugepage_exit_sysfs(hugepage_kobj);
+err_sysfs:
return err;
}
subsys_initcall(hugepage_init);
@@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long haddr, pmd_t *pmd,
- struct page *page)
+ struct page *page, gfp_t gfp)
{
struct mem_cgroup *memcg;
pgtable_t pgtable;
@@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
return VM_FAULT_OOM;
pgtable = pte_alloc_one(mm, haddr);
@@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) {
+ if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1022,7 +1031,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
goto out_free_pages;
VM_BUG_ON_PAGE(!PageHead(page), page);
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
@@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ gfp_t huge_gfp; /* for allocation and charge */
ptl = pmd_lockptr(mm, pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
- gfp_t gfp;
-
- gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
- new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+ huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
} else
new_page = NULL;
@@ -1130,8 +1138,7 @@ alloc:
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm,
- GFP_TRANSHUGE, &memcg))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
put_page(new_page);
if (page) {
split_huge_page(page);
@@ -1167,7 +1174,7 @@ alloc:
pmd_t entry;
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
page_add_new_anon_rmap(new_page, vma, haddr);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
@@ -1231,7 +1238,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
pmd, _pmd, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
if (page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
@@ -1389,12 +1396,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t orig_pmd;
/*
* For architectures like ppc64 we look at deposited pgtable
- * when calling pmdp_get_and_clear. So do the
+ * when calling pmdp_huge_get_and_clear. So do the
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
- orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+ tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
@@ -1452,7 +1459,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
- pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+ pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
@@ -1498,7 +1505,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
if (!prot_numa || !pmd_protnone(*pmd)) {
- entry = pmdp_get_and_clear_notify(mm, addr, pmd);
+ entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
if (preserve_write)
entry = pmd_mkwrite(entry);
@@ -1669,12 +1676,7 @@ static void __split_huge_page_refcount(struct page *page,
/* after clearing PageTail the gup refcount can be released */
smp_mb__after_atomic();
- /*
- * retain hwpoison flag of the poisoned tail page:
- * fix for the unsuitable process killed on Guest Machine(KVM)
- * by the memory-failure.
- */
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (page->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
@@ -1976,6 +1978,11 @@ static int __init khugepaged_slab_init(void)
return 0;
}
+static void __init khugepaged_slab_exit(void)
+{
+ kmem_cache_destroy(mm_slot_cache);
+}
+
static inline struct mm_slot *alloc_mm_slot(void)
{
if (!mm_slot_cache) /* initialization failed */
@@ -2109,7 +2116,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
{
while (--_pte >= pte) {
pte_t pteval = *_pte;
- if (!pte_none(pteval))
+ if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
release_pte_page(pte_page(pteval));
}
}
@@ -2120,13 +2127,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
{
struct page *page;
pte_t *_pte;
- int none = 0;
+ int none_or_zero = 0;
bool referenced = false, writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
- if (pte_none(pteval)) {
- if (++none <= khugepaged_max_ptes_none)
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out;
@@ -2207,9 +2214,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
pte_t pteval = *_pte;
struct page *src_page;
- if (pte_none(pteval)) {
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, address);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+ if (is_zero_pfn(pte_pfn(pteval))) {
+ /*
+ * ptl mostly unnecessary.
+ */
+ spin_lock(ptl);
+ /*
+ * paravirt calls inside pte_clear here are
+ * superfluous.
+ */
+ pte_clear(vma->vm_mm, address, _pte);
+ spin_unlock(ptl);
+ }
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
@@ -2311,8 +2330,8 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
return true;
}
-static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
int node)
{
@@ -2326,8 +2345,7 @@ static struct page
*/
up_read(&mm->mmap_sem);
- *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
- khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
+ *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
@@ -2380,13 +2398,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
return true;
}
-static struct page
-*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+static struct page *
+khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
int node)
{
up_read(&mm->mmap_sem);
VM_BUG_ON(!*hpage);
+
return *hpage;
}
#endif
@@ -2421,16 +2440,21 @@ static void collapse_huge_page(struct mm_struct *mm,
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
+ gfp_t gfp;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ /* Only allocate from the target node */
+ gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
+ __GFP_THISNODE;
+
/* release the mmap_sem read lock. */
- new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+ new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
if (!new_page)
return;
if (unlikely(mem_cgroup_try_charge(new_page, mm,
- GFP_TRANSHUGE, &memcg)))
+ gfp, &memcg)))
return;
/*
@@ -2470,7 +2494,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* huge and small TLB entries for the same virtual address
* to avoid the risk of CPU bugs in that area.
*/
- _pmd = pmdp_clear_flush(vma, address, pmd);
+ _pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -2543,7 +2567,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none = 0;
+ int ret = 0, none_or_zero = 0;
struct page *page;
unsigned long _address;
spinlock_t *ptl;
@@ -2561,8 +2585,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
- if (pte_none(pteval)) {
- if (++none <= khugepaged_max_ptes_none)
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (++none_or_zero <= khugepaged_max_ptes_none)
continue;
else
goto out_unmap;
@@ -2770,7 +2794,7 @@ static void khugepaged_do_scan(void)
cond_resched();
- if (unlikely(kthread_should_stop() || freezing(current)))
+ if (unlikely(kthread_should_stop() || try_to_freeze()))
break;
spin_lock(&khugepaged_mm_lock);
@@ -2791,8 +2815,6 @@ static void khugepaged_do_scan(void)
static void khugepaged_wait_work(void)
{
- try_to_freeze();
-
if (khugepaged_has_work()) {
if (!khugepaged_scan_sleep_millisecs)
return;
@@ -2836,7 +2858,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- pmdp_clear_flush_notify(vma, haddr, pmd);
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c41b2a0ee273..a8c3087089d8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,11 @@ int hugepages_treat_as_movable;
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
+/*
+ * Minimum page order among possible hugepage sizes, set to a proper value
+ * at boot time.
+ */
+static unsigned int minimum_order __read_mostly = UINT_MAX;
__initdata LIST_HEAD(huge_boot_pages);
@@ -61,6 +66,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
static int num_fault_mutexes;
static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
+
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +76,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
- * remain, free the subpool the subpool remain */
- if (free)
+ * remain, give up any reservations mased on minimum size and
+ * free the subpool */
+ if (free) {
+ if (spool->min_hpages != -1)
+ hugetlb_acct_memory(spool->hstate,
+ -spool->min_hpages);
kfree(spool);
+ }
}
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+ long min_hpages)
{
struct hugepage_subpool *spool;
- spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+ spool = kzalloc(sizeof(*spool), GFP_KERNEL);
if (!spool)
return NULL;
spin_lock_init(&spool->lock);
spool->count = 1;
- spool->max_hpages = nr_blocks;
- spool->used_hpages = 0;
+ spool->max_hpages = max_hpages;
+ spool->hstate = h;
+ spool->min_hpages = min_hpages;
+
+ if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+ kfree(spool);
+ return NULL;
+ }
+ spool->rsv_hpages = min_hpages;
return spool;
}
@@ -97,36 +118,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
unlock_or_release_subpool(spool);
}
-static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for allocating and reserving pages.
+ * Return -ENOMEM if there are not enough resources to satisfy the
+ * the request. Otherwise, return the number of pages by which the
+ * global pools must be adjusted (upward). The returned value may
+ * only be different than the passed value (delta) in the case where
+ * a subpool minimum size must be manitained.
+ */
+static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
long delta)
{
- int ret = 0;
+ long ret = delta;
if (!spool)
- return 0;
+ return ret;
spin_lock(&spool->lock);
- if ((spool->used_hpages + delta) <= spool->max_hpages) {
- spool->used_hpages += delta;
- } else {
- ret = -ENOMEM;
+
+ if (spool->max_hpages != -1) { /* maximum size accounting */
+ if ((spool->used_hpages + delta) <= spool->max_hpages)
+ spool->used_hpages += delta;
+ else {
+ ret = -ENOMEM;
+ goto unlock_ret;
+ }
+ }
+
+ if (spool->min_hpages != -1) { /* minimum size accounting */
+ if (delta > spool->rsv_hpages) {
+ /*
+ * Asking for more reserves than those already taken on
+ * behalf of subpool. Return difference.
+ */
+ ret = delta - spool->rsv_hpages;
+ spool->rsv_hpages = 0;
+ } else {
+ ret = 0; /* reserves already accounted for */
+ spool->rsv_hpages -= delta;
+ }
}
- spin_unlock(&spool->lock);
+unlock_ret:
+ spin_unlock(&spool->lock);
return ret;
}
-static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for freeing and unreserving pages.
+ * Return the number of global page reservations that must be dropped.
+ * The return value may only be different than the passed value (delta)
+ * in the case where a subpool minimum size must be maintained.
+ */
+static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
long delta)
{
+ long ret = delta;
+
if (!spool)
- return;
+ return delta;
spin_lock(&spool->lock);
- spool->used_hpages -= delta;
- /* If hugetlbfs_put_super couldn't free spool due to
- * an outstanding quota reference, free it now. */
+
+ if (spool->max_hpages != -1) /* maximum size accounting */
+ spool->used_hpages -= delta;
+
+ if (spool->min_hpages != -1) { /* minimum size accounting */
+ if (spool->rsv_hpages + delta <= spool->min_hpages)
+ ret = 0;
+ else
+ ret = spool->rsv_hpages + delta - spool->min_hpages;
+
+ spool->rsv_hpages += delta;
+ if (spool->rsv_hpages > spool->min_hpages)
+ spool->rsv_hpages = spool->min_hpages;
+ }
+
+ /*
+ * If hugetlbfs_put_super couldn't free spool due to an outstanding
+ * quota reference, free it now.
+ */
unlock_or_release_subpool(spool);
+
+ return ret;
}
static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -143,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
* Region tracking -- allows tracking of reservations and instantiated pages
* across the pages in a mapping.
*
- * The region data structures are embedded into a resv_map and
- * protected by a resv_map's lock
+ * The region data structures are embedded into a resv_map and protected
+ * by a resv_map's lock. The set of regions within the resv_map represent
+ * reservations for huge pages, or huge pages that have already been
+ * instantiated within the map. The from and to elements are huge page
+ * indicies into the associated mapping. from indicates the starting index
+ * of the region. to represents the first index past the end of the region.
+ *
+ * For example, a file region structure with from == 0 and to == 4 represents
+ * four huge pages in a mapping. It is important to note that the to element
+ * represents the first element past the end of the region. This is used in
+ * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
+ *
+ * Interval notation of the form [from, to) will be used to indicate that
+ * the endpoint from is inclusive and to is exclusive.
*/
struct file_region {
struct list_head link;
@@ -152,10 +238,22 @@ struct file_region {
long to;
};
+/*
+ * Add the huge page range represented by [f, t) to the reserve
+ * map. Existing regions will be expanded to accommodate the
+ * specified range. We know only existing regions need to be
+ * expanded, because region_add is only called after region_chg
+ * with the same range. If a new file_region structure must
+ * be allocated, it is done in region_chg.
+ *
+ * Return the number of new huge pages added to the map. This
+ * number is greater than or equal to zero.
+ */
static long region_add(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
struct file_region *rg, *nrg, *trg;
+ long add = 0;
spin_lock(&resv->lock);
/* Locate the region we are either in or before. */
@@ -181,16 +279,45 @@ static long region_add(struct resv_map *resv, long f, long t)
if (rg->to > t)
t = rg->to;
if (rg != nrg) {
+ /* Decrement return value by the deleted range.
+ * Another range will span this area so that by
+ * end of routine add will be >= zero
+ */
+ add -= (rg->to - rg->from);
list_del(&rg->link);
kfree(rg);
}
}
+
+ add += (nrg->from - f); /* Added to beginning of region */
nrg->from = f;
+ add += t - nrg->to; /* Added to end of region */
nrg->to = t;
+
spin_unlock(&resv->lock);
- return 0;
+ VM_BUG_ON(add < 0);
+ return add;
}
+/*
+ * Examine the existing reserve map and determine how many
+ * huge pages in the specified range [f, t) are NOT currently
+ * represented. This routine is called before a subsequent
+ * call to region_add that will actually modify the reserve
+ * map to add the specified range [f, t). region_chg does
+ * not change the number of huge pages represented by the
+ * map. However, if the existing regions in the map can not
+ * be expanded to represent the new range, a new file_region
+ * structure is added to the map as a placeholder. This is
+ * so that the subsequent region_add call will have all the
+ * regions it needs and will not fail.
+ *
+ * Returns the number of huge pages that need to be added
+ * to the existing reservation map for the range [f, t).
+ * This number is greater or equal to zero. -ENOMEM is
+ * returned if a new file_region structure is needed and can
+ * not be allocated.
+ */
static long region_chg(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
@@ -257,6 +384,11 @@ out_nrg:
return chg;
}
+/*
+ * Truncate the reserve map at index 'end'. Modify/truncate any
+ * region which contains end. Delete any regions past end.
+ * Return the number of huge pages removed from the map.
+ */
static long region_truncate(struct resv_map *resv, long end)
{
struct list_head *head = &resv->regions;
@@ -292,6 +424,10 @@ out:
return chg;
}
+/*
+ * Count and return the number of huge pages in the reserve map
+ * that intersect with the range [f, t).
+ */
static long region_count(struct resv_map *resv, long f, long t)
{
struct list_head *head = &resv->regions;
@@ -839,7 +975,6 @@ static void update_and_free_page(struct hstate *h, struct page *page)
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
} else {
- arch_release_hugepage(page);
__free_pages(page, huge_page_order(h));
}
}
@@ -855,6 +990,31 @@ struct hstate *size_to_hstate(unsigned long size)
return NULL;
}
+/*
+ * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
+ * to hstate->hugepage_activelist.)
+ *
+ * This function can be called for tail pages, but never returns true for them.
+ */
+bool page_huge_active(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHuge(page), page);
+ return PageHead(page) && PagePrivate(&page[1]);
+}
+
+/* never called for tail page */
+static void set_page_huge_active(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+ SetPagePrivate(&page[1]);
+}
+
+static void clear_page_huge_active(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+ ClearPagePrivate(&page[1]);
+}
+
void free_huge_page(struct page *page)
{
/*
@@ -874,7 +1034,16 @@ void free_huge_page(struct page *page)
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
+ /*
+ * A return code of zero implies that the subpool will be under its
+ * minimum size if the reservation is not restored after page is free.
+ * Therefore, force restore_reserve operation.
+ */
+ if (hugepage_subpool_put_pages(spool, 1) == 0)
+ restore_reserve = true;
+
spin_lock(&hugetlb_lock);
+ clear_page_huge_active(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
if (restore_reserve)
@@ -891,7 +1060,6 @@ void free_huge_page(struct page *page)
enqueue_huge_page(h, page);
}
spin_unlock(&hugetlb_lock);
- hugepage_subpool_put_pages(spool, 1);
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -991,10 +1159,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
if (page) {
- if (arch_prepare_hugepage(page)) {
- __free_pages(page, huge_page_order(h));
- return NULL;
- }
prep_new_huge_page(h, page, nid);
}
@@ -1086,19 +1250,13 @@ static void dissolve_free_huge_page(struct page *page)
*/
void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned int order = 8 * sizeof(void *);
unsigned long pfn;
- struct hstate *h;
if (!hugepages_supported())
return;
- /* Set scan step to minimum hugepage size */
- for_each_hstate(h)
- if (order > huge_page_order(h))
- order = huge_page_order(h);
- VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
+ VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
dissolve_free_huge_page(pfn_to_page(pfn));
}
@@ -1152,11 +1310,6 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
- if (page && arch_prepare_hugepage(page)) {
- __free_pages(page, huge_page_order(h));
- page = NULL;
- }
-
spin_lock(&hugetlb_lock);
if (page) {
INIT_LIST_HEAD(&page->lru);
@@ -1321,46 +1474,56 @@ static void return_unused_surplus_pages(struct hstate *h,
}
/*
- * Determine if the huge page at addr within the vma has an associated
- * reservation. Where it does not we will need to logically increase
- * reservation and actually increase subpool usage before an allocation
- * can occur. Where any new reservation would be required the
- * reservation change is prepared, but not committed. Once the page
- * has been allocated from the subpool and instantiated the change should
- * be committed via vma_commit_reservation. No action is required on
- * failure.
+ * vma_needs_reservation and vma_commit_reservation are used by the huge
+ * page allocation routines to manage reservations.
+ *
+ * vma_needs_reservation is called to determine if the huge page at addr
+ * within the vma has an associated reservation. If a reservation is
+ * needed, the value 1 is returned. The caller is then responsible for
+ * managing the global reservation and subpool usage counts. After
+ * the huge page has been allocated, vma_commit_reservation is called
+ * to add the page to the reservation map.
+ *
+ * In the normal case, vma_commit_reservation returns the same value
+ * as the preceding vma_needs_reservation call. The only time this
+ * is not the case is if a reserve map was changed between calls. It
+ * is the responsibility of the caller to notice the difference and
+ * take appropriate action.
*/
-static long vma_needs_reservation(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr)
+static long __vma_reservation_common(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr,
+ bool commit)
{
struct resv_map *resv;
pgoff_t idx;
- long chg;
+ long ret;
resv = vma_resv_map(vma);
if (!resv)
return 1;
idx = vma_hugecache_offset(h, vma, addr);
- chg = region_chg(resv, idx, idx + 1);
+ if (commit)
+ ret = region_add(resv, idx, idx + 1);
+ else
+ ret = region_chg(resv, idx, idx + 1);
if (vma->vm_flags & VM_MAYSHARE)
- return chg;
+ return ret;
else
- return chg < 0 ? chg : 0;
+ return ret < 0 ? ret : 0;
}
-static void vma_commit_reservation(struct hstate *h,
+
+static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct resv_map *resv;
- pgoff_t idx;
-
- resv = vma_resv_map(vma);
- if (!resv)
- return;
+ return __vma_reservation_common(h, vma, addr, false);
+}
- idx = vma_hugecache_offset(h, vma, addr);
- region_add(resv, idx, idx + 1);
+static long vma_commit_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, true);
}
static struct page *alloc_huge_page(struct vm_area_struct *vma,
@@ -1369,7 +1532,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct page *page;
- long chg;
+ long chg, commit;
int ret, idx;
struct hugetlb_cgroup *h_cg;
@@ -1386,7 +1549,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
if (chg < 0)
return ERR_PTR(-ENOMEM);
if (chg || avoid_reserve)
- if (hugepage_subpool_get_pages(spool, 1))
+ if (hugepage_subpool_get_pages(spool, 1) < 0)
return ERR_PTR(-ENOSPC);
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -1410,7 +1573,22 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
set_page_private(page, (unsigned long)spool);
- vma_commit_reservation(h, vma, addr);
+ commit = vma_commit_reservation(h, vma, addr);
+ if (unlikely(chg > commit)) {
+ /*
+ * The page was added to the reservation map between
+ * vma_needs_reservation and vma_commit_reservation.
+ * This indicates a race with hugetlb_reserve_pages.
+ * Adjust for the subpool count incremented above AND
+ * in hugetlb_reserve_pages for the same page. Also,
+ * the reservation count added in hugetlb_reserve_pages
+ * no longer applies.
+ */
+ long rsv_adjust;
+
+ rsv_adjust = hugepage_subpool_put_pages(spool, 1);
+ hugetlb_acct_memory(h, -rsv_adjust);
+ }
return page;
out_uncharge_cgroup:
@@ -1525,10 +1703,14 @@ static void __init hugetlb_init_hstates(void)
struct hstate *h;
for_each_hstate(h) {
+ if (minimum_order > huge_page_order(h))
+ minimum_order = huge_page_order(h);
+
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
}
+ VM_BUG_ON(minimum_order == UINT_MAX);
}
static char * __init memfmt(char *buf, unsigned long n)
@@ -2454,6 +2636,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
struct resv_map *resv = vma_resv_map(vma);
struct hugepage_subpool *spool = subpool_vma(vma);
unsigned long reserve, start, end;
+ long gbl_reserve;
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
return;
@@ -2466,8 +2649,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
kref_put(&resv->refs, resv_map_release);
if (reserve) {
- hugetlb_acct_memory(h, -reserve);
- hugepage_subpool_put_pages(spool, reserve);
+ /*
+ * Decrement reserve counts. The global reserve count may be
+ * adjusted if the subpool has a minimum size.
+ */
+ gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
+ hugetlb_acct_memory(h, -gbl_reserve);
}
}
@@ -2891,6 +3078,7 @@ retry_avoidcopy:
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
+ set_page_huge_active(new_page);
mmun_start = address & huge_page_mask(h);
mmun_end = mmun_start + huge_page_size(h);
@@ -3003,6 +3191,7 @@ retry:
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
+ set_page_huge_active(page);
if (vma->vm_flags & VM_MAYSHARE) {
int err;
@@ -3278,6 +3467,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
/*
+ * If we have a pending SIGKILL, don't keep faulting pages and
+ * potentially allocating memory.
+ */
+ if (unlikely(fatal_signal_pending(current))) {
+ remainder = 0;
+ break;
+ }
+
+ /*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
@@ -3438,6 +3636,7 @@ int hugetlb_reserve_pages(struct inode *inode,
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
+ long gbl_reserve;
/*
* Only apply hugepage reservation if asked. At fault time, an
@@ -3474,8 +3673,13 @@ int hugetlb_reserve_pages(struct inode *inode,
goto out_err;
}
- /* There must be enough pages in the subpool for the mapping */
- if (hugepage_subpool_get_pages(spool, chg)) {
+ /*
+ * There must be enough pages in the subpool for the mapping. If
+ * the subpool has a minimum size, there may be some global
+ * reservations already in place (gbl_reserve).
+ */
+ gbl_reserve = hugepage_subpool_get_pages(spool, chg);
+ if (gbl_reserve < 0) {
ret = -ENOSPC;
goto out_err;
}
@@ -3484,9 +3688,10 @@ int hugetlb_reserve_pages(struct inode *inode,
* Check enough hugepages are available for the reservation.
* Hand the pages back to the subpool if there are not
*/
- ret = hugetlb_acct_memory(h, chg);
+ ret = hugetlb_acct_memory(h, gbl_reserve);
if (ret < 0) {
- hugepage_subpool_put_pages(spool, chg);
+ /* put back original number of pages, chg */
+ (void)hugepage_subpool_put_pages(spool, chg);
goto out_err;
}
@@ -3501,8 +3706,24 @@ int hugetlb_reserve_pages(struct inode *inode,
* consumed reservations are stored in the map. Hence, nothing
* else has to be done for private mappings here
*/
- if (!vma || vma->vm_flags & VM_MAYSHARE)
- region_add(resv_map, from, to);
+ if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ long add = region_add(resv_map, from, to);
+
+ if (unlikely(chg > add)) {
+ /*
+ * pages in this range were added to the reserve
+ * map between region_chg and region_add. This
+ * indicates a race with alloc_huge_page. Adjust
+ * the subpool and reserve counts modified above
+ * based on the difference.
+ */
+ long rsv_adjust;
+
+ rsv_adjust = hugepage_subpool_put_pages(spool,
+ chg - add);
+ hugetlb_acct_memory(h, -rsv_adjust);
+ }
+ }
return 0;
out_err:
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
@@ -3516,6 +3737,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
struct resv_map *resv_map = inode_resv_map(inode);
long chg = 0;
struct hugepage_subpool *spool = subpool_inode(inode);
+ long gbl_reserve;
if (resv_map)
chg = region_truncate(resv_map, offset);
@@ -3523,8 +3745,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
inode->i_blocks -= (blocks_per_huge_page(h) * freed);
spin_unlock(&inode->i_lock);
- hugepage_subpool_put_pages(spool, (chg - freed));
- hugetlb_acct_memory(h, -(chg - freed));
+ /*
+ * If the subpool has a minimum size, the number of global
+ * reservations to be released may be adjusted.
+ */
+ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
+ hugetlb_acct_memory(h, -gbl_reserve);
}
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3659,6 +3885,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
return NULL;
}
+
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+ return 0;
+}
#define want_pmd_share() (0)
#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
@@ -3735,8 +3966,7 @@ retry:
if (!pmd_huge(*pmd))
goto out;
if (pmd_present(*pmd)) {
- page = pte_page(*(pte_t *)pmd) +
- ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
if (flags & FOLL_GET)
get_page(page);
} else {
@@ -3767,20 +3997,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
#ifdef CONFIG_MEMORY_FAILURE
-/* Should be called in hugetlb_lock */
-static int is_hugepage_on_freelist(struct page *hpage)
-{
- struct page *page;
- struct page *tmp;
- struct hstate *h = page_hstate(hpage);
- int nid = page_to_nid(hpage);
-
- list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
- if (page == hpage)
- return 1;
- return 0;
-}
-
/*
* This function is called from memory failure code.
* Assume the caller holds page lock of the head page.
@@ -3792,7 +4008,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
int ret = -EBUSY;
spin_lock(&hugetlb_lock);
- if (is_hugepage_on_freelist(hpage)) {
+ /*
+ * Just checking !page_huge_active is not enough, because that could be
+ * an isolated/hwpoisoned hugepage (which have >0 refcount).
+ */
+ if (!page_huge_active(hpage) && !page_count(hpage)) {
/*
* Hwpoisoned hugepage isn't linked to activelist or freelist,
* but dangling hpage->lru can trigger list-debug warnings
@@ -3812,42 +4032,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
bool isolate_huge_page(struct page *page, struct list_head *list)
{
+ bool ret = true;
+
VM_BUG_ON_PAGE(!PageHead(page), page);
- if (!get_page_unless_zero(page))
- return false;
spin_lock(&hugetlb_lock);
+ if (!page_huge_active(page) || !get_page_unless_zero(page)) {
+ ret = false;
+ goto unlock;
+ }
+ clear_page_huge_active(page);
list_move_tail(&page->lru, list);
+unlock:
spin_unlock(&hugetlb_lock);
- return true;
+ return ret;
}
void putback_active_hugepage(struct page *page)
{
VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
+ set_page_huge_active(page);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
spin_unlock(&hugetlb_lock);
put_page(page);
}
-
-bool is_hugepage_active(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
- /*
- * This function can be called for a tail page because the caller,
- * scan_movable_pages, scans through a given pfn-range which typically
- * covers one memory block. In systems using gigantic hugepage (1GB
- * for x86_64,) a hugepage is larger than a memory block, and we don't
- * support migrating such large hugepages for now, so return false
- * when called for tail pages.
- */
- if (PageTail(page))
- return false;
- /*
- * Refcount of a hwpoisoned hugepages is 1, but they are not active,
- * so we should return false for them.
- */
- if (unlikely(PageHWPoison(page)))
- return false;
- return page_count(page) > 0;
-}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 329caf56df22..bf73ac17dad4 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -28,19 +28,19 @@ static int hwpoison_inject(void *data, u64 val)
/*
* This implies unable to support free buddy pages.
*/
- if (!get_page_unless_zero(hpage))
+ if (!get_hwpoison_page(p))
return 0;
if (!hwpoison_filter_enable)
goto inject;
- if (!PageLRU(p) && !PageHuge(p))
- shake_page(p, 0);
+ if (!PageLRU(hpage) && !PageHuge(p))
+ shake_page(hpage, 0);
/*
* This implies unable to support non-LRU pages.
*/
- if (!PageLRU(p) && !PageHuge(p))
- return 0;
+ if (!PageLRU(hpage) && !PageHuge(p))
+ goto put_out;
/*
* do a racy check with elevated page count, to make sure PG_hwpoison
@@ -52,11 +52,14 @@ static int hwpoison_inject(void *data, u64 val)
err = hwpoison_filter(hpage);
unlock_page(hpage);
if (err)
- return 0;
+ goto put_out;
inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
return memory_failure(pfn, 18, MF_COUNT_INCREASED);
+put_out:
+ put_page(p);
+ return 0;
}
static int hwpoison_unpoison(void *data, u64 val)
diff --git a/mm/internal.h b/mm/internal.h
index a96da5b0029d..36b23f1e2ca6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -155,7 +155,8 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
}
extern int __isolate_free_page(struct page *page, unsigned int order);
-extern void __free_pages_bootmem(struct page *page, unsigned int order);
+extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
+ unsigned int order);
extern void prep_compound_page(struct page *page, unsigned long order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
@@ -200,6 +201,8 @@ isolate_freepages_range(struct compact_control *cc,
unsigned long
isolate_migratepages_range(struct compact_control *cc,
unsigned long low_pfn, unsigned long end_pfn);
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal);
#endif
@@ -222,13 +225,13 @@ static inline unsigned long page_order(struct page *page)
* PageBuddy() should be checked first by the caller to minimize race window,
* and invalid values must be handled gracefully.
*
- * ACCESS_ONCE is used so that if the caller assigns the result into a local
+ * READ_ONCE is used so that if the caller assigns the result into a local
* variable and e.g. tests it for valid range before using, the compiler cannot
* decide to remove the variable and inline the page_private(page) multiple
* times, potentially observing different values in the tests and the actual
* use of the result.
*/
-#define page_order_unsafe(page) ACCESS_ONCE(page_private(page))
+#define page_order_unsafe(page) READ_ONCE(page_private(page))
static inline bool is_cow_mapping(vm_flags_t flags)
{
@@ -240,7 +243,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
#ifdef CONFIG_MMU
-extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
+extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *nonblocking);
extern void munlock_vma_pages_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
@@ -359,10 +362,7 @@ do { \
} while (0)
extern void mminit_verify_pageflags_layout(void);
-extern void mminit_verify_page_links(struct page *page,
- enum zone_type zone, unsigned long nid, unsigned long pfn);
extern void mminit_verify_zonelist(void);
-
#else
static inline void mminit_dprintk(enum mminit_level level,
@@ -374,11 +374,6 @@ static inline void mminit_verify_pageflags_layout(void)
{
}
-static inline void mminit_verify_page_links(struct page *page,
- enum zone_type zone, unsigned long nid, unsigned long pfn)
-{
-}
-
static inline void mminit_verify_zonelist(void)
{
}
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 936d81661c47..6c513a63ea84 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -389,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size)
kasan_kmalloc(page->slab_cache, object, size);
}
+void kasan_kfree(void *ptr)
+{
+ struct page *page;
+
+ page = virt_to_head_page(ptr);
+
+ if (unlikely(!PageSlab(page)))
+ kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+ KASAN_FREE_PAGE);
+ else
+ kasan_slab_free(page->slab_cache, ptr);
+}
+
void kasan_kfree_large(const void *ptr)
{
struct page *page = virt_to_page(ptr);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 4986b0acab21..c242adf6bc85 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -7,7 +7,6 @@
#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
#define KASAN_FREE_PAGE 0xFF /* page was freed */
-#define KASAN_FREE_PAGE 0xFF /* page was freed */
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5405aff5a590..cf79f110157c 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -53,6 +53,13 @@
* modifications to the memory scanning parameters including the scan_thread
* pointer
*
+ * Locks and mutexes are acquired/nested in the following order:
+ *
+ * scan_mutex [-> object->lock] -> kmemleak_lock -> other_object->lock (SINGLE_DEPTH_NESTING)
+ *
+ * No kmemleak_lock and object->lock nesting is allowed outside scan_mutex
+ * regions.
+ *
* The kmemleak_object structures have a use_count incremented or decremented
* using the get_object()/put_object() functions. When the use_count becomes
* 0, this count can no longer be incremented and put_object() schedules the
@@ -115,7 +122,8 @@
#define BYTES_PER_POINTER sizeof(void *)
/* GFP bitmask for kmemleak internal allocations */
-#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \
+ __GFP_NOACCOUNT)) | \
__GFP_NORETRY | __GFP_NOMEMALLOC | \
__GFP_NOWARN)
@@ -194,6 +202,8 @@ static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
static int kmemleak_enabled;
+/* same as above but only for the kmemleak_free() callback */
+static int kmemleak_free_enabled;
/* set in the late_initcall if there were no errors */
static int kmemleak_initialized;
/* enables or disables early logging of the memory operations */
@@ -482,8 +492,7 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
rcu_read_lock();
read_lock_irqsave(&kmemleak_lock, flags);
- if (ptr >= min_addr && ptr < max_addr)
- object = lookup_object(ptr, alias);
+ object = lookup_object(ptr, alias);
read_unlock_irqrestore(&kmemleak_lock, flags);
/* check whether the object is still available */
@@ -495,6 +504,27 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
}
/*
+ * Look up an object in the object search tree and remove it from both
+ * object_tree_root and object_list. The returned object's use_count should be
+ * at least 1, as initially set by create_object().
+ */
+static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ write_lock_irqsave(&kmemleak_lock, flags);
+ object = lookup_object(ptr, alias);
+ if (object) {
+ rb_erase(&object->rb_node, &object_tree_root);
+ list_del_rcu(&object->object_list);
+ }
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+
+ return object;
+}
+
+/*
* Save stack trace to the given array of MAX_TRACE size.
*/
static int __save_stack_trace(unsigned long *trace)
@@ -579,11 +609,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
kmemleak_stop("Cannot insert 0x%lx into the object "
"search tree (overlaps existing)\n",
ptr);
+ /*
+ * No need for parent->lock here since "parent" cannot
+ * be freed while the kmemleak_lock is held.
+ */
+ dump_object_info(parent);
kmem_cache_free(object_cache, object);
- object = parent;
- spin_lock(&object->lock);
- dump_object_info(object);
- spin_unlock(&object->lock);
+ object = NULL;
goto out;
}
}
@@ -597,20 +629,14 @@ out:
}
/*
- * Remove the metadata (struct kmemleak_object) for a memory block from the
- * object_list and object_tree_root and decrement its use_count.
+ * Mark the object as not allocated and schedule RCU freeing via put_object().
*/
static void __delete_object(struct kmemleak_object *object)
{
unsigned long flags;
- write_lock_irqsave(&kmemleak_lock, flags);
- rb_erase(&object->rb_node, &object_tree_root);
- list_del_rcu(&object->object_list);
- write_unlock_irqrestore(&kmemleak_lock, flags);
-
WARN_ON(!(object->flags & OBJECT_ALLOCATED));
- WARN_ON(atomic_read(&object->use_count) < 2);
+ WARN_ON(atomic_read(&object->use_count) < 1);
/*
* Locking here also ensures that the corresponding memory block
@@ -630,7 +656,7 @@ static void delete_object_full(unsigned long ptr)
{
struct kmemleak_object *object;
- object = find_and_get_object(ptr, 0);
+ object = find_and_remove_object(ptr, 0);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Freeing unknown object at 0x%08lx\n",
@@ -639,7 +665,6 @@ static void delete_object_full(unsigned long ptr)
return;
}
__delete_object(object);
- put_object(object);
}
/*
@@ -652,7 +677,7 @@ static void delete_object_part(unsigned long ptr, size_t size)
struct kmemleak_object *object;
unsigned long start, end;
- object = find_and_get_object(ptr, 1);
+ object = find_and_remove_object(ptr, 1);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Partially freeing unknown object at 0x%08lx "
@@ -660,7 +685,6 @@ static void delete_object_part(unsigned long ptr, size_t size)
#endif
return;
}
- __delete_object(object);
/*
* Create one or two objects that may result from the memory block
@@ -678,7 +702,7 @@ static void delete_object_part(unsigned long ptr, size_t size)
create_object(ptr + size, end - ptr - size, object->min_count,
GFP_KERNEL);
- put_object(object);
+ __delete_object(object);
}
static void __paint_it(struct kmemleak_object *object, int color)
@@ -906,12 +930,13 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
* kmemleak_alloc_percpu - register a newly allocated __percpu object
* @ptr: __percpu pointer to beginning of the object
* @size: size of the object
+ * @gfp: flags used for kmemleak internal memory allocations
*
* This function is called from the kernel percpu allocator when a new object
- * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL
- * allocation.
+ * (memory block) is allocated (alloc_percpu).
*/
-void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
+void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
+ gfp_t gfp)
{
unsigned int cpu;
@@ -924,7 +949,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
for_each_possible_cpu(cpu)
create_object((unsigned long)per_cpu_ptr(ptr, cpu),
- size, 0, GFP_KERNEL);
+ size, 0, gfp);
else if (kmemleak_early_log)
log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
}
@@ -941,7 +966,7 @@ void __ref kmemleak_free(const void *ptr)
{
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
delete_object_full((unsigned long)ptr);
else if (kmemleak_early_log)
log_early(KMEMLEAK_FREE, ptr, 0, 0);
@@ -981,7 +1006,7 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
pr_debug("%s(0x%p)\n", __func__, ptr);
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
for_each_possible_cpu(cpu)
delete_object_full((unsigned long)per_cpu_ptr(ptr,
cpu));
@@ -1147,19 +1172,18 @@ static int scan_should_stop(void)
* found to the gray list.
*/
static void scan_block(void *_start, void *_end,
- struct kmemleak_object *scanned, int allow_resched)
+ struct kmemleak_object *scanned)
{
unsigned long *ptr;
unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
unsigned long *end = _end - (BYTES_PER_POINTER - 1);
+ unsigned long flags;
+ read_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
struct kmemleak_object *object;
- unsigned long flags;
unsigned long pointer;
- if (allow_resched)
- cond_resched();
if (scan_should_stop())
break;
@@ -1172,26 +1196,31 @@ static void scan_block(void *_start, void *_end,
pointer = *ptr;
kasan_enable_current();
- object = find_and_get_object(pointer, 1);
+ if (pointer < min_addr || pointer >= max_addr)
+ continue;
+
+ /*
+ * No need for get_object() here since we hold kmemleak_lock.
+ * object->use_count cannot be dropped to 0 while the object
+ * is still present in object_tree_root and object_list
+ * (with updates protected by kmemleak_lock).
+ */
+ object = lookup_object(pointer, 1);
if (!object)
continue;
- if (object == scanned) {
+ if (object == scanned)
/* self referenced, ignore */
- put_object(object);
continue;
- }
/*
* Avoid the lockdep recursive warning on object->lock being
* previously acquired in scan_object(). These locks are
* enclosed by scan_mutex.
*/
- spin_lock_irqsave_nested(&object->lock, flags,
- SINGLE_DEPTH_NESTING);
+ spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
if (!color_white(object)) {
/* non-orphan, ignored or new */
- spin_unlock_irqrestore(&object->lock, flags);
- put_object(object);
+ spin_unlock(&object->lock);
continue;
}
@@ -1203,13 +1232,27 @@ static void scan_block(void *_start, void *_end,
*/
object->count++;
if (color_gray(object)) {
+ /* put_object() called when removing from gray_list */
+ WARN_ON(!get_object(object));
list_add_tail(&object->gray_list, &gray_list);
- spin_unlock_irqrestore(&object->lock, flags);
- continue;
}
+ spin_unlock(&object->lock);
+ }
+ read_unlock_irqrestore(&kmemleak_lock, flags);
+}
- spin_unlock_irqrestore(&object->lock, flags);
- put_object(object);
+/*
+ * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency.
+ */
+static void scan_large_block(void *start, void *end)
+{
+ void *next;
+
+ while (start < end) {
+ next = min(start + MAX_SCAN_SIZE, end);
+ scan_block(start, next, NULL);
+ start = next;
+ cond_resched();
}
}
@@ -1235,22 +1278,25 @@ static void scan_object(struct kmemleak_object *object)
if (hlist_empty(&object->area_list)) {
void *start = (void *)object->pointer;
void *end = (void *)(object->pointer + object->size);
+ void *next;
- while (start < end && (object->flags & OBJECT_ALLOCATED) &&
- !(object->flags & OBJECT_NO_SCAN)) {
- scan_block(start, min(start + MAX_SCAN_SIZE, end),
- object, 0);
- start += MAX_SCAN_SIZE;
+ do {
+ next = min(start + MAX_SCAN_SIZE, end);
+ scan_block(start, next, object);
+
+ start = next;
+ if (start >= end)
+ break;
spin_unlock_irqrestore(&object->lock, flags);
cond_resched();
spin_lock_irqsave(&object->lock, flags);
- }
+ } while (object->flags & OBJECT_ALLOCATED);
} else
hlist_for_each_entry(area, &object->area_list, node)
scan_block((void *)area->start,
(void *)(area->start + area->size),
- object, 0);
+ object);
out:
spin_unlock_irqrestore(&object->lock, flags);
}
@@ -1327,14 +1373,14 @@ static void kmemleak_scan(void)
rcu_read_unlock();
/* data/bss scanning */
- scan_block(_sdata, _edata, NULL, 1);
- scan_block(__bss_start, __bss_stop, NULL, 1);
+ scan_large_block(_sdata, _edata);
+ scan_large_block(__bss_start, __bss_stop);
#ifdef CONFIG_SMP
/* per-cpu sections scanning */
for_each_possible_cpu(i)
- scan_block(__per_cpu_start + per_cpu_offset(i),
- __per_cpu_end + per_cpu_offset(i), NULL, 1);
+ scan_large_block(__per_cpu_start + per_cpu_offset(i),
+ __per_cpu_end + per_cpu_offset(i));
#endif
/*
@@ -1355,7 +1401,7 @@ static void kmemleak_scan(void)
/* only scan if page is in use */
if (page_count(page) == 0)
continue;
- scan_block(page, page + 1, NULL, 1);
+ scan_block(page, page + 1, NULL);
}
}
put_online_mems();
@@ -1369,7 +1415,7 @@ static void kmemleak_scan(void)
read_lock(&tasklist_lock);
do_each_thread(g, p) {
scan_block(task_stack_page(p), task_stack_page(p) +
- THREAD_SIZE, NULL, 0);
+ THREAD_SIZE, NULL);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
}
@@ -1746,15 +1792,20 @@ static void __kmemleak_do_cleanup(void)
*/
static void kmemleak_do_cleanup(struct work_struct *work)
{
- mutex_lock(&scan_mutex);
stop_scan_thread();
+ /*
+ * Once the scan thread has stopped, it is safe to no longer track
+ * object freeing. Ordering of the scan thread stopping and the memory
+ * accesses below is guaranteed by the kthread_stop() function.
+ */
+ kmemleak_free_enabled = 0;
+
if (!kmemleak_found_leaks)
__kmemleak_do_cleanup();
else
pr_info("Kmemleak disabled without freeing internal data. "
"Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
- mutex_unlock(&scan_mutex);
}
static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
@@ -1775,6 +1826,8 @@ static void kmemleak_disable(void)
/* check whether it is too early for a kernel thread */
if (kmemleak_initialized)
schedule_work(&cleanup_work);
+ else
+ kmemleak_free_enabled = 0;
pr_info("Kernel memory leak detector disabled\n");
}
@@ -1839,8 +1892,10 @@ void __init kmemleak_init(void)
if (kmemleak_error) {
local_irq_restore(flags);
return;
- } else
+ } else {
kmemleak_enabled = 1;
+ kmemleak_free_enabled = 1;
+ }
local_irq_restore(flags);
/*
diff --git a/mm/ksm.c b/mm/ksm.c
index 4162dce2eb44..7ee101eaacdf 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
expected_mapping = (void *)stable_node +
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
again:
- kpfn = ACCESS_ONCE(stable_node->kpfn);
+ kpfn = READ_ONCE(stable_node->kpfn);
page = pfn_to_page(kpfn);
/*
@@ -551,7 +551,7 @@ again:
* but on Alpha we need to be more careful.
*/
smp_read_barrier_depends();
- if (ACCESS_ONCE(page->mapping) != expected_mapping)
+ if (READ_ONCE(page->mapping) != expected_mapping)
goto stale;
/*
@@ -577,14 +577,14 @@ again:
cpu_relax();
}
- if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ if (READ_ONCE(page->mapping) != expected_mapping) {
put_page(page);
goto stale;
}
if (lock_it) {
lock_page(page);
- if (ACCESS_ONCE(page->mapping) != expected_mapping) {
+ if (READ_ONCE(page->mapping) != expected_mapping) {
unlock_page(page);
put_page(page);
goto stale;
@@ -600,7 +600,7 @@ stale:
* before checking whether node->kpfn has been changed.
*/
smp_rmb();
- if (ACCESS_ONCE(stable_node->kpfn) != kpfn)
+ if (READ_ONCE(stable_node->kpfn) != kpfn)
goto again;
remove_node_from_stable_tree(stable_node);
return NULL;
diff --git a/mm/madvise.c b/mm/madvise.c
index d551475517bf..64bb8a22110c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
diff --git a/mm/memblock.c b/mm/memblock.c
index 252b77bdf65e..87108e77e476 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock;
#ifdef CONFIG_MOVABLE_NODE
bool movable_node_enabled __initdata_memblock = false;
#endif
+static bool system_has_some_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
+ulong __init_memblock choose_memblock_flags(void)
+{
+ return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
+}
+
/* inline so we don't get a warning when pr_debug is compiled out */
static __init_memblock const char *
memblock_type_name(struct memblock_type *type)
@@ -107,6 +113,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area bottom-up.
*
@@ -115,12 +122,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
*/
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
+ phys_addr_t size, phys_addr_t align, int nid,
+ ulong flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
- for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
+ for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
@@ -139,6 +147,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
* @size: size of free area to find
* @align: alignment of free area to find
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
*
* Utility called from memblock_find_in_range_node(), find free area top-down.
*
@@ -147,12 +156,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
*/
static phys_addr_t __init_memblock
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
+ phys_addr_t size, phys_addr_t align, int nid,
+ ulong flags)
{
phys_addr_t this_start, this_end, cand;
u64 i;
- for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+ for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
+ NULL) {
this_start = clamp(this_start, start, end);
this_end = clamp(this_end, start, end);
@@ -174,6 +185,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
* @start: start of candidate range
* @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @flags: pick from blocks based on memory attributes
*
* Find @size free area aligned to @align in the specified range and node.
*
@@ -190,7 +202,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
+ phys_addr_t end, int nid, ulong flags)
{
phys_addr_t kernel_end, ret;
@@ -215,7 +227,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
/* ok, try bottom-up allocation first */
ret = __memblock_find_range_bottom_up(bottom_up_start, end,
- size, align, nid);
+ size, align, nid, flags);
if (ret)
return ret;
@@ -233,7 +245,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
"memory hotunplug may be affected\n");
}
- return __memblock_find_range_top_down(start, end, size, align, nid);
+ return __memblock_find_range_top_down(start, end, size, align, nid,
+ flags);
}
/**
@@ -252,8 +265,21 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align)
{
- return memblock_find_in_range_node(size, align, start, end,
- NUMA_NO_NODE);
+ phys_addr_t ret;
+ ulong flags = choose_memblock_flags();
+
+again:
+ ret = memblock_find_in_range_node(size, align, start, end,
+ NUMA_NO_NODE, flags);
+
+ if (!ret && (flags & MEMBLOCK_MIRROR)) {
+ pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ &size);
+ flags &= ~MEMBLOCK_MIRROR;
+ goto again;
+ }
+
+ return ret;
}
static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
@@ -580,10 +606,24 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
return memblock_add_range(&memblock.memory, base, size, nid, 0);
}
+static int __init_memblock memblock_add_region(phys_addr_t base,
+ phys_addr_t size,
+ int nid,
+ unsigned long flags)
+{
+ struct memblock_type *_rgn = &memblock.memory;
+
+ memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
+ (unsigned long long)base,
+ (unsigned long long)base + size - 1,
+ flags, (void *)_RET_IP_);
+
+ return memblock_add_range(_rgn, base, size, nid, flags);
+}
+
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- return memblock_add_range(&memblock.memory, base, size,
- MAX_NUMNODES, 0);
+ return memblock_add_region(base, size, MAX_NUMNODES, 0);
}
/**
@@ -699,14 +739,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *_rgn = &memblock.reserved;
+ struct memblock_type *type = &memblock.reserved;
memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(_rgn, base, size, nid, flags);
+ return memblock_add_range(type, base, size, nid, flags);
}
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
@@ -765,9 +805,57 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
}
/**
+ * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return 0 on succees, -errno on failure.
+ */
+int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
+{
+ system_has_some_mirror = true;
+
+ return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
+}
+
+
+/**
+ * __next_reserved_mem_region - next function for for_each_reserved_region()
+ * @idx: pointer to u64 loop variable
+ * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL
+ *
+ * Iterate over all reserved memory regions.
+ */
+void __init_memblock __next_reserved_mem_region(u64 *idx,
+ phys_addr_t *out_start,
+ phys_addr_t *out_end)
+{
+ struct memblock_type *rsv = &memblock.reserved;
+
+ if (*idx >= 0 && *idx < rsv->cnt) {
+ struct memblock_region *r = &rsv->regions[*idx];
+ phys_addr_t base = r->base;
+ phys_addr_t size = r->size;
+
+ if (out_start)
+ *out_start = base;
+ if (out_end)
+ *out_end = base + size - 1;
+
+ *idx += 1;
+ return;
+ }
+
+ /* signal end of iteration */
+ *idx = ULLONG_MAX;
+}
+
+/**
* __next__mem_range - next function for for_each_free_mem_range() etc.
* @idx: pointer to u64 loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -789,7 +877,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
-void __init_memblock __next_mem_range(u64 *idx, int nid,
+void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -817,6 +905,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
continue;
+ /* if we want mirror memory skip non-mirror memory regions */
+ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+ continue;
+
if (!type_b) {
if (out_start)
*out_start = m_start;
@@ -881,6 +973,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
*
* @idx: pointer to u64 loop variable
* @nid: nid: node selector, %NUMA_NO_NODE for all nodes
+ * @flags: pick from blocks based on memory attributes
* @type_a: pointer to memblock_type from where the range is taken
* @type_b: pointer to memblock_type which excludes memory from being taken
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
@@ -889,7 +982,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
*
* Reverse of __next_mem_range().
*/
-void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
+void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
struct memblock_type *type_a,
struct memblock_type *type_b,
phys_addr_t *out_start,
@@ -921,6 +1014,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
continue;
+ /* if we want mirror memory skip non-mirror memory regions */
+ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+ continue;
+
if (!type_b) {
if (out_start)
*out_start = m_start;
@@ -1036,14 +1133,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid)
+ phys_addr_t end, int nid, ulong flags)
{
phys_addr_t found;
if (!align)
align = SMP_CACHE_BYTES;
- found = memblock_find_in_range_node(size, align, start, end, nid);
+ found = memblock_find_in_range_node(size, align, start, end, nid,
+ flags);
if (found && !memblock_reserve(found, size)) {
/*
* The min_count is set to 0 so that memblock allocations are
@@ -1056,26 +1154,40 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
}
phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
- phys_addr_t start, phys_addr_t end)
+ phys_addr_t start, phys_addr_t end,
+ ulong flags)
{
- return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
+ flags);
}
static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t max_addr,
- int nid)
+ int nid, ulong flags)
{
- return memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+ return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
}
phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ ulong flags = choose_memblock_flags();
+ phys_addr_t ret;
+
+again:
+ ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
+ nid, flags);
+
+ if (!ret && (flags & MEMBLOCK_MIRROR)) {
+ flags &= ~MEMBLOCK_MIRROR;
+ goto again;
+ }
+ return ret;
}
phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE);
+ return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE,
+ MEMBLOCK_NONE);
}
phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
@@ -1139,6 +1251,7 @@ static void * __init memblock_virt_alloc_internal(
{
phys_addr_t alloc;
void *ptr;
+ ulong flags = choose_memblock_flags();
if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;
@@ -1159,13 +1272,14 @@ static void * __init memblock_virt_alloc_internal(
again:
alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
- nid);
+ nid, flags);
if (alloc)
goto done;
if (nid != NUMA_NO_NODE) {
alloc = memblock_find_in_range_node(size, align, min_addr,
- max_addr, NUMA_NO_NODE);
+ max_addr, NUMA_NO_NODE,
+ flags);
if (alloc)
goto done;
}
@@ -1173,10 +1287,16 @@ again:
if (min_addr) {
min_addr = 0;
goto again;
- } else {
- goto error;
}
+ if (flags & MEMBLOCK_MIRROR) {
+ flags &= ~MEMBLOCK_MIRROR;
+ pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ &size);
+ goto again;
+ }
+
+ return NULL;
done:
memblock_reserve(alloc, size);
ptr = phys_to_virt(alloc);
@@ -1191,9 +1311,6 @@ done:
kmemleak_alloc(ptr, size, 0, 0);
return ptr;
-
-error:
- return NULL;
}
/**
@@ -1302,7 +1419,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
end = PFN_DOWN(base + size);
for (; cursor < end; cursor++) {
- __free_pages_bootmem(pfn_to_page(cursor), 0);
+ __free_pages_bootmem(pfn_to_page(cursor), cursor, 0);
totalram_pages++;
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b34ef4a32a3b..acb93c554f6e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -14,6 +14,12 @@
* Copyright (C) 2012 Parallels Inc. and Google Inc.
* Authors: Glauber Costa and Suleiman Souhlal
*
+ * Native page reclaim
+ * Charge lifetime sanitation
+ * Lockless page tracking & accounting
+ * Unified hierarchy configuration model
+ * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -71,6 +77,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
+struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
@@ -84,6 +91,7 @@ static const char * const mem_cgroup_stat_names[] = {
"rss",
"rss_huge",
"mapped_file",
+ "dirty",
"writeback",
"swap",
};
@@ -253,11 +261,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
- *
- * TODO: Add a water mark for the memory controller. Reclaim will begin when
- * we hit the water mark. May be even add a low water mark, such that
- * no reclaim occurs from a cgroup at it's low water mark, this is
- * a feature that will be implemented much later in the future.
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
@@ -284,9 +287,9 @@ struct mem_cgroup {
*/
bool use_hierarchy;
+ /* protected by memcg_oom_lock */
bool oom_lock;
- atomic_t under_oom;
- atomic_t oom_wakeups;
+ int under_oom;
int swappiness;
/* OOM-Killer disable */
@@ -321,11 +324,6 @@ struct mem_cgroup {
* percpu counter.
*/
struct mem_cgroup_stat_cpu __percpu *stat;
- /*
- * used when a cpu is offlined or other synchronizations
- * See mem_cgroup_read_stat().
- */
- struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
@@ -345,6 +343,11 @@ struct mem_cgroup {
atomic_t numainfo_updating;
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_list;
+ struct wb_domain cgwb_domain;
+#endif
+
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
@@ -454,6 +457,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
return memcg->css.id;
}
+/*
+ * A helper function to get mem_cgroup from ID. must be called under
+ * rcu_read_lock(). The caller is responsible for calling
+ * css_tryget_online() if the mem_cgroup is used for charging. (dropping
+ * refcnt from swap can be called against removed memcg.)
+ */
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
struct cgroup_subsys_state *css;
@@ -589,6 +598,39 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
return &memcg->css;
}
+/**
+ * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * @page: page of interest
+ *
+ * If memcg is bound to the default hierarchy, css of the memcg associated
+ * with @page is returned. The returned css remains associated with @page
+ * until it is released.
+ *
+ * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+ * is returned.
+ *
+ * XXX: The above description of behavior on the default hierarchy isn't
+ * strictly true yet as replace_page_cache_page() can modify the
+ * association before @page is released even on the default hierarchy;
+ * however, the current and planned usages don't mix the the two functions
+ * and replace_page_cache_page() will soon be updated to make the invariant
+ * actually true.
+ */
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+
+ memcg = page->mem_cgroup;
+
+ if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+ memcg = root_mem_cgroup;
+
+ rcu_read_unlock();
+ return &memcg->css;
+}
+
static struct mem_cgroup_per_zone *
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
@@ -667,7 +709,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
unsigned long nr_pages = page_counter_read(&memcg->memory);
- unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
+ unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
unsigned long excess = 0;
if (nr_pages > soft_limit)
@@ -788,15 +830,8 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
long val = 0;
int cpu;
- get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
val += per_cpu(memcg->stat->count[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
- spin_lock(&memcg->pcp_counter_lock);
- val += memcg->nocpu_base.count[idx];
- spin_unlock(&memcg->pcp_counter_lock);
-#endif
- put_online_cpus();
return val;
}
@@ -806,15 +841,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
unsigned long val = 0;
int cpu;
- get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
val += per_cpu(memcg->stat->events[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
- spin_lock(&memcg->pcp_counter_lock);
- val += memcg->nocpu_base.events[idx];
- spin_unlock(&memcg->pcp_counter_lock);
-#endif
- put_online_cpus();
return val;
}
@@ -1035,7 +1063,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
goto out_unlock;
do {
- pos = ACCESS_ONCE(iter->position);
+ pos = READ_ONCE(iter->position);
/*
* A racing update may change the position and
* put the last reference, hence css_tryget(),
@@ -1352,13 +1380,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
unsigned long limit;
count = page_counter_read(&memcg->memory);
- limit = ACCESS_ONCE(memcg->memory.limit);
+ limit = READ_ONCE(memcg->memory.limit);
if (count < limit)
margin = limit - count;
if (do_swap_account) {
count = page_counter_read(&memcg->memsw);
- limit = ACCESS_ONCE(memcg->memsw.limit);
+ limit = READ_ONCE(memcg->memsw.limit);
if (count <= limit)
margin = min(margin, limit - count);
}
@@ -1436,15 +1464,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
struct mem_cgroup *iter;
unsigned int i;
- if (!p)
- return;
-
mutex_lock(&oom_info_lock);
rcu_read_lock();
- pr_info("Task in ");
- pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_cont(" killed as a result of limit of ");
+ if (p) {
+ pr_info("Task in ");
+ pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+ pr_cont(" killed as a result of limit of ");
+ } else {
+ pr_info("Memory limit reached of cgroup ");
+ }
+
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont("\n");
@@ -1521,17 +1551,19 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int points = 0;
struct task_struct *chosen = NULL;
+ mutex_lock(&oom_lock);
+
/*
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
if (fatal_signal_pending(current) || task_will_free_mem(current)) {
- mark_tsk_oom_victim(current);
- return;
+ mark_oom_victim(current);
+ goto unlock;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1555,7 +1587,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
mem_cgroup_iter_break(memcg, iter);
if (chosen)
put_task_struct(chosen);
- return;
+ goto unlock;
case OOM_SCAN_OK:
break;
};
@@ -1576,11 +1608,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_task_iter_end(&it);
}
- if (!chosen)
- return;
- points = chosen_points * 1000 / totalpages;
- oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
- NULL, "Memory cgroup out of memory");
+ if (chosen) {
+ points = chosen_points * 1000 / totalpages;
+ oom_kill_process(chosen, gfp_mask, order, points, totalpages,
+ memcg, NULL, "Memory cgroup out of memory");
+ }
+unlock:
+ mutex_unlock(&oom_lock);
}
#if MAX_NUMNODES > 1
@@ -1797,8 +1831,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
+ spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
- atomic_inc(&iter->under_oom);
+ iter->under_oom++;
+ spin_unlock(&memcg_oom_lock);
}
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
@@ -1807,11 +1843,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
/*
* When a new child is created while the hierarchy is under oom,
- * mem_cgroup_oom_lock() may not be called. We have to use
- * atomic_add_unless() here.
+ * mem_cgroup_oom_lock() may not be called. Watch for underflow.
*/
+ spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
- atomic_add_unless(&iter->under_oom, -1, 0);
+ if (iter->under_oom > 0)
+ iter->under_oom--;
+ spin_unlock(&memcg_oom_lock);
}
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1837,17 +1875,18 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
return autoremove_wake_function(wait, mode, sync, arg);
}
-static void memcg_wakeup_oom(struct mem_cgroup *memcg)
-{
- atomic_inc(&memcg->oom_wakeups);
- /* for filtering, pass "memcg" as argument. */
- __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
-}
-
static void memcg_oom_recover(struct mem_cgroup *memcg)
{
- if (memcg && atomic_read(&memcg->under_oom))
- memcg_wakeup_oom(memcg);
+ /*
+ * For the following lockless ->under_oom test, the only required
+ * guarantee is that it must see the state asserted by an OOM when
+ * this function is called as a result of userland actions
+ * triggered by the notification of the OOM. This is trivially
+ * achieved by invoking mem_cgroup_mark_under_oom() before
+ * triggering notification.
+ */
+ if (memcg && memcg->under_oom)
+ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
@@ -2002,6 +2041,7 @@ again:
return memcg;
}
+EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
/**
* mem_cgroup_end_page_stat - finish a page state statistics transaction
@@ -2020,6 +2060,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
rcu_read_unlock();
}
+EXPORT_SYMBOL(mem_cgroup_end_page_stat);
/**
* mem_cgroup_update_page_stat - update page state statistics
@@ -2160,37 +2201,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}
-/*
- * This function drains percpu counter value from DEAD cpu and
- * move it to local cpu. Note that this function can be preempted.
- */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
-{
- int i;
-
- spin_lock(&memcg->pcp_counter_lock);
- for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- long x = per_cpu(memcg->stat->count[i], cpu);
-
- per_cpu(memcg->stat->count[i], cpu) = 0;
- memcg->nocpu_base.count[i] += x;
- }
- for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
- unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-
- per_cpu(memcg->stat->events[i], cpu) = 0;
- memcg->nocpu_base.events[i] += x;
- }
- spin_unlock(&memcg->pcp_counter_lock);
-}
-
static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
- struct mem_cgroup *iter;
if (action == CPU_ONLINE)
return NOTIFY_OK;
@@ -2198,9 +2214,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
- for_each_mem_cgroup(iter)
- mem_cgroup_drain_pcp_counter(iter, cpu);
-
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
return NOTIFY_OK;
@@ -2314,6 +2327,8 @@ done_restock:
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
+ if (!(gfp_mask & __GFP_WAIT))
+ goto done;
/*
* If the hierarchy is above the normal consumption range,
* make the charging task trim their excess contribution.
@@ -2341,20 +2356,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
}
/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
-{
- /* ID 0 is unused ID */
- if (!id)
- return NULL;
- return mem_cgroup_from_id(id);
-}
-
-/*
* try_get_mem_cgroup_from_page - look up page's memcg association
* @page: the page
*
@@ -2380,7 +2381,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
ent.val = page_private(page);
id = lookup_swap_cgroup_id(ent);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = mem_cgroup_from_id(id);
if (memcg && !css_tryget_online(&memcg->css))
memcg = NULL;
rcu_read_unlock();
@@ -2642,7 +2643,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
return cachep;
memcg = get_mem_cgroup_from_mm(current->mm);
- kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+ kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
goto out;
@@ -2779,92 +2780,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
- * @nr_pages: number of regular pages (>1 for huge pages)
- * @from: mem_cgroup which the page is moved from.
- * @to: mem_cgroup which the page is moved to. @from != @to.
- *
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
- *
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * from old cgroup.
- */
-static int mem_cgroup_move_account(struct page *page,
- unsigned int nr_pages,
- struct mem_cgroup *from,
- struct mem_cgroup *to)
-{
- unsigned long flags;
- int ret;
-
- VM_BUG_ON(from == to);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- ret = -EBUSY;
- if (nr_pages > 1 && !PageTransHuge(page))
- goto out;
-
- /*
- * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
- * of its source page while we change it: page migration takes
- * both pages off the LRU, but page cache replacement doesn't.
- */
- if (!trylock_page(page))
- goto out;
-
- ret = -EINVAL;
- if (page->mem_cgroup != from)
- goto out_unlock;
-
- spin_lock_irqsave(&from->move_lock, flags);
-
- if (!PageAnon(page) && page_mapped(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- }
-
- if (PageWriteback(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- }
-
- /*
- * It is safe to change page->mem_cgroup here because the page
- * is referenced, charged, and isolated - we can't race with
- * uncharging, charging, migration, or LRU putback.
- */
-
- /* caller should have done css_get */
- page->mem_cgroup = to;
- spin_unlock_irqrestore(&from->move_lock, flags);
-
- ret = 0;
-
- local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
- memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
- memcg_check_events(from, page);
- local_irq_enable();
-out_unlock:
- unlock_page(page);
-out:
- return ret;
-}
-
#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
@@ -3953,7 +3868,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
list_add(&event->list, &memcg->oom_notify);
/* already in OOM ? */
- if (atomic_read(&memcg->under_oom))
+ if (memcg->under_oom)
eventfd_signal(eventfd, 1);
spin_unlock(&memcg_oom_lock);
@@ -3982,7 +3897,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
- seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
+ seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
return 0;
}
@@ -4084,6 +3999,98 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
}
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
+{
+ return &memcg->cgwb_list;
+}
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+ return wb_domain_init(&memcg->cgwb_domain, gfp);
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+ wb_domain_exit(&memcg->cgwb_domain);
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+ wb_domain_size_changed(&memcg->cgwb_domain);
+}
+
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+ if (!memcg->css.parent)
+ return NULL;
+
+ return &memcg->cgwb_domain;
+}
+
+/**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+ * @pavail: out parameter for number of available pages
+ * @pdirty: out parameter for number of dirty pages
+ * @pwriteback: out parameter for number of pages under writeback
+ *
+ * Determine the numbers of available, dirty, and writeback pages in @wb's
+ * memcg. Dirty and writeback are self-explanatory. Available is a bit
+ * more involved.
+ *
+ * A memcg's headroom is "min(max, high) - used". The available memory is
+ * calculated as the lowest headroom of itself and the ancestors plus the
+ * number of pages already being used for file pages. Note that this
+ * doesn't consider the actual amount of available memory in the system.
+ * The caller should further cap *@pavail accordingly.
+ */
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+ unsigned long *pdirty, unsigned long *pwriteback)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ struct mem_cgroup *parent;
+ unsigned long head_room = PAGE_COUNTER_MAX;
+ unsigned long file_pages;
+
+ *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+
+ /* this should eventually include NR_UNSTABLE_NFS */
+ *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+
+ file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+ (1 << LRU_ACTIVE_FILE));
+ while ((parent = parent_mem_cgroup(memcg))) {
+ unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+ unsigned long used = page_counter_read(&memcg->memory);
+
+ head_room = min(head_room, ceiling - min(ceiling, used));
+ memcg = parent;
+ }
+
+ *pavail = file_pages + head_room;
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+ return 0;
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
/*
* DO NOT USE IN NEW FILES.
*
@@ -4468,9 +4475,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!memcg->stat)
goto out_free;
+
+ if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+ goto out_free_stat;
+
spin_lock_init(&memcg->pcp_counter_lock);
return memcg;
+out_free_stat:
+ free_percpu(memcg->stat);
out_free:
kfree(memcg);
return NULL;
@@ -4497,6 +4510,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_mem_cgroup_per_zone_info(memcg, node);
free_percpu(memcg->stat);
+ memcg_wb_domain_exit(memcg);
kfree(memcg);
}
@@ -4529,6 +4543,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* root ? */
if (parent_css == NULL) {
root_mem_cgroup = memcg;
+ mem_cgroup_root_css = &memcg->css;
page_counter_init(&memcg->memory, NULL);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4547,7 +4562,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
#endif
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+ INIT_LIST_HEAD(&memcg->cgwb_list);
+#endif
return &memcg->css;
free_out:
@@ -4635,6 +4652,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
vmpressure_cleanup(&memcg->vmpressure);
memcg_deactivate_kmem(memcg);
+
+ wb_memcg_offline(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4668,6 +4687,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg->low = 0;
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
+ memcg_wb_domain_size_changed(memcg);
}
#ifdef CONFIG_MMU
@@ -4816,6 +4836,111 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
return page;
}
+/**
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ * - compound_lock is held when nr_pages > 1
+ *
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
+ */
+static int mem_cgroup_move_account(struct page *page,
+ unsigned int nr_pages,
+ struct mem_cgroup *from,
+ struct mem_cgroup *to)
+{
+ unsigned long flags;
+ int ret;
+ bool anon;
+
+ VM_BUG_ON(from == to);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ /*
+ * The page is isolated from LRU. So, collapse function
+ * will not handle this page. But page splitting can happen.
+ * Do this check under compound_page_lock(). The caller should
+ * hold it.
+ */
+ ret = -EBUSY;
+ if (nr_pages > 1 && !PageTransHuge(page))
+ goto out;
+
+ /*
+ * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
+ * of its source page while we change it: page migration takes
+ * both pages off the LRU, but page cache replacement doesn't.
+ */
+ if (!trylock_page(page))
+ goto out;
+
+ ret = -EINVAL;
+ if (page->mem_cgroup != from)
+ goto out_unlock;
+
+ anon = PageAnon(page);
+
+ spin_lock_irqsave(&from->move_lock, flags);
+
+ if (!anon && page_mapped(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ }
+
+ /*
+ * move_lock grabbed above and caller set from->moving_account, so
+ * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+ * So mapping should be stable for dirty pages.
+ */
+ if (!anon && PageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping_cap_account_dirty(mapping)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+ nr_pages);
+ }
+ }
+
+ if (PageWriteback(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ }
+
+ /*
+ * It is safe to change page->mem_cgroup here because the page
+ * is referenced, charged, and isolated - we can't race with
+ * uncharging, charging, migration, or LRU putback.
+ */
+
+ /* caller should have done css_get */
+ page->mem_cgroup = to;
+ spin_unlock_irqrestore(&from->move_lock, flags);
+
+ ret = 0;
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(to, page, nr_pages);
+ memcg_check_events(to, page);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
+ memcg_check_events(from, page);
+ local_irq_enable();
+out_unlock:
+ unlock_page(page);
+out:
+ return ret;
+}
+
static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
@@ -5012,7 +5137,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
- move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
+ move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
if (move_flags) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5246,7 +5371,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
static int memory_low_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = ACCESS_ONCE(memcg->low);
+ unsigned long low = READ_ONCE(memcg->low);
if (low == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5276,7 +5401,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
static int memory_high_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long high = ACCESS_ONCE(memcg->high);
+ unsigned long high = READ_ONCE(memcg->high);
if (high == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5300,13 +5425,14 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
memcg->high = high;
+ memcg_wb_domain_size_changed(memcg);
return nbytes;
}
static int memory_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = ACCESS_ONCE(memcg->memory.limit);
+ unsigned long max = READ_ONCE(memcg->memory.limit);
if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5332,6 +5458,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (err)
return err;
+ memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -5838,9 +5965,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, 1);
- /* XXX: caller holds IRQ-safe mapping->tree_lock */
- VM_BUG_ON(!irqs_disabled());
-
+ /* Caller disabled preemption with mapping->tree_lock */
mem_cgroup_charge_statistics(memcg, page, -1);
memcg_check_events(memcg, page);
}
@@ -5861,7 +5986,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
id = swap_cgroup_record(entry, 0);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = mem_cgroup_from_id(id);
if (memcg) {
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memsw, 1);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d487f8dc6d39..ea5a93659488 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -20,6 +20,14 @@
* this code has to be extremely careful. Generally it tries to use
* normal locking rules, as in get the standard locks, even if that means
* the error handling takes potentially a long time.
+ *
+ * It can be very tempting to add handling for obscure cases here.
+ * In general any code for handling new cases should only be added iff:
+ * - You know how to test it.
+ * - You have a test that can be added to mce-test
+ * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/
+ * - The case actually shows up as a frequent (top 10) page state in
+ * tools/vm/page-types when running a real workload.
*
* There are several operations here with exponential complexity because
* of unsuitable VM data structures. For example the operation to map back
@@ -28,13 +36,6 @@
* are rare we hope to get away with this. This avoids impacting the core
* VM.
*/
-
-/*
- * Notebook:
- * - hugetlb needs more code
- * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
- * - pass bad pages to kdump next kernel
- */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -56,6 +57,7 @@
#include <linux/mm_inline.h>
#include <linux/kfifo.h>
#include "internal.h"
+#include "ras/ras_event.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -503,22 +505,34 @@ static void collect_procs(struct page *page, struct list_head *tokill,
kfree(tk);
}
-/*
- * Error handlers for various types of pages.
- */
-
-enum outcome {
- IGNORED, /* Error: cannot be handled */
- FAILED, /* Error: handling failed */
- DELAYED, /* Will be handled later */
- RECOVERED, /* Successfully recovered */
+static const char *action_name[] = {
+ [MF_IGNORED] = "Ignored",
+ [MF_FAILED] = "Failed",
+ [MF_DELAYED] = "Delayed",
+ [MF_RECOVERED] = "Recovered",
};
-static const char *action_name[] = {
- [IGNORED] = "Ignored",
- [FAILED] = "Failed",
- [DELAYED] = "Delayed",
- [RECOVERED] = "Recovered",
+static const char * const action_page_types[] = {
+ [MF_MSG_KERNEL] = "reserved kernel page",
+ [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
+ [MF_MSG_SLAB] = "kernel slab page",
+ [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
+ [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
+ [MF_MSG_HUGE] = "huge page",
+ [MF_MSG_FREE_HUGE] = "free huge page",
+ [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
+ [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
+ [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
+ [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
+ [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
+ [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
+ [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
+ [MF_MSG_DIRTY_LRU] = "dirty LRU page",
+ [MF_MSG_CLEAN_LRU] = "clean LRU page",
+ [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
+ [MF_MSG_BUDDY] = "free buddy page",
+ [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
+ [MF_MSG_UNKNOWN] = "unknown page",
};
/*
@@ -552,7 +566,7 @@ static int delete_from_lru_cache(struct page *p)
*/
static int me_kernel(struct page *p, unsigned long pfn)
{
- return IGNORED;
+ return MF_IGNORED;
}
/*
@@ -561,7 +575,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
static int me_unknown(struct page *p, unsigned long pfn)
{
printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
- return FAILED;
+ return MF_FAILED;
}
/*
@@ -570,7 +584,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
int err;
- int ret = FAILED;
+ int ret = MF_FAILED;
struct address_space *mapping;
delete_from_lru_cache(p);
@@ -580,7 +594,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* should be the one m_f() holds.
*/
if (PageAnon(p))
- return RECOVERED;
+ return MF_RECOVERED;
/*
* Now truncate the page in the page cache. This is really
@@ -594,7 +608,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
- return FAILED;
+ return MF_FAILED;
}
/*
@@ -611,7 +625,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
!try_to_release_page(p, GFP_NOIO)) {
pr_info("MCE %#lx: failed to release buffers\n", pfn);
} else {
- ret = RECOVERED;
+ ret = MF_RECOVERED;
}
} else {
/*
@@ -619,7 +633,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* This fails on dirty or anything with private pages
*/
if (invalidate_inode_page(p))
- ret = RECOVERED;
+ ret = MF_RECOVERED;
else
printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
pfn);
@@ -705,9 +719,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
ClearPageUptodate(p);
if (!delete_from_lru_cache(p))
- return DELAYED;
+ return MF_DELAYED;
else
- return FAILED;
+ return MF_FAILED;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
@@ -715,9 +729,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
delete_from_swap_cache(p);
if (!delete_from_lru_cache(p))
- return RECOVERED;
+ return MF_RECOVERED;
else
- return FAILED;
+ return MF_FAILED;
}
/*
@@ -730,6 +744,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
{
int res = 0;
struct page *hpage = compound_head(p);
+
+ if (!PageHuge(hpage))
+ return MF_DELAYED;
+
/*
* We can safely recover from error on free or reserved (i.e.
* not in-use) hugepage by dequeuing it from freelist.
@@ -743,9 +761,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
if (!(page_mapping(hpage) || PageAnon(hpage))) {
res = dequeue_hwpoisoned_huge_page(hpage);
if (!res)
- return RECOVERED;
+ return MF_RECOVERED;
}
- return DELAYED;
+ return MF_DELAYED;
}
/*
@@ -777,10 +795,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
static struct page_state {
unsigned long mask;
unsigned long res;
- char *msg;
+ enum mf_action_page_type type;
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
- { reserved, reserved, "reserved kernel", me_kernel },
+ { reserved, reserved, MF_MSG_KERNEL, me_kernel },
/*
* free pages are specially detected outside this table:
* PG_buddy pages only make a small fraction of all free pages.
@@ -791,31 +809,31 @@ static struct page_state {
* currently unused objects without touching them. But just
* treat it as standard kernel for now.
*/
- { slab, slab, "kernel slab", me_kernel },
+ { slab, slab, MF_MSG_SLAB, me_kernel },
#ifdef CONFIG_PAGEFLAGS_EXTENDED
- { head, head, "huge", me_huge_page },
- { tail, tail, "huge", me_huge_page },
+ { head, head, MF_MSG_HUGE, me_huge_page },
+ { tail, tail, MF_MSG_HUGE, me_huge_page },
#else
- { compound, compound, "huge", me_huge_page },
+ { compound, compound, MF_MSG_HUGE, me_huge_page },
#endif
- { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
- { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
+ { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
+ { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
- { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
- { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean },
+ { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
+ { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
- { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
- { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean },
+ { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
+ { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
- { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
- { lru|dirty, lru, "clean LRU", me_pagecache_clean },
+ { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty },
+ { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean },
/*
* Catchall entry: must be at end.
*/
- { 0, 0, "unknown page state", me_unknown },
+ { 0, 0, MF_MSG_UNKNOWN, me_unknown },
};
#undef dirty
@@ -835,10 +853,13 @@ static struct page_state {
* "Dirty/Clean" indication is not 100% accurate due to the possibility of
* setting PG_dirty outside page lock. See also comment above set_page_dirty().
*/
-static void action_result(unsigned long pfn, char *msg, int result)
+static void action_result(unsigned long pfn, enum mf_action_page_type type,
+ enum mf_result result)
{
- pr_err("MCE %#lx: %s page recovery: %s\n",
- pfn, msg, action_name[result]);
+ trace_memory_failure_event(pfn, type, result);
+
+ pr_err("MCE %#lx: recovery action for %s: %s\n",
+ pfn, action_page_types[type], action_name[result]);
}
static int page_action(struct page_state *ps, struct page *p,
@@ -850,23 +871,68 @@ static int page_action(struct page_state *ps, struct page *p,
result = ps->action(p, pfn);
count = page_count(p) - 1;
- if (ps->action == me_swapcache_dirty && result == DELAYED)
+ if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
if (count != 0) {
printk(KERN_ERR
- "MCE %#lx: %s page still referenced by %d users\n",
- pfn, ps->msg, count);
- result = FAILED;
+ "MCE %#lx: %s still referenced by %d users\n",
+ pfn, action_page_types[ps->type], count);
+ result = MF_FAILED;
}
- action_result(pfn, ps->msg, result);
+ action_result(pfn, ps->type, result);
/* Could do more checks here if page looks ok */
/*
* Could adjust zone counters here to correct for the missing page.
*/
- return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
+ return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
+}
+
+/**
+ * get_hwpoison_page() - Get refcount for memory error handling:
+ * @page: raw error page (hit by memory error)
+ *
+ * Return: return 0 if failed to grab the refcount, otherwise true (some
+ * non-zero value.)
+ */
+int get_hwpoison_page(struct page *page)
+{
+ struct page *head = compound_head(page);
+
+ if (PageHuge(head))
+ return get_page_unless_zero(head);
+
+ /*
+ * Thp tail page has special refcounting rule (refcount of tail pages
+ * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
+ * directly for tail pages.
+ */
+ if (PageTransHuge(head)) {
+ /*
+ * Non anonymous thp exists only in allocation/free time. We
+ * can't handle such a case correctly, so let's give it up.
+ * This should be better than triggering BUG_ON when kernel
+ * tries to touch the "partially handled" page.
+ */
+ if (!PageAnon(head)) {
+ pr_err("MCE: %#lx: non anonymous thp\n",
+ page_to_pfn(page));
+ return 0;
+ }
+
+ if (get_page_unless_zero(head)) {
+ if (PageTail(page))
+ get_page(page);
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+
+ return get_page_unless_zero(page);
}
+EXPORT_SYMBOL_GPL(get_hwpoison_page);
/*
* Do all that is necessary to remove user space mappings. Unmap
@@ -881,7 +947,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
int ret;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
- struct page *ppage;
/*
* Here we are interested only in user-mapped pages, so skip any
@@ -931,59 +996,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
}
/*
- * ppage: poisoned page
- * if p is regular page(4k page)
- * ppage == real poisoned page;
- * else p is hugetlb or THP, ppage == head page.
- */
- ppage = hpage;
-
- if (PageTransHuge(hpage)) {
- /*
- * Verify that this isn't a hugetlbfs head page, the check for
- * PageAnon is just for avoid tripping a split_huge_page
- * internal debug check, as split_huge_page refuses to deal with
- * anything that isn't an anon page. PageAnon can't go away fro
- * under us because we hold a refcount on the hpage, without a
- * refcount on the hpage. split_huge_page can't be safely called
- * in the first place, having a refcount on the tail isn't
- * enough * to be safe.
- */
- if (!PageHuge(hpage) && PageAnon(hpage)) {
- if (unlikely(split_huge_page(hpage))) {
- /*
- * FIXME: if splitting THP is failed, it is
- * better to stop the following operation rather
- * than causing panic by unmapping. System might
- * survive if the page is freed later.
- */
- printk(KERN_INFO
- "MCE %#lx: failed to split THP\n", pfn);
-
- BUG_ON(!PageHWPoison(p));
- return SWAP_FAIL;
- }
- /*
- * We pinned the head page for hwpoison handling,
- * now we split the thp and we are interested in
- * the hwpoisoned raw page, so move the refcount
- * to it. Similarly, page lock is shifted.
- */
- if (hpage != p) {
- if (!(flags & MF_COUNT_INCREASED)) {
- put_page(hpage);
- get_page(p);
- }
- lock_page(p);
- unlock_page(hpage);
- *hpagep = p;
- }
- /* THP is split, so ppage should be the real poisoned page. */
- ppage = p;
- }
- }
-
- /*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
* because ttu takes the rmap data structures down.
@@ -992,12 +1004,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED);
+ collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- ret = try_to_unmap(ppage, ttu);
+ ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(ppage));
+ pfn, page_mapcount(hpage));
/*
* Now that the dirty bit has been propagated to the
@@ -1009,7 +1021,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
kill_procs(&tokill, forcekill, trapno,
ret != SWAP_SUCCESS, p, pfn, flags);
@@ -1055,6 +1067,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
struct page_state *ps;
struct page *p;
struct page *hpage;
+ struct page *orig_head;
int res;
unsigned int nr_pages;
unsigned long page_flags;
@@ -1070,7 +1083,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
p = pfn_to_page(pfn);
- hpage = compound_head(p);
+ orig_head = hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
return 0;
@@ -1103,10 +1116,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* In fact it's dangerous to directly bump up page count from 0,
* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
*/
- if (!(flags & MF_COUNT_INCREASED) &&
- !get_page_unless_zero(hpage)) {
+ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
if (is_free_buddy_page(p)) {
- action_result(pfn, "free buddy", DELAYED);
+ action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
return 0;
} else if (PageHuge(hpage)) {
/*
@@ -1123,16 +1135,30 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
- action_result(pfn, "free huge",
- res ? IGNORED : DELAYED);
+ action_result(pfn, MF_MSG_FREE_HUGE,
+ res ? MF_IGNORED : MF_DELAYED);
unlock_page(hpage);
return res;
} else {
- action_result(pfn, "high order kernel", IGNORED);
+ action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
return -EBUSY;
}
}
+ if (!PageHuge(p) && PageTransHuge(hpage)) {
+ if (unlikely(split_huge_page(hpage))) {
+ pr_err("MCE: %#lx: thp split failed\n", pfn);
+ if (TestClearPageHWPoison(p))
+ atomic_long_sub(nr_pages, &num_poisoned_pages);
+ put_page(p);
+ if (p != hpage)
+ put_page(hpage);
+ return -EBUSY;
+ }
+ VM_BUG_ON_PAGE(!page_count(p), p);
+ hpage = compound_head(p);
+ }
+
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
@@ -1141,7 +1167,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageHuge(p) && !PageTransTail(p)) {
+ if (!PageHuge(p)) {
if (!PageLRU(p))
shake_page(p, 0);
if (!PageLRU(p)) {
@@ -1150,9 +1176,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (is_free_buddy_page(p)) {
if (flags & MF_COUNT_INCREASED)
- action_result(pfn, "free buddy", DELAYED);
+ action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
else
- action_result(pfn, "free buddy, 2nd try", DELAYED);
+ action_result(pfn, MF_MSG_BUDDY_2ND,
+ MF_DELAYED);
return 0;
}
}
@@ -1164,8 +1191,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* The page could have changed compound pages during the locking.
* If this happens just bail out.
*/
- if (compound_head(p) != hpage) {
- action_result(pfn, "different compound page after locking", IGNORED);
+ if (PageCompound(p) && compound_head(p) != orig_head) {
+ action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
res = -EBUSY;
goto out;
}
@@ -1185,9 +1212,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
put_page(hpage);
- res = 0;
- goto out;
+ return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
@@ -1205,8 +1232,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* on the head page to show that the hugepage is hwpoisoned
*/
if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
- action_result(pfn, "hugepage already hardware poisoned",
- IGNORED);
+ action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED);
unlock_page(hpage);
put_page(hpage);
return 0;
@@ -1235,7 +1261,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
*/
if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
!= SWAP_SUCCESS) {
- action_result(pfn, "unmapping failed", IGNORED);
+ action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
}
@@ -1244,7 +1270,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* Torn down by someone else?
*/
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
- action_result(pfn, "already truncated LRU", IGNORED);
+ action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
res = -EBUSY;
goto out;
}
@@ -1404,12 +1430,12 @@ int unpoison_memory(unsigned long pfn)
*/
if (!PageHuge(page) && PageTransHuge(page)) {
pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
- return 0;
+ return 0;
}
nr_pages = 1 << compound_order(page);
- if (!get_page_unless_zero(page)) {
+ if (!get_hwpoison_page(p)) {
/*
* Since HWPoisoned hugepage should have non-zero refcount,
* race between memory failure and unpoison seems to happen.
@@ -1477,7 +1503,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
* When the target page is a free hugepage, just remove it
* from free hugepage list.
*/
- if (!get_page_unless_zero(compound_head(p))) {
+ if (!get_hwpoison_page(p)) {
if (PageHuge(p)) {
pr_info("%s: %#lx free huge page\n", __func__, pfn);
ret = 0;
@@ -1540,8 +1566,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
}
unlock_page(hpage);
- /* Keep page count to indicate a given hugepage is isolated. */
- list_move(&hpage->lru, &pagelist);
+ ret = isolate_huge_page(hpage, &pagelist);
+ if (ret) {
+ /*
+ * get_any_page() and isolate_huge_page() takes a refcount each,
+ * so need to drop one here.
+ */
+ put_page(hpage);
+ } else {
+ pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
+ return -EBUSY;
+ }
+
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
@@ -1623,6 +1659,8 @@ static int __soft_offline_page(struct page *page, int flags)
inc_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
+ if (!TestSetPageHWPoison(page))
+ atomic_long_inc(&num_poisoned_pages);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
@@ -1637,22 +1675,8 @@ static int __soft_offline_page(struct page *page, int flags)
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
- } else {
- /*
- * After page migration succeeds, the source page can
- * be trapped in pagevec and actual freeing is delayed.
- * Freeing code works differently based on PG_hwpoison,
- * so there's a race. We need to make sure that the
- * source page should be freed back to buddy before
- * setting PG_hwpoison.
- */
- if (!is_free_buddy_page(page))
- drain_all_pages(page_zone(page));
- SetPageHWPoison(page);
- if (!is_free_buddy_page(page))
- pr_info("soft offline: %#lx: page leaked\n",
- pfn);
- atomic_long_inc(&num_poisoned_pages);
+ if (TestClearPageHWPoison(page))
+ atomic_long_dec(&num_poisoned_pages);
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
@@ -1703,14 +1727,6 @@ int soft_offline_page(struct page *page, int flags)
get_online_mems();
- /*
- * Isolate the page, so that it doesn't get reallocated if it
- * was free. This flag should be kept set until the source page
- * is freed and PG_hwpoison on it is set.
- */
- if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
- set_migratetype_isolate(page, true);
-
ret = get_any_page(page, pfn, flags);
put_online_mems();
if (ret > 0) { /* for in-use pages */
@@ -1721,14 +1737,13 @@ int soft_offline_page(struct page *page, int flags)
} else if (ret == 0) { /* for free pages */
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
- dequeue_hwpoisoned_huge_page(hpage);
- atomic_long_add(1 << compound_order(hpage),
+ if (!dequeue_hwpoisoned_huge_page(hpage))
+ atomic_long_add(1 << compound_order(hpage),
&num_poisoned_pages);
} else {
- SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
+ if (!TestSetPageHWPoison(page))
+ atomic_long_inc(&num_poisoned_pages);
}
}
- unset_migratetype_isolate(page, MIGRATE_MOVABLE);
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 97839f5c8c30..388dcf9aa283 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
/*
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
- if (vma->vm_ops)
- printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
- vma->vm_ops->fault);
- if (vma->vm_file)
- printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
- vma->vm_file->f_op->mmap);
+ pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
+ vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->fault : NULL,
+ vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+ mapping ? mapping->a_ops->readpage : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
@@ -1983,167 +1982,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
}
/*
- * This routine handles present pages, when users try to write
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
- *
- * Note that this routine assumes that the protection checks have been
- * done by the caller (the low-level page fault routine in most cases).
- * Thus we can safely just mark it writable once we've done any necessary
- * COW.
+ * Handle write page faults for pages that can be reused in the current vma
*
- * We also mark the page dirty at this point even though the page will
- * change only once the write actually happens. This avoids a few races,
- * and potentially makes it more efficient.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * This can happen either due to the mapping being with the VM_SHARED flag,
+ * or due to us being the last reference standing to the page. In either
+ * case, all we need to do here is to mark the page as writable and update
+ * any related book-keeping.
*/
-static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- spinlock_t *ptl, pte_t orig_pte)
+static inline int wp_page_reuse(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+ struct page *page, int page_mkwrite,
+ int dirty_shared)
__releases(ptl)
{
- struct page *old_page, *new_page = NULL;
pte_t entry;
- int ret = 0;
- int page_mkwrite = 0;
- bool dirty_shared = false;
- unsigned long mmun_start = 0; /* For mmu_notifiers */
- unsigned long mmun_end = 0; /* For mmu_notifiers */
- struct mem_cgroup *memcg;
-
- old_page = vm_normal_page(vma, address, orig_pte);
- if (!old_page) {
- /*
- * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
- * VM_PFNMAP VMA.
- *
- * We should not cow pages in a shared writeable mapping.
- * Just mark the pages writable as we can't do any dirty
- * accounting on raw pfn maps.
- */
- if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))
- goto reuse;
- goto gotten;
- }
-
/*
- * Take out anonymous pages first, anonymous shared vmas are
- * not dirty accountable.
+ * Clear the pages cpupid information as the existing
+ * information potentially belongs to a now completely
+ * unrelated process.
*/
- if (PageAnon(old_page) && !PageKsm(old_page)) {
- if (!trylock_page(old_page)) {
- page_cache_get(old_page);
- pte_unmap_unlock(page_table, ptl);
- lock_page(old_page);
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
- unlock_page(old_page);
- goto unlock;
- }
- page_cache_release(old_page);
- }
- if (reuse_swap_page(old_page)) {
- /*
- * The page is all ours. Move it to our anon_vma so
- * the rmap code will not search our parent or siblings.
- * Protected against the rmap code by the page lock.
- */
- page_move_anon_rmap(old_page, vma, address);
- unlock_page(old_page);
- goto reuse;
- }
- unlock_page(old_page);
- } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
- (VM_WRITE|VM_SHARED))) {
- page_cache_get(old_page);
- /*
- * Only catch write-faults on shared writable pages,
- * read-only shared pages can get COWed by
- * get_user_pages(.write=1, .force=1).
- */
- if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
- int tmp;
-
- pte_unmap_unlock(page_table, ptl);
- tmp = do_page_mkwrite(vma, old_page, address);
- if (unlikely(!tmp || (tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
- page_cache_release(old_page);
- return tmp;
- }
- /*
- * Since we dropped the lock we need to revalidate
- * the PTE as someone else may have changed it. If
- * they did, we just return, as we can count on the
- * MMU to tell us if they didn't also make it writable.
- */
- page_table = pte_offset_map_lock(mm, pmd, address,
- &ptl);
- if (!pte_same(*page_table, orig_pte)) {
- unlock_page(old_page);
- goto unlock;
- }
- page_mkwrite = 1;
- }
-
- dirty_shared = true;
-
-reuse:
- /*
- * Clear the pages cpupid information as the existing
- * information potentially belongs to a now completely
- * unrelated process.
- */
- if (old_page)
- page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
-
- flush_cache_page(vma, address, pte_pfn(orig_pte));
- entry = pte_mkyoung(orig_pte);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, address, page_table, entry,1))
- update_mmu_cache(vma, address, page_table);
- pte_unmap_unlock(page_table, ptl);
- ret |= VM_FAULT_WRITE;
+ if (page)
+ page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
- if (dirty_shared) {
- struct address_space *mapping;
- int dirtied;
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = pte_mkyoung(orig_pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (ptep_set_access_flags(vma, address, page_table, entry, 1))
+ update_mmu_cache(vma, address, page_table);
+ pte_unmap_unlock(page_table, ptl);
- if (!page_mkwrite)
- lock_page(old_page);
+ if (dirty_shared) {
+ struct address_space *mapping;
+ int dirtied;
- dirtied = set_page_dirty(old_page);
- VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
- mapping = old_page->mapping;
- unlock_page(old_page);
- page_cache_release(old_page);
+ if (!page_mkwrite)
+ lock_page(page);
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
+ dirtied = set_page_dirty(page);
+ VM_BUG_ON_PAGE(PageAnon(page), page);
+ mapping = page->mapping;
+ unlock_page(page);
+ page_cache_release(page);
- if (!page_mkwrite)
- file_update_time(vma->vm_file);
+ if ((dirtied || page_mkwrite) && mapping) {
+ /*
+ * Some device drivers do not set page.mapping
+ * but still dirty their pages
+ */
+ balance_dirty_pages_ratelimited(mapping);
}
- return ret;
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
}
- /*
- * Ok, we need to copy. Oh, well..
- */
- page_cache_get(old_page);
-gotten:
- pte_unmap_unlock(page_table, ptl);
+ return VM_FAULT_WRITE;
+}
+
+/*
+ * Handle the case of a page which we actually need to copy to a new page.
+ *
+ * Called with mmap_sem locked and the old page referenced, but
+ * without the ptl held.
+ *
+ * High level logic flow:
+ *
+ * - Allocate a page, copy the content of the old page to the new one.
+ * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
+ * - Take the PTL. If the pte changed, bail out and release the allocated page
+ * - If the pte is still the way we remember it, update the page table and all
+ * relevant references. This includes dropping the reference the page-table
+ * held to the old page, as well as updating the rmap.
+ * - In any case, unlock the PTL and drop the reference we took to the old page.
+ */
+static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ pte_t orig_pte, struct page *old_page)
+{
+ struct page *new_page = NULL;
+ spinlock_t *ptl = NULL;
+ pte_t entry;
+ int page_copied = 0;
+ const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
+ const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
+ struct mem_cgroup *memcg;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2158,13 +2081,12 @@ gotten:
goto oom;
cow_user_page(new_page, old_page, address, vma);
}
- __SetPageUptodate(new_page);
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
goto oom_free_new;
- mmun_start = address & PAGE_MASK;
- mmun_end = mmun_start + PAGE_SIZE;
+ __SetPageUptodate(new_page);
+
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
@@ -2177,8 +2099,9 @@ gotten:
dec_mm_counter_fast(mm, MM_FILEPAGES);
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
- } else
+ } else {
inc_mm_counter_fast(mm, MM_ANONPAGES);
+ }
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2227,29 +2150,29 @@ gotten:
/* Free the old page.. */
new_page = old_page;
- ret |= VM_FAULT_WRITE;
- } else
+ page_copied = 1;
+ } else {
mem_cgroup_cancel_charge(new_page, memcg);
+ }
if (new_page)
page_cache_release(new_page);
-unlock:
+
pte_unmap_unlock(page_table, ptl);
- if (mmun_end > mmun_start)
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
- if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
+ if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
}
- return ret;
+ return page_copied ? VM_FAULT_WRITE : 0;
oom_free_new:
page_cache_release(new_page);
oom:
@@ -2258,6 +2181,179 @@ oom:
return VM_FAULT_OOM;
}
+/*
+ * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
+ * mapping
+ */
+static int wp_pfn_shared(struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long address,
+ pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
+ pmd_t *pmd)
+{
+ if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
+ struct vm_fault vmf = {
+ .page = NULL,
+ .pgoff = linear_page_index(vma, address),
+ .virtual_address = (void __user *)(address & PAGE_MASK),
+ .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
+ };
+ int ret;
+
+ pte_unmap_unlock(page_table, ptl);
+ ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
+ if (ret & VM_FAULT_ERROR)
+ return ret;
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+ /*
+ * We might have raced with another page fault while we
+ * released the pte_offset_map_lock.
+ */
+ if (!pte_same(*page_table, orig_pte)) {
+ pte_unmap_unlock(page_table, ptl);
+ return 0;
+ }
+ }
+ return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
+ NULL, 0, 0);
+}
+
+static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table,
+ pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
+ struct page *old_page)
+ __releases(ptl)
+{
+ int page_mkwrite = 0;
+
+ page_cache_get(old_page);
+
+ /*
+ * Only catch write-faults on shared writable pages,
+ * read-only shared pages can get COWed by
+ * get_user_pages(.write=1, .force=1).
+ */
+ if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
+ int tmp;
+
+ pte_unmap_unlock(page_table, ptl);
+ tmp = do_page_mkwrite(vma, old_page, address);
+ if (unlikely(!tmp || (tmp &
+ (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
+ page_cache_release(old_page);
+ return tmp;
+ }
+ /*
+ * Since we dropped the lock we need to revalidate
+ * the PTE as someone else may have changed it. If
+ * they did, we just return, as we can count on the
+ * MMU to tell us if they didn't also make it writable.
+ */
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ page_cache_release(old_page);
+ return 0;
+ }
+ page_mkwrite = 1;
+ }
+
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, page_mkwrite, 1);
+}
+
+/*
+ * This routine handles present pages, when users try to write
+ * to a shared page. It is done by copying the page to a new address
+ * and decrementing the shared-page counter for the old page.
+ *
+ * Note that this routine assumes that the protection checks have been
+ * done by the caller (the low-level page fault routine in most cases).
+ * Thus we can safely just mark it writable once we've done any necessary
+ * COW.
+ *
+ * We also mark the page dirty at this point even though the page will
+ * change only once the write actually happens. This avoids a few races,
+ * and potentially makes it more efficient.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), with pte both mapped and locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ */
+static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pte_t *page_table, pmd_t *pmd,
+ spinlock_t *ptl, pte_t orig_pte)
+ __releases(ptl)
+{
+ struct page *old_page;
+
+ old_page = vm_normal_page(vma, address, orig_pte);
+ if (!old_page) {
+ /*
+ * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
+ * VM_PFNMAP VMA.
+ *
+ * We should not cow pages in a shared writeable mapping.
+ * Just mark the pages writable and/or call ops->pfn_mkwrite.
+ */
+ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))
+ return wp_pfn_shared(mm, vma, address, page_table, ptl,
+ orig_pte, pmd);
+
+ pte_unmap_unlock(page_table, ptl);
+ return wp_page_copy(mm, vma, address, page_table, pmd,
+ orig_pte, old_page);
+ }
+
+ /*
+ * Take out anonymous pages first, anonymous shared vmas are
+ * not dirty accountable.
+ */
+ if (PageAnon(old_page) && !PageKsm(old_page)) {
+ if (!trylock_page(old_page)) {
+ page_cache_get(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ lock_page(old_page);
+ page_table = pte_offset_map_lock(mm, pmd, address,
+ &ptl);
+ if (!pte_same(*page_table, orig_pte)) {
+ unlock_page(old_page);
+ pte_unmap_unlock(page_table, ptl);
+ page_cache_release(old_page);
+ return 0;
+ }
+ page_cache_release(old_page);
+ }
+ if (reuse_swap_page(old_page)) {
+ /*
+ * The page is all ours. Move it to our anon_vma so
+ * the rmap code will not search our parent or siblings.
+ * Protected against the rmap code by the page lock.
+ */
+ page_move_anon_rmap(old_page, vma, address);
+ unlock_page(old_page);
+ return wp_page_reuse(mm, vma, address, page_table, ptl,
+ orig_pte, old_page, 0, 0);
+ }
+ unlock_page(old_page);
+ } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
+ (VM_WRITE|VM_SHARED))) {
+ return wp_page_shared(mm, vma, address, page_table, pmd,
+ ptl, orig_pte, old_page);
+ }
+
+ /*
+ * Ok, we need to copy. Oh, well..
+ */
+ page_cache_get(old_page);
+
+ pte_unmap_unlock(page_table, ptl);
+ return wp_page_copy(mm, vma, address, page_table, pmd,
+ orig_pte, old_page);
+}
+
static void unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
@@ -2574,6 +2670,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap(page_table);
+ /* File mapping without ->vm_ops ? */
+ if (vma->vm_flags & VM_SHARED)
+ return VM_FAULT_SIGBUS;
+
/* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
return VM_FAULT_SIGSEGV;
@@ -2594,6 +2694,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
+
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+ goto oom_free_page;
+
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceeding stores to the page contents become visible before
@@ -2601,9 +2705,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
__SetPageUptodate(page);
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
- goto oom_free_page;
-
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
@@ -2784,7 +2885,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
struct vm_fault vmf;
int off;
- nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT;
+ nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
start_addr = max(address & mask, vma->vm_start);
@@ -3002,6 +3103,9 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
+ /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
+ if (!vma->vm_ops->fault)
+ return VM_FAULT_SIGBUS;
if (!(flags & FAULT_FLAG_WRITE))
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
@@ -3147,13 +3251,12 @@ static int handle_pte_fault(struct mm_struct *mm,
barrier();
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (vma->vm_ops) {
- if (likely(vma->vm_ops->fault))
- return do_fault(mm, vma, address, pte,
- pmd, flags, entry);
- }
- return do_anonymous_page(mm, vma, address,
- pte, pmd, flags);
+ if (vma->vm_ops)
+ return do_fault(mm, vma, address, pte, pmd,
+ flags, entry);
+
+ return do_anonymous_page(mm, vma, address, pte, pmd,
+ flags);
}
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
@@ -3629,7 +3732,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
if (buf) {
char *p;
- p = d_path(&f->f_path, buf, PAGE_SIZE);
+ p = file_path(f, buf, PAGE_SIZE);
if (IS_ERR(p))
p = "?";
printk("%s%s[%lx+%lx]", prefix, kbasename(p),
@@ -3642,7 +3745,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
}
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
-void might_fault(void)
+void __might_fault(const char *file, int line)
{
/*
* Some code (nfs/sunrpc) uses socket ops on kernel memory while
@@ -3652,21 +3755,15 @@ void might_fault(void)
*/
if (segment_eq(get_fs(), KERNEL_DS))
return;
-
- /*
- * it would be nicer only to annotate paths which are not under
- * pagefault_disable, however that requires a larger audit and
- * providing helpers like get_user_atomic.
- */
- if (in_atomic())
+ if (pagefault_disabled())
return;
-
- __might_sleep(__FILE__, __LINE__, 0);
-
+ __might_sleep(file, line, 0);
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_sem);
+#endif
}
-EXPORT_SYMBOL(might_fault);
+EXPORT_SYMBOL(__might_fault);
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 65842d688b7c..003dbe4b060d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -104,7 +104,7 @@ void put_online_mems(void)
}
-static void mem_hotplug_begin(void)
+void mem_hotplug_begin(void)
{
mem_hotplug.active_writer = current;
@@ -119,7 +119,7 @@ static void mem_hotplug_begin(void)
}
}
-static void mem_hotplug_done(void)
+void mem_hotplug_done(void)
{
mem_hotplug.active_writer = NULL;
mutex_unlock(&mem_hotplug.lock);
@@ -446,7 +446,7 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
int nr_pages = PAGES_PER_SECTION;
int nid = pgdat->node_id;
int zone_type;
- unsigned long flags;
+ unsigned long flags, pfn;
int ret;
zone_type = zone - pgdat->node_zones;
@@ -461,6 +461,14 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
pgdat_resize_unlock(zone->zone_pgdat, &flags);
memmap_init_zone(nr_pages, nid, zone_type,
phys_start_pfn, MEMMAP_HOTPLUG);
+
+ /* online_page_range is called later and expects pages reserved */
+ for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+
+ SetPageReserved(pfn_to_page(pfn));
+ }
return 0;
}
@@ -502,7 +510,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
+ err = __add_section(nid, zone, section_nr_to_pfn(i));
/*
* EEXIST is finally dealt with by ioresource collision
@@ -513,6 +521,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
break;
err = 0;
}
+ vmemmap_populate_print_last();
return err;
}
@@ -959,6 +968,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
}
+/* Must be protected by mem_hotplug_begin() */
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long flags;
@@ -969,7 +979,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
int ret;
struct memory_notify arg;
- mem_hotplug_begin();
/*
* This doesn't need a lock to do pfn_to_page().
* The section can't be removed here because of the
@@ -977,21 +986,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
*/
zone = page_zone(pfn_to_page(pfn));
- ret = -EINVAL;
if ((zone_idx(zone) > ZONE_NORMAL ||
online_type == MMOP_ONLINE_MOVABLE) &&
!can_online_high_movable(zone))
- goto out;
+ return -EINVAL;
if (online_type == MMOP_ONLINE_KERNEL &&
zone_idx(zone) == ZONE_MOVABLE) {
if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
- goto out;
+ return -EINVAL;
}
if (online_type == MMOP_ONLINE_MOVABLE &&
zone_idx(zone) == ZONE_MOVABLE - 1) {
if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
- goto out;
+ return -EINVAL;
}
/* Previous code may changed the zone of the pfn range */
@@ -1007,7 +1015,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
ret = notifier_to_errno(ret);
if (ret) {
memory_notify(MEM_CANCEL_ONLINE, &arg);
- goto out;
+ return ret;
}
/*
* If this zone is not populated, then it is not in zonelist.
@@ -1031,7 +1039,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
(((unsigned long long) pfn + nr_pages)
<< PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
- goto out;
+ return ret;
}
zone->present_pages += onlined_pages;
@@ -1061,9 +1069,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
-out:
- mem_hotplug_done();
- return ret;
+ return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1376,7 +1382,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
if (PageLRU(page))
return pfn;
if (PageHuge(page)) {
- if (is_hugepage_active(page))
+ if (page_huge_active(page))
return pfn;
else
pfn = round_up(pfn + 1,
@@ -1688,21 +1694,18 @@ static int __ref __offline_pages(unsigned long start_pfn,
if (!test_pages_in_a_zone(start_pfn, end_pfn))
return -EINVAL;
- mem_hotplug_begin();
-
zone = page_zone(pfn_to_page(start_pfn));
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
- ret = -EINVAL;
if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
- goto out;
+ return -EINVAL;
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE, true);
if (ret)
- goto out;
+ return ret;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -1795,7 +1798,6 @@ repeat:
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
- mem_hotplug_done();
return 0;
failed_removal:
@@ -1805,12 +1807,10 @@ failed_removal:
memory_notify(MEM_CANCEL_OFFLINE, &arg);
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
-
-out:
- mem_hotplug_done();
return ret;
}
+/* Must be protected by mem_hotplug_begin() */
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
@@ -1978,8 +1978,10 @@ void try_offline_node(int nid)
* wait_table may be allocated from boot memory,
* here only free if it's allocated by vmalloc.
*/
- if (is_vmalloc_addr(zone->wait_table))
+ if (is_vmalloc_addr(zone->wait_table)) {
vfree(zone->wait_table);
+ zone->wait_table = NULL;
+ }
}
}
EXPORT_SYMBOL(try_offline_node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4721046a134a..99d4c1d0b858 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
return alloc_huge_page_node(page_hstate(compound_head(page)),
node);
else
- return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+ return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
+ __GFP_THISNODE, 0);
}
/*
@@ -1971,34 +1972,41 @@ retry_cpuset:
pol = get_vma_policy(vma, addr);
cpuset_mems_cookie = read_mems_allowed_begin();
- if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
- pol->mode != MPOL_INTERLEAVE)) {
+ if (pol->mode == MPOL_INTERLEAVE) {
+ unsigned nid;
+
+ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+ mpol_cond_put(pol);
+ page = alloc_page_interleave(gfp, order, nid);
+ goto out;
+ }
+
+ if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
+ int hpage_node = node;
+
/*
* For hugepage allocation and non-interleave policy which
- * allows the current node, we only try to allocate from the
- * current node and don't fall back to other nodes, as the
- * cost of remote accesses would likely offset THP benefits.
+ * allows the current node (or other explicitly preferred
+ * node) we only try to allocate from the current/preferred
+ * node and don't fall back to other nodes, as the cost of
+ * remote accesses would likely offset THP benefits.
*
* If the policy is interleave, or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
+ if (pol->mode == MPOL_PREFERRED &&
+ !(pol->flags & MPOL_F_LOCAL))
+ hpage_node = pol->v.preferred_node;
+
nmask = policy_nodemask(gfp, pol);
- if (!nmask || node_isset(node, *nmask)) {
+ if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
- page = alloc_pages_exact_node(node, gfp, order);
+ page = alloc_pages_exact_node(hpage_node,
+ gfp | __GFP_THISNODE, order);
goto out;
}
}
- if (pol->mode == MPOL_INTERLEAVE) {
- unsigned nid;
-
- nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
- mpol_cond_put(pol);
- page = alloc_page_interleave(gfp, order, nid);
- goto out;
- }
-
nmask = policy_nodemask(gfp, pol);
zl = policy_zonelist(gfp, pol, node);
mpol_cond_put(pol);
@@ -2516,7 +2524,7 @@ static void __init check_numabalancing_enable(void)
if (numabalancing_override)
set_numabalancing_state(numabalancing_override == 1);
- if (nr_node_ids > 1 && !numabalancing_override) {
+ if (num_online_nodes() > 1 && !numabalancing_override) {
pr_info("%s automatic NUMA balancing. "
"Configure with numa_balancing= or the "
"kernel.numa_balancing sysctl",
diff --git a/mm/mempool.c b/mm/mempool.c
index e209c98c7203..2cc08de8b1db 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -6,26 +6,138 @@
* extreme VM load.
*
* started by Ingo Molnar, Copyright (C) 2001
+ * debugging by David Rientjes, Copyright (C) 2015
*/
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/kasan.h>
#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
+#include "slab.h"
+
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+static void poison_error(mempool_t *pool, void *element, size_t size,
+ size_t byte)
+{
+ const int nr = pool->curr_nr;
+ const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
+ const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
+ int i;
+
+ pr_err("BUG: mempool element poison mismatch\n");
+ pr_err("Mempool %p size %zu\n", pool, size);
+ pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
+ for (i = start; i < end; i++)
+ pr_cont("%x ", *(u8 *)(element + i));
+ pr_cont("%s\n", end < size ? "..." : "");
+ dump_stack();
+}
+
+static void __check_element(mempool_t *pool, void *element, size_t size)
+{
+ u8 *obj = element;
+ size_t i;
+
+ for (i = 0; i < size; i++) {
+ u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;
+
+ if (obj[i] != exp) {
+ poison_error(pool, element, size, i);
+ return;
+ }
+ }
+ memset(obj, POISON_INUSE, size);
+}
+
+static void check_element(mempool_t *pool, void *element)
+{
+ /* Mempools backed by slab allocator */
+ if (pool->free == mempool_free_slab || pool->free == mempool_kfree)
+ __check_element(pool, element, ksize(element));
+
+ /* Mempools backed by page allocator */
+ if (pool->free == mempool_free_pages) {
+ int order = (int)(long)pool->pool_data;
+ void *addr = kmap_atomic((struct page *)element);
+
+ __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
+ kunmap_atomic(addr);
+ }
+}
+
+static void __poison_element(void *element, size_t size)
+{
+ u8 *obj = element;
+
+ memset(obj, POISON_FREE, size - 1);
+ obj[size - 1] = POISON_END;
+}
+
+static void poison_element(mempool_t *pool, void *element)
+{
+ /* Mempools backed by slab allocator */
+ if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+ __poison_element(element, ksize(element));
+
+ /* Mempools backed by page allocator */
+ if (pool->alloc == mempool_alloc_pages) {
+ int order = (int)(long)pool->pool_data;
+ void *addr = kmap_atomic((struct page *)element);
+
+ __poison_element(addr, 1UL << (PAGE_SHIFT + order));
+ kunmap_atomic(addr);
+ }
+}
+#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+static inline void check_element(mempool_t *pool, void *element)
+{
+}
+static inline void poison_element(mempool_t *pool, void *element)
+{
+}
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+
+static void kasan_poison_element(mempool_t *pool, void *element)
+{
+ if (pool->alloc == mempool_alloc_slab)
+ kasan_slab_free(pool->pool_data, element);
+ if (pool->alloc == mempool_kmalloc)
+ kasan_kfree(element);
+ if (pool->alloc == mempool_alloc_pages)
+ kasan_free_pages(element, (unsigned long)pool->pool_data);
+}
+
+static void kasan_unpoison_element(mempool_t *pool, void *element)
+{
+ if (pool->alloc == mempool_alloc_slab)
+ kasan_slab_alloc(pool->pool_data, element);
+ if (pool->alloc == mempool_kmalloc)
+ kasan_krealloc(element, (size_t)pool->pool_data);
+ if (pool->alloc == mempool_alloc_pages)
+ kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+}
static void add_element(mempool_t *pool, void *element)
{
BUG_ON(pool->curr_nr >= pool->min_nr);
+ poison_element(pool, element);
+ kasan_poison_element(pool, element);
pool->elements[pool->curr_nr++] = element;
}
static void *remove_element(mempool_t *pool)
{
- BUG_ON(pool->curr_nr <= 0);
- return pool->elements[--pool->curr_nr];
+ void *element = pool->elements[--pool->curr_nr];
+
+ BUG_ON(pool->curr_nr < 0);
+ check_element(pool, element);
+ kasan_unpoison_element(pool, element);
+ return element;
}
/**
@@ -113,23 +225,24 @@ EXPORT_SYMBOL(mempool_create_node);
* mempool_create().
* @new_min_nr: the new minimum number of elements guaranteed to be
* allocated for this pool.
- * @gfp_mask: the usual allocation bitmask.
*
* This function shrinks/grows the pool. In the case of growing,
* it cannot be guaranteed that the pool will be grown to the new
* size immediately, but new mempool_free() calls will refill it.
+ * This function may sleep.
*
* Note, the caller must guarantee that no mempool_destroy is called
* while this function is running. mempool_alloc() & mempool_free()
* might be called (eg. from IRQ contexts) while this function executes.
*/
-int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
+int mempool_resize(mempool_t *pool, int new_min_nr)
{
void *element;
void **new_elements;
unsigned long flags;
BUG_ON(new_min_nr <= 0);
+ might_sleep();
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
@@ -145,7 +258,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
spin_unlock_irqrestore(&pool->lock, flags);
/* Grow the pool */
- new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
+ new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
+ GFP_KERNEL);
if (!new_elements)
return -ENOMEM;
@@ -164,7 +278,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
while (pool->curr_nr < pool->min_nr) {
spin_unlock_irqrestore(&pool->lock, flags);
- element = pool->alloc(gfp_mask, pool->pool_data);
+ element = pool->alloc(GFP_KERNEL, pool->pool_data);
if (!element)
goto out;
spin_lock_irqsave(&pool->lock, flags);
@@ -332,6 +446,7 @@ EXPORT_SYMBOL(mempool_free);
void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
{
struct kmem_cache *mem = pool_data;
+ VM_BUG_ON(mem->ctor);
return kmem_cache_alloc(mem, gfp_mask);
}
EXPORT_SYMBOL(mempool_alloc_slab);
diff --git a/mm/memtest.c b/mm/memtest.c
new file mode 100644
index 000000000000..0a1cc133f6d7
--- /dev/null
+++ b/mm/memtest.c
@@ -0,0 +1,119 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/memblock.h>
+
+static u64 patterns[] __initdata = {
+ /* The first entry has to be 0 to leave memtest with zeroed memory */
+ 0,
+ 0xffffffffffffffffULL,
+ 0x5555555555555555ULL,
+ 0xaaaaaaaaaaaaaaaaULL,
+ 0x1111111111111111ULL,
+ 0x2222222222222222ULL,
+ 0x4444444444444444ULL,
+ 0x8888888888888888ULL,
+ 0x3333333333333333ULL,
+ 0x6666666666666666ULL,
+ 0x9999999999999999ULL,
+ 0xccccccccccccccccULL,
+ 0x7777777777777777ULL,
+ 0xbbbbbbbbbbbbbbbbULL,
+ 0xddddddddddddddddULL,
+ 0xeeeeeeeeeeeeeeeeULL,
+ 0x7a6c7258554e494cULL, /* yeah ;-) */
+};
+
+static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
+{
+ printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
+ (unsigned long long) pattern,
+ (unsigned long long) start_bad,
+ (unsigned long long) end_bad);
+ memblock_reserve(start_bad, end_bad - start_bad);
+}
+
+static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
+{
+ u64 *p, *start, *end;
+ phys_addr_t start_bad, last_bad;
+ phys_addr_t start_phys_aligned;
+ const size_t incr = sizeof(pattern);
+
+ start_phys_aligned = ALIGN(start_phys, incr);
+ start = __va(start_phys_aligned);
+ end = start + (size - (start_phys_aligned - start_phys)) / incr;
+ start_bad = 0;
+ last_bad = 0;
+
+ for (p = start; p < end; p++)
+ *p = pattern;
+
+ for (p = start; p < end; p++, start_phys_aligned += incr) {
+ if (*p == pattern)
+ continue;
+ if (start_phys_aligned == last_bad + incr) {
+ last_bad += incr;
+ continue;
+ }
+ if (start_bad)
+ reserve_bad_mem(pattern, start_bad, last_bad + incr);
+ start_bad = last_bad = start_phys_aligned;
+ }
+ if (start_bad)
+ reserve_bad_mem(pattern, start_bad, last_bad + incr);
+}
+
+static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
+{
+ u64 i;
+ phys_addr_t this_start, this_end;
+
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start,
+ &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
+ if (this_start < this_end) {
+ printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
+ (unsigned long long)this_start,
+ (unsigned long long)this_end,
+ (unsigned long long)cpu_to_be64(pattern));
+ memtest(pattern, this_start, this_end - this_start);
+ }
+ }
+}
+
+/* default is disabled */
+static int memtest_pattern __initdata;
+
+static int __init parse_memtest(char *arg)
+{
+ if (arg)
+ memtest_pattern = simple_strtoul(arg, NULL, 0);
+ else
+ memtest_pattern = ARRAY_SIZE(patterns);
+
+ return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+void __init early_memtest(phys_addr_t start, phys_addr_t end)
+{
+ unsigned int i;
+ unsigned int idx = 0;
+
+ if (!memtest_pattern)
+ return;
+
+ printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
+ for (i = memtest_pattern-1; i < UINT_MAX; --i) {
+ idx = i % ARRAY_SIZE(patterns);
+ do_one_pass(patterns[idx], start, end);
+ }
+}
diff --git a/mm/migrate.c b/mm/migrate.c
index 85e042686031..eb4267107d1f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -537,7 +537,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)
* Please do not reorder this without considering how mm/ksm.c's
* get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
*/
- ClearPageSwapCache(page);
+ if (PageSwapCache(page))
+ ClearPageSwapCache(page);
ClearPagePrivate(page);
set_page_private(page, 0);
@@ -879,7 +880,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
/* Establish migration ptes or remove ptes */
if (page_mapped(page)) {
try_to_unmap(page,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
+ TTU_IGNORE_HWPOISON);
page_was_mapped = 1;
}
@@ -901,12 +903,24 @@ out:
}
/*
+ * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
+ * around it.
+ */
+#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
+#define ICE_noinline noinline
+#else
+#define ICE_noinline
+#endif
+
+/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
- unsigned long private, struct page *page, int force,
- enum migrate_mode mode)
+static ICE_noinline int unmap_and_move(new_page_t get_new_page,
+ free_page_t put_new_page,
+ unsigned long private, struct page *page,
+ int force, enum migrate_mode mode,
+ enum migrate_reason reason)
{
int rc = 0;
int *result = NULL;
@@ -937,7 +951,11 @@ out:
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- putback_lru_page(page);
+ /* Soft-offlined page shouldn't go through lru cache list */
+ if (reason == MR_MEMORY_FAILURE)
+ put_page(page);
+ else
+ putback_lru_page(page);
}
/*
@@ -1110,7 +1128,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
pass > 2, mode);
else
rc = unmap_and_move(get_new_page, put_new_page,
- private, page, pass > 2, mode);
+ private, page, pass > 2, mode,
+ reason);
switch(rc) {
case -ENOMEM:
@@ -1554,30 +1573,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
* page migration rate limiting control.
* Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
* window of time. Default here says do not migrate more than 1280M per second.
- * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
- * as it is faults that reset the window, pte updates will happen unconditionally
- * if there has not been a fault since @pteupdate_interval_millisecs after the
- * throttle window closed.
*/
static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-/* Returns true if NUMA migration is currently rate limited */
-bool migrate_ratelimited(int node)
-{
- pg_data_t *pgdat = NODE_DATA(node);
-
- if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
- msecs_to_jiffies(pteupdate_interval_millisecs)))
- return false;
-
- if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
- return false;
-
- return true;
-}
-
/* Returns true if the node is migrate rate-limited after the update */
static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
unsigned long nr_pages)
@@ -1804,7 +1803,7 @@ fail_putback:
*/
flush_cache_range(vma, mmun_start, mmun_end);
page_add_anon_rmap(new_page, vma, mmun_start);
- pmdp_clear_flush_notify(vma, mmun_start, pmd);
+ pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
diff --git a/mm/mlock.c b/mm/mlock.c
index 8a54cd214925..6fd2cf15e868 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -205,62 +205,6 @@ out:
return nr_pages - 1;
}
-/**
- * __mlock_vma_pages_range() - mlock a range of pages in the vma.
- * @vma: target vma
- * @start: start address
- * @end: end address
- * @nonblocking:
- *
- * This takes care of making the pages present too.
- *
- * return 0 on success, negative error code on error.
- *
- * vma->vm_mm->mmap_sem must be held.
- *
- * If @nonblocking is NULL, it may be held for read or write and will
- * be unperturbed.
- *
- * If @nonblocking is non-NULL, it must held for read only and may be
- * released. If it's released, *@nonblocking will be set to 0.
- */
-long __mlock_vma_pages_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end, int *nonblocking)
-{
- struct mm_struct *mm = vma->vm_mm;
- unsigned long nr_pages = (end - start) / PAGE_SIZE;
- int gup_flags;
-
- VM_BUG_ON(start & ~PAGE_MASK);
- VM_BUG_ON(end & ~PAGE_MASK);
- VM_BUG_ON_VMA(start < vma->vm_start, vma);
- VM_BUG_ON_VMA(end > vma->vm_end, vma);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
-
- gup_flags = FOLL_TOUCH | FOLL_MLOCK;
- /*
- * We want to touch writable mappings with a write fault in order
- * to break COW, except for shared mappings because these don't COW
- * and we would not want to dirty them for nothing.
- */
- if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
- gup_flags |= FOLL_WRITE;
-
- /*
- * We want mlock to succeed for regions that have any permissions
- * other than PROT_NONE.
- */
- if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
- gup_flags |= FOLL_FORCE;
-
- /*
- * We made sure addr is within a VMA, so the following will
- * not result in a stack expansion that recurses back here.
- */
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
- NULL, NULL, nonblocking);
-}
-
/*
* convert get_user_pages() return value to posix mlock() error
*/
@@ -596,7 +540,7 @@ success:
/*
* vm_flags is protected by the mmap_sem held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
- * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
+ * set VM_LOCKED, populate_vma_page_range will bring it back.
*/
if (lock)
@@ -660,69 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on)
return error;
}
-/*
- * __mm_populate - populate and/or mlock pages within a range of address space.
- *
- * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
- * flags. VMAs must be already marked with the desired vm_flags, and
- * mmap_sem must not be held.
- */
-int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
-{
- struct mm_struct *mm = current->mm;
- unsigned long end, nstart, nend;
- struct vm_area_struct *vma = NULL;
- int locked = 0;
- long ret = 0;
-
- VM_BUG_ON(start & ~PAGE_MASK);
- VM_BUG_ON(len != PAGE_ALIGN(len));
- end = start + len;
-
- for (nstart = start; nstart < end; nstart = nend) {
- /*
- * We want to fault in pages for [nstart; end) address range.
- * Find first corresponding VMA.
- */
- if (!locked) {
- locked = 1;
- down_read(&mm->mmap_sem);
- vma = find_vma(mm, nstart);
- } else if (nstart >= vma->vm_end)
- vma = vma->vm_next;
- if (!vma || vma->vm_start >= end)
- break;
- /*
- * Set [nstart; nend) to intersection of desired address
- * range with the first VMA. Also, skip undesirable VMA types.
- */
- nend = min(end, vma->vm_end);
- if (vma->vm_flags & (VM_IO | VM_PFNMAP))
- continue;
- if (nstart < vma->vm_start)
- nstart = vma->vm_start;
- /*
- * Now fault in a range of pages. __mlock_vma_pages_range()
- * double checks the vma flags, so that it won't mlock pages
- * if the vma was already munlocked.
- */
- ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
- if (ret < 0) {
- if (ignore_errors) {
- ret = 0;
- continue; /* continue at next VMA */
- }
- ret = __mlock_posix_error_return(ret);
- break;
- }
- nend = nstart + ret * PAGE_SIZE;
- ret = 0;
- }
- if (locked)
- up_read(&mm->mmap_sem);
- return ret; /* 0 or negative error code */
-}
-
SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
unsigned long locked;
@@ -750,9 +631,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
error = do_mlock(start, len, 1);
up_write(&current->mm->mmap_sem);
- if (!error)
- error = __mm_populate(start, len, 0);
- return error;
+ if (error)
+ return error;
+
+ error = __mm_populate(start, len, 0);
+ if (error)
+ return __mlock_posix_error_return(error);
+ return 0;
}
SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 5f420f7fafa1..fdadf918de76 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -11,6 +11,7 @@
#include <linux/export.h>
#include <linux/memory.h>
#include <linux/notifier.h>
+#include <linux/sched.h>
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -130,14 +131,6 @@ void __init mminit_verify_pageflags_layout(void)
BUG_ON(or_mask != add_mask);
}
-void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
- unsigned long nid, unsigned long pfn)
-{
- BUG_ON(page_to_nid(page) != nid);
- BUG_ON(page_zonenum(page) != zone);
- BUG_ON(page_to_pfn(page) != pfn);
-}
-
static __init int set_mminit_loglevel(char *str)
{
get_option(&str, &mminit_loglevel);
diff --git a/mm/mmap.c b/mm/mmap.c
index 9ec50a368634..aa632ade2be7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1133,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
* by another page fault trying to merge _that_. But that's ok: if it
* is being set up, that automatically means that it will be a singleton
* acceptable for merging, so we can do all of this optimistically. But
- * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
+ * we do that READ_ONCE() to make sure that we never re-load the pointer.
*
* IOW: that the "list_is_singular()" test on the anon_vma_chain only
* matters for the 'stable anon_vma' case (ie the thing we want to avoid
@@ -1147,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
{
if (anon_vma_compatible(a, b)) {
- struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
+ struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
if (anon_vma && list_is_singular(&old->anon_vma_chain))
return anon_vma;
@@ -1258,6 +1258,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
*populate = 0;
+ if (!len)
+ return -EINVAL;
+
/*
* Does the application expect PROT_READ to imply PROT_EXEC?
*
@@ -1268,9 +1271,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC)))
prot |= PROT_EXEC;
- if (!len)
- return -EINVAL;
-
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
@@ -1551,11 +1551,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* Clear old maps */
error = -ENOMEM;
-munmap_back:
- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
+ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
+ &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
- goto munmap_back;
}
/*
@@ -1571,7 +1570,8 @@ munmap_back:
/*
* Can we just expand an old mapping?
*/
- vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
+ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
+ NULL);
if (vma)
goto out;
@@ -2100,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
actual_size = size;
if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
actual_size -= PAGE_SIZE;
- if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -2108,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
unsigned long locked;
unsigned long limit;
locked = mm->locked_vm + grow;
- limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+ limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
limit >>= PAGE_SHIFT;
if (locked > limit && !capable(CAP_IPC_LOCK))
return -ENOMEM;
@@ -2316,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL);
+ populate_vma_page_range(prev, addr, prev->vm_end, NULL);
return prev;
}
#else
@@ -2351,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
if (expand_stack(vma, addr))
return NULL;
if (vma->vm_flags & VM_LOCKED)
- __mlock_vma_pages_range(vma, addr, start, NULL);
+ populate_vma_page_range(vma, addr, start, NULL);
return vma;
}
#endif
@@ -2739,11 +2739,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
/*
* Clear old maps. this also does some error checking for us
*/
- munmap_back:
- if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
+ while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
+ &rb_parent)) {
if (do_munmap(mm, addr, len))
return -ENOMEM;
- goto munmap_back;
}
/* Check against address space limits *after* clearing old maps... */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 88584838e704..e7d6f1171ecb 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,6 +29,8 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include "internal.h"
+
/*
* For a prot_numa update we only hold mmap_sem for read so there is a
* potential race with faulting where a pmd was temporarily none. This
@@ -322,6 +324,15 @@ success:
change_protection(vma, start, end, vma->vm_page_prot,
dirty_accountable, 0);
+ /*
+ * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
+ * fault on access.
+ */
+ if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED &&
+ (newflags & VM_WRITE)) {
+ populate_vma_page_range(vma, start, end, NULL);
+ }
+
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
perf_event_mmap(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index 57dadc025c64..a7c93eceb1c8 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -22,6 +22,7 @@
#include <linux/mmu_notifier.h>
#include <linux/sched/sysctl.h>
#include <linux/uaccess.h>
+#include <linux/mm-arch-hooks.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -286,8 +287,18 @@ static unsigned long move_vma(struct vm_area_struct *vma,
old_len = new_len;
old_addr = new_addr;
new_addr = -ENOMEM;
- } else if (vma->vm_file && vma->vm_file->f_op->mremap)
- vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+ } else {
+ if (vma->vm_file && vma->vm_file->f_op->mremap) {
+ err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
+ if (err < 0) {
+ move_page_tables(new_vma, new_addr, vma,
+ old_addr, moved_len, true);
+ return err;
+ }
+ }
+ arch_remap(mm, old_addr, old_addr + old_len,
+ new_addr, new_addr + new_len);
+ }
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT) {
@@ -339,25 +350,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
struct vm_area_struct *vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr)
- goto Efault;
+ return ERR_PTR(-EFAULT);
if (is_vm_hugetlb_page(vma))
- goto Einval;
+ return ERR_PTR(-EINVAL);
/* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr)
- goto Efault;
+ return ERR_PTR(-EFAULT);
/* Need to be careful about a growing mapping */
if (new_len > old_len) {
unsigned long pgoff;
if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
- goto Efault;
+ return ERR_PTR(-EFAULT);
pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
- goto Einval;
+ return ERR_PTR(-EINVAL);
}
if (vma->vm_flags & VM_LOCKED) {
@@ -366,29 +377,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
lock_limit = rlimit(RLIMIT_MEMLOCK);
locked += new_len - old_len;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- goto Eagain;
+ return ERR_PTR(-EAGAIN);
}
if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
- goto Enomem;
+ return ERR_PTR(-ENOMEM);
if (vma->vm_flags & VM_ACCOUNT) {
unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
- goto Efault;
+ return ERR_PTR(-ENOMEM);
*p = charged;
}
return vma;
-
-Efault: /* very odd choice for most of the cases, but... */
- return ERR_PTR(-EFAULT);
-Einval:
- return ERR_PTR(-EINVAL);
-Enomem:
- return ERR_PTR(-ENOMEM);
-Eagain:
- return ERR_PTR(-EAGAIN);
}
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 90b50468333e..e57cf24babd6 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -37,11 +37,20 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
{
void *ptr;
u64 addr;
+ ulong flags = choose_memblock_flags();
if (limit > memblock.current_limit)
limit = memblock.current_limit;
- addr = memblock_find_in_range_node(size, align, goal, limit, nid);
+again:
+ addr = memblock_find_in_range_node(size, align, goal, limit, nid,
+ flags);
+ if (!addr && (flags & MEMBLOCK_MIRROR)) {
+ flags &= ~MEMBLOCK_MIRROR;
+ pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ &size);
+ goto again;
+ }
if (!addr)
return NULL;
@@ -77,7 +86,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
end = PFN_DOWN(addr + size);
for (; cursor < end; cursor++) {
- __free_pages_bootmem(pfn_to_page(cursor), 0);
+ __free_pages_bootmem(pfn_to_page(cursor), cursor, 0);
totalram_pages++;
}
}
@@ -92,7 +101,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
while (start + (1UL << order) > end)
order--;
- __free_pages_bootmem(pfn_to_page(start), order);
+ __free_pages_bootmem(pfn_to_page(start), start, order);
start += (1UL << order);
}
@@ -121,7 +130,11 @@ static unsigned long __init free_low_memory_core_early(void)
memblock_clear_hotplug(0, -1);
- for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
+ for_each_reserved_mem_region(i, &start, &end)
+ reserve_bootmem_region(start, end);
+
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL)
count += __free_memory_core(start, end);
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
diff --git a/mm/nommu.c b/mm/nommu.c
index 3fba2dc97c44..58ea3643b9e9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -42,22 +42,6 @@
#include <asm/mmu_context.h>
#include "internal.h"
-#if 0
-#define kenter(FMT, ...) \
- printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) \
- printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) \
- printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
-#else
-#define kenter(FMT, ...) \
- no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) \
- no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) \
- no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
-#endif
-
void *high_memory;
EXPORT_SYMBOL(high_memory);
struct page *mem_map;
@@ -665,11 +649,7 @@ static void free_page_series(unsigned long from, unsigned long to)
for (; from < to; from += PAGE_SIZE) {
struct page *page = virt_to_page(from);
- kdebug("- free %lx", from);
atomic_long_dec(&mmap_pages_allocated);
- if (page_count(page) != 1)
- kdebug("free page %p: refcount not one: %d",
- page, page_count(page));
put_page(page);
}
}
@@ -683,8 +663,6 @@ static void free_page_series(unsigned long from, unsigned long to)
static void __put_nommu_region(struct vm_region *region)
__releases(nommu_region_sem)
{
- kenter("%p{%d}", region, region->vm_usage);
-
BUG_ON(!nommu_region_tree.rb_node);
if (--region->vm_usage == 0) {
@@ -697,10 +675,8 @@ static void __put_nommu_region(struct vm_region *region)
/* IO memory and memory shared directly out of the pagecache
* from ramfs/tmpfs mustn't be released here */
- if (region->vm_flags & VM_MAPPED_COPY) {
- kdebug("free series");
+ if (region->vm_flags & VM_MAPPED_COPY)
free_page_series(region->vm_start, region->vm_top);
- }
kmem_cache_free(vm_region_jar, region);
} else {
up_write(&nommu_region_sem);
@@ -744,8 +720,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
struct address_space *mapping;
struct rb_node **p, *parent, *rb_prev;
- kenter(",%p", vma);
-
BUG_ON(!vma->vm_region);
mm->map_count++;
@@ -813,8 +787,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
struct mm_struct *mm = vma->vm_mm;
struct task_struct *curr = current;
- kenter("%p", vma);
-
protect_vma(vma, 0);
mm->map_count--;
@@ -854,7 +826,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
*/
static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
{
- kenter("%p", vma);
if (vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file)
@@ -957,12 +928,8 @@ static int validate_mmap_request(struct file *file,
int ret;
/* do the simple checks first */
- if (flags & MAP_FIXED) {
- printk(KERN_DEBUG
- "%d: Can't do fixed-address/overlay mmap of RAM\n",
- current->pid);
+ if (flags & MAP_FIXED)
return -EINVAL;
- }
if ((flags & MAP_TYPE) != MAP_PRIVATE &&
(flags & MAP_TYPE) != MAP_SHARED)
@@ -1016,7 +983,7 @@ static int validate_mmap_request(struct file *file,
* device */
if (!file->f_op->get_unmapped_area)
capabilities &= ~NOMMU_MAP_DIRECT;
- if (!file->f_op->read)
+ if (!(file->f_mode & FMODE_CAN_READ))
capabilities &= ~NOMMU_MAP_COPY;
/* The file shall have been opened with read permission. */
@@ -1060,8 +1027,7 @@ static int validate_mmap_request(struct file *file,
) {
capabilities &= ~NOMMU_MAP_DIRECT;
if (flags & MAP_SHARED) {
- printk(KERN_WARNING
- "MAP_SHARED not completely supported on !MMU\n");
+ pr_warn("MAP_SHARED not completely supported on !MMU\n");
return -EINVAL;
}
}
@@ -1205,16 +1171,12 @@ static int do_mmap_private(struct vm_area_struct *vma,
* we're allocating is smaller than a page
*/
order = get_order(len);
- kdebug("alloc order %d for %lx", order, len);
-
total = 1 << order;
point = len >> PAGE_SHIFT;
/* we don't want to allocate a power-of-2 sized page set */
- if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
+ if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages)
total = point;
- kdebug("try to alloc exact %lu pages", total);
- }
base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL);
if (!base)
@@ -1240,7 +1202,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
+ ret = __vfs_read(vma->vm_file, base, len, &fpos);
set_fs(old_fs);
if (ret < 0)
@@ -1285,18 +1247,14 @@ unsigned long do_mmap_pgoff(struct file *file,
unsigned long capabilities, vm_flags, result;
int ret;
- kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
-
*populate = 0;
/* decide whether we should attempt the mapping, and if so what sort of
* mapping */
ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
&capabilities);
- if (ret < 0) {
- kleave(" = %d [val]", ret);
+ if (ret < 0)
return ret;
- }
/* we ignore the address hint */
addr = 0;
@@ -1383,11 +1341,9 @@ unsigned long do_mmap_pgoff(struct file *file,
vma->vm_start = start;
vma->vm_end = start + len;
- if (pregion->vm_flags & VM_MAPPED_COPY) {
- kdebug("share copy");
+ if (pregion->vm_flags & VM_MAPPED_COPY)
vma->vm_flags |= VM_MAPPED_COPY;
- } else {
- kdebug("share mmap");
+ else {
ret = do_mmap_shared_file(vma);
if (ret < 0) {
vma->vm_region = NULL;
@@ -1467,7 +1423,6 @@ share:
up_write(&nommu_region_sem);
- kleave(" = %lx", result);
return result;
error_just_free:
@@ -1479,27 +1434,24 @@ error:
if (vma->vm_file)
fput(vma->vm_file);
kmem_cache_free(vm_area_cachep, vma);
- kleave(" = %d", ret);
return ret;
sharing_violation:
up_write(&nommu_region_sem);
- printk(KERN_WARNING "Attempt to share mismatched mappings\n");
+ pr_warn("Attempt to share mismatched mappings\n");
ret = -EINVAL;
goto error;
error_getting_vma:
kmem_cache_free(vm_region_jar, region);
- printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
- " from process %d failed\n",
- len, current->pid);
+ pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n",
+ len, current->pid);
show_free_areas(0);
return -ENOMEM;
error_getting_region:
- printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
- " from process %d failed\n",
- len, current->pid);
+ pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n",
+ len, current->pid);
show_free_areas(0);
return -ENOMEM;
}
@@ -1563,8 +1515,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_region *region;
unsigned long npages;
- kenter("");
-
/* we're only permitted to split anonymous regions (these should have
* only a single usage on the region) */
if (vma->vm_file)
@@ -1628,8 +1578,6 @@ static int shrink_vma(struct mm_struct *mm,
{
struct vm_region *region;
- kenter("");
-
/* adjust the VMA's pointers, which may reposition it in the MM's tree
* and list */
delete_vma_from_mm(vma);
@@ -1669,8 +1617,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
unsigned long end;
int ret;
- kenter(",%lx,%zx", start, len);
-
len = PAGE_ALIGN(len);
if (len == 0)
return -EINVAL;
@@ -1682,11 +1628,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
if (!vma) {
static int limit;
if (limit < 5) {
- printk(KERN_WARNING
- "munmap of memory not mmapped by process %d"
- " (%s): 0x%lx-0x%lx\n",
- current->pid, current->comm,
- start, start + len - 1);
+ pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n",
+ current->pid, current->comm,
+ start, start + len - 1);
limit++;
}
return -EINVAL;
@@ -1695,38 +1639,27 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
/* we're allowed to split an anonymous VMA but not a file-backed one */
if (vma->vm_file) {
do {
- if (start > vma->vm_start) {
- kleave(" = -EINVAL [miss]");
+ if (start > vma->vm_start)
return -EINVAL;
- }
if (end == vma->vm_end)
goto erase_whole_vma;
vma = vma->vm_next;
} while (vma);
- kleave(" = -EINVAL [split file]");
return -EINVAL;
} else {
/* the chunk must be a subset of the VMA found */
if (start == vma->vm_start && end == vma->vm_end)
goto erase_whole_vma;
- if (start < vma->vm_start || end > vma->vm_end) {
- kleave(" = -EINVAL [superset]");
+ if (start < vma->vm_start || end > vma->vm_end)
return -EINVAL;
- }
- if (start & ~PAGE_MASK) {
- kleave(" = -EINVAL [unaligned start]");
+ if (start & ~PAGE_MASK)
return -EINVAL;
- }
- if (end != vma->vm_end && end & ~PAGE_MASK) {
- kleave(" = -EINVAL [unaligned split]");
+ if (end != vma->vm_end && end & ~PAGE_MASK)
return -EINVAL;
- }
if (start != vma->vm_start && end != vma->vm_end) {
ret = split_vma(mm, vma, start, 1);
- if (ret < 0) {
- kleave(" = %d [split]", ret);
+ if (ret < 0)
return ret;
- }
}
return shrink_vma(mm, vma, start, end);
}
@@ -1734,7 +1667,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
erase_whole_vma:
delete_vma_from_mm(vma);
delete_vma(mm, vma);
- kleave(" = 0");
return 0;
}
EXPORT_SYMBOL(do_munmap);
@@ -1766,8 +1698,6 @@ void exit_mmap(struct mm_struct *mm)
if (!mm)
return;
- kenter("");
-
mm->total_vm = 0;
while ((vma = mm->mmap)) {
@@ -1776,8 +1706,6 @@ void exit_mmap(struct mm_struct *mm)
delete_vma(mm, vma);
cond_resched();
}
-
- kleave("");
}
unsigned long vm_brk(unsigned long addr, unsigned long len)
@@ -2157,7 +2085,7 @@ static int __meminit init_user_reserve(void)
sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
return 0;
}
-module_init(init_user_reserve)
+subsys_initcall(init_user_reserve);
/*
* Initialise sysctl_admin_reserve_kbytes.
@@ -2178,4 +2106,4 @@ static int __meminit init_admin_reserve(void)
sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
return 0;
}
-module_init(init_admin_reserve)
+subsys_initcall(init_admin_reserve);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 642f38cb175a..dff991e0681e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -42,7 +42,8 @@
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
-static DEFINE_SPINLOCK(zone_scan_lock);
+
+DEFINE_MUTEX(oom_lock);
#ifdef CONFIG_NUMA
/**
@@ -405,16 +406,15 @@ static atomic_t oom_victims = ATOMIC_INIT(0);
static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
bool oom_killer_disabled __read_mostly;
-static DECLARE_RWSEM(oom_sem);
/**
- * mark_tsk_oom_victim - marks the given taks as OOM victim.
+ * mark_oom_victim - mark the given task as OOM victim
* @tsk: task to mark
*
- * Has to be called with oom_sem taken for read and never after
+ * Has to be called with oom_lock held and never after
* oom has been disabled already.
*/
-void mark_tsk_oom_victim(struct task_struct *tsk)
+void mark_oom_victim(struct task_struct *tsk)
{
WARN_ON(oom_killer_disabled);
/* OOM killer might race with memcg OOM */
@@ -431,23 +431,14 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
}
/**
- * unmark_oom_victim - unmarks the current task as OOM victim.
- *
- * Wakes up all waiters in oom_killer_disable()
+ * exit_oom_victim - note the exit of an OOM victim
*/
-void unmark_oom_victim(void)
+void exit_oom_victim(void)
{
- if (!test_and_clear_thread_flag(TIF_MEMDIE))
- return;
+ clear_thread_flag(TIF_MEMDIE);
- down_read(&oom_sem);
- /*
- * There is no need to signal the lasst oom_victim if there
- * is nobody who cares.
- */
- if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+ if (!atomic_dec_return(&oom_victims))
wake_up_all(&oom_victims_wait);
- up_read(&oom_sem);
}
/**
@@ -469,14 +460,14 @@ bool oom_killer_disable(void)
* Make sure to not race with an ongoing OOM killer
* and that the current is not the victim.
*/
- down_write(&oom_sem);
+ mutex_lock(&oom_lock);
if (test_thread_flag(TIF_MEMDIE)) {
- up_write(&oom_sem);
+ mutex_unlock(&oom_lock);
return false;
}
oom_killer_disabled = true;
- up_write(&oom_sem);
+ mutex_unlock(&oom_lock);
wait_event(oom_victims_wait, !atomic_read(&oom_victims));
@@ -488,9 +479,7 @@ bool oom_killer_disable(void)
*/
void oom_killer_enable(void)
{
- down_write(&oom_sem);
oom_killer_disabled = false;
- up_write(&oom_sem);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -517,7 +506,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
*/
task_lock(p);
if (p->mm && task_will_free_mem(p)) {
- mark_tsk_oom_victim(p);
+ mark_oom_victim(p);
task_unlock(p);
put_task_struct(p);
return;
@@ -528,7 +517,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
dump_header(p, gfp_mask, order, memcg, nodemask);
task_lock(p);
- pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
+ pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
message, task_pid_nr(p), p->comm, points);
task_unlock(p);
@@ -572,7 +561,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/* mm cannot safely be dereferenced after task_unlock(victim) */
mm = victim->mm;
- mark_tsk_oom_victim(victim);
+ mark_oom_victim(victim);
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -612,7 +601,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order, const nodemask_t *nodemask)
+ int order, const nodemask_t *nodemask,
+ struct mem_cgroup *memcg)
{
if (likely(!sysctl_panic_on_oom))
return;
@@ -625,7 +615,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
if (constraint != CONSTRAINT_NONE)
return;
}
- dump_header(NULL, gfp_mask, order, NULL, nodemask);
+ dump_header(NULL, gfp_mask, order, memcg, nodemask);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
}
@@ -644,52 +634,6 @@ int unregister_oom_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_oom_notifier);
-/*
- * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero
- * if a parallel OOM killing is already taking place that includes a zone in
- * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
- */
-bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
-{
- struct zoneref *z;
- struct zone *zone;
- bool ret = true;
-
- spin_lock(&zone_scan_lock);
- for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
- ret = false;
- goto out;
- }
-
- /*
- * Lock each zone in the zonelist under zone_scan_lock so a parallel
- * call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
- */
- for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- set_bit(ZONE_OOM_LOCKED, &zone->flags);
-
-out:
- spin_unlock(&zone_scan_lock);
- return ret;
-}
-
-/*
- * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
- * allocation attempts with zonelists containing them may now recall the OOM
- * killer, if necessary.
- */
-void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
-{
- struct zoneref *z;
- struct zone *zone;
-
- spin_lock(&zone_scan_lock);
- for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- clear_bit(ZONE_OOM_LOCKED, &zone->flags);
- spin_unlock(&zone_scan_lock);
-}
-
/**
* __out_of_memory - kill the "best" process when we run out of memory
* @zonelist: zonelist pointer
@@ -703,8 +647,8 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+ int order, nodemask_t *nodemask, bool force_kill)
{
const nodemask_t *mpol_mask;
struct task_struct *p;
@@ -714,10 +658,13 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
enum oom_constraint constraint = CONSTRAINT_NONE;
int killed = 0;
+ if (oom_killer_disabled)
+ return false;
+
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
/* Got some memory back in the last second. */
- return;
+ goto out;
/*
* If current has a pending SIGKILL or is exiting, then automatically
@@ -729,8 +676,8 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
*/
if (current->mm &&
(fatal_signal_pending(current) || task_will_free_mem(current))) {
- mark_tsk_oom_victim(current);
- return;
+ mark_oom_victim(current);
+ goto out;
}
/*
@@ -740,7 +687,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
&totalpages);
mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
+ check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
if (sysctl_oom_kill_allocating_task && current->mm &&
!oom_unkillable_task(current, NULL, nodemask) &&
@@ -770,32 +717,8 @@ out:
*/
if (killed)
schedule_timeout_killable(1);
-}
-
-/**
- * out_of_memory - tries to invoke OOM killer.
- * @zonelist: zonelist pointer
- * @gfp_mask: memory allocation flags
- * @order: amount of memory being requested as a power of 2
- * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
- *
- * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
- * when it returns false. Otherwise returns true.
- */
-bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
-{
- bool ret = false;
-
- down_read(&oom_sem);
- if (!oom_killer_disabled) {
- __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
- ret = true;
- }
- up_read(&oom_sem);
- return ret;
+ return true;
}
/*
@@ -805,27 +728,21 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
*/
void pagefault_out_of_memory(void)
{
- struct zonelist *zonelist;
-
- down_read(&oom_sem);
if (mem_cgroup_oom_synchronize(true))
- goto unlock;
+ return;
- zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
- if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
- if (!oom_killer_disabled)
- __out_of_memory(NULL, 0, 0, NULL, false);
- else
- /*
- * There shouldn't be any user tasks runable while the
- * OOM killer is disabled so the current task has to
- * be a racing OOM victim for which oom_killer_disable()
- * is waiting for.
- */
- WARN_ON(test_thread_flag(TIF_MEMDIE));
+ if (!mutex_trylock(&oom_lock))
+ return;
- oom_zonelist_unlock(zonelist, GFP_KERNEL);
+ if (!out_of_memory(NULL, 0, 0, NULL, false)) {
+ /*
+ * There shouldn't be any user tasks runnable while the
+ * OOM killer is disabled, so the current task has to
+ * be a racing OOM victim for which oom_killer_disable()
+ * is waiting for.
+ */
+ WARN_ON(test_thread_flag(TIF_MEMDIE));
}
-unlock:
- up_read(&oom_sem);
+
+ mutex_unlock(&oom_lock);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 644bcb665773..5cccc127ef81 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -122,31 +122,31 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */
-unsigned long global_dirty_limit;
+struct wb_domain global_wb_domain;
-/*
- * Scale the writeback cache size proportional to the relative writeout speeds.
- *
- * We do this by keeping a floating proportion between BDIs, based on page
- * writeback completions [end_page_writeback()]. Those devices that write out
- * pages fastest will get the larger share, while the slower will get a smaller
- * share.
- *
- * We use page writeout completions because we are interested in getting rid of
- * dirty pages. Having them written out is the primary goal.
- *
- * We introduce a concept of time, a period over which we measure these events,
- * because demand can/will vary over time. The length of this period itself is
- * measured in page writeback completions.
- *
- */
-static struct fprop_global writeout_completions;
+/* consolidated parameters for balance_dirty_pages() and its subroutines */
+struct dirty_throttle_control {
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct wb_domain *dom;
+ struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */
+#endif
+ struct bdi_writeback *wb;
+ struct fprop_local_percpu *wb_completions;
-static void writeout_period(unsigned long t);
-/* Timer for aging of writeout_completions */
-static struct timer_list writeout_period_timer =
- TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
-static unsigned long writeout_period_time = 0;
+ unsigned long avail; /* dirtyable */
+ unsigned long dirty; /* file_dirty + write + nfs */
+ unsigned long thresh; /* dirty threshold */
+ unsigned long bg_thresh; /* dirty background threshold */
+
+ unsigned long wb_dirty; /* per-wb counterparts */
+ unsigned long wb_thresh;
+ unsigned long wb_bg_thresh;
+
+ unsigned long pos_ratio;
+};
+
+#define DTC_INIT_COMMON(__wb) .wb = (__wb), \
+ .wb_completions = &(__wb)->completions
/*
* Length of period for aging writeout fractions of bdis. This is an
@@ -155,6 +155,97 @@ static unsigned long writeout_period_time = 0;
*/
#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#define GDTC_INIT(__wb) .dom = &global_wb_domain, \
+ DTC_INIT_COMMON(__wb)
+#define GDTC_INIT_NO_WB .dom = &global_wb_domain
+#define MDTC_INIT(__wb, __gdtc) .dom = mem_cgroup_wb_domain(__wb), \
+ .gdtc = __gdtc, \
+ DTC_INIT_COMMON(__wb)
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+ return dtc->dom;
+}
+
+static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+{
+ return dtc->dom;
+}
+
+static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+{
+ return mdtc->gdtc;
+}
+
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+ return &wb->memcg_completions;
+}
+
+static void wb_min_max_ratio(struct bdi_writeback *wb,
+ unsigned long *minp, unsigned long *maxp)
+{
+ unsigned long this_bw = wb->avg_write_bandwidth;
+ unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+ unsigned long long min = wb->bdi->min_ratio;
+ unsigned long long max = wb->bdi->max_ratio;
+
+ /*
+ * @wb may already be clean by the time control reaches here and
+ * the total may not include its bw.
+ */
+ if (this_bw < tot_bw) {
+ if (min) {
+ min *= this_bw;
+ do_div(min, tot_bw);
+ }
+ if (max < 100) {
+ max *= this_bw;
+ do_div(max, tot_bw);
+ }
+ }
+
+ *minp = min;
+ *maxp = max;
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+#define GDTC_INIT(__wb) DTC_INIT_COMMON(__wb)
+#define GDTC_INIT_NO_WB
+#define MDTC_INIT(__wb, __gdtc)
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+ return false;
+}
+
+static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+{
+ return &global_wb_domain;
+}
+
+static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+{
+ return NULL;
+}
+
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+ return NULL;
+}
+
+static void wb_min_max_ratio(struct bdi_writeback *wb,
+ unsigned long *minp, unsigned long *maxp)
+{
+ *minp = wb->bdi->min_ratio;
+ *maxp = wb->bdi->max_ratio;
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
/*
* In a memory zone, there is a certain amount of pages we consider
* available for the page cache, which is essentially the number of
@@ -250,42 +341,88 @@ static unsigned long global_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
-/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
+/**
+ * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
+ * @dtc: dirty_throttle_control of interest
*
- * Calculate the dirty thresholds based on sysctl parameters
- * - vm.dirty_background_ratio or vm.dirty_background_bytes
- * - vm.dirty_ratio or vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * Calculate @dtc->thresh and ->bg_thresh considering
+ * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller
+ * must ensure that @dtc->avail is set before calling this function. The
+ * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
* real-time tasks.
*/
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
- const unsigned long available_memory = global_dirtyable_memory();
- unsigned long background;
- unsigned long dirty;
+ const unsigned long available_memory = dtc->avail;
+ struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
+ unsigned long bytes = vm_dirty_bytes;
+ unsigned long bg_bytes = dirty_background_bytes;
+ unsigned long ratio = vm_dirty_ratio;
+ unsigned long bg_ratio = dirty_background_ratio;
+ unsigned long thresh;
+ unsigned long bg_thresh;
struct task_struct *tsk;
- if (vm_dirty_bytes)
- dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+ /* gdtc is !NULL iff @dtc is for memcg domain */
+ if (gdtc) {
+ unsigned long global_avail = gdtc->avail;
+
+ /*
+ * The byte settings can't be applied directly to memcg
+ * domains. Convert them to ratios by scaling against
+ * globally available memory.
+ */
+ if (bytes)
+ ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 /
+ global_avail, 100UL);
+ if (bg_bytes)
+ bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 /
+ global_avail, 100UL);
+ bytes = bg_bytes = 0;
+ }
+
+ if (bytes)
+ thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
else
- dirty = (vm_dirty_ratio * available_memory) / 100;
+ thresh = (ratio * available_memory) / 100;
- if (dirty_background_bytes)
- background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+ if (bg_bytes)
+ bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
else
- background = (dirty_background_ratio * available_memory) / 100;
+ bg_thresh = (bg_ratio * available_memory) / 100;
- if (background >= dirty)
- background = dirty / 2;
+ if (bg_thresh >= thresh)
+ bg_thresh = thresh / 2;
tsk = current;
if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
- background += background / 4;
- dirty += dirty / 4;
+ bg_thresh += bg_thresh / 4;
+ thresh += thresh / 4;
}
- *pbackground = background;
- *pdirty = dirty;
- trace_global_dirty_state(background, dirty);
+ dtc->thresh = thresh;
+ dtc->bg_thresh = bg_thresh;
+
+ /* we should eventually report the domain in the TP */
+ if (!gdtc)
+ trace_global_dirty_state(bg_thresh, thresh);
+}
+
+/**
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ * @pbackground: out parameter for bg_thresh
+ * @pdirty: out parameter for thresh
+ *
+ * Calculate bg_thresh and thresh for global_wb_domain. See
+ * domain_dirty_limits() for details.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+ struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+
+ gdtc.avail = global_dirtyable_memory();
+ domain_dirty_limits(&gdtc);
+
+ *pbackground = gdtc.bg_thresh;
+ *pdirty = gdtc.thresh;
}
/**
@@ -392,47 +529,52 @@ static unsigned long wp_next_time(unsigned long cur_time)
return cur_time;
}
-/*
- * Increment the BDI's writeout completion count and the global writeout
- * completion count. Called from test_clear_page_writeback().
- */
-static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+static void wb_domain_writeout_inc(struct wb_domain *dom,
+ struct fprop_local_percpu *completions,
+ unsigned int max_prop_frac)
{
- __inc_bdi_stat(bdi, BDI_WRITTEN);
- __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
- bdi->max_prop_frac);
+ __fprop_inc_percpu_max(&dom->completions, completions,
+ max_prop_frac);
/* First event after period switching was turned off? */
- if (!unlikely(writeout_period_time)) {
+ if (!unlikely(dom->period_time)) {
/*
* We can race with other __bdi_writeout_inc calls here but
* it does not cause any harm since the resulting time when
* timer will fire and what is in writeout_period_time will be
* roughly the same.
*/
- writeout_period_time = wp_next_time(jiffies);
- mod_timer(&writeout_period_timer, writeout_period_time);
+ dom->period_time = wp_next_time(jiffies);
+ mod_timer(&dom->period_timer, dom->period_time);
}
}
-void bdi_writeout_inc(struct backing_dev_info *bdi)
+/*
+ * Increment @wb's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __wb_writeout_inc(struct bdi_writeback *wb)
+{
+ struct wb_domain *cgdom;
+
+ __inc_wb_stat(wb, WB_WRITTEN);
+ wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
+ wb->bdi->max_prop_frac);
+
+ cgdom = mem_cgroup_wb_domain(wb);
+ if (cgdom)
+ wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
+ wb->bdi->max_prop_frac);
+}
+
+void wb_writeout_inc(struct bdi_writeback *wb)
{
unsigned long flags;
local_irq_save(flags);
- __bdi_writeout_inc(bdi);
+ __wb_writeout_inc(wb);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(bdi_writeout_inc);
-
-/*
- * Obtain an accurate fraction of the BDI's portion.
- */
-static void bdi_writeout_fraction(struct backing_dev_info *bdi,
- long *numerator, long *denominator)
-{
- fprop_fraction_percpu(&writeout_completions, &bdi->completions,
- numerator, denominator);
-}
+EXPORT_SYMBOL_GPL(wb_writeout_inc);
/*
* On idle system, we can be called long after we scheduled because we use
@@ -440,22 +582,46 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
*/
static void writeout_period(unsigned long t)
{
- int miss_periods = (jiffies - writeout_period_time) /
+ struct wb_domain *dom = (void *)t;
+ int miss_periods = (jiffies - dom->period_time) /
VM_COMPLETIONS_PERIOD_LEN;
- if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
- writeout_period_time = wp_next_time(writeout_period_time +
+ if (fprop_new_period(&dom->completions, miss_periods + 1)) {
+ dom->period_time = wp_next_time(dom->period_time +
miss_periods * VM_COMPLETIONS_PERIOD_LEN);
- mod_timer(&writeout_period_timer, writeout_period_time);
+ mod_timer(&dom->period_timer, dom->period_time);
} else {
/*
* Aging has zeroed all fractions. Stop wasting CPU on period
* updates.
*/
- writeout_period_time = 0;
+ dom->period_time = 0;
}
}
+int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
+{
+ memset(dom, 0, sizeof(*dom));
+
+ spin_lock_init(&dom->lock);
+
+ init_timer_deferrable(&dom->period_timer);
+ dom->period_timer.function = writeout_period;
+ dom->period_timer.data = (unsigned long)dom;
+
+ dom->dirty_limit_tstamp = jiffies;
+
+ return fprop_global_init(&dom->completions, gfp);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+void wb_domain_exit(struct wb_domain *dom)
+{
+ del_timer_sync(&dom->period_timer);
+ fprop_global_destroy(&dom->completions);
+}
+#endif
+
/*
* bdi_min_ratio keeps the sum of the minimum dirty shares of all
* registered backing devices, which, for obvious reasons, can not
@@ -510,17 +676,26 @@ static unsigned long dirty_freerun_ceiling(unsigned long thresh,
return (thresh + bg_thresh) / 2;
}
-static unsigned long hard_dirty_limit(unsigned long thresh)
+static unsigned long hard_dirty_limit(struct wb_domain *dom,
+ unsigned long thresh)
{
- return max(thresh, global_dirty_limit);
+ return max(thresh, dom->dirty_limit);
+}
+
+/* memory available to a memcg domain is capped by system-wide clean memory */
+static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+{
+ struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
+ unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+
+ mdtc->avail = min(mdtc->avail, clean);
}
/**
- * bdi_dirty_limit - @bdi's share of dirty throttling threshold
- * @bdi: the backing_dev_info to query
- * @dirty: global dirty limit in pages
+ * __wb_calc_thresh - @wb's share of dirty throttling threshold
+ * @dtc: dirty_throttle_context of interest
*
- * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * Returns @wb's dirty limit in pages. The term "dirty" in the context of
* dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
*
* Note that balance_dirty_pages() will only seriously take it as a hard limit
@@ -528,34 +703,47 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
* control. For example, when the device is completely stalled due to some error
* conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
* In the other normal situations, it acts more gently by throttling the tasks
- * more (rather than completely block them) when the bdi dirty pages go high.
+ * more (rather than completely block them) when the wb dirty pages go high.
*
* It allocates high/low dirty limits to fast/slow devices, in order to prevent
* - starving fast devices
* - piling up dirty pages (that will take long time to sync) on slow devices
*
- * The bdi's share of dirty limit will be adapting to its throughput and
+ * The wb's share of dirty limit will be adapting to its throughput and
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
*/
-unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
- u64 bdi_dirty;
+ struct wb_domain *dom = dtc_dom(dtc);
+ unsigned long thresh = dtc->thresh;
+ u64 wb_thresh;
long numerator, denominator;
+ unsigned long wb_min_ratio, wb_max_ratio;
/*
- * Calculate this BDI's share of the dirty ratio.
+ * Calculate this BDI's share of the thresh ratio.
*/
- bdi_writeout_fraction(bdi, &numerator, &denominator);
+ fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
+ &numerator, &denominator);
+
+ wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
+ wb_thresh *= numerator;
+ do_div(wb_thresh, denominator);
- bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
- bdi_dirty *= numerator;
- do_div(bdi_dirty, denominator);
+ wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
- bdi_dirty += (dirty * bdi->min_ratio) / 100;
- if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
- bdi_dirty = dirty * bdi->max_ratio / 100;
+ wb_thresh += (thresh * wb_min_ratio) / 100;
+ if (wb_thresh > (thresh * wb_max_ratio) / 100)
+ wb_thresh = thresh * wb_max_ratio / 100;
- return bdi_dirty;
+ return wb_thresh;
+}
+
+unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
+{
+ struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
+ .thresh = thresh };
+ return __wb_calc_thresh(&gdtc);
}
/*
@@ -580,7 +768,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
long x;
x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
- limit - setpoint + 1);
+ (limit - setpoint) | 1);
pos_ratio = x;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
@@ -594,7 +782,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
*
* (o) global/bdi setpoints
*
- * We want the dirty pages be balanced around the global/bdi setpoints.
+ * We want the dirty pages be balanced around the global/wb setpoints.
* When the number of dirty pages is higher/lower than the setpoint, the
* dirty position control ratio (and hence task dirty ratelimit) will be
* decreased/increased to bring the dirty pages back to the setpoint.
@@ -604,8 +792,8 @@ static long long pos_ratio_polynom(unsigned long setpoint,
* if (dirty < setpoint) scale up pos_ratio
* if (dirty > setpoint) scale down pos_ratio
*
- * if (bdi_dirty < bdi_setpoint) scale up pos_ratio
- * if (bdi_dirty > bdi_setpoint) scale down pos_ratio
+ * if (wb_dirty < wb_setpoint) scale up pos_ratio
+ * if (wb_dirty > wb_setpoint) scale down pos_ratio
*
* task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
*
@@ -630,7 +818,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
* 0 +------------.------------------.----------------------*------------->
* freerun^ setpoint^ limit^ dirty pages
*
- * (o) bdi control line
+ * (o) wb control line
*
* ^ pos_ratio
* |
@@ -656,33 +844,32 @@ static long long pos_ratio_polynom(unsigned long setpoint,
* | . .
* | . .
* 0 +----------------------.-------------------------------.------------->
- * bdi_setpoint^ x_intercept^
+ * wb_setpoint^ x_intercept^
*
- * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
+ * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
* be smoothly throttled down to normal if it starts high in situations like
* - start writing to a slow SD card and a fast disk at the same time. The SD
- * card's bdi_dirty may rush to many times higher than bdi_setpoint.
- * - the bdi dirty thresh drops quickly due to change of JBOD workload
+ * card's wb_dirty may rush to many times higher than wb_setpoint.
+ * - the wb dirty thresh drops quickly due to change of JBOD workload
*/
-static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
- unsigned long thresh,
- unsigned long bg_thresh,
- unsigned long dirty,
- unsigned long bdi_thresh,
- unsigned long bdi_dirty)
-{
- unsigned long write_bw = bdi->avg_write_bandwidth;
- unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
- unsigned long limit = hard_dirty_limit(thresh);
+static void wb_position_ratio(struct dirty_throttle_control *dtc)
+{
+ struct bdi_writeback *wb = dtc->wb;
+ unsigned long write_bw = wb->avg_write_bandwidth;
+ unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+ unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+ unsigned long wb_thresh = dtc->wb_thresh;
unsigned long x_intercept;
unsigned long setpoint; /* dirty pages' target balance point */
- unsigned long bdi_setpoint;
+ unsigned long wb_setpoint;
unsigned long span;
long long pos_ratio; /* for scaling up/down the rate limit */
long x;
- if (unlikely(dirty >= limit))
- return 0;
+ dtc->pos_ratio = 0;
+
+ if (unlikely(dtc->dirty >= limit))
+ return;
/*
* global setpoint
@@ -690,165 +877,167 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
* See comment for pos_ratio_polynom().
*/
setpoint = (freerun + limit) / 2;
- pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
+ pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
/*
* The strictlimit feature is a tool preventing mistrusted filesystems
* from growing a large number of dirty pages before throttling. For
- * such filesystems balance_dirty_pages always checks bdi counters
- * against bdi limits. Even if global "nr_dirty" is under "freerun".
+ * such filesystems balance_dirty_pages always checks wb counters
+ * against wb limits. Even if global "nr_dirty" is under "freerun".
* This is especially important for fuse which sets bdi->max_ratio to
* 1% by default. Without strictlimit feature, fuse writeback may
* consume arbitrary amount of RAM because it is accounted in
* NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
*
- * Here, in bdi_position_ratio(), we calculate pos_ratio based on
- * two values: bdi_dirty and bdi_thresh. Let's consider an example:
+ * Here, in wb_position_ratio(), we calculate pos_ratio based on
+ * two values: wb_dirty and wb_thresh. Let's consider an example:
* total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
* limits are set by default to 10% and 20% (background and throttle).
- * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
- * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
- * about ~6K pages (as the average of background and throttle bdi
+ * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
+ * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
+ * about ~6K pages (as the average of background and throttle wb
* limits). The 3rd order polynomial will provide positive feedback if
- * bdi_dirty is under bdi_setpoint and vice versa.
+ * wb_dirty is under wb_setpoint and vice versa.
*
* Note, that we cannot use global counters in these calculations
- * because we want to throttle process writing to a strictlimit BDI
+ * because we want to throttle process writing to a strictlimit wb
* much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
* in the example above).
*/
- if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
- long long bdi_pos_ratio;
- unsigned long bdi_bg_thresh;
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ long long wb_pos_ratio;
- if (bdi_dirty < 8)
- return min_t(long long, pos_ratio * 2,
- 2 << RATELIMIT_CALC_SHIFT);
+ if (dtc->wb_dirty < 8) {
+ dtc->pos_ratio = min_t(long long, pos_ratio * 2,
+ 2 << RATELIMIT_CALC_SHIFT);
+ return;
+ }
- if (bdi_dirty >= bdi_thresh)
- return 0;
+ if (dtc->wb_dirty >= wb_thresh)
+ return;
- bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
- bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
- bdi_bg_thresh);
+ wb_setpoint = dirty_freerun_ceiling(wb_thresh,
+ dtc->wb_bg_thresh);
- if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
- return 0;
+ if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
+ return;
- bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
- bdi_thresh);
+ wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
+ wb_thresh);
/*
- * Typically, for strictlimit case, bdi_setpoint << setpoint
- * and pos_ratio >> bdi_pos_ratio. In the other words global
+ * Typically, for strictlimit case, wb_setpoint << setpoint
+ * and pos_ratio >> wb_pos_ratio. In the other words global
* state ("dirty") is not limiting factor and we have to
- * make decision based on bdi counters. But there is an
+ * make decision based on wb counters. But there is an
* important case when global pos_ratio should get precedence:
* global limits are exceeded (e.g. due to activities on other
- * BDIs) while given strictlimit BDI is below limit.
+ * wb's) while given strictlimit wb is below limit.
*
- * "pos_ratio * bdi_pos_ratio" would work for the case above,
+ * "pos_ratio * wb_pos_ratio" would work for the case above,
* but it would look too non-natural for the case of all
- * activity in the system coming from a single strictlimit BDI
+ * activity in the system coming from a single strictlimit wb
* with bdi->max_ratio == 100%.
*
* Note that min() below somewhat changes the dynamics of the
* control system. Normally, pos_ratio value can be well over 3
- * (when globally we are at freerun and bdi is well below bdi
+ * (when globally we are at freerun and wb is well below wb
* setpoint). Now the maximum pos_ratio in the same situation
* is 2. We might want to tweak this if we observe the control
* system is too slow to adapt.
*/
- return min(pos_ratio, bdi_pos_ratio);
+ dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
+ return;
}
/*
* We have computed basic pos_ratio above based on global situation. If
- * the bdi is over/under its share of dirty pages, we want to scale
+ * the wb is over/under its share of dirty pages, we want to scale
* pos_ratio further down/up. That is done by the following mechanism.
*/
/*
- * bdi setpoint
+ * wb setpoint
*
- * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
+ * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
*
- * x_intercept - bdi_dirty
+ * x_intercept - wb_dirty
* := --------------------------
- * x_intercept - bdi_setpoint
+ * x_intercept - wb_setpoint
*
- * The main bdi control line is a linear function that subjects to
+ * The main wb control line is a linear function that subjects to
*
- * (1) f(bdi_setpoint) = 1.0
- * (2) k = - 1 / (8 * write_bw) (in single bdi case)
- * or equally: x_intercept = bdi_setpoint + 8 * write_bw
+ * (1) f(wb_setpoint) = 1.0
+ * (2) k = - 1 / (8 * write_bw) (in single wb case)
+ * or equally: x_intercept = wb_setpoint + 8 * write_bw
*
- * For single bdi case, the dirty pages are observed to fluctuate
+ * For single wb case, the dirty pages are observed to fluctuate
* regularly within range
- * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
+ * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
* for various filesystems, where (2) can yield in a reasonable 12.5%
* fluctuation range for pos_ratio.
*
- * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
+ * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
* own size, so move the slope over accordingly and choose a slope that
- * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
+ * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
*/
- if (unlikely(bdi_thresh > thresh))
- bdi_thresh = thresh;
+ if (unlikely(wb_thresh > dtc->thresh))
+ wb_thresh = dtc->thresh;
/*
- * It's very possible that bdi_thresh is close to 0 not because the
+ * It's very possible that wb_thresh is close to 0 not because the
* device is slow, but that it has remained inactive for long time.
* Honour such devices a reasonable good (hopefully IO efficient)
* threshold, so that the occasional writes won't be blocked and active
* writes can rampup the threshold quickly.
*/
- bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+ wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
/*
- * scale global setpoint to bdi's:
- * bdi_setpoint = setpoint * bdi_thresh / thresh
+ * scale global setpoint to wb's:
+ * wb_setpoint = setpoint * wb_thresh / thresh
*/
- x = div_u64((u64)bdi_thresh << 16, thresh + 1);
- bdi_setpoint = setpoint * (u64)x >> 16;
+ x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
+ wb_setpoint = setpoint * (u64)x >> 16;
/*
- * Use span=(8*write_bw) in single bdi case as indicated by
- * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+ * Use span=(8*write_bw) in single wb case as indicated by
+ * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
*
- * bdi_thresh thresh - bdi_thresh
- * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
- * thresh thresh
+ * wb_thresh thresh - wb_thresh
+ * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
+ * thresh thresh
*/
- span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
- x_intercept = bdi_setpoint + span;
+ span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
+ x_intercept = wb_setpoint + span;
- if (bdi_dirty < x_intercept - span / 4) {
- pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
- x_intercept - bdi_setpoint + 1);
+ if (dtc->wb_dirty < x_intercept - span / 4) {
+ pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
+ (x_intercept - wb_setpoint) | 1);
} else
pos_ratio /= 4;
/*
- * bdi reserve area, safeguard against dirty pool underrun and disk idle
+ * wb reserve area, safeguard against dirty pool underrun and disk idle
* It may push the desired control point of global dirty pages higher
* than setpoint.
*/
- x_intercept = bdi_thresh / 2;
- if (bdi_dirty < x_intercept) {
- if (bdi_dirty > x_intercept / 8)
- pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
+ x_intercept = wb_thresh / 2;
+ if (dtc->wb_dirty < x_intercept) {
+ if (dtc->wb_dirty > x_intercept / 8)
+ pos_ratio = div_u64(pos_ratio * x_intercept,
+ dtc->wb_dirty);
else
pos_ratio *= 8;
}
- return pos_ratio;
+ dtc->pos_ratio = pos_ratio;
}
-static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
- unsigned long elapsed,
- unsigned long written)
+static void wb_update_write_bandwidth(struct bdi_writeback *wb,
+ unsigned long elapsed,
+ unsigned long written)
{
const unsigned long period = roundup_pow_of_two(3 * HZ);
- unsigned long avg = bdi->avg_write_bandwidth;
- unsigned long old = bdi->write_bandwidth;
+ unsigned long avg = wb->avg_write_bandwidth;
+ unsigned long old = wb->write_bandwidth;
u64 bw;
/*
@@ -861,14 +1050,14 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
* @written may have decreased due to account_page_redirty().
* Avoid underflowing @bw calculation.
*/
- bw = written - min(written, bdi->written_stamp);
+ bw = written - min(written, wb->written_stamp);
bw *= HZ;
if (unlikely(elapsed > period)) {
do_div(bw, elapsed);
avg = bw;
goto out;
}
- bw += (u64)bdi->write_bandwidth * (period - elapsed);
+ bw += (u64)wb->write_bandwidth * (period - elapsed);
bw >>= ilog2(period);
/*
@@ -881,21 +1070,22 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
avg += (old - avg) >> 3;
out:
- bdi->write_bandwidth = bw;
- bdi->avg_write_bandwidth = avg;
+ /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
+ avg = max(avg, 1LU);
+ if (wb_has_dirty_io(wb)) {
+ long delta = avg - wb->avg_write_bandwidth;
+ WARN_ON_ONCE(atomic_long_add_return(delta,
+ &wb->bdi->tot_write_bandwidth) <= 0);
+ }
+ wb->write_bandwidth = bw;
+ wb->avg_write_bandwidth = avg;
}
-/*
- * The global dirtyable memory and dirty threshold could be suddenly knocked
- * down by a large amount (eg. on the startup of KVM in a swapless system).
- * This may throw the system into deep dirty exceeded state and throttle
- * heavy/light dirtiers alike. To retain good responsiveness, maintain
- * global_dirty_limit for tracking slowly down to the knocked down dirty
- * threshold.
- */
-static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+static void update_dirty_limit(struct dirty_throttle_control *dtc)
{
- unsigned long limit = global_dirty_limit;
+ struct wb_domain *dom = dtc_dom(dtc);
+ unsigned long thresh = dtc->thresh;
+ unsigned long limit = dom->dirty_limit;
/*
* Follow up in one step.
@@ -908,63 +1098,57 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
/*
* Follow down slowly. Use the higher one as the target, because thresh
* may drop below dirty. This is exactly the reason to introduce
- * global_dirty_limit which is guaranteed to lie above the dirty pages.
+ * dom->dirty_limit which is guaranteed to lie above the dirty pages.
*/
- thresh = max(thresh, dirty);
+ thresh = max(thresh, dtc->dirty);
if (limit > thresh) {
limit -= (limit - thresh) >> 5;
goto update;
}
return;
update:
- global_dirty_limit = limit;
+ dom->dirty_limit = limit;
}
-static void global_update_bandwidth(unsigned long thresh,
- unsigned long dirty,
+static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
unsigned long now)
{
- static DEFINE_SPINLOCK(dirty_lock);
- static unsigned long update_time = INITIAL_JIFFIES;
+ struct wb_domain *dom = dtc_dom(dtc);
/*
* check locklessly first to optimize away locking for the most time
*/
- if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+ if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
return;
- spin_lock(&dirty_lock);
- if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
- update_dirty_limit(thresh, dirty);
- update_time = now;
+ spin_lock(&dom->lock);
+ if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
+ update_dirty_limit(dtc);
+ dom->dirty_limit_tstamp = now;
}
- spin_unlock(&dirty_lock);
+ spin_unlock(&dom->lock);
}
/*
- * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+ * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
*
- * Normal bdi tasks will be curbed at or below it in long term.
+ * Normal wb tasks will be curbed at or below it in long term.
* Obviously it should be around (write_bw / N) when there are N dd tasks.
*/
-static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
- unsigned long thresh,
- unsigned long bg_thresh,
- unsigned long dirty,
- unsigned long bdi_thresh,
- unsigned long bdi_dirty,
- unsigned long dirtied,
- unsigned long elapsed)
-{
- unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
- unsigned long limit = hard_dirty_limit(thresh);
+static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
+ unsigned long dirtied,
+ unsigned long elapsed)
+{
+ struct bdi_writeback *wb = dtc->wb;
+ unsigned long dirty = dtc->dirty;
+ unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+ unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
unsigned long setpoint = (freerun + limit) / 2;
- unsigned long write_bw = bdi->avg_write_bandwidth;
- unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+ unsigned long write_bw = wb->avg_write_bandwidth;
+ unsigned long dirty_ratelimit = wb->dirty_ratelimit;
unsigned long dirty_rate;
unsigned long task_ratelimit;
unsigned long balanced_dirty_ratelimit;
- unsigned long pos_ratio;
unsigned long step;
unsigned long x;
@@ -972,20 +1156,18 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
* The dirty rate will match the writeout rate in long term, except
* when dirty pages are truncated by userspace or re-dirtied by FS.
*/
- dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+ dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
- pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
- bdi_thresh, bdi_dirty);
/*
* task_ratelimit reflects each dd's dirty rate for the past 200ms.
*/
task_ratelimit = (u64)dirty_ratelimit *
- pos_ratio >> RATELIMIT_CALC_SHIFT;
+ dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
/*
* A linear estimation of the "balanced" throttle rate. The theory is,
- * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+ * if there are N dd tasks, each throttled at task_ratelimit, the wb's
* dirty_rate will be measured to be (N * task_ratelimit). So the below
* formula will yield the balanced rate limit (write_bw / N).
*
@@ -1024,7 +1206,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
/*
* We could safely do this and return immediately:
*
- * bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+ * wb->dirty_ratelimit = balanced_dirty_ratelimit;
*
* However to get a more stable dirty_ratelimit, the below elaborated
* code makes use of task_ratelimit to filter out singular points and
@@ -1058,32 +1240,31 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
step = 0;
/*
- * For strictlimit case, calculations above were based on bdi counters
- * and limits (starting from pos_ratio = bdi_position_ratio() and up to
+ * For strictlimit case, calculations above were based on wb counters
+ * and limits (starting from pos_ratio = wb_position_ratio() and up to
* balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
- * Hence, to calculate "step" properly, we have to use bdi_dirty as
- * "dirty" and bdi_setpoint as "setpoint".
+ * Hence, to calculate "step" properly, we have to use wb_dirty as
+ * "dirty" and wb_setpoint as "setpoint".
*
- * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
- * it's possible that bdi_thresh is close to zero due to inactivity
- * of backing device (see the implementation of bdi_dirty_limit()).
+ * We rampup dirty_ratelimit forcibly if wb_dirty is low because
+ * it's possible that wb_thresh is close to zero due to inactivity
+ * of backing device.
*/
- if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
- dirty = bdi_dirty;
- if (bdi_dirty < 8)
- setpoint = bdi_dirty + 1;
+ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+ dirty = dtc->wb_dirty;
+ if (dtc->wb_dirty < 8)
+ setpoint = dtc->wb_dirty + 1;
else
- setpoint = (bdi_thresh +
- bdi_dirty_limit(bdi, bg_thresh)) / 2;
+ setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
}
if (dirty < setpoint) {
- x = min3(bdi->balanced_dirty_ratelimit,
+ x = min3(wb->balanced_dirty_ratelimit,
balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit < x)
step = x - dirty_ratelimit;
} else {
- x = max3(bdi->balanced_dirty_ratelimit,
+ x = max3(wb->balanced_dirty_ratelimit,
balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit > x)
step = dirty_ratelimit - x;
@@ -1105,69 +1286,67 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
else
dirty_ratelimit -= step;
- bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
- bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+ wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+ wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
- trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+ trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
}
-void __bdi_update_bandwidth(struct backing_dev_info *bdi,
- unsigned long thresh,
- unsigned long bg_thresh,
- unsigned long dirty,
- unsigned long bdi_thresh,
- unsigned long bdi_dirty,
- unsigned long start_time)
+static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
+ struct dirty_throttle_control *mdtc,
+ unsigned long start_time,
+ bool update_ratelimit)
{
+ struct bdi_writeback *wb = gdtc->wb;
unsigned long now = jiffies;
- unsigned long elapsed = now - bdi->bw_time_stamp;
+ unsigned long elapsed = now - wb->bw_time_stamp;
unsigned long dirtied;
unsigned long written;
+ lockdep_assert_held(&wb->list_lock);
+
/*
* rate-limit, only update once every 200ms.
*/
if (elapsed < BANDWIDTH_INTERVAL)
return;
- dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
- written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+ dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
+ written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
/*
* Skip quiet periods when disk bandwidth is under-utilized.
* (at least 1s idle time between two flusher runs)
*/
- if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+ if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
goto snapshot;
- if (thresh) {
- global_update_bandwidth(thresh, dirty, now);
- bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
- bdi_thresh, bdi_dirty,
- dirtied, elapsed);
+ if (update_ratelimit) {
+ domain_update_bandwidth(gdtc, now);
+ wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
+
+ /*
+ * @mdtc is always NULL if !CGROUP_WRITEBACK but the
+ * compiler has no way to figure that out. Help it.
+ */
+ if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
+ domain_update_bandwidth(mdtc, now);
+ wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
+ }
}
- bdi_update_write_bandwidth(bdi, elapsed, written);
+ wb_update_write_bandwidth(wb, elapsed, written);
snapshot:
- bdi->dirtied_stamp = dirtied;
- bdi->written_stamp = written;
- bdi->bw_time_stamp = now;
+ wb->dirtied_stamp = dirtied;
+ wb->written_stamp = written;
+ wb->bw_time_stamp = now;
}
-static void bdi_update_bandwidth(struct backing_dev_info *bdi,
- unsigned long thresh,
- unsigned long bg_thresh,
- unsigned long dirty,
- unsigned long bdi_thresh,
- unsigned long bdi_dirty,
- unsigned long start_time)
+void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
{
- if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
- return;
- spin_lock(&bdi->wb.list_lock);
- __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
- bdi_thresh, bdi_dirty, start_time);
- spin_unlock(&bdi->wb.list_lock);
+ struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+
+ __wb_update_bandwidth(&gdtc, NULL, start_time, false);
}
/*
@@ -1187,10 +1366,10 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
return 1;
}
-static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
- unsigned long bdi_dirty)
+static unsigned long wb_max_pause(struct bdi_writeback *wb,
+ unsigned long wb_dirty)
{
- unsigned long bw = bdi->avg_write_bandwidth;
+ unsigned long bw = wb->avg_write_bandwidth;
unsigned long t;
/*
@@ -1200,20 +1379,20 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
*
* 8 serves as the safety ratio.
*/
- t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
+ t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
t++;
return min_t(unsigned long, t, MAX_PAUSE);
}
-static long bdi_min_pause(struct backing_dev_info *bdi,
- long max_pause,
- unsigned long task_ratelimit,
- unsigned long dirty_ratelimit,
- int *nr_dirtied_pause)
+static long wb_min_pause(struct bdi_writeback *wb,
+ long max_pause,
+ unsigned long task_ratelimit,
+ unsigned long dirty_ratelimit,
+ int *nr_dirtied_pause)
{
- long hi = ilog2(bdi->avg_write_bandwidth);
- long lo = ilog2(bdi->dirty_ratelimit);
+ long hi = ilog2(wb->avg_write_bandwidth);
+ long lo = ilog2(wb->dirty_ratelimit);
long t; /* target pause */
long pause; /* estimated next pause */
int pages; /* target nr_dirtied_pause */
@@ -1281,34 +1460,27 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}
-static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
- unsigned long dirty_thresh,
- unsigned long background_thresh,
- unsigned long *bdi_dirty,
- unsigned long *bdi_thresh,
- unsigned long *bdi_bg_thresh)
+static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{
- unsigned long bdi_reclaimable;
+ struct bdi_writeback *wb = dtc->wb;
+ unsigned long wb_reclaimable;
/*
- * bdi_thresh is not treated as some limiting factor as
+ * wb_thresh is not treated as some limiting factor as
* dirty_thresh, due to reasons
- * - in JBOD setup, bdi_thresh can fluctuate a lot
+ * - in JBOD setup, wb_thresh can fluctuate a lot
* - in a system with HDD and USB key, the USB key may somehow
- * go into state (bdi_dirty >> bdi_thresh) either because
- * bdi_dirty starts high, or because bdi_thresh drops low.
+ * go into state (wb_dirty >> wb_thresh) either because
+ * wb_dirty starts high, or because wb_thresh drops low.
* In this case we don't want to hard throttle the USB key
- * dirtiers for 100 seconds until bdi_dirty drops under
- * bdi_thresh. Instead the auxiliary bdi control line in
- * bdi_position_ratio() will let the dirtier task progress
- * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+ * dirtiers for 100 seconds until wb_dirty drops under
+ * wb_thresh. Instead the auxiliary wb control line in
+ * wb_position_ratio() will let the dirtier task progress
+ * at some rate <= (write_bw / 2) for bringing down wb_dirty.
*/
- *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-
- if (bdi_bg_thresh)
- *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
- background_thresh,
- dirty_thresh) : 0;
+ dtc->wb_thresh = __wb_calc_thresh(dtc);
+ dtc->wb_bg_thresh = dtc->thresh ?
+ div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
@@ -1320,14 +1492,12 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
* actually dirty; with m+n sitting in the percpu
* deltas.
*/
- if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
- bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
- *bdi_dirty = bdi_reclaimable +
- bdi_stat_sum(bdi, BDI_WRITEBACK);
+ if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
+ wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+ dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
} else {
- bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- *bdi_dirty = bdi_reclaimable +
- bdi_stat(bdi, BDI_WRITEBACK);
+ wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+ dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
}
}
@@ -1339,12 +1509,16 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
* perform some writeout.
*/
static void balance_dirty_pages(struct address_space *mapping,
+ struct bdi_writeback *wb,
unsigned long pages_dirtied)
{
+ struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+ struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+ struct dirty_throttle_control * const gdtc = &gdtc_stor;
+ struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+ &mdtc_stor : NULL;
+ struct dirty_throttle_control *sdtc;
unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
- unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
- unsigned long background_thresh;
- unsigned long dirty_thresh;
long period;
long pause;
long max_pause;
@@ -1353,18 +1527,14 @@ static void balance_dirty_pages(struct address_space *mapping,
bool dirty_exceeded = false;
unsigned long task_ratelimit;
unsigned long dirty_ratelimit;
- unsigned long pos_ratio;
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct backing_dev_info *bdi = wb->bdi;
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
for (;;) {
unsigned long now = jiffies;
- unsigned long uninitialized_var(bdi_thresh);
- unsigned long thresh;
- unsigned long uninitialized_var(bdi_dirty);
- unsigned long dirty;
- unsigned long bg_thresh;
+ unsigned long dirty, thresh, bg_thresh;
+ unsigned long m_dirty, m_thresh, m_bg_thresh;
/*
* Unstable writes are a feature of certain networked
@@ -1374,65 +1544,127 @@ static void balance_dirty_pages(struct address_space *mapping,
*/
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
- nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
+ gdtc->avail = global_dirtyable_memory();
+ gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
- global_dirty_limits(&background_thresh, &dirty_thresh);
+ domain_dirty_limits(gdtc);
if (unlikely(strictlimit)) {
- bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
- &bdi_dirty, &bdi_thresh, &bg_thresh);
+ wb_dirty_limits(gdtc);
- dirty = bdi_dirty;
- thresh = bdi_thresh;
+ dirty = gdtc->wb_dirty;
+ thresh = gdtc->wb_thresh;
+ bg_thresh = gdtc->wb_bg_thresh;
} else {
- dirty = nr_dirty;
- thresh = dirty_thresh;
- bg_thresh = background_thresh;
+ dirty = gdtc->dirty;
+ thresh = gdtc->thresh;
+ bg_thresh = gdtc->bg_thresh;
+ }
+
+ if (mdtc) {
+ unsigned long writeback;
+
+ /*
+ * If @wb belongs to !root memcg, repeat the same
+ * basic calculations for the memcg domain.
+ */
+ mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
+ &writeback);
+ mdtc_cap_avail(mdtc);
+ mdtc->dirty += writeback;
+
+ domain_dirty_limits(mdtc);
+
+ if (unlikely(strictlimit)) {
+ wb_dirty_limits(mdtc);
+ m_dirty = mdtc->wb_dirty;
+ m_thresh = mdtc->wb_thresh;
+ m_bg_thresh = mdtc->wb_bg_thresh;
+ } else {
+ m_dirty = mdtc->dirty;
+ m_thresh = mdtc->thresh;
+ m_bg_thresh = mdtc->bg_thresh;
+ }
}
/*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
- * when the bdi limits are ramping up in case of !strictlimit.
+ * when the wb limits are ramping up in case of !strictlimit.
*
- * In strictlimit case make decision based on the bdi counters
- * and limits. Small writeouts when the bdi limits are ramping
+ * In strictlimit case make decision based on the wb counters
+ * and limits. Small writeouts when the wb limits are ramping
* up are the price we consciously pay for strictlimit-ing.
+ *
+ * If memcg domain is in effect, @dirty should be under
+ * both global and memcg freerun ceilings.
*/
- if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
+ if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
+ (!mdtc ||
+ m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+ unsigned long intv = dirty_poll_interval(dirty, thresh);
+ unsigned long m_intv = ULONG_MAX;
+
current->dirty_paused_when = now;
current->nr_dirtied = 0;
- current->nr_dirtied_pause =
- dirty_poll_interval(dirty, thresh);
+ if (mdtc)
+ m_intv = dirty_poll_interval(m_dirty, m_thresh);
+ current->nr_dirtied_pause = min(intv, m_intv);
break;
}
- if (unlikely(!writeback_in_progress(bdi)))
- bdi_start_background_writeback(bdi);
+ if (unlikely(!writeback_in_progress(wb)))
+ wb_start_background_writeback(wb);
+ /*
+ * Calculate global domain's pos_ratio and select the
+ * global dtc by default.
+ */
if (!strictlimit)
- bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
- &bdi_dirty, &bdi_thresh, NULL);
-
- dirty_exceeded = (bdi_dirty > bdi_thresh) &&
- ((nr_dirty > dirty_thresh) || strictlimit);
- if (dirty_exceeded && !bdi->dirty_exceeded)
- bdi->dirty_exceeded = 1;
-
- bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
- nr_dirty, bdi_thresh, bdi_dirty,
- start_time);
-
- dirty_ratelimit = bdi->dirty_ratelimit;
- pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
- background_thresh, nr_dirty,
- bdi_thresh, bdi_dirty);
- task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
+ wb_dirty_limits(gdtc);
+
+ dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
+ ((gdtc->dirty > gdtc->thresh) || strictlimit);
+
+ wb_position_ratio(gdtc);
+ sdtc = gdtc;
+
+ if (mdtc) {
+ /*
+ * If memcg domain is in effect, calculate its
+ * pos_ratio. @wb should satisfy constraints from
+ * both global and memcg domains. Choose the one
+ * w/ lower pos_ratio.
+ */
+ if (!strictlimit)
+ wb_dirty_limits(mdtc);
+
+ dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
+ ((mdtc->dirty > mdtc->thresh) || strictlimit);
+
+ wb_position_ratio(mdtc);
+ if (mdtc->pos_ratio < gdtc->pos_ratio)
+ sdtc = mdtc;
+ }
+
+ if (dirty_exceeded && !wb->dirty_exceeded)
+ wb->dirty_exceeded = 1;
+
+ if (time_is_before_jiffies(wb->bw_time_stamp +
+ BANDWIDTH_INTERVAL)) {
+ spin_lock(&wb->list_lock);
+ __wb_update_bandwidth(gdtc, mdtc, start_time, true);
+ spin_unlock(&wb->list_lock);
+ }
+
+ /* throttle according to the chosen dtc */
+ dirty_ratelimit = wb->dirty_ratelimit;
+ task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
RATELIMIT_CALC_SHIFT;
- max_pause = bdi_max_pause(bdi, bdi_dirty);
- min_pause = bdi_min_pause(bdi, max_pause,
- task_ratelimit, dirty_ratelimit,
- &nr_dirtied_pause);
+ max_pause = wb_max_pause(wb, sdtc->wb_dirty);
+ min_pause = wb_min_pause(wb, max_pause,
+ task_ratelimit, dirty_ratelimit,
+ &nr_dirtied_pause);
if (unlikely(task_ratelimit == 0)) {
period = max_pause;
@@ -1452,11 +1684,11 @@ static void balance_dirty_pages(struct address_space *mapping,
*/
if (pause < min_pause) {
trace_balance_dirty_pages(bdi,
- dirty_thresh,
- background_thresh,
- nr_dirty,
- bdi_thresh,
- bdi_dirty,
+ sdtc->thresh,
+ sdtc->bg_thresh,
+ sdtc->dirty,
+ sdtc->wb_thresh,
+ sdtc->wb_dirty,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -1481,11 +1713,11 @@ static void balance_dirty_pages(struct address_space *mapping,
pause:
trace_balance_dirty_pages(bdi,
- dirty_thresh,
- background_thresh,
- nr_dirty,
- bdi_thresh,
- bdi_dirty,
+ sdtc->thresh,
+ sdtc->bg_thresh,
+ sdtc->dirty,
+ sdtc->wb_thresh,
+ sdtc->wb_dirty,
dirty_ratelimit,
task_ratelimit,
pages_dirtied,
@@ -1500,33 +1732,33 @@ pause:
current->nr_dirtied_pause = nr_dirtied_pause;
/*
- * This is typically equal to (nr_dirty < dirty_thresh) and can
- * also keep "1000+ dd on a slow USB stick" under control.
+ * This is typically equal to (dirty < thresh) and can also
+ * keep "1000+ dd on a slow USB stick" under control.
*/
if (task_ratelimit)
break;
/*
* In the case of an unresponding NFS server and the NFS dirty
- * pages exceeds dirty_thresh, give the other good bdi's a pipe
+ * pages exceeds dirty_thresh, give the other good wb's a pipe
* to go through, so that tasks on them still remain responsive.
*
* In theory 1 page is enough to keep the comsumer-producer
* pipe going: the flusher cleans 1 page => the task dirties 1
- * more page. However bdi_dirty has accounting errors. So use
- * the larger and more IO friendly bdi_stat_error.
+ * more page. However wb_dirty has accounting errors. So use
+ * the larger and more IO friendly wb_stat_error.
*/
- if (bdi_dirty <= bdi_stat_error(bdi))
+ if (sdtc->wb_dirty <= wb_stat_error(wb))
break;
if (fatal_signal_pending(current))
break;
}
- if (!dirty_exceeded && bdi->dirty_exceeded)
- bdi->dirty_exceeded = 0;
+ if (!dirty_exceeded && wb->dirty_exceeded)
+ wb->dirty_exceeded = 0;
- if (writeback_in_progress(bdi))
+ if (writeback_in_progress(wb))
return;
/*
@@ -1540,8 +1772,8 @@ pause:
if (laptop_mode)
return;
- if (nr_reclaimable > background_thresh)
- bdi_start_background_writeback(bdi);
+ if (nr_reclaimable > gdtc->bg_thresh)
+ wb_start_background_writeback(wb);
}
static DEFINE_PER_CPU(int, bdp_ratelimits);
@@ -1577,15 +1809,22 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct inode *inode = mapping->host;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ struct bdi_writeback *wb = NULL;
int ratelimit;
int *p;
if (!bdi_cap_account_dirty(bdi))
return;
+ if (inode_cgwb_enabled(inode))
+ wb = wb_get_create_current(bdi, GFP_KERNEL);
+ if (!wb)
+ wb = &bdi->wb;
+
ratelimit = current->nr_dirtied_pause;
- if (bdi->dirty_exceeded)
+ if (wb->dirty_exceeded)
ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
preempt_disable();
@@ -1617,10 +1856,59 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
preempt_enable();
if (unlikely(current->nr_dirtied >= ratelimit))
- balance_dirty_pages(mapping, current->nr_dirtied);
+ balance_dirty_pages(mapping, wb, current->nr_dirtied);
+
+ wb_put(wb);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+/**
+ * wb_over_bg_thresh - does @wb need to be written back?
+ * @wb: bdi_writeback of interest
+ *
+ * Determines whether background writeback should keep writing @wb or it's
+ * clean enough. Returns %true if writeback should continue.
+ */
+bool wb_over_bg_thresh(struct bdi_writeback *wb)
+{
+ struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+ struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+ struct dirty_throttle_control * const gdtc = &gdtc_stor;
+ struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+ &mdtc_stor : NULL;
+
+ /*
+ * Similar to balance_dirty_pages() but ignores pages being written
+ * as we're trying to decide whether to put more under writeback.
+ */
+ gdtc->avail = global_dirtyable_memory();
+ gdtc->dirty = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
+ domain_dirty_limits(gdtc);
+
+ if (gdtc->dirty > gdtc->bg_thresh)
+ return true;
+
+ if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc))
+ return true;
+
+ if (mdtc) {
+ unsigned long writeback;
+
+ mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
+ mdtc_cap_avail(mdtc);
+ domain_dirty_limits(mdtc); /* ditto, ignore writeback */
+
+ if (mdtc->dirty > mdtc->bg_thresh)
+ return true;
+
+ if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc))
+ return true;
+ }
+
+ return false;
+}
+
void throttle_vm_writeout(gfp_t gfp_mask)
{
unsigned long background_thresh;
@@ -1628,7 +1916,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
for ( ; ; ) {
global_dirty_limits(&background_thresh, &dirty_thresh);
- dirty_thresh = hard_dirty_limit(dirty_thresh);
+ dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
/*
* Boost the allowable dirty threshold a bit for page
@@ -1667,14 +1955,20 @@ void laptop_mode_timer_fn(unsigned long data)
struct request_queue *q = (struct request_queue *)data;
int nr_pages = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
+ struct bdi_writeback *wb;
+ struct wb_iter iter;
/*
* We want to write everything out, not just down to the dirty
* threshold
*/
- if (bdi_has_dirty_io(&q->backing_dev_info))
- bdi_start_writeback(&q->backing_dev_info, nr_pages,
- WB_REASON_LAPTOP_TIMER);
+ if (!bdi_has_dirty_io(&q->backing_dev_info))
+ return;
+
+ bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
+ if (wb_has_dirty_io(wb))
+ wb_start_writeback(wb, nr_pages, true,
+ WB_REASON_LAPTOP_TIMER);
}
/*
@@ -1718,10 +2012,12 @@ void laptop_sync_completion(void)
void writeback_set_ratelimit(void)
{
+ struct wb_domain *dom = &global_wb_domain;
unsigned long background_thresh;
unsigned long dirty_thresh;
+
global_dirty_limits(&background_thresh, &dirty_thresh);
- global_dirty_limit = dirty_thresh;
+ dom->dirty_limit = dirty_thresh;
ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
if (ratelimit_pages < 16)
ratelimit_pages = 16;
@@ -1767,10 +2063,10 @@ static struct notifier_block ratelimit_nb = {
*/
void __init page_writeback_init(void)
{
+ BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
+
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
-
- fprop_global_init(&writeout_completions, GFP_KERNEL);
}
/**
@@ -2090,19 +2386,29 @@ int __set_page_dirty_no_writeback(struct page *page)
/*
* Helper function for set_page_dirty family.
+ *
+ * Caller must hold mem_cgroup_begin_page_stat().
+ *
* NOTE: This relies on being atomic wrt interrupts.
*/
-void account_page_dirtied(struct page *page, struct address_space *mapping)
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+ struct mem_cgroup *memcg)
{
+ struct inode *inode = mapping->host;
+
trace_writeback_dirty_page(page, mapping);
if (mapping_cap_account_dirty(mapping)) {
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct bdi_writeback *wb;
+ inode_attach_wb(inode, page);
+ wb = inode_to_wb(inode);
+
+ mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
- __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
- __inc_bdi_stat(bdi, BDI_DIRTIED);
+ __inc_wb_stat(wb, WB_RECLAIMABLE);
+ __inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_CACHE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
@@ -2111,6 +2417,22 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
EXPORT_SYMBOL(account_page_dirtied);
/*
+ * Helper function for deaccounting dirty page without writeback.
+ *
+ * Caller must hold mem_cgroup_begin_page_stat().
+ */
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+ struct mem_cgroup *memcg, struct bdi_writeback *wb)
+{
+ if (mapping_cap_account_dirty(mapping)) {
+ mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ dec_zone_page_state(page, NR_FILE_DIRTY);
+ dec_wb_stat(wb, WB_RECLAIMABLE);
+ task_io_account_cancelled_write(PAGE_CACHE_SIZE);
+ }
+}
+
+/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
@@ -2124,26 +2446,34 @@ EXPORT_SYMBOL(account_page_dirtied);
*/
int __set_page_dirty_nobuffers(struct page *page)
{
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_begin_page_stat(page);
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
unsigned long flags;
- if (!mapping)
+ if (!mapping) {
+ mem_cgroup_end_page_stat(memcg);
return 1;
+ }
spin_lock_irqsave(&mapping->tree_lock, flags);
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping);
+ account_page_dirtied(page, mapping, memcg);
radix_tree_tag_set(&mapping->page_tree, page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
+
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
return 1;
}
+ mem_cgroup_end_page_stat(memcg);
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2158,10 +2488,17 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
void account_page_redirty(struct page *page)
{
struct address_space *mapping = page->mapping;
+
if (mapping && mapping_cap_account_dirty(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ bool locked;
+
+ wb = unlocked_inode_to_wb_begin(inode, &locked);
current->nr_dirtied--;
dec_zone_page_state(page, NR_DIRTIED);
- dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
+ dec_wb_stat(wb, WB_DIRTIED);
+ unlocked_inode_to_wb_end(inode, locked);
}
}
EXPORT_SYMBOL(account_page_redirty);
@@ -2209,7 +2546,8 @@ int set_page_dirty(struct page *page)
* it will confuse readahead and make it restart the size rampup
* process. But it's a trivial problem.
*/
- ClearPageReclaim(page);
+ if (PageReclaim(page))
+ ClearPageReclaim(page);
#ifdef CONFIG_BLOCK
if (!spd)
spd = __set_page_dirty_buffers;
@@ -2246,6 +2584,43 @@ int set_page_dirty_lock(struct page *page)
EXPORT_SYMBOL(set_page_dirty_lock);
/*
+ * This cancels just the dirty bit on the kernel page itself, it does NOT
+ * actually remove dirty bits on any mmap's that may be around. It also
+ * leaves the page tagged dirty, so any sync activity will still find it on
+ * the dirty lists, and in particular, clear_page_dirty_for_io() will still
+ * look at the dirty bits in the VM.
+ *
+ * Doing this should *normally* only ever be done when a page is truncated,
+ * and is not actually mapped anywhere at all. However, fs/buffer.c does
+ * this when it notices that somebody has cleaned out all the buffers on a
+ * page without actually doing it through the VM. Can you say "ext3 is
+ * horribly ugly"? Thought you could.
+ */
+void cancel_dirty_page(struct page *page)
+{
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping_cap_account_dirty(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct mem_cgroup *memcg;
+ bool locked;
+
+ memcg = mem_cgroup_begin_page_stat(page);
+ wb = unlocked_inode_to_wb_begin(inode, &locked);
+
+ if (TestClearPageDirty(page))
+ account_page_cleaned(page, mapping, memcg, wb);
+
+ unlocked_inode_to_wb_end(inode, locked);
+ mem_cgroup_end_page_stat(memcg);
+ } else {
+ ClearPageDirty(page);
+ }
+}
+EXPORT_SYMBOL(cancel_dirty_page);
+
+/*
* Clear a page's dirty flag, while caring for dirty memory accounting.
* Returns true if the page was previously dirty.
*
@@ -2262,10 +2637,16 @@ EXPORT_SYMBOL(set_page_dirty_lock);
int clear_page_dirty_for_io(struct page *page)
{
struct address_space *mapping = page_mapping(page);
+ int ret = 0;
BUG_ON(!PageLocked(page));
if (mapping && mapping_cap_account_dirty(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct mem_cgroup *memcg;
+ bool locked;
+
/*
* Yes, Virginia, this is indeed insane.
*
@@ -2301,13 +2682,17 @@ int clear_page_dirty_for_io(struct page *page)
* always locked coming in here, so we get the desired
* exclusion.
*/
+ memcg = mem_cgroup_begin_page_stat(page);
+ wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
+ mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(inode_to_bdi(mapping->host),
- BDI_RECLAIMABLE);
- return 1;
+ dec_wb_stat(wb, WB_RECLAIMABLE);
+ ret = 1;
}
- return 0;
+ unlocked_inode_to_wb_end(inode, locked);
+ mem_cgroup_end_page_stat(memcg);
+ return ret;
}
return TestClearPageDirty(page);
}
@@ -2321,7 +2706,8 @@ int test_clear_page_writeback(struct page *page)
memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct inode *inode = mapping->host;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2331,8 +2717,10 @@ int test_clear_page_writeback(struct page *page)
page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi)) {
- __dec_bdi_stat(bdi, BDI_WRITEBACK);
- __bdi_writeout_inc(bdi);
+ struct bdi_writeback *wb = inode_to_wb(inode);
+
+ __dec_wb_stat(wb, WB_WRITEBACK);
+ __wb_writeout_inc(wb);
}
}
spin_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -2356,7 +2744,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+ struct inode *inode = mapping->host;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2366,7 +2755,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
- __inc_bdi_stat(bdi, BDI_WRITEBACK);
+ __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
}
if (!PageDirty(page))
radix_tree_tag_clear(&mapping->page_tree,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40e29429e7b0..beda41710802 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,6 +61,7 @@
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <linux/page_owner.h>
+#include <linux/kthread.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -235,6 +236,75 @@ EXPORT_SYMBOL(nr_online_nodes);
int page_group_by_mobility_disabled __read_mostly;
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void reset_deferred_meminit(pg_data_t *pgdat)
+{
+ pgdat->first_deferred_pfn = ULONG_MAX;
+}
+
+/* Returns true if the struct page for the pfn is uninitialised */
+static inline bool __meminit early_page_uninitialised(unsigned long pfn)
+{
+ if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn)
+ return true;
+
+ return false;
+}
+
+static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
+{
+ if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
+ return true;
+
+ return false;
+}
+
+/*
+ * Returns false when the remaining initialisation should be deferred until
+ * later in the boot cycle when it can be parallelised.
+ */
+static inline bool update_defer_init(pg_data_t *pgdat,
+ unsigned long pfn, unsigned long zone_end,
+ unsigned long *nr_initialised)
+{
+ /* Always populate low zones for address-contrained allocations */
+ if (zone_end < pgdat_end_pfn(pgdat))
+ return true;
+
+ /* Initialise at least 2G of the highest zone */
+ (*nr_initialised)++;
+ if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ pgdat->first_deferred_pfn = pfn;
+ return false;
+ }
+
+ return true;
+}
+#else
+static inline void reset_deferred_meminit(pg_data_t *pgdat)
+{
+}
+
+static inline bool early_page_uninitialised(unsigned long pfn)
+{
+ return false;
+}
+
+static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
+{
+ return false;
+}
+
+static inline bool update_defer_init(pg_data_t *pgdat,
+ unsigned long pfn, unsigned long zone_end,
+ unsigned long *nr_initialised)
+{
+ return true;
+}
+#endif
+
+
void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled &&
@@ -380,20 +450,6 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}
-static inline void prep_zero_page(struct page *page, unsigned int order,
- gfp_t gfp_flags)
-{
- int i;
-
- /*
- * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
- * and __GFP_HIGHMEM from hard or soft interrupt context.
- */
- VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
- for (i = 0; i < (1 << order); i++)
- clear_highpage(page + i);
-}
-
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
bool _debug_pagealloc_enabled __read_mostly;
@@ -778,6 +834,75 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
return 0;
}
+static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+ unsigned long zone, int nid)
+{
+ set_page_links(page, zone, nid, pfn);
+ init_page_count(page);
+ page_mapcount_reset(page);
+ page_cpupid_reset_last(page);
+
+ INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+ /* The shift won't overflow because ZONE_NORMAL is below 4G. */
+ if (!is_highmem_idx(zone))
+ set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+}
+
+static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
+ int nid)
+{
+ return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void init_reserved_page(unsigned long pfn)
+{
+ pg_data_t *pgdat;
+ int nid, zid;
+
+ if (!early_page_uninitialised(pfn))
+ return;
+
+ nid = early_pfn_to_nid(pfn);
+ pgdat = NODE_DATA(nid);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
+ break;
+ }
+ __init_single_pfn(pfn, zid, nid);
+}
+#else
+static inline void init_reserved_page(unsigned long pfn)
+{
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+/*
+ * Initialised pages do not have PageReserved set. This function is
+ * called for each range allocated by the bootmem allocator and
+ * marks the pages PageReserved. The remaining valid pages are later
+ * sent to the buddy page allocator.
+ */
+void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+{
+ unsigned long start_pfn = PFN_DOWN(start);
+ unsigned long end_pfn = PFN_UP(end);
+
+ for (; start_pfn < end_pfn; start_pfn++) {
+ if (pfn_valid(start_pfn)) {
+ struct page *page = pfn_to_page(start_pfn);
+
+ init_reserved_page(start_pfn);
+ SetPageReserved(page);
+ }
+ }
+}
+
static bool free_pages_prepare(struct page *page, unsigned int order)
{
bool compound = PageCompound(page);
@@ -832,7 +957,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
local_irq_restore(flags);
}
-void __init __free_pages_bootmem(struct page *page, unsigned int order)
+static void __init __free_pages_boot_core(struct page *page,
+ unsigned long pfn, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -852,6 +978,235 @@ void __init __free_pages_bootmem(struct page *page, unsigned int order)
__free_pages(page, order);
}
+#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
+ defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
+
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+ static DEFINE_SPINLOCK(early_pfn_lock);
+ int nid;
+
+ spin_lock(&early_pfn_lock);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
+ if (nid < 0)
+ nid = 0;
+ spin_unlock(&early_pfn_lock);
+
+ return nid;
+}
+#endif
+
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
+{
+ int nid;
+
+ nid = __early_pfn_to_nid(pfn, state);
+ if (nid >= 0 && nid != node)
+ return false;
+ return true;
+}
+
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+ return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
+}
+
+#else
+
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+ return true;
+}
+static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
+{
+ return true;
+}
+#endif
+
+
+void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
+ unsigned int order)
+{
+ if (early_page_uninitialised(pfn))
+ return;
+ return __free_pages_boot_core(page, pfn, order);
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void __init deferred_free_range(struct page *page,
+ unsigned long pfn, int nr_pages)
+{
+ int i;
+
+ if (!page)
+ return;
+
+ /* Free a large naturally-aligned chunk if possible */
+ if (nr_pages == MAX_ORDER_NR_PAGES &&
+ (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ __free_pages_boot_core(page, pfn, MAX_ORDER-1);
+ return;
+ }
+
+ for (i = 0; i < nr_pages; i++, page++, pfn++)
+ __free_pages_boot_core(page, pfn, 0);
+}
+
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+ if (atomic_dec_and_test(&pgdat_init_n_undone))
+ complete(&pgdat_init_all_done_comp);
+}
+
+/* Initialise remaining memory on a node */
+static int __init deferred_init_memmap(void *data)
+{
+ pg_data_t *pgdat = data;
+ int nid = pgdat->node_id;
+ struct mminit_pfnnid_cache nid_init_state = { };
+ unsigned long start = jiffies;
+ unsigned long nr_pages = 0;
+ unsigned long walk_start, walk_end;
+ int i, zid;
+ struct zone *zone;
+ unsigned long first_init_pfn = pgdat->first_deferred_pfn;
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (first_init_pfn == ULONG_MAX) {
+ pgdat_init_report_one_done();
+ return 0;
+ }
+
+ /* Bind memory initialisation thread to a local node if possible */
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(current, cpumask);
+
+ /* Sanity check boundaries */
+ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
+ BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
+ pgdat->first_deferred_pfn = ULONG_MAX;
+
+ /* Only the highest zone is deferred so find it */
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ zone = pgdat->node_zones + zid;
+ if (first_init_pfn < zone_end_pfn(zone))
+ break;
+ }
+
+ for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
+ unsigned long pfn, end_pfn;
+ struct page *page = NULL;
+ struct page *free_base_page = NULL;
+ unsigned long free_base_pfn = 0;
+ int nr_to_free = 0;
+
+ end_pfn = min(walk_end, zone_end_pfn(zone));
+ pfn = first_init_pfn;
+ if (pfn < walk_start)
+ pfn = walk_start;
+ if (pfn < zone->zone_start_pfn)
+ pfn = zone->zone_start_pfn;
+
+ for (; pfn < end_pfn; pfn++) {
+ if (!pfn_valid_within(pfn))
+ goto free_range;
+
+ /*
+ * Ensure pfn_valid is checked every
+ * MAX_ORDER_NR_PAGES for memory holes
+ */
+ if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+ if (!pfn_valid(pfn)) {
+ page = NULL;
+ goto free_range;
+ }
+ }
+
+ if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+ page = NULL;
+ goto free_range;
+ }
+
+ /* Minimise pfn page lookups and scheduler checks */
+ if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
+ page++;
+ } else {
+ nr_pages += nr_to_free;
+ deferred_free_range(free_base_page,
+ free_base_pfn, nr_to_free);
+ free_base_page = NULL;
+ free_base_pfn = nr_to_free = 0;
+
+ page = pfn_to_page(pfn);
+ cond_resched();
+ }
+
+ if (page->flags) {
+ VM_BUG_ON(page_zone(page) != zone);
+ goto free_range;
+ }
+
+ __init_single_page(page, pfn, zid, nid);
+ if (!free_base_page) {
+ free_base_page = page;
+ free_base_pfn = pfn;
+ nr_to_free = 0;
+ }
+ nr_to_free++;
+
+ /* Where possible, batch up pages for a single free */
+ continue;
+free_range:
+ /* Free the current block of pages to allocator */
+ nr_pages += nr_to_free;
+ deferred_free_range(free_base_page, free_base_pfn,
+ nr_to_free);
+ free_base_page = NULL;
+ free_base_pfn = nr_to_free = 0;
+ }
+
+ first_init_pfn = max(end_pfn, first_init_pfn);
+ }
+
+ /* Sanity check that the next zone really is unpopulated */
+ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
+
+ pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
+ jiffies_to_msecs(jiffies - start));
+
+ pgdat_init_report_one_done();
+ return 0;
+}
+
+void __init page_alloc_init_late(void)
+{
+ int nid;
+
+ /* There will be num_node_state(N_MEMORY) threads */
+ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
+ for_each_node_state(nid, N_MEMORY) {
+ kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
+ }
+
+ /* Block until all are initialised */
+ wait_for_completion(&pgdat_init_all_done_comp);
+
+ /* Reinit limits that are based on free pages after the kernel is up */
+ files_maxfiles_init();
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
@@ -941,6 +1296,10 @@ static inline int check_new_page(struct page *page)
bad_reason = "non-NULL mapping";
if (unlikely(atomic_read(&page->_count) != 0))
bad_reason = "nonzero _count";
+ if (unlikely(page->flags & __PG_HWPOISON)) {
+ bad_reason = "HWPoisoned (hardware-corrupted)";
+ bad_flags = __PG_HWPOISON;
+ }
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
@@ -975,7 +1334,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
kasan_alloc_pages(page, order);
if (gfp_flags & __GFP_ZERO)
- prep_zero_page(page, order, gfp_flags);
+ for (i = 0; i < (1 << order); i++)
+ clear_highpage(page + i);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
@@ -1032,11 +1392,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#ifdef CONFIG_CMA
- [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
[MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
-#else
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
#endif
[MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
#ifdef CONFIG_MEMORY_ISOLATION
@@ -1044,6 +1402,17 @@ static int fallbacks[MIGRATE_TYPES][4] = {
#endif
};
+#ifdef CONFIG_CMA
+static struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order)
+{
+ return __rmqueue_smallest(zone, order, MIGRATE_CMA);
+}
+#else
+static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
+ unsigned int order) { return NULL; }
+#endif
+
/*
* Move the free pages in a range to the free lists of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
@@ -1136,14 +1505,40 @@ static void change_pageblock_range(struct page *pageblock_page,
* as fragmentation caused by those allocations polluting movable pageblocks
* is worse than movable allocations stealing from unmovable and reclaimable
* pageblocks.
- *
- * If we claim more than half of the pageblock, change pageblock's migratetype
- * as well.
*/
-static void try_to_steal_freepages(struct zone *zone, struct page *page,
- int start_type, int fallback_type)
+static bool can_steal_fallback(unsigned int order, int start_mt)
+{
+ /*
+ * Leaving this order check is intended, although there is
+ * relaxed order check in next check. The reason is that
+ * we can actually steal whole pageblock if this condition met,
+ * but, below check doesn't guarantee it and that is just heuristic
+ * so could be changed anytime.
+ */
+ if (order >= pageblock_order)
+ return true;
+
+ if (order >= pageblock_order / 2 ||
+ start_mt == MIGRATE_RECLAIMABLE ||
+ start_mt == MIGRATE_UNMOVABLE ||
+ page_group_by_mobility_disabled)
+ return true;
+
+ return false;
+}
+
+/*
+ * This function implements actual steal behaviour. If order is large enough,
+ * we can steal whole pageblock. If not, we first move freepages in this
+ * pageblock and check whether half of pages are moved or not. If half of
+ * pages are moved, we can change migratetype of pageblock and permanently
+ * use it's pages as requested migratetype in the future.
+ */
+static void steal_suitable_fallback(struct zone *zone, struct page *page,
+ int start_type)
{
int current_order = page_order(page);
+ int pages;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
@@ -1151,19 +1546,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page,
return;
}
- if (current_order >= pageblock_order / 2 ||
- start_type == MIGRATE_RECLAIMABLE ||
- start_type == MIGRATE_UNMOVABLE ||
- page_group_by_mobility_disabled) {
- int pages;
+ pages = move_freepages_block(zone, page, start_type);
+
+ /* Claim the whole block if over half of it is free */
+ if (pages >= (1 << (pageblock_order-1)) ||
+ page_group_by_mobility_disabled)
+ set_pageblock_migratetype(page, start_type);
+}
+
+/*
+ * Check whether there is a suitable fallback freepage with requested order.
+ * If only_stealable is true, this function returns fallback_mt only if
+ * we can steal other freepages all together. This would help to reduce
+ * fragmentation due to mixed migratetype pages in one pageblock.
+ */
+int find_suitable_fallback(struct free_area *area, unsigned int order,
+ int migratetype, bool only_stealable, bool *can_steal)
+{
+ int i;
+ int fallback_mt;
+
+ if (area->nr_free == 0)
+ return -1;
+
+ *can_steal = false;
+ for (i = 0;; i++) {
+ fallback_mt = fallbacks[migratetype][i];
+ if (fallback_mt == MIGRATE_RESERVE)
+ break;
+
+ if (list_empty(&area->free_list[fallback_mt]))
+ continue;
+
+ if (can_steal_fallback(order, migratetype))
+ *can_steal = true;
- pages = move_freepages_block(zone, page, start_type);
+ if (!only_stealable)
+ return fallback_mt;
- /* Claim the whole block if over half of it is free */
- if (pages >= (1 << (pageblock_order-1)) ||
- page_group_by_mobility_disabled)
- set_pageblock_migratetype(page, start_type);
+ if (*can_steal)
+ return fallback_mt;
}
+
+ return -1;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -1173,64 +1598,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct free_area *area;
unsigned int current_order;
struct page *page;
+ int fallback_mt;
+ bool can_steal;
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1;
current_order >= order && current_order <= MAX_ORDER-1;
--current_order) {
- int i;
- for (i = 0;; i++) {
- int migratetype = fallbacks[start_migratetype][i];
- int buddy_type = start_migratetype;
-
- /* MIGRATE_RESERVE handled later if necessary */
- if (migratetype == MIGRATE_RESERVE)
- break;
-
- area = &(zone->free_area[current_order]);
- if (list_empty(&area->free_list[migratetype]))
- continue;
-
- page = list_entry(area->free_list[migratetype].next,
- struct page, lru);
- area->nr_free--;
-
- if (!is_migrate_cma(migratetype)) {
- try_to_steal_freepages(zone, page,
- start_migratetype,
- migratetype);
- } else {
- /*
- * When borrowing from MIGRATE_CMA, we need to
- * release the excess buddy pages to CMA
- * itself, and we do not try to steal extra
- * free pages.
- */
- buddy_type = migratetype;
- }
+ area = &(zone->free_area[current_order]);
+ fallback_mt = find_suitable_fallback(area, current_order,
+ start_migratetype, false, &can_steal);
+ if (fallback_mt == -1)
+ continue;
- /* Remove the page from the freelists */
- list_del(&page->lru);
- rmv_page_order(page);
+ page = list_entry(area->free_list[fallback_mt].next,
+ struct page, lru);
+ if (can_steal)
+ steal_suitable_fallback(zone, page, start_migratetype);
- expand(zone, page, order, current_order, area,
- buddy_type);
+ /* Remove the page from the freelists */
+ area->nr_free--;
+ list_del(&page->lru);
+ rmv_page_order(page);
- /*
- * The freepage_migratetype may differ from pageblock's
- * migratetype depending on the decisions in
- * try_to_steal_freepages(). This is OK as long as it
- * does not differ for MIGRATE_CMA pageblocks. For CMA
- * we need to make sure unallocated pages flushed from
- * pcp lists are returned to the correct freelist.
- */
- set_freepage_migratetype(page, buddy_type);
+ expand(zone, page, order, current_order, area,
+ start_migratetype);
+ /*
+ * The freepage_migratetype may differ from pageblock's
+ * migratetype depending on the decisions in
+ * try_to_steal_freepages(). This is OK as long as it
+ * does not differ for MIGRATE_CMA pageblocks. For CMA
+ * we need to make sure unallocated pages flushed from
+ * pcp lists are returned to the correct freelist.
+ */
+ set_freepage_migratetype(page, start_migratetype);
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, migratetype);
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
- return page;
- }
+ return page;
}
return NULL;
@@ -1249,7 +1655,11 @@ retry_reserve:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
- page = __rmqueue_fallback(zone, order, migratetype);
+ if (migratetype == MIGRATE_MOVABLE)
+ page = __rmqueue_cma_fallback(zone, order);
+
+ if (!page)
+ page = __rmqueue_fallback(zone, order, migratetype);
/*
* Use MIGRATE_RESERVE rather than fail an allocation. goto
@@ -1321,7 +1731,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
int to_drain, batch;
local_irq_save(flags);
- batch = ACCESS_ONCE(pcp->batch);
+ batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0) {
free_pcppages_bulk(zone, to_drain, pcp);
@@ -1520,7 +1930,7 @@ void free_hot_cold_page(struct page *page, bool cold)
list_add_tail(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
- unsigned long batch = ACCESS_ONCE(pcp->batch);
+ unsigned long batch = READ_ONCE(pcp->batch);
free_pcppages_bulk(zone, batch, pcp);
pcp->count -= batch;
}
@@ -1553,6 +1963,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
void split_page(struct page *page, unsigned int order)
{
int i;
+ gfp_t gfp_mask;
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
@@ -1566,10 +1977,11 @@ void split_page(struct page *page, unsigned int order)
split_page(virt_to_page(page[0].shadow), order);
#endif
- set_page_owner(page, 0, 0);
+ gfp_mask = get_page_owner_gfp(page);
+ set_page_owner(page, 0, gfp_mask);
for (i = 1; i < (1 << order); i++) {
set_page_refcounted(page + i);
- set_page_owner(page + i, 0, 0);
+ set_page_owner(page + i, 0, gfp_mask);
}
}
EXPORT_SYMBOL_GPL(split_page);
@@ -1599,6 +2011,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
zone->free_area[order].nr_free--;
rmv_page_order(page);
+ set_page_owner(page, order, __GFP_MOVABLE);
+
/* Set the pageblock if the isolated page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
@@ -1610,7 +2024,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
}
- set_page_owner(page, order, 0);
+
return 1UL << order;
}
@@ -2272,48 +2686,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
show_mem(filter);
}
-static inline int
-should_alloc_retry(gfp_t gfp_mask, unsigned int order,
- unsigned long did_some_progress,
- unsigned long pages_reclaimed)
-{
- /* Do not loop if specifically requested */
- if (gfp_mask & __GFP_NORETRY)
- return 0;
-
- /* Always retry if specifically requested */
- if (gfp_mask & __GFP_NOFAIL)
- return 1;
-
- /*
- * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
- * making forward progress without invoking OOM. Suspend also disables
- * storage devices so kswapd will not help. Bail if we are suspending.
- */
- if (!did_some_progress && pm_suspended_storage())
- return 0;
-
- /*
- * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
- * means __GFP_NOFAIL, but that may not be true in other
- * implementations.
- */
- if (order <= PAGE_ALLOC_COSTLY_ORDER)
- return 1;
-
- /*
- * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
- * specified, then we retry until we no longer reclaim any pages
- * (above), or we've reclaimed an order of pages at least as
- * large as the allocation's order. In both cases, if the
- * allocation still fails, we stop retrying.
- */
- if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
- return 1;
-
- return 0;
-}
-
static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac, unsigned long *did_some_progress)
@@ -2323,10 +2695,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
*did_some_progress = 0;
/*
- * Acquire the per-zone oom lock for each zone. If that
- * fails, somebody else is making progress for us.
+ * Acquire the oom lock. If that fails, somebody else is
+ * making progress for us.
*/
- if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
+ if (!mutex_trylock(&oom_lock)) {
*did_some_progress = 1;
schedule_timeout_uninterruptible(1);
return NULL;
@@ -2352,23 +2724,19 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
- /* The OOM killer does not compensate for light reclaim */
+ /* The OOM killer does not compensate for IO-less reclaim */
if (!(gfp_mask & __GFP_FS)) {
/*
* XXX: Page reclaim didn't yield anything,
* and the OOM killer can't be invoked, but
- * keep looping as per should_alloc_retry().
+ * keep looping as per tradition.
*/
*did_some_progress = 1;
goto out;
}
- /*
- * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
- * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
- * The caller should handle page allocation failure by itself if
- * it specifies __GFP_THISNODE.
- * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
- */
+ if (pm_suspended_storage())
+ goto out;
+ /* The OOM killer may not free memory on a specific node */
if (gfp_mask & __GFP_THISNODE)
goto out;
}
@@ -2377,7 +2745,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
|| WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
*did_some_progress = 1;
out:
- oom_zonelist_unlock(ac->zonelist, gfp_mask);
+ mutex_unlock(&oom_lock);
return page;
}
@@ -2623,15 +2991,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
}
/*
- * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
- * __GFP_NOWARN set) should not cause reclaim since the subsystem
- * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
- * using a larger set of nodes after it has established that the
- * allowed per node queues are empty and that nodes are
- * over allocated.
+ * If this allocation cannot block and it is for a specific node, then
+ * fail early. There's no need to wakeup kswapd or retry for a
+ * speculative node-specific allocation.
*/
- if (IS_ENABLED(CONFIG_NUMA) &&
- (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
goto nopage;
retry:
@@ -2754,40 +3118,40 @@ retry:
if (page)
goto got_pg;
- /* Check if we should retry the allocation */
+ /* Do not loop if specifically requested */
+ if (gfp_mask & __GFP_NORETRY)
+ goto noretry;
+
+ /* Keep reclaiming pages as long as there is reasonable progress */
pages_reclaimed += did_some_progress;
- if (should_alloc_retry(gfp_mask, order, did_some_progress,
- pages_reclaimed)) {
- /*
- * If we fail to make progress by freeing individual
- * pages, but the allocation wants us to keep going,
- * start OOM killing tasks.
- */
- if (!did_some_progress) {
- page = __alloc_pages_may_oom(gfp_mask, order, ac,
- &did_some_progress);
- if (page)
- goto got_pg;
- if (!did_some_progress)
- goto nopage;
- }
+ if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
+ ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
/* Wait for some write requests to complete then retry */
wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
goto retry;
- } else {
- /*
- * High-order allocations do not necessarily loop after
- * direct reclaim and reclaim/compaction depends on compaction
- * being called after reclaim so call directly if necessary
- */
- page = __alloc_pages_direct_compact(gfp_mask, order,
- alloc_flags, ac, migration_mode,
- &contended_compaction,
- &deferred_compaction);
- if (page)
- goto got_pg;
}
+ /* Reclaim has failed us, start killing things */
+ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+ if (page)
+ goto got_pg;
+
+ /* Retry as long as the OOM killer is making progress */
+ if (did_some_progress)
+ goto retry;
+
+noretry:
+ /*
+ * High-order allocations do not necessarily loop after
+ * direct reclaim and reclaim/compaction depends on compaction
+ * being called after reclaim so call directly if necessary
+ */
+ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
+ ac, migration_mode,
+ &contended_compaction,
+ &deferred_compaction);
+ if (page)
+ goto got_pg;
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
got_pg:
@@ -2824,7 +3188,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
- * of GFP_THISNODE and a memoryless node
+ * of __GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
@@ -2927,6 +3291,104 @@ void free_pages(unsigned long addr, unsigned int order)
EXPORT_SYMBOL(free_pages);
/*
+ * Page Fragment:
+ * An arbitrary-length arbitrary-offset area of memory which resides
+ * within a 0 or higher order page. Multiple fragments within that page
+ * are individually refcounted, in the page's reference counter.
+ *
+ * The page_frag functions below provide a simple allocation framework for
+ * page fragments. This is used by the network stack and network device
+ * drivers to provide a backing region of memory for use as either an
+ * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
+ */
+static struct page *__page_frag_refill(struct page_frag_cache *nc,
+ gfp_t gfp_mask)
+{
+ struct page *page = NULL;
+ gfp_t gfp = gfp_mask;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+ gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+ __GFP_NOMEMALLOC;
+ page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+ PAGE_FRAG_CACHE_MAX_ORDER);
+ nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+#endif
+ if (unlikely(!page))
+ page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+ nc->va = page ? page_address(page) : NULL;
+
+ return page;
+}
+
+void *__alloc_page_frag(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask)
+{
+ unsigned int size = PAGE_SIZE;
+ struct page *page;
+ int offset;
+
+ if (unlikely(!nc->va)) {
+refill:
+ page = __page_frag_refill(nc, gfp_mask);
+ if (!page)
+ return NULL;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+ /* if size can vary use size else just use PAGE_SIZE */
+ size = nc->size;
+#endif
+ /* Even if we own the page, we do not use atomic_set().
+ * This would break get_page_unless_zero() users.
+ */
+ atomic_add(size - 1, &page->_count);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pfmemalloc = page->pfmemalloc;
+ nc->pagecnt_bias = size;
+ nc->offset = size;
+ }
+
+ offset = nc->offset - fragsz;
+ if (unlikely(offset < 0)) {
+ page = virt_to_page(nc->va);
+
+ if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+ goto refill;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+ /* if size can vary use size else just use PAGE_SIZE */
+ size = nc->size;
+#endif
+ /* OK, page count is 0, we can safely set it */
+ atomic_set(&page->_count, size);
+
+ /* reset page count bias and offset to start of new frag */
+ nc->pagecnt_bias = size;
+ offset = size - fragsz;
+ }
+
+ nc->pagecnt_bias--;
+ nc->offset = offset;
+
+ return nc->va + offset;
+}
+EXPORT_SYMBOL(__alloc_page_frag);
+
+/*
+ * Frees a page fragment allocated out of either a compound or order 0 page.
+ */
+void __free_page_frag(void *addr)
+{
+ struct page *page = virt_to_head_page(addr);
+
+ if (unlikely(put_page_testzero(page)))
+ __free_pages_ok(page, compound_order(page));
+}
+EXPORT_SYMBOL(__free_page_frag);
+
+/*
* alloc_kmem_pages charges newly allocated pages to the kmem resource counter
* of the current memory cgroup.
*
@@ -3201,38 +3663,31 @@ static void show_migration_types(unsigned char type)
* Show free area list (used inside shift_scroll-lock stuff)
* We also calculate the percentage fragmentation. We do this by counting the
* memory on each free list with the exception of the first item on the list.
- * Suppresses nodes that are not allowed by current's cpuset if
- * SHOW_MEM_FILTER_NODES is passed.
+ *
+ * Bits in @filter:
+ * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
+ * cpuset.
*/
void show_free_areas(unsigned int filter)
{
+ unsigned long free_pcp = 0;
int cpu;
struct zone *zone;
for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
- show_node(zone);
- printk("%s per-cpu:\n", zone->name);
-
- for_each_online_cpu(cpu) {
- struct per_cpu_pageset *pageset;
-
- pageset = per_cpu_ptr(zone->pageset, cpu);
- printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
- cpu, pageset->pcp.high,
- pageset->pcp.batch, pageset->pcp.count);
- }
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
}
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu"
- " dirty:%lu writeback:%lu unstable:%lu\n"
- " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+ " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
- " free_cma:%lu\n",
+ " free:%lu free_pcp:%lu free_cma:%lu\n",
global_page_state(NR_ACTIVE_ANON),
global_page_state(NR_INACTIVE_ANON),
global_page_state(NR_ISOLATED_ANON),
@@ -3243,13 +3698,14 @@ void show_free_areas(unsigned int filter)
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
- global_page_state(NR_FREE_PAGES),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
global_page_state(NR_FILE_MAPPED),
global_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
+ global_page_state(NR_FREE_PAGES),
+ free_pcp,
global_page_state(NR_FREE_CMA_PAGES));
for_each_populated_zone(zone) {
@@ -3257,6 +3713,11 @@ void show_free_areas(unsigned int filter)
if (skip_free_areas_node(filter, zone_to_nid(zone)))
continue;
+
+ free_pcp = 0;
+ for_each_online_cpu(cpu)
+ free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
+
show_node(zone);
printk("%s"
" free:%lukB"
@@ -3283,6 +3744,8 @@ void show_free_areas(unsigned int filter)
" pagetables:%lukB"
" unstable:%lukB"
" bounce:%lukB"
+ " free_pcp:%lukB"
+ " local_pcp:%ukB"
" free_cma:%lukB"
" writeback_tmp:%lukB"
" pages_scanned:%lu"
@@ -3314,6 +3777,8 @@ void show_free_areas(unsigned int filter)
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)),
+ K(free_pcp),
+ K(this_cpu_read(zone->pageset->pcp.count)),
K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
K(zone_page_state(zone, NR_PAGES_SCANNED)),
@@ -4062,6 +4527,9 @@ static void setup_zone_migrate_reserve(struct zone *zone)
zone->nr_migrate_reserve_block = reserve;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
+ return;
+
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
@@ -4124,15 +4592,16 @@ static void setup_zone_migrate_reserve(struct zone *zone)
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
- struct page *page;
+ pg_data_t *pgdat = NODE_DATA(nid);
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
struct zone *z;
+ unsigned long nr_initialised = 0;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
- z = &NODE_DATA(nid)->node_zones[zone];
+ z = &pgdat->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
@@ -4144,14 +4613,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
+ if (!update_defer_init(pgdat, pfn, end_pfn,
+ &nr_initialised))
+ break;
}
- page = pfn_to_page(pfn);
- set_page_links(page, zone, nid, pfn);
- mminit_verify_page_links(page, zone, nid, pfn);
- init_page_count(page);
- page_mapcount_reset(page);
- page_cpupid_reset_last(page);
- SetPageReserved(page);
+
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
@@ -4166,17 +4632,14 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*/
- if ((z->zone_start_pfn <= pfn)
- && (pfn < zone_end_pfn(z))
- && !(pfn & (pageblock_nr_pages - 1)))
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ if (!(pfn & (pageblock_nr_pages - 1))) {
+ struct page *page = pfn_to_page(pfn);
- INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
- /* The shift won't overflow because ZONE_NORMAL is below 4G. */
- if (!is_highmem_idx(zone))
- set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
+ __init_single_page(page, pfn, zone, nid);
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ } else {
+ __init_single_pfn(pfn, zone, nid);
+ }
}
}
@@ -4434,57 +4897,30 @@ int __meminit init_currently_empty_zone(struct zone *zone,
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
*/
-int __meminit __early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state)
{
unsigned long start_pfn, end_pfn;
int nid;
- /*
- * NOTE: The following SMP-unsafe globals are only used early in boot
- * when the kernel is running single-threaded.
- */
- static unsigned long __meminitdata last_start_pfn, last_end_pfn;
- static int __meminitdata last_nid;
- if (last_start_pfn <= pfn && pfn < last_end_pfn)
- return last_nid;
+ if (state->last_start <= pfn && pfn < state->last_end)
+ return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
if (nid != -1) {
- last_start_pfn = start_pfn;
- last_end_pfn = end_pfn;
- last_nid = nid;
+ state->last_start = start_pfn;
+ state->last_end = end_pfn;
+ state->last_nid = nid;
}
return nid;
}
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-int __meminit early_pfn_to_nid(unsigned long pfn)
-{
- int nid;
-
- nid = __early_pfn_to_nid(pfn);
- if (nid >= 0)
- return nid;
- /* just returns 0 */
- return 0;
-}
-
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- int nid;
-
- nid = __early_pfn_to_nid(pfn);
- if (nid >= 0 && nid != node)
- return false;
- return true;
-}
-#endif
-
/**
* free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
* @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@ -4726,22 +5162,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long *zones_size,
unsigned long *zholes_size)
{
- unsigned long realtotalpages, totalpages = 0;
+ unsigned long realtotalpages = 0, totalpages = 0;
enum zone_type i;
- for (i = 0; i < MAX_NR_ZONES; i++)
- totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn,
- zones_size);
- pgdat->node_spanned_pages = totalpages;
-
- realtotalpages = totalpages;
- for (i = 0; i < MAX_NR_ZONES; i++)
- realtotalpages -=
- zone_absent_pages_in_node(pgdat->node_id, i,
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zone *zone = pgdat->node_zones + i;
+ unsigned long size, real_size;
+
+ size = zone_spanned_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn,
+ zones_size);
+ real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn, node_end_pfn,
zholes_size);
+ zone->spanned_pages = size;
+ zone->present_pages = real_size;
+
+ totalpages += size;
+ realtotalpages += real_size;
+ }
+
+ pgdat->node_spanned_pages = totalpages;
pgdat->node_present_pages = realtotalpages;
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
realtotalpages);
@@ -4851,8 +5293,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
* NOTE: pgdat should get zeroed by caller.
*/
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
- unsigned long node_start_pfn, unsigned long node_end_pfn,
- unsigned long *zones_size, unsigned long *zholes_size)
+ unsigned long node_start_pfn, unsigned long node_end_pfn)
{
enum zone_type j;
int nid = pgdat->node_id;
@@ -4873,12 +5314,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
- size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
- node_end_pfn, zones_size);
- realsize = freesize = size - zone_absent_pages_in_node(nid, j,
- node_start_pfn,
- node_end_pfn,
- zholes_size);
+ size = zone->spanned_pages;
+ realsize = freesize = zone->present_pages;
/*
* Adjust freesize so that it accounts for how much memory
@@ -4913,8 +5350,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;
- zone->spanned_pages = size;
- zone->present_pages = realsize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
@@ -5003,6 +5438,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
+ reset_deferred_meminit(pgdat);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5020,8 +5456,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
- free_area_init_core(pgdat, start_pfn, end_pfn,
- zones_size, zholes_size);
+ free_area_init_core(pgdat, start_pfn, end_pfn);
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5717,7 +6152,7 @@ static void __setup_per_zone_wmarks(void)
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
- * deltas controls asynch page reclaim, and so should
+ * deltas control asynch page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
@@ -5970,9 +6405,9 @@ out:
return ret;
}
+#ifdef CONFIG_NUMA
int hashdist = HASHDIST_DEFAULT;
-#ifdef CONFIG_NUMA
static int __init set_hashdist(char *str)
{
if (!str)
@@ -6164,7 +6599,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
mask <<= (BITS_PER_LONG - bitidx - 1);
flags <<= (BITS_PER_LONG - bitidx - 1);
- word = ACCESS_ONCE(bitmap[word_bitidx]);
+ word = READ_ONCE(bitmap[word_bitidx]);
for (;;) {
old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
if (word == old_word)
diff --git a/mm/page_io.c b/mm/page_io.c
index e6045804c8d8..520baa4b04d7 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -20,8 +20,8 @@
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/frontswap.h>
-#include <linux/aio.h>
#include <linux/blkdev.h>
+#include <linux/uio.h>
#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -69,7 +69,7 @@ void end_swap_bio_write(struct bio *bio, int err)
bio_put(bio);
}
-void end_swap_bio_read(struct bio *bio, int err)
+static void end_swap_bio_read(struct bio *bio, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
struct page *page = bio->bi_io_vec[0].bv_page;
@@ -274,13 +274,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
init_sync_kiocb(&kiocb, swap_file);
kiocb.ki_pos = page_file_offset(page);
- kiocb.ki_nbytes = PAGE_SIZE;
set_page_writeback(page);
unlock_page(page);
- ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE,
- &kiocb, &from,
- kiocb.ki_pos);
+ ret = mapping->a_ops->direct_IO(&kiocb, &from, kiocb.ki_pos);
if (ret == PAGE_SIZE) {
count_vm_event(PSWPOUT);
ret = 0;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 755a42c76eb4..303c908790ef 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -101,7 +101,8 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);
- if (!is_migrate_isolate_page(buddy)) {
+ if (pfn_valid_within(page_to_pfn(buddy)) &&
+ !is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
kernel_map_pages(page, (1 << order), 1);
set_page_refcounted(page);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 0993f5f36b01..983c3a10fa07 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -76,6 +76,13 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
+gfp_t __get_page_owner_gfp(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ return page_ext->gfp_mask;
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_ext *page_ext)
@@ -310,4 +317,4 @@ static int __init pageowner_init(void)
return 0;
}
-module_init(pageowner_init)
+late_initcall(pageowner_init)
diff --git a/mm/percpu.c b/mm/percpu.c
index 73c97a5f4495..2dd74487a0af 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1030,7 +1030,7 @@ area_found:
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
- kmemleak_alloc_percpu(ptr, size);
+ kmemleak_alloc_percpu(ptr, size, gfp);
return ptr;
fail_unlock:
@@ -1310,7 +1310,7 @@ bool is_kernel_percpu_address(unsigned long addr)
* and, from the second one, the backing allocator (currently either vm or
* km) provides translation.
*
- * The addr can be tranlated simply without checking if it falls into the
+ * The addr can be translated simply without checking if it falls into the
* first chunk. But the current code reflects better how percpu allocator
* actually works, and the verification can discover both bugs in percpu
* allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
@@ -1762,7 +1762,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
* and other parameters considering needed percpu size, allocation
* atom size and distances between CPUs.
*
- * Groups are always mutliples of atom size and CPUs which are of
+ * Groups are always multiples of atom size and CPUs which are of
* LOCAL_DISTANCE both ways are grouped together and share space for
* units in the same group. The returned configuration is guaranteed
* to have CPUs on different nodes on different groups and >=75% usage
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c25f94b33811..6b674e00153c 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -119,14 +119,15 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
}
#endif
-#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
+pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+ VM_BUG_ON(!pmd_trans_huge(*pmdp));
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
}
@@ -198,3 +199,23 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
+
+#ifndef pmdp_collapse_flush
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
+{
+ /*
+ * pmd and hugepage pte format are same. So we could
+ * use the same function.
+ */
+ pmd_t pmd;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(pmd_trans_huge(*pmdp));
+ pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index b1597690530c..e88d071648c2 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -257,22 +257,18 @@ static ssize_t process_vm_rw(pid_t pid,
struct iovec *iov_r = iovstack_r;
struct iov_iter iter;
ssize_t rc;
+ int dir = vm_write ? WRITE : READ;
if (flags != 0)
return -EINVAL;
/* Check iovecs */
- if (vm_write)
- rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
- iovstack_l, &iov_l);
- else
- rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
- iovstack_l, &iov_l);
- if (rc <= 0)
+ rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
+ if (rc < 0)
+ return rc;
+ if (!iov_iter_count(&iter))
goto free_iovecs;
- iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
-
rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
iovstack_r, &iov_r);
if (rc <= 0)
@@ -283,8 +279,7 @@ static ssize_t process_vm_rw(pid_t pid,
free_iovecs:
if (iov_r != iovstack_r)
kfree(iov_r);
- if (iov_l != iovstack_l)
- kfree(iov_l);
+ kfree(iov_l);
return rc;
}
@@ -320,21 +315,16 @@ compat_process_vm_rw(compat_pid_t pid,
struct iovec *iov_r = iovstack_r;
struct iov_iter iter;
ssize_t rc = -EFAULT;
+ int dir = vm_write ? WRITE : READ;
if (flags != 0)
return -EINVAL;
- if (vm_write)
- rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
- UIO_FASTIOV, iovstack_l,
- &iov_l);
- else
- rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
- UIO_FASTIOV, iovstack_l,
- &iov_l);
- if (rc <= 0)
+ rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
+ if (rc < 0)
+ return rc;
+ if (!iov_iter_count(&iter))
goto free_iovecs;
- iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
UIO_FASTIOV, iovstack_r,
&iov_r);
@@ -346,8 +336,7 @@ compat_process_vm_rw(compat_pid_t pid,
free_iovecs:
if (iov_r != iovstack_r)
kfree(iov_r);
- if (iov_l != iovstack_l)
- kfree(iov_l);
+ kfree(iov_l);
return rc;
}
diff --git a/mm/readahead.c b/mm/readahead.c
index 935675844b2e..60cd846a9a44 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
/*
* Defer asynchronous read-ahead on IO congestion.
*/
- if (bdi_read_congested(inode_to_bdi(mapping->host)))
+ if (inode_read_congested(mapping->host))
return;
/* do read-ahead */
diff --git a/mm/rmap.c b/mm/rmap.c
index c161a14b6a8f..171b68768df1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -30,6 +30,8 @@
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
+ * mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ * mapping->tree_lock (widely used)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
@@ -456,7 +458,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ anon_mapping = (unsigned long)READ_ONCE(page->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
if (!page_mapped(page))
@@ -500,14 +502,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
unsigned long anon_mapping;
rcu_read_lock();
- anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+ anon_mapping = (unsigned long)READ_ONCE(page->mapping);
if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
goto out;
if (!page_mapped(page))
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- root_anon_vma = ACCESS_ONCE(anon_vma->root);
+ root_anon_vma = READ_ONCE(anon_vma->root);
if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
* If the page is still mapped, then this anon_vma is still
@@ -625,7 +627,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
pmd = pmd_offset(pud, address);
/*
- * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+ * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
* without holding anon_vma lock for write. So when looking for a
* genuine pmde (in which to find pte), test present and !THP together.
*/
@@ -950,7 +952,12 @@ void page_move_anon_rmap(struct page *page,
VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- page->mapping = (struct address_space *) anon_vma;
+ /*
+ * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
+ * simultaneously, so a concurrent reader (eg page_referenced()'s
+ * PageAnon()) will not see one without the other.
+ */
+ WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
}
/**
diff --git a/mm/shmem.c b/mm/shmem.c
index cf2d0ca010bc..dbe0c1e8349c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -31,7 +31,7 @@
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/swap.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
static struct vfsmount *shm_mnt;
@@ -544,7 +544,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
int error;
@@ -569,7 +569,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
i_size_write(inode, newsize);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
}
- if (newsize < oldsize) {
+ if (newsize <= oldsize) {
loff_t holebegin = round_up(newsize, PAGE_SIZE);
unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
shmem_truncate_range(inode, newsize, (loff_t)-1);
@@ -2274,7 +2274,7 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
*/
static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
int ret;
/*
@@ -2298,7 +2298,7 @@ out:
static int shmem_unlink(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
shmem_free_inode(inode->i_sb);
@@ -2315,7 +2315,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
if (!simple_empty(dentry))
return -ENOTEMPTY;
- drop_nlink(dentry->d_inode);
+ drop_nlink(d_inode(dentry));
drop_nlink(dir);
return shmem_unlink(dir, dentry);
}
@@ -2336,8 +2336,8 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru
}
old_dir->i_ctime = old_dir->i_mtime =
new_dir->i_ctime = new_dir->i_mtime =
- old_dentry->d_inode->i_ctime =
- new_dentry->d_inode->i_ctime = CURRENT_TIME;
+ d_inode(old_dentry)->i_ctime =
+ d_inode(new_dentry)->i_ctime = CURRENT_TIME;
return 0;
}
@@ -2376,7 +2376,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
*/
static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
{
- struct inode *inode = old_dentry->d_inode;
+ struct inode *inode = d_inode(old_dentry);
int they_are_dirs = S_ISDIR(inode->i_mode);
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
@@ -2396,10 +2396,10 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
return error;
}
- if (new_dentry->d_inode) {
+ if (d_really_is_positive(new_dentry)) {
(void) shmem_unlink(new_dir, new_dentry);
if (they_are_dirs) {
- drop_nlink(new_dentry->d_inode);
+ drop_nlink(d_inode(new_dentry));
drop_nlink(old_dir);
}
} else if (they_are_dirs) {
@@ -2451,6 +2451,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
return -ENOMEM;
}
inode->i_op = &shmem_short_symlink_operations;
+ inode->i_link = info->symlink;
} else {
error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
if (error) {
@@ -2474,30 +2475,23 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
return 0;
}
-static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
-{
- nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
- return NULL;
-}
-
-static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
+static const char *shmem_follow_link(struct dentry *dentry, void **cookie)
{
struct page *page = NULL;
- int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
- nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
- if (page)
- unlock_page(page);
- return page;
+ int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL);
+ if (error)
+ return ERR_PTR(error);
+ unlock_page(page);
+ *cookie = page;
+ return kmap(page);
}
-static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
+static void shmem_put_link(struct inode *unused, void *cookie)
{
- if (!IS_ERR(nd_get_link(nd))) {
- struct page *page = cookie;
- kunmap(page);
- mark_page_accessed(page);
- page_cache_release(page);
- }
+ struct page *page = cookie;
+ kunmap(page);
+ mark_page_accessed(page);
+ page_cache_release(page);
}
#ifdef CONFIG_TMPFS_XATTR
@@ -2574,7 +2568,7 @@ static int shmem_xattr_validate(const char *name)
static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size)
{
- struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+ struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
int err;
/*
@@ -2595,7 +2589,7 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
static int shmem_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+ struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
int err;
/*
@@ -2615,7 +2609,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
static int shmem_removexattr(struct dentry *dentry, const char *name)
{
- struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+ struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
int err;
/*
@@ -2635,14 +2629,14 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
- struct shmem_inode_info *info = SHMEM_I(dentry->d_inode);
+ struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
return simple_xattr_list(&info->xattrs, buffer, size);
}
#endif /* CONFIG_TMPFS_XATTR */
static const struct inode_operations shmem_short_symlink_operations = {
.readlink = generic_readlink,
- .follow_link = shmem_follow_short_symlink,
+ .follow_link = simple_follow_link,
#ifdef CONFIG_TMPFS_XATTR
.setxattr = shmem_setxattr,
.getxattr = shmem_getxattr,
@@ -3118,8 +3112,6 @@ static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
#ifdef CONFIG_TMPFS
.llseek = shmem_file_llseek,
- .read = new_sync_read,
- .write = new_sync_write,
.read_iter = shmem_file_read_iter,
.write_iter = generic_file_write_iter,
.fsync = noop_fsync,
@@ -3371,8 +3363,8 @@ put_path:
* shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
* kernel internal. There will be NO LSM permission checks against the
* underlying inode. So users of this interface must do LSM checks at a
- * higher layer. The one user is the big_key implementation. LSM checks
- * are provided at the key level rather than the inode level.
+ * higher layer. The users are the big_key and shm implementations. LSM
+ * checks are provided at the key or shm level rather than the inode.
* @name: name for dentry (to be seen in /proc/<pid>/maps
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
@@ -3403,7 +3395,13 @@ int shmem_zero_setup(struct vm_area_struct *vma)
struct file *file;
loff_t size = vma->vm_end - vma->vm_start;
- file = shmem_file_setup("dev/zero", size, vma->vm_flags);
+ /*
+ * Cloning a new file under mmap_sem leads to a lock ordering conflict
+ * between XFS directory reading and selinux: since this file is only
+ * accessible to the user through its mapping, use S_PRIVATE flag to
+ * bypass file security, in the same way as shmem_kernel_file_setup().
+ */
+ file = __shmem_file_setup("dev/zero", size, vma->vm_flags, S_PRIVATE);
if (IS_ERR(file))
return PTR_ERR(file);
diff --git a/mm/slab.c b/mm/slab.c
index c4b89eaf4c96..200e22412a16 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
return NULL;
}
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+ return flags;
+}
+
#else /* CONFIG_NUMA */
static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
@@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
return __cache_free_alien(cachep, objp, node, page_node);
}
+
+/*
+ * Construct gfp mask to allocate from a specific node but do not invoke reclaim
+ * or warn about failures.
+ */
+static inline gfp_t gfp_exact_node(gfp_t flags)
+{
+ return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT;
+}
#endif
/*
@@ -1440,6 +1454,7 @@ void __init kmem_cache_init(void)
kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
slab_state = PARTIAL_NODE;
+ setup_kmalloc_cache_index_table();
slab_early_init = 0;
@@ -2825,7 +2840,7 @@ alloc_done:
if (unlikely(!ac->avail)) {
int x;
force_grow:
- x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
+ x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
ac = cpu_cache_get(cachep);
@@ -3019,7 +3034,7 @@ retry:
get_node(cache, nid) &&
get_node(cache, nid)->free_objects) {
obj = ____cache_alloc_node(cache,
- flags | GFP_THISNODE, nid);
+ gfp_exact_node(flags), nid);
if (obj)
break;
}
@@ -3047,7 +3062,7 @@ retry:
nid = page_to_nid(page);
if (cache_grow(cache, flags, nid, page)) {
obj = ____cache_alloc_node(cache,
- flags | GFP_THISNODE, nid);
+ gfp_exact_node(flags), nid);
if (!obj)
/*
* Another processor may allocate the
@@ -3118,7 +3133,7 @@ retry:
must_grow:
spin_unlock(&n->list_lock);
- x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
+ x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
if (x)
goto retry;
diff --git a/mm/slab.h b/mm/slab.h
index 4c3ac12dd644..8da63e4e470f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -71,6 +71,7 @@ unsigned long calculate_alignment(unsigned long flags,
#ifndef CONFIG_SLOB
/* Kmalloc array related functions */
+void setup_kmalloc_cache_index_table(void);
void create_kmalloc_caches(unsigned long);
/* Find the kmalloc slab corresponding for a certain size */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 999bb3424d44..86831105a09f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -37,8 +37,7 @@ struct kmem_cache *kmem_cache;
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
SLAB_FAILSLAB)
-#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_CACHE_DMA | SLAB_NOTRACK)
+#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK)
/*
* Merge control. If this is set then no merging of slab caches will occur.
@@ -784,25 +783,45 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
}
/*
- * Create the kmalloc array. Some of the regular kmalloc arrays
- * may already have been created because they were needed to
- * enable allocations for slab creation.
+ * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
+ * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is
+ * kmalloc-67108864.
*/
-void __init create_kmalloc_caches(unsigned long flags)
+static struct {
+ const char *name;
+ unsigned long size;
+} const kmalloc_info[] __initconst = {
+ {NULL, 0}, {"kmalloc-96", 96},
+ {"kmalloc-192", 192}, {"kmalloc-8", 8},
+ {"kmalloc-16", 16}, {"kmalloc-32", 32},
+ {"kmalloc-64", 64}, {"kmalloc-128", 128},
+ {"kmalloc-256", 256}, {"kmalloc-512", 512},
+ {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048},
+ {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192},
+ {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768},
+ {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072},
+ {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288},
+ {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152},
+ {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608},
+ {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432},
+ {"kmalloc-67108864", 67108864}
+};
+
+/*
+ * Patch up the size_index table if we have strange large alignment
+ * requirements for the kmalloc array. This is only the case for
+ * MIPS it seems. The standard arches will not generate any code here.
+ *
+ * Largest permitted alignment is 256 bytes due to the way we
+ * handle the index determination for the smaller caches.
+ *
+ * Make sure that nothing crazy happens if someone starts tinkering
+ * around with ARCH_KMALLOC_MINALIGN
+ */
+void __init setup_kmalloc_cache_index_table(void)
{
int i;
- /*
- * Patch up the size_index table if we have strange large alignment
- * requirements for the kmalloc array. This is only the case for
- * MIPS it seems. The standard arches will not generate any code here.
- *
- * Largest permitted alignment is 256 bytes due to the way we
- * handle the index determination for the smaller caches.
- *
- * Make sure that nothing crazy happens if someone starts tinkering
- * around with ARCH_KMALLOC_MINALIGN
- */
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
@@ -833,11 +852,26 @@ void __init create_kmalloc_caches(unsigned long flags)
for (i = 128 + 8; i <= 192; i += 8)
size_index[size_index_elem(i)] = 8;
}
+}
+
+static void __init new_kmalloc_cache(int idx, unsigned long flags)
+{
+ kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
+ kmalloc_info[idx].size, flags);
+}
+
+/*
+ * Create the kmalloc array. Some of the regular kmalloc arrays
+ * may already have been created because they were needed to
+ * enable allocations for slab creation.
+ */
+void __init create_kmalloc_caches(unsigned long flags)
+{
+ int i;
+
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
- if (!kmalloc_caches[i]) {
- kmalloc_caches[i] = create_kmalloc_cache(NULL,
- 1 << i, flags);
- }
+ if (!kmalloc_caches[i])
+ new_kmalloc_cache(i, flags);
/*
* Caches that are not of the two-to-the-power-of size.
@@ -845,27 +879,14 @@ void __init create_kmalloc_caches(unsigned long flags)
* earlier power of two caches
*/
if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
- kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
-
+ new_kmalloc_cache(1, flags);
if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
- kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
+ new_kmalloc_cache(2, flags);
}
/* Kmalloc array is now usable */
slab_state = UP;
- for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
- struct kmem_cache *s = kmalloc_caches[i];
- char *n;
-
- if (s) {
- n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
-
- BUG_ON(!n);
- s->name = n;
- }
- }
-
#ifdef CONFIG_ZONE_DMA
for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
struct kmem_cache *s = kmalloc_caches[i];
diff --git a/mm/slob.c b/mm/slob.c
index 94a7fede6d48..4765f65019c7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -532,7 +532,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
return 0;
}
-void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
{
void *b;
@@ -558,7 +558,6 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
return b;
}
-EXPORT_SYMBOL(slob_alloc_node);
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
diff --git a/mm/slub.c b/mm/slub.c
index 82c473780c91..816df0016555 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -374,7 +374,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
if (cmpxchg_double(&page->freelist, &page->counters,
freelist_old, counters_old,
freelist_new, counters_new))
- return 1;
+ return true;
} else
#endif
{
@@ -384,7 +384,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
page->freelist = freelist_new;
set_page_slub_counters(page, counters_new);
slab_unlock(page);
- return 1;
+ return true;
}
slab_unlock(page);
}
@@ -396,7 +396,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
pr_info("%s %s: cmpxchg double redo ", n, s->name);
#endif
- return 0;
+ return false;
}
static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
@@ -410,7 +410,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
if (cmpxchg_double(&page->freelist, &page->counters,
freelist_old, counters_old,
freelist_new, counters_new))
- return 1;
+ return true;
} else
#endif
{
@@ -424,7 +424,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
set_page_slub_counters(page, counters_new);
slab_unlock(page);
local_irq_restore(flags);
- return 1;
+ return true;
}
slab_unlock(page);
local_irq_restore(flags);
@@ -437,7 +437,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
pr_info("%s %s: cmpxchg double redo ", n, s->name);
#endif
- return 0;
+ return false;
}
#ifdef CONFIG_SLUB_DEBUG
@@ -1137,15 +1137,6 @@ static int __init setup_slub_debug(char *str)
*/
goto check_slabs;
- if (tolower(*str) == 'o') {
- /*
- * Avoid enabling debugging on caches if its minimum order
- * would increase as a result.
- */
- disable_higher_order_debug = 1;
- goto out;
- }
-
slub_debug = 0;
if (*str == '-')
/*
@@ -1176,6 +1167,13 @@ static int __init setup_slub_debug(char *str)
case 'a':
slub_debug |= SLAB_FAILSLAB;
break;
+ case 'o':
+ /*
+ * Avoid enabling debugging on caches if its minimum
+ * order would increase as a result.
+ */
+ disable_higher_order_debug = 1;
+ break;
default:
pr_err("slub_debug option '%c' unknown. skipped\n",
*str);
@@ -3702,6 +3700,7 @@ void __init kmem_cache_init(void)
kmem_cache_node = bootstrap(&boot_kmem_cache_node);
/* Now we can use the kmem_cache to allocate kmalloc slabs */
+ setup_kmalloc_cache_index_table();
create_kmalloc_caches(0);
#ifdef CONFIG_SMP
@@ -4279,7 +4278,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
int node;
struct page *page;
- page = ACCESS_ONCE(c->page);
+ page = READ_ONCE(c->page);
if (!page)
continue;
@@ -4294,7 +4293,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
total += x;
nodes[node] += x;
- page = ACCESS_ONCE(c->partial);
+ page = READ_ONCE(c->partial);
if (page) {
node = page_to_nid(page);
if (flags & SO_TOTAL)
diff --git a/mm/swap.c b/mm/swap.c
index cd3a5e64cea9..a3a0a2f1f7c3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
+#include <linux/hugetlb.h>
#include "internal.h"
@@ -42,7 +43,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
/*
* This path almost never happens for VM activity - pages are normally
@@ -75,7 +76,14 @@ static void __put_compound_page(struct page *page)
{
compound_page_dtor *dtor;
- __page_cache_release(page);
+ /*
+ * __page_cache_release() is supposed to be called for thp, not for
+ * hugetlb. This is because hugetlb page does never have PageLRU set
+ * (it's never listed to any LRU lists) and no memcg routines should
+ * be called for hugetlb (it has a separate hugetlb_cgroup.)
+ */
+ if (!PageHuge(page))
+ __page_cache_release(page);
dtor = get_compound_page_dtor(page);
(*dtor)(page);
}
@@ -123,7 +131,6 @@ void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
* here, see the comment above this function.
*/
VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- VM_BUG_ON_PAGE(page_mapcount(page) != 0, page);
if (put_page_testzero(page_head)) {
/*
* If this is the tail of a slab THP page,
@@ -743,7 +750,7 @@ void lru_cache_add_active_or_unevictable(struct page *page,
* be write it out by flusher threads as this is much more effective
* than the single-page writeout from reclaim.
*/
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
int lru, file;
@@ -811,36 +818,36 @@ void lru_add_drain_cpu(int cpu)
local_irq_restore(flags);
}
- pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
activate_page_drain(cpu);
}
/**
- * deactivate_page - forcefully deactivate a page
+ * deactivate_file_page - forcefully deactivate a file page
* @page: page to deactivate
*
* This function hints the VM that @page is a good reclaim candidate,
* for example if its invalidation fails due to the page being dirty
* or under writeback.
*/
-void deactivate_page(struct page *page)
+void deactivate_file_page(struct page *page)
{
/*
- * In a workload with many unevictable page such as mprotect, unevictable
- * page deactivation for accelerating reclaim is pointless.
+ * In a workload with many unevictable page such as mprotect,
+ * unevictable page deactivation for accelerating reclaim is pointless.
*/
if (PageUnevictable(page))
return;
if (likely(get_page_unless_zero(page))) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
if (!pagevec_add(pvec, page))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- put_cpu_var(lru_deactivate_pvecs);
+ pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ put_cpu_var(lru_deactivate_file_pvecs);
}
}
@@ -872,7 +879,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
schedule_work_on(cpu, work);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 405923f77334..8bc8e66138da 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
unsigned int pages, max_pages, last_ra;
static atomic_t last_readahead_pages;
- max_pages = 1 << ACCESS_ONCE(page_cluster);
+ max_pages = 1 << READ_ONCE(page_cluster);
if (max_pages <= 1)
return 1;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 63f55ccb9b26..41e4581af7c5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
else
continue;
}
- count = ACCESS_ONCE(si->swap_map[i]);
+ count = READ_ONCE(si->swap_map[i]);
if (count && swap_count(count) != SWAP_MAP_BAD)
break;
}
@@ -2032,7 +2032,7 @@ static int swap_show(struct seq_file *swap, void *v)
}
file = si->swap_file;
- len = seq_path(swap, &file->f_path, " \t\n\\");
+ len = seq_file_path(swap, file, " \t\n\\");
seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
len < 40 ? 40 - len : 1, " ",
S_ISBLK(file_inode(file)->i_mode) ?
diff --git a/mm/truncate.c b/mm/truncate.c
index ddec5a5966d7..76e35ad97102 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,35 +93,6 @@ void do_invalidatepage(struct page *page, unsigned int offset,
}
/*
- * This cancels just the dirty bit on the kernel page itself, it
- * does NOT actually remove dirty bits on any mmap's that may be
- * around. It also leaves the page tagged dirty, so any sync
- * activity will still find it on the dirty lists, and in particular,
- * clear_page_dirty_for_io() will still look at the dirty bits in
- * the VM.
- *
- * Doing this should *normally* only ever be done when a page
- * is truncated, and is not actually mapped anywhere at all. However,
- * fs/buffer.c does this when it notices that somebody has cleaned
- * out all the buffers on a page without actually doing it through
- * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
- */
-void cancel_dirty_page(struct page *page, unsigned int account_size)
-{
- if (TestClearPageDirty(page)) {
- struct address_space *mapping = page->mapping;
- if (mapping && mapping_cap_account_dirty(mapping)) {
- dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(inode_to_bdi(mapping->host),
- BDI_RECLAIMABLE);
- if (account_size)
- task_io_account_cancelled_write(account_size);
- }
- }
-}
-EXPORT_SYMBOL(cancel_dirty_page);
-
-/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes orphaned. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_fault().
@@ -140,8 +111,12 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
if (page_has_private(page))
do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
- cancel_dirty_page(page, PAGE_CACHE_SIZE);
-
+ /*
+ * Some filesystems seem to re-dirty the page even after
+ * the VM has canceled the dirty bit (eg ext3 journaling).
+ * Hence dirty accounting check is placed after invalidation.
+ */
+ cancel_dirty_page(page);
ClearPageMappedToDisk(page);
delete_from_page_cache(page);
return 0;
@@ -513,7 +488,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
* of interest and try to speed up its reclaim.
*/
if (!ret)
- deactivate_page(page);
+ deactivate_file_page(page);
count += ret;
}
pagevec_remove_exceptionals(&pvec);
@@ -535,19 +510,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
+ struct mem_cgroup *memcg;
+ unsigned long flags;
+
if (page->mapping != mapping)
return 0;
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- spin_lock_irq(&mapping->tree_lock);
+ memcg = mem_cgroup_begin_page_stat(page);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL);
- spin_unlock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(page, NULL, memcg);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -555,7 +535,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
page_cache_release(page); /* pagecache ref */
return 1;
failed:
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
return 0;
}
diff --git a/mm/util.c b/mm/util.c
index 3981ae9d1b15..68ff8a5361e7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -325,9 +325,37 @@ void kvfree(const void *addr)
}
EXPORT_SYMBOL(kvfree);
+static inline void *__page_rmapping(struct page *page)
+{
+ unsigned long mapping;
+
+ mapping = (unsigned long)page->mapping;
+ mapping &= ~PAGE_MAPPING_FLAGS;
+
+ return (void *)mapping;
+}
+
+/* Neutral page->mapping pointer to address_space or anon_vma or other */
+void *page_rmapping(struct page *page)
+{
+ page = compound_head(page);
+ return __page_rmapping(page);
+}
+
+struct anon_vma *page_anon_vma(struct page *page)
+{
+ unsigned long mapping;
+
+ page = compound_head(page);
+ mapping = (unsigned long)page->mapping;
+ if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+ return NULL;
+ return __page_rmapping(page);
+}
+
struct address_space *page_mapping(struct page *page)
{
- struct address_space *mapping = page->mapping;
+ unsigned long mapping;
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
@@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page)
swp_entry_t entry;
entry.val = page_private(page);
- mapping = swap_address_space(entry);
- } else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
- mapping = NULL;
- return mapping;
+ return swap_address_space(entry);
+ }
+
+ mapping = (unsigned long)page->mapping;
+ if (mapping & PAGE_MAPPING_FLAGS)
+ return NULL;
+ return page->mapping;
}
int overcommit_ratio_handler(struct ctl_table *table, int write,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 49abccf29a29..2faaa2976447 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -29,6 +29,7 @@
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/llist.h>
+#include <linux/bitops.h>
#include <asm/uaccess.h>
#include <asm/tlbflush.h>
@@ -74,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
+ if (pmd_clear_huge(pmd))
+ continue;
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next);
@@ -88,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
+ if (pud_clear_huge(pud))
+ continue;
if (pud_none_or_clear_bad(pud))
continue;
vunmap_pmd_range(pud, addr, next);
@@ -760,7 +765,7 @@ struct vmap_block {
spinlock_t lock;
struct vmap_area *va;
unsigned long free, dirty;
- DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
+ unsigned long dirty_min, dirty_max; /*< dirty range */
struct list_head free_list;
struct rcu_head rcu_head;
struct list_head purge;
@@ -791,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr)
return addr;
}
-static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
+static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
+{
+ unsigned long addr;
+
+ addr = va_start + (pages_off << PAGE_SHIFT);
+ BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
+ return (void *)addr;
+}
+
+/**
+ * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
+ * block. Of course pages number can't exceed VMAP_BBMAP_BITS
+ * @order: how many 2^order pages should be occupied in newly allocated block
+ * @gfp_mask: flags for the page level allocator
+ *
+ * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
+ */
+static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
struct vmap_area *va;
unsigned long vb_idx;
int node, err;
+ void *vaddr;
node = numa_node_id();
@@ -821,11 +844,15 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
return ERR_PTR(err);
}
+ vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
- vb->free = VMAP_BBMAP_BITS;
+ /* At least something should be left free */
+ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
+ vb->free = VMAP_BBMAP_BITS - (1UL << order);
vb->dirty = 0;
- bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
+ vb->dirty_min = VMAP_BBMAP_BITS;
+ vb->dirty_max = 0;
INIT_LIST_HEAD(&vb->free_list);
vb_idx = addr_to_vb_idx(va->va_start);
@@ -837,11 +864,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
vbq = &get_cpu_var(vmap_block_queue);
spin_lock(&vbq->lock);
- list_add_rcu(&vb->free_list, &vbq->free);
+ list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
put_cpu_var(vmap_block_queue);
- return vb;
+ return vaddr;
}
static void free_vmap_block(struct vmap_block *vb)
@@ -876,7 +903,8 @@ static void purge_fragmented_blocks(int cpu)
if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
vb->free = 0; /* prevent further allocs after releasing lock */
vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
- bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+ vb->dirty_min = 0;
+ vb->dirty_max = VMAP_BBMAP_BITS;
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
spin_unlock(&vbq->lock);
@@ -905,7 +933,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
- unsigned long addr = 0;
+ void *vaddr = NULL;
unsigned int order;
BUG_ON(size & ~PAGE_MASK);
@@ -920,43 +948,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
}
order = get_order(size);
-again:
rcu_read_lock();
vbq = &get_cpu_var(vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
- int i;
+ unsigned long pages_off;
spin_lock(&vb->lock);
- if (vb->free < 1UL << order)
- goto next;
+ if (vb->free < (1UL << order)) {
+ spin_unlock(&vb->lock);
+ continue;
+ }
- i = VMAP_BBMAP_BITS - vb->free;
- addr = vb->va->va_start + (i << PAGE_SHIFT);
- BUG_ON(addr_to_vb_idx(addr) !=
- addr_to_vb_idx(vb->va->va_start));
+ pages_off = VMAP_BBMAP_BITS - vb->free;
+ vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
vb->free -= 1UL << order;
if (vb->free == 0) {
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
spin_unlock(&vbq->lock);
}
+
spin_unlock(&vb->lock);
break;
-next:
- spin_unlock(&vb->lock);
}
put_cpu_var(vmap_block_queue);
rcu_read_unlock();
- if (!addr) {
- vb = new_vmap_block(gfp_mask);
- if (IS_ERR(vb))
- return vb;
- goto again;
- }
+ /* Allocate new block if nothing was found */
+ if (!vaddr)
+ vaddr = new_vmap_block(order, gfp_mask);
- return (void *)addr;
+ return vaddr;
}
static void vb_free(const void *addr, unsigned long size)
@@ -974,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size)
order = get_order(size);
offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
+ offset >>= PAGE_SHIFT;
vb_idx = addr_to_vb_idx((unsigned long)addr);
rcu_read_lock();
@@ -984,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size)
vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
spin_lock(&vb->lock);
- BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
+
+ /* Expand dirty range */
+ vb->dirty_min = min(vb->dirty_min, offset);
+ vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
vb->dirty += 1UL << order;
if (vb->dirty == VMAP_BBMAP_BITS) {
@@ -1023,25 +1050,18 @@ void vm_unmap_aliases(void)
rcu_read_lock();
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
- int i, j;
-
spin_lock(&vb->lock);
- i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
- if (i < VMAP_BBMAP_BITS) {
+ if (vb->dirty) {
+ unsigned long va_start = vb->va->va_start;
unsigned long s, e;
- j = find_last_bit(vb->dirty_map,
- VMAP_BBMAP_BITS);
- j = j + 1; /* need exclusive index */
+ s = va_start + (vb->dirty_min << PAGE_SHIFT);
+ e = va_start + (vb->dirty_max << PAGE_SHIFT);
- s = vb->va->va_start + (i << PAGE_SHIFT);
- e = vb->va->va_start + (j << PAGE_SHIFT);
- flush = 1;
+ start = min(s, start);
+ end = max(e, end);
- if (s < start)
- start = s;
- if (e > end)
- end = e;
+ flush = 1;
}
spin_unlock(&vb->lock);
}
@@ -1314,7 +1334,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
BUG_ON(in_interrupt());
if (flags & VM_IOREMAP)
- align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
+ align = 1ul << clamp_t(int, fls_long(size),
+ PAGE_SHIFT, IOREMAP_MAX_ORDER);
size = PAGE_ALIGN(size);
if (unlikely(!size))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e8eadd71bac..8286938c70de 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
}
+
+/**
+ * sane_reclaim - is the usual dirty throttling mechanism operational?
+ * @sc: scan_control in question
+ *
+ * The normal page dirty throttling mechanism in balance_dirty_pages() is
+ * completely broken with the legacy memcg and direct stalling in
+ * shrink_page_list() is used for throttling instead, which lacks all the
+ * niceties such as fairness, adaptive pausing, bandwidth proportional
+ * allocation and configurability.
+ *
+ * This function tests whether the vmscan currently in progress can assume
+ * that the normal dirty throttling mechanism is operational.
+ */
+static bool sane_reclaim(struct scan_control *sc)
+{
+ struct mem_cgroup *memcg = sc->target_mem_cgroup;
+
+ if (!memcg)
+ return true;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+ return true;
+#endif
+ return false;
+}
#else
static bool global_reclaim(struct scan_control *sc)
{
return true;
}
+
+static bool sane_reclaim(struct scan_control *sc)
+{
+ return true;
+}
#endif
static unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page)
return page_count(page) - page_has_private(page) == 2;
}
-static int may_write_to_queue(struct backing_dev_info *bdi,
- struct scan_control *sc)
+static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
{
if (current->flags & PF_SWAPWRITE)
return 1;
- if (!bdi_write_congested(bdi))
+ if (!inode_write_congested(inode))
return 1;
- if (bdi == current->backing_dev_info)
+ if (inode_to_bdi(inode) == current->backing_dev_info)
return 1;
return 0;
}
@@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
+ if (!may_write_to_inode(mapping->host, sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed)
{
+ unsigned long flags;
+ struct mem_cgroup *memcg;
+
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- spin_lock_irq(&mapping->tree_lock);
+ memcg = mem_cgroup_begin_page_stat(page);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
/*
* The non racy check for a busy page.
*
@@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
swapcache_free(swap);
} else {
void (*freepage)(struct page *);
@@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping))
shadow = workingset_eviction(mapping, page);
- __delete_from_page_cache(page, shadow);
- spin_unlock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(page, shadow, memcg);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
if (freepage != NULL)
freepage(page);
@@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
return 1;
cannot_free:
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
return 0;
}
@@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
mapping = page_mapping(page);
if (((dirty || writeback) && mapping &&
- bdi_write_congested(inode_to_bdi(mapping->host))) ||
+ inode_write_congested(mapping->host)) ||
(writeback && PageReclaim(page)))
nr_congested++;
@@ -935,24 +972,20 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* note that the LRU is being scanned too quickly and the
* caller can stall after page list has been processed.
*
- * 2) Global reclaim encounters a page, memcg encounters a
- * page that is not marked for immediate reclaim or
- * the caller does not have __GFP_IO. In this case mark
- * the page for immediate reclaim and continue scanning.
+ * 2) Global or new memcg reclaim encounters a page that is
+ * not marked for immediate reclaim, or the caller does not
+ * have __GFP_FS (or __GFP_IO if it's simply going to swap,
+ * not to fs). In this case mark the page for immediate
+ * reclaim and continue scanning.
*
- * __GFP_IO is checked because a loop driver thread might
+ * Require may_enter_fs because we would wait on fs, which
+ * may not have submitted IO yet. And the loop driver might
* enter reclaim, and deadlock if it waits on a page for
* which it is needed to do the write (loop masks off
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * Don't require __GFP_FS, since we're not going into the
- * FS, just waiting on its writeback completion. Worryingly,
- * ext4 gfs2 and xfs allocate pages with
- * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
- * may_enter_fs here is liable to OOM on them.
- *
- * 3) memcg encounters a page that is not already marked
+ * 3) Legacy memcg encounters a page that is not already marked
* PageReclaim. memcg does not have any dirty pages
* throttling so we could easily OOM just because too many
* pages are in writeback and there is nothing else to
@@ -967,8 +1000,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
/* Case 2 above */
- } else if (global_reclaim(sc) ||
- !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+ } else if (sane_reclaim(sc) ||
+ !PageReclaim(page) || !may_enter_fs) {
/*
* This is slightly racy - end_page_writeback()
* might have just cleared PageReclaim, then
@@ -1416,7 +1449,7 @@ static int too_many_isolated(struct zone *zone, int file,
if (current_is_kswapd())
return 0;
- if (!global_reclaim(sc))
+ if (!sane_reclaim(sc))
return 0;
if (file) {
@@ -1608,10 +1641,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
set_bit(ZONE_WRITEBACK, &zone->flags);
/*
- * memcg will stall in page writeback so only consider forcibly
- * stalling for global reclaim
+ * Legacy memcg will stall in page writeback so avoid forcibly
+ * stalling here.
*/
- if (global_reclaim(sc)) {
+ if (sane_reclaim(sc)) {
/*
* Tag a zone as congested if all the dirty pages scanned were
* backed by a congested BDI and wait_iff_congested will stall.
@@ -2646,7 +2679,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
- if (!populated_zone(zone))
+ if (!populated_zone(zone) ||
+ zone_reclaimable_pages(zone) == 0)
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
@@ -3596,7 +3630,7 @@ int zone_reclaim_mode __read_mostly;
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
-#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
+#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
/*
* Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -3638,12 +3672,12 @@ static long zone_pagecache_reclaimable(struct zone *zone)
long delta = 0;
/*
- * If RECLAIM_SWAP is set, then all file pages are considered
+ * If RECLAIM_UNMAP is set, then all file pages are considered
* potentially reclaimable. Otherwise, we have to worry about
* pages like swapcache and zone_unmapped_file_pages() provides
* a better estimate
*/
- if (zone_reclaim_mode & RECLAIM_SWAP)
+ if (zone_reclaim_mode & RECLAIM_UNMAP)
nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
else
nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
@@ -3674,15 +3708,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
.order = order,
.priority = ZONE_RECLAIM_PRIORITY,
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
};
cond_resched();
/*
- * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ * We need to be able to allocate from the reserves for RECLAIM_UNMAP
* and we also need to be able to write out pages for RECLAIM_WRITE
- * and RECLAIM_SWAP.
+ * and RECLAIM_UNMAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
lockdep_set_current_reclaim_state(gfp_mask);
diff --git a/mm/zbud.c b/mm/zbud.c
index 2ee4e4520493..f3bf6f7627d8 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -97,6 +97,10 @@ struct zbud_pool {
struct list_head lru;
u64 pages_nr;
struct zbud_ops *ops;
+#ifdef CONFIG_ZPOOL
+ struct zpool *zpool;
+ struct zpool_ops *zpool_ops;
+#endif
};
/*
@@ -123,7 +127,10 @@ struct zbud_header {
static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
{
- return zpool_evict(pool, handle);
+ if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+ return pool->zpool_ops->evict(pool->zpool, handle);
+ else
+ return -ENOENT;
}
static struct zbud_ops zbud_zpool_ops = {
@@ -131,9 +138,17 @@ static struct zbud_ops zbud_zpool_ops = {
};
static void *zbud_zpool_create(char *name, gfp_t gfp,
- struct zpool_ops *zpool_ops)
+ struct zpool_ops *zpool_ops,
+ struct zpool *zpool)
{
- return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+ struct zbud_pool *pool;
+
+ pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
+ if (pool) {
+ pool->zpool = zpool;
+ pool->zpool_ops = zpool_ops;
+ }
+ return pool;
}
static void zbud_zpool_destroy(void *pool)
@@ -292,7 +307,7 @@ struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
struct zbud_pool *pool;
int i;
- pool = kmalloc(sizeof(struct zbud_pool), gfp);
+ pool = kzalloc(sizeof(struct zbud_pool), gfp);
if (!pool)
return NULL;
spin_lock_init(&pool->lock);
diff --git a/mm/zpool.c b/mm/zpool.c
index bacdab6e47de..722a4f60e90b 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -73,33 +73,6 @@ int zpool_unregister_driver(struct zpool_driver *driver)
}
EXPORT_SYMBOL(zpool_unregister_driver);
-/**
- * zpool_evict() - evict callback from a zpool implementation.
- * @pool: pool to evict from.
- * @handle: handle to evict.
- *
- * This can be used by zpool implementations to call the
- * user's evict zpool_ops struct evict callback.
- */
-int zpool_evict(void *pool, unsigned long handle)
-{
- struct zpool *zpool;
-
- spin_lock(&pools_lock);
- list_for_each_entry(zpool, &pools_head, list) {
- if (zpool->pool == pool) {
- spin_unlock(&pools_lock);
- if (!zpool->ops || !zpool->ops->evict)
- return -EINVAL;
- return zpool->ops->evict(zpool, handle);
- }
- }
- spin_unlock(&pools_lock);
-
- return -ENOENT;
-}
-EXPORT_SYMBOL(zpool_evict);
-
static struct zpool_driver *zpool_get_driver(char *type)
{
struct zpool_driver *driver;
@@ -147,7 +120,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
struct zpool_driver *driver;
struct zpool *zpool;
- pr_info("creating pool type %s\n", type);
+ pr_debug("creating pool type %s\n", type);
driver = zpool_get_driver(type);
@@ -170,7 +143,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
zpool->type = driver->type;
zpool->driver = driver;
- zpool->pool = driver->create(name, gfp, ops);
+ zpool->pool = driver->create(name, gfp, ops, zpool);
zpool->ops = ops;
if (!zpool->pool) {
@@ -180,7 +153,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
return NULL;
}
- pr_info("created %s pool\n", type);
+ pr_debug("created pool type %s\n", type);
spin_lock(&pools_lock);
list_add(&zpool->list, &pools_head);
@@ -202,7 +175,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
*/
void zpool_destroy_pool(struct zpool *zpool)
{
- pr_info("destroying pool type %s\n", zpool->type);
+ pr_debug("destroying pool type %s\n", zpool->type);
spin_lock(&pools_lock);
list_del(&zpool->list);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0dec1fa5f656..0a7f81aa2249 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -12,35 +12,6 @@
*/
/*
- * This allocator is designed for use with zram. Thus, the allocator is
- * supposed to work well under low memory conditions. In particular, it
- * never attempts higher order page allocation which is very likely to
- * fail under memory pressure. On the other hand, if we just use single
- * (0-order) pages, it would suffer from very high fragmentation --
- * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
- * This was one of the major issues with its predecessor (xvmalloc).
- *
- * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
- * and links them together using various 'struct page' fields. These linked
- * pages act as a single higher-order page i.e. an object can span 0-order
- * page boundaries. The code refers to these linked pages as a single entity
- * called zspage.
- *
- * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
- * since this satisfies the requirements of all its current users (in the
- * worst case, page is incompressible and is thus stored "as-is" i.e. in
- * uncompressed form). For allocation requests larger than this size, failure
- * is returned (see zs_malloc).
- *
- * Additionally, zs_malloc() does not return a dereferenceable pointer.
- * Instead, it returns an opaque handle (unsigned long) which encodes actual
- * location of the allocated object. The reason for this indirection is that
- * zsmalloc does not keep zspages permanently mapped since that would cause
- * issues on 32-bit systems where the VA region for kernel space mappings
- * is very small. So, before using the allocating memory, the object has to
- * be mapped using zs_map_object() to get a usable pointer and subsequently
- * unmapped using zs_unmap_object().
- *
* Following is how we use various fields and flags of underlying
* struct page(s) to form a zspage.
*
@@ -57,6 +28,8 @@
*
* page->private (union with page->first_page): refers to the
* component page after the first page
+ * If the page is first_page for huge object, it stores handle.
+ * Look at size_class->huge.
* page->freelist: points to the first free object in zspage.
* Free objects are linked together using in-place
* metadata.
@@ -72,12 +45,9 @@
*
*/
-#ifdef CONFIG_ZSMALLOC_DEBUG
-#define DEBUG
-#endif
-
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
@@ -110,6 +80,8 @@
#define ZS_MAX_ZSPAGE_ORDER 2
#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
+#define ZS_HANDLE_SIZE (sizeof(unsigned long))
+
/*
* Object location (<PFN>, <obj_idx>) is encoded as
* as single (unsigned long) handle value.
@@ -133,13 +105,33 @@
#endif
#endif
#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
-#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS)
+
+/*
+ * Memory for allocating for handle keeps object position by
+ * encoding <page, obj_idx> and the encoded value has a room
+ * in least bit(ie, look at obj_to_location).
+ * We use the bit to synchronize between object access by
+ * user and migration.
+ */
+#define HANDLE_PIN_BIT 0
+
+/*
+ * Head in allocated object should have OBJ_ALLOCATED_TAG
+ * to identify the object was allocated or not.
+ * It's okay to add the status bit in the least bit because
+ * header keeps handle which is 4byte-aligned address so we
+ * have room for two bit at least.
+ */
+#define OBJ_ALLOCATED_TAG 1
+#define OBJ_TAG_BITS 1
+#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
#define ZS_MIN_ALLOC_SIZE \
MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
+/* each chunk includes extra space to keep handle */
#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
/*
@@ -172,6 +164,8 @@ enum fullness_group {
enum zs_stat_type {
OBJ_ALLOCATED,
OBJ_USED,
+ CLASS_ALMOST_FULL,
+ CLASS_ALMOST_EMPTY,
NR_ZS_STAT_TYPE,
};
@@ -216,6 +210,8 @@ struct size_class {
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
+ /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+ bool huge;
#ifdef CONFIG_ZSMALLOC_STAT
struct zs_size_stat stats;
@@ -233,14 +229,24 @@ struct size_class {
* This must be power of 2 and less than or equal to ZS_ALIGN
*/
struct link_free {
- /* Handle of next free chunk (encodes <PFN, obj_idx>) */
- void *next;
+ union {
+ /*
+ * Position of next free chunk (encodes <PFN, obj_idx>)
+ * It's valid for non-allocated object
+ */
+ void *next;
+ /*
+ * Handle of allocated object.
+ */
+ unsigned long handle;
+ };
};
struct zs_pool {
char *name;
struct size_class **size_class;
+ struct kmem_cache *handle_cachep;
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
@@ -267,13 +273,44 @@ struct mapping_area {
#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
+ bool huge;
};
+static int create_handle_cache(struct zs_pool *pool)
+{
+ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
+ 0, 0, NULL);
+ return pool->handle_cachep ? 0 : 1;
+}
+
+static void destroy_handle_cache(struct zs_pool *pool)
+{
+ if (pool->handle_cachep)
+ kmem_cache_destroy(pool->handle_cachep);
+}
+
+static unsigned long alloc_handle(struct zs_pool *pool)
+{
+ return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
+ pool->flags & ~__GFP_HIGHMEM);
+}
+
+static void free_handle(struct zs_pool *pool, unsigned long handle)
+{
+ kmem_cache_free(pool->handle_cachep, (void *)handle);
+}
+
+static void record_obj(unsigned long handle, unsigned long obj)
+{
+ *(unsigned long *)handle = obj;
+}
+
/* zpool driver */
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops,
+ struct zpool *zpool)
{
return zs_create_pool(name, gfp);
}
@@ -346,6 +383,11 @@ static struct zpool_driver zs_zpool_driver = {
MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
+static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
+{
+ return pages_per_zspage * PAGE_SIZE / size;
+}
+
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
@@ -396,9 +438,182 @@ static int get_size_class_index(int size)
idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
ZS_SIZE_CLASS_DELTA);
- return idx;
+ return min(zs_size_classes - 1, idx);
+}
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] += cnt;
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] -= cnt;
}
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return class->stats.objs[type];
+}
+
+static int __init zs_stat_init(void)
+{
+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+ if (!zs_stat_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+ debugfs_remove_recursive(zs_stat_root);
+}
+
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+ int i;
+ struct zs_pool *pool = s->private;
+ struct size_class *class;
+ int objs_per_zspage;
+ unsigned long class_almost_full, class_almost_empty;
+ unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
+ unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+
+ seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+ "class", "size", "almost_full", "almost_empty",
+ "obj_allocated", "obj_used", "pages_used",
+ "pages_per_zspage");
+
+ for (i = 0; i < zs_size_classes; i++) {
+ class = pool->size_class[i];
+
+ if (class->index != i)
+ continue;
+
+ spin_lock(&class->lock);
+ class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
+ class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
+ obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ obj_used = zs_stat_get(class, OBJ_USED);
+ spin_unlock(&class->lock);
+
+ objs_per_zspage = get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+ pages_used = obj_allocated / objs_per_zspage *
+ class->pages_per_zspage;
+
+ seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+ i, class->size, class_almost_full, class_almost_empty,
+ obj_allocated, obj_used, pages_used,
+ class->pages_per_zspage);
+
+ total_class_almost_full += class_almost_full;
+ total_class_almost_empty += class_almost_empty;
+ total_objs += obj_allocated;
+ total_used_objs += obj_used;
+ total_pages += pages_used;
+ }
+
+ seq_puts(s, "\n");
+ seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+ "Total", "", total_class_almost_full,
+ total_class_almost_empty, total_objs,
+ total_used_objs, total_pages);
+
+ return 0;
+}
+
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, zs_stats_size_show, inode->i_private);
+}
+
+static const struct file_operations zs_stat_size_ops = {
+ .open = zs_stats_size_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ struct dentry *entry;
+
+ if (!zs_stat_root)
+ return -ENODEV;
+
+ entry = debugfs_create_dir(name, zs_stat_root);
+ if (!entry) {
+ pr_warn("debugfs dir <%s> creation failed\n", name);
+ return -ENOMEM;
+ }
+ pool->stat_dentry = entry;
+
+ entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
+ pool->stat_dentry, pool, &zs_stat_size_ops);
+ if (!entry) {
+ pr_warn("%s: debugfs file entry <%s> creation failed\n",
+ name, "classes");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+ debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return 0;
+}
+
+static int __init zs_stat_init(void)
+{
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ return 0;
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+
+#endif
+
+
/*
* For each size class, zspages are divided into different groups
* depending on how "full" they are. This was done so that we could
@@ -419,7 +634,7 @@ static enum fullness_group get_fullness_group(struct page *page)
fg = ZS_EMPTY;
else if (inuse == max_objects)
fg = ZS_FULL;
- else if (inuse <= max_objects / fullness_threshold_frac)
+ else if (inuse <= 3 * max_objects / fullness_threshold_frac)
fg = ZS_ALMOST_EMPTY;
else
fg = ZS_ALMOST_FULL;
@@ -448,6 +663,8 @@ static void insert_zspage(struct page *page, struct size_class *class,
list_add_tail(&page->lru, &(*head)->lru);
*head = page;
+ zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
+ CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
/*
@@ -473,6 +690,8 @@ static void remove_zspage(struct page *page, struct size_class *class,
struct page, lru);
list_del_init(&page->lru);
+ zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
+ CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
}
/*
@@ -484,11 +703,10 @@ static void remove_zspage(struct page *page, struct size_class *class,
* page from the freelist of the old fullness group to that of the new
* fullness group.
*/
-static enum fullness_group fix_fullness_group(struct zs_pool *pool,
+static enum fullness_group fix_fullness_group(struct size_class *class,
struct page *page)
{
int class_idx;
- struct size_class *class;
enum fullness_group currfg, newfg;
BUG_ON(!is_first_page(page));
@@ -498,7 +716,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
if (newfg == currfg)
goto out;
- class = pool->size_class[class_idx];
remove_zspage(page, class, currfg);
insert_zspage(page, class, newfg);
set_zspage_mapping(page, class_idx, newfg);
@@ -512,7 +729,8 @@ out:
* to form a zspage for each size class. This is important
* to reduce wastage due to unusable space left at end of
* each zspage which is given as:
- * wastage = Zp - Zp % size_class
+ * wastage = Zp % class_size
+ * usage = Zp - wastage
* where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
*
* For example, for size class of 3/8 * PAGE_SIZE, we should
@@ -571,35 +789,50 @@ static struct page *get_next_page(struct page *page)
/*
* Encode <page, obj_idx> as a single handle value.
- * On hardware platforms with physical memory starting at 0x0 the pfn
- * could be 0 so we ensure that the handle will never be 0 by adjusting the
- * encoded obj_idx value before encoding.
+ * We use the least bit of handle for tagging.
*/
-static void *obj_location_to_handle(struct page *page, unsigned long obj_idx)
+static void *location_to_obj(struct page *page, unsigned long obj_idx)
{
- unsigned long handle;
+ unsigned long obj;
if (!page) {
BUG_ON(obj_idx);
return NULL;
}
- handle = page_to_pfn(page) << OBJ_INDEX_BITS;
- handle |= ((obj_idx + 1) & OBJ_INDEX_MASK);
+ obj = page_to_pfn(page) << OBJ_INDEX_BITS;
+ obj |= ((obj_idx) & OBJ_INDEX_MASK);
+ obj <<= OBJ_TAG_BITS;
- return (void *)handle;
+ return (void *)obj;
}
/*
* Decode <page, obj_idx> pair from the given object handle. We adjust the
* decoded obj_idx back to its original value since it was adjusted in
- * obj_location_to_handle().
+ * location_to_obj().
*/
-static void obj_handle_to_location(unsigned long handle, struct page **page,
+static void obj_to_location(unsigned long obj, struct page **page,
unsigned long *obj_idx)
{
- *page = pfn_to_page(handle >> OBJ_INDEX_BITS);
- *obj_idx = (handle & OBJ_INDEX_MASK) - 1;
+ obj >>= OBJ_TAG_BITS;
+ *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+ *obj_idx = (obj & OBJ_INDEX_MASK);
+}
+
+static unsigned long handle_to_obj(unsigned long handle)
+{
+ return *(unsigned long *)handle;
+}
+
+static unsigned long obj_to_head(struct size_class *class, struct page *page,
+ void *obj)
+{
+ if (class->huge) {
+ VM_BUG_ON(!is_first_page(page));
+ return *(unsigned long *)page_private(page);
+ } else
+ return *(unsigned long *)obj;
}
static unsigned long obj_idx_to_offset(struct page *page,
@@ -613,6 +846,25 @@ static unsigned long obj_idx_to_offset(struct page *page,
return off + obj_idx * class_size;
}
+static inline int trypin_tag(unsigned long handle)
+{
+ unsigned long *ptr = (unsigned long *)handle;
+
+ return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
+}
+
+static void pin_tag(unsigned long handle)
+{
+ while (!trypin_tag(handle));
+}
+
+static void unpin_tag(unsigned long handle)
+{
+ unsigned long *ptr = (unsigned long *)handle;
+
+ clear_bit_unlock(HANDLE_PIN_BIT, ptr);
+}
+
static void reset_page(struct page *page)
{
clear_bit(PG_private, &page->flags);
@@ -674,7 +926,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
link = (struct link_free *)vaddr + off / sizeof(*link);
while ((off += class->size) < PAGE_SIZE) {
- link->next = obj_location_to_handle(page, i++);
+ link->next = location_to_obj(page, i++);
link += class->size / sizeof(*link);
}
@@ -684,7 +936,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
* page (if present)
*/
next_page = get_next_page(page);
- link->next = obj_location_to_handle(next_page, 0);
+ link->next = location_to_obj(next_page, 0);
kunmap_atomic(vaddr);
page = next_page;
off %= PAGE_SIZE;
@@ -738,7 +990,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
init_zspage(first_page, class);
- first_page->freelist = obj_location_to_handle(first_page, 0);
+ first_page->freelist = location_to_obj(first_page, 0);
/* Maximum number of objects we can store in this zspage */
first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
@@ -860,12 +1112,19 @@ static void __zs_unmap_object(struct mapping_area *area,
{
int sizes[2];
void *addr;
- char *buf = area->vm_buf;
+ char *buf;
/* no write fastpath */
if (area->vm_mm == ZS_MM_RO)
goto out;
+ buf = area->vm_buf;
+ if (!area->huge) {
+ buf = buf + ZS_HANDLE_SIZE;
+ size -= ZS_HANDLE_SIZE;
+ off += ZS_HANDLE_SIZE;
+ }
+
sizes[0] = PAGE_SIZE - off;
sizes[1] = size - sizes[0];
@@ -952,11 +1211,6 @@ static void init_zs_size_classes(void)
zs_size_classes = nr;
}
-static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
-{
- return pages_per_zspage * PAGE_SIZE / size;
-}
-
static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
{
if (prev->pages_per_zspage != pages_per_zspage)
@@ -969,166 +1223,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
return true;
}
-#ifdef CONFIG_ZSMALLOC_STAT
-
-static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
- class->stats.objs[type] += cnt;
-}
-
-static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
- class->stats.objs[type] -= cnt;
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
-{
- return class->stats.objs[type];
-}
-
-static int __init zs_stat_init(void)
-{
- if (!debugfs_initialized())
- return -ENODEV;
-
- zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
- if (!zs_stat_root)
- return -ENOMEM;
-
- return 0;
-}
-
-static void __exit zs_stat_exit(void)
-{
- debugfs_remove_recursive(zs_stat_root);
-}
-
-static int zs_stats_size_show(struct seq_file *s, void *v)
+static bool zspage_full(struct page *page)
{
- int i;
- struct zs_pool *pool = s->private;
- struct size_class *class;
- int objs_per_zspage;
- unsigned long obj_allocated, obj_used, pages_used;
- unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
-
- seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
- "obj_allocated", "obj_used", "pages_used");
-
- for (i = 0; i < zs_size_classes; i++) {
- class = pool->size_class[i];
-
- if (class->index != i)
- continue;
-
- spin_lock(&class->lock);
- obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
- obj_used = zs_stat_get(class, OBJ_USED);
- spin_unlock(&class->lock);
-
- objs_per_zspage = get_maxobj_per_zspage(class->size,
- class->pages_per_zspage);
- pages_used = obj_allocated / objs_per_zspage *
- class->pages_per_zspage;
-
- seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i,
- class->size, obj_allocated, obj_used, pages_used);
-
- total_objs += obj_allocated;
- total_used_objs += obj_used;
- total_pages += pages_used;
- }
-
- seq_puts(s, "\n");
- seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "",
- total_objs, total_used_objs, total_pages);
-
- return 0;
-}
-
-static int zs_stats_size_open(struct inode *inode, struct file *file)
-{
- return single_open(file, zs_stats_size_show, inode->i_private);
-}
-
-static const struct file_operations zs_stat_size_ops = {
- .open = zs_stats_size_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
- struct dentry *entry;
-
- if (!zs_stat_root)
- return -ENODEV;
-
- entry = debugfs_create_dir(name, zs_stat_root);
- if (!entry) {
- pr_warn("debugfs dir <%s> creation failed\n", name);
- return -ENOMEM;
- }
- pool->stat_dentry = entry;
-
- entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
- pool->stat_dentry, pool, &zs_stat_size_ops);
- if (!entry) {
- pr_warn("%s: debugfs file entry <%s> creation failed\n",
- name, "obj_in_classes");
- return -ENOMEM;
- }
-
- return 0;
-}
-
-static void zs_pool_stat_destroy(struct zs_pool *pool)
-{
- debugfs_remove_recursive(pool->stat_dentry);
-}
-
-#else /* CONFIG_ZSMALLOC_STAT */
-
-static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
-{
-}
-
-static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
-{
- return 0;
-}
-
-static int __init zs_stat_init(void)
-{
- return 0;
-}
-
-static void __exit zs_stat_exit(void)
-{
-}
-
-static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
-{
- return 0;
-}
+ BUG_ON(!is_first_page(page));
-static inline void zs_pool_stat_destroy(struct zs_pool *pool)
-{
+ return page->inuse == page->objects;
}
-#endif
-
unsigned long zs_get_total_pages(struct zs_pool *pool)
{
return atomic_long_read(&pool->pages_allocated);
@@ -1153,13 +1254,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
enum zs_mapmode mm)
{
struct page *page;
- unsigned long obj_idx, off;
+ unsigned long obj, obj_idx, off;
unsigned int class_idx;
enum fullness_group fg;
struct size_class *class;
struct mapping_area *area;
struct page *pages[2];
+ void *ret;
BUG_ON(!handle);
@@ -1170,7 +1272,11 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
*/
BUG_ON(in_interrupt());
- obj_handle_to_location(handle, &page, &obj_idx);
+ /* From now on, migration cannot move the object */
+ pin_tag(handle);
+
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
class = pool->size_class[class_idx];
off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1180,7 +1286,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
area->vm_addr = kmap_atomic(page);
- return area->vm_addr + off;
+ ret = area->vm_addr + off;
+ goto out;
}
/* this object spans two pages */
@@ -1188,14 +1295,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
pages[1] = get_next_page(page);
BUG_ON(!pages[1]);
- return __zs_map_object(area, pages, off, class->size);
+ ret = __zs_map_object(area, pages, off, class->size);
+out:
+ if (!class->huge)
+ ret += ZS_HANDLE_SIZE;
+
+ return ret;
}
EXPORT_SYMBOL_GPL(zs_map_object);
void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
{
struct page *page;
- unsigned long obj_idx, off;
+ unsigned long obj, obj_idx, off;
unsigned int class_idx;
enum fullness_group fg;
@@ -1204,7 +1316,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
BUG_ON(!handle);
- obj_handle_to_location(handle, &page, &obj_idx);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &page, &obj_idx);
get_zspage_mapping(get_first_page(page), &class_idx, &fg);
class = pool->size_class[class_idx];
off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1222,9 +1335,42 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
__zs_unmap_object(area, pages, off, class->size);
}
put_cpu_var(zs_map_area);
+ unpin_tag(handle);
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
+static unsigned long obj_malloc(struct page *first_page,
+ struct size_class *class, unsigned long handle)
+{
+ unsigned long obj;
+ struct link_free *link;
+
+ struct page *m_page;
+ unsigned long m_objidx, m_offset;
+ void *vaddr;
+
+ handle |= OBJ_ALLOCATED_TAG;
+ obj = (unsigned long)first_page->freelist;
+ obj_to_location(obj, &m_page, &m_objidx);
+ m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
+
+ vaddr = kmap_atomic(m_page);
+ link = (struct link_free *)vaddr + m_offset / sizeof(*link);
+ first_page->freelist = link->next;
+ if (!class->huge)
+ /* record handle in the header of allocated chunk */
+ link->handle = handle;
+ else
+ /* record handle in first_page->private */
+ set_page_private(first_page, handle);
+ kunmap_atomic(vaddr);
+ first_page->inuse++;
+ zs_stat_inc(class, OBJ_USED, 1);
+
+ return obj;
+}
+
+
/**
* zs_malloc - Allocate block of given size from pool.
* @pool: pool to allocate from
@@ -1236,17 +1382,19 @@ EXPORT_SYMBOL_GPL(zs_unmap_object);
*/
unsigned long zs_malloc(struct zs_pool *pool, size_t size)
{
- unsigned long obj;
- struct link_free *link;
+ unsigned long handle, obj;
struct size_class *class;
- void *vaddr;
-
- struct page *first_page, *m_page;
- unsigned long m_objidx, m_offset;
+ struct page *first_page;
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
return 0;
+ handle = alloc_handle(pool);
+ if (!handle)
+ return 0;
+
+ /* extra space in chunk to keep the handle */
+ size += ZS_HANDLE_SIZE;
class = pool->size_class[get_size_class_index(size)];
spin_lock(&class->lock);
@@ -1255,8 +1403,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
if (!first_page) {
spin_unlock(&class->lock);
first_page = alloc_zspage(class, pool->flags);
- if (unlikely(!first_page))
+ if (unlikely(!first_page)) {
+ free_handle(pool, handle);
return 0;
+ }
set_zspage_mapping(first_page, class->index, ZS_EMPTY);
atomic_long_add(class->pages_per_zspage,
@@ -1267,73 +1417,360 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
class->size, class->pages_per_zspage));
}
- obj = (unsigned long)first_page->freelist;
- obj_handle_to_location(obj, &m_page, &m_objidx);
- m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
-
- vaddr = kmap_atomic(m_page);
- link = (struct link_free *)vaddr + m_offset / sizeof(*link);
- first_page->freelist = link->next;
- memset(link, POISON_INUSE, sizeof(*link));
- kunmap_atomic(vaddr);
-
- first_page->inuse++;
- zs_stat_inc(class, OBJ_USED, 1);
+ obj = obj_malloc(first_page, class, handle);
/* Now move the zspage to another fullness group, if required */
- fix_fullness_group(pool, first_page);
+ fix_fullness_group(class, first_page);
+ record_obj(handle, obj);
spin_unlock(&class->lock);
- return obj;
+ return handle;
}
EXPORT_SYMBOL_GPL(zs_malloc);
-void zs_free(struct zs_pool *pool, unsigned long obj)
+static void obj_free(struct zs_pool *pool, struct size_class *class,
+ unsigned long obj)
{
struct link_free *link;
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
void *vaddr;
-
int class_idx;
- struct size_class *class;
enum fullness_group fullness;
- if (unlikely(!obj))
- return;
+ BUG_ON(!obj);
- obj_handle_to_location(obj, &f_page, &f_objidx);
+ obj &= ~OBJ_ALLOCATED_TAG;
+ obj_to_location(obj, &f_page, &f_objidx);
first_page = get_first_page(f_page);
get_zspage_mapping(first_page, &class_idx, &fullness);
- class = pool->size_class[class_idx];
f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
- spin_lock(&class->lock);
+ vaddr = kmap_atomic(f_page);
/* Insert this object in containing zspage's freelist */
- vaddr = kmap_atomic(f_page);
link = (struct link_free *)(vaddr + f_offset);
link->next = first_page->freelist;
+ if (class->huge)
+ set_page_private(first_page, 0);
kunmap_atomic(vaddr);
first_page->freelist = (void *)obj;
-
first_page->inuse--;
- fullness = fix_fullness_group(pool, first_page);
-
zs_stat_dec(class, OBJ_USED, 1);
- if (fullness == ZS_EMPTY)
+}
+
+void zs_free(struct zs_pool *pool, unsigned long handle)
+{
+ struct page *first_page, *f_page;
+ unsigned long obj, f_objidx;
+ int class_idx;
+ struct size_class *class;
+ enum fullness_group fullness;
+
+ if (unlikely(!handle))
+ return;
+
+ pin_tag(handle);
+ obj = handle_to_obj(handle);
+ obj_to_location(obj, &f_page, &f_objidx);
+ first_page = get_first_page(f_page);
+
+ get_zspage_mapping(first_page, &class_idx, &fullness);
+ class = pool->size_class[class_idx];
+
+ spin_lock(&class->lock);
+ obj_free(pool, class, obj);
+ fullness = fix_fullness_group(class, first_page);
+ if (fullness == ZS_EMPTY) {
zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
class->size, class->pages_per_zspage));
-
+ atomic_long_sub(class->pages_per_zspage,
+ &pool->pages_allocated);
+ free_zspage(first_page);
+ }
spin_unlock(&class->lock);
+ unpin_tag(handle);
+
+ free_handle(pool, handle);
+}
+EXPORT_SYMBOL_GPL(zs_free);
+
+static void zs_object_copy(unsigned long src, unsigned long dst,
+ struct size_class *class)
+{
+ struct page *s_page, *d_page;
+ unsigned long s_objidx, d_objidx;
+ unsigned long s_off, d_off;
+ void *s_addr, *d_addr;
+ int s_size, d_size, size;
+ int written = 0;
+
+ s_size = d_size = class->size;
+
+ obj_to_location(src, &s_page, &s_objidx);
+ obj_to_location(dst, &d_page, &d_objidx);
+
+ s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
+ d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
+
+ if (s_off + class->size > PAGE_SIZE)
+ s_size = PAGE_SIZE - s_off;
+
+ if (d_off + class->size > PAGE_SIZE)
+ d_size = PAGE_SIZE - d_off;
+
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+
+ while (1) {
+ size = min(s_size, d_size);
+ memcpy(d_addr + d_off, s_addr + s_off, size);
+ written += size;
+
+ if (written == class->size)
+ break;
+
+ s_off += size;
+ s_size -= size;
+ d_off += size;
+ d_size -= size;
+
+ if (s_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+ s_page = get_next_page(s_page);
+ BUG_ON(!s_page);
+ s_addr = kmap_atomic(s_page);
+ d_addr = kmap_atomic(d_page);
+ s_size = class->size - written;
+ s_off = 0;
+ }
+
+ if (d_off >= PAGE_SIZE) {
+ kunmap_atomic(d_addr);
+ d_page = get_next_page(d_page);
+ BUG_ON(!d_page);
+ d_addr = kmap_atomic(d_page);
+ d_size = class->size - written;
+ d_off = 0;
+ }
+ }
+
+ kunmap_atomic(d_addr);
+ kunmap_atomic(s_addr);
+}
+
+/*
+ * Find alloced object in zspage from index object and
+ * return handle.
+ */
+static unsigned long find_alloced_obj(struct page *page, int index,
+ struct size_class *class)
+{
+ unsigned long head;
+ int offset = 0;
+ unsigned long handle = 0;
+ void *addr = kmap_atomic(page);
+
+ if (!is_first_page(page))
+ offset = page->index;
+ offset += class->size * index;
+
+ while (offset < PAGE_SIZE) {
+ head = obj_to_head(class, page, addr + offset);
+ if (head & OBJ_ALLOCATED_TAG) {
+ handle = head & ~OBJ_ALLOCATED_TAG;
+ if (trypin_tag(handle))
+ break;
+ handle = 0;
+ }
+
+ offset += class->size;
+ index++;
+ }
+
+ kunmap_atomic(addr);
+ return handle;
+}
+
+struct zs_compact_control {
+ /* Source page for migration which could be a subpage of zspage. */
+ struct page *s_page;
+ /* Destination page for migration which should be a first page
+ * of zspage. */
+ struct page *d_page;
+ /* Starting object index within @s_page which used for live object
+ * in the subpage. */
+ int index;
+ /* how many of objects are migrated */
+ int nr_migrated;
+};
+
+static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
+ struct zs_compact_control *cc)
+{
+ unsigned long used_obj, free_obj;
+ unsigned long handle;
+ struct page *s_page = cc->s_page;
+ struct page *d_page = cc->d_page;
+ unsigned long index = cc->index;
+ int nr_migrated = 0;
+ int ret = 0;
+
+ while (1) {
+ handle = find_alloced_obj(s_page, index, class);
+ if (!handle) {
+ s_page = get_next_page(s_page);
+ if (!s_page)
+ break;
+ index = 0;
+ continue;
+ }
+
+ /* Stop if there is no more space */
+ if (zspage_full(d_page)) {
+ unpin_tag(handle);
+ ret = -ENOMEM;
+ break;
+ }
+
+ used_obj = handle_to_obj(handle);
+ free_obj = obj_malloc(d_page, class, handle);
+ zs_object_copy(used_obj, free_obj, class);
+ index++;
+ record_obj(handle, free_obj);
+ unpin_tag(handle);
+ obj_free(pool, class, used_obj);
+ nr_migrated++;
+ }
+
+ /* Remember last position in this iteration */
+ cc->s_page = s_page;
+ cc->index = index;
+ cc->nr_migrated = nr_migrated;
+
+ return ret;
+}
+
+static struct page *alloc_target_page(struct size_class *class)
+{
+ int i;
+ struct page *page;
+
+ for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
+ page = class->fullness_list[i];
+ if (page) {
+ remove_zspage(page, class, i);
+ break;
+ }
+ }
+
+ return page;
+}
+
+static void putback_zspage(struct zs_pool *pool, struct size_class *class,
+ struct page *first_page)
+{
+ enum fullness_group fullness;
+
+ BUG_ON(!is_first_page(first_page));
+
+ fullness = get_fullness_group(first_page);
+ insert_zspage(first_page, class, fullness);
+ set_zspage_mapping(first_page, class->index, fullness);
if (fullness == ZS_EMPTY) {
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
atomic_long_sub(class->pages_per_zspage,
&pool->pages_allocated);
+
free_zspage(first_page);
}
}
-EXPORT_SYMBOL_GPL(zs_free);
+
+static struct page *isolate_source_page(struct size_class *class)
+{
+ struct page *page;
+
+ page = class->fullness_list[ZS_ALMOST_EMPTY];
+ if (page)
+ remove_zspage(page, class, ZS_ALMOST_EMPTY);
+
+ return page;
+}
+
+static unsigned long __zs_compact(struct zs_pool *pool,
+ struct size_class *class)
+{
+ int nr_to_migrate;
+ struct zs_compact_control cc;
+ struct page *src_page;
+ struct page *dst_page = NULL;
+ unsigned long nr_total_migrated = 0;
+
+ spin_lock(&class->lock);
+ while ((src_page = isolate_source_page(class))) {
+
+ BUG_ON(!is_first_page(src_page));
+
+ /* The goal is to migrate all live objects in source page */
+ nr_to_migrate = src_page->inuse;
+ cc.index = 0;
+ cc.s_page = src_page;
+
+ while ((dst_page = alloc_target_page(class))) {
+ cc.d_page = dst_page;
+ /*
+ * If there is no more space in dst_page, try to
+ * allocate another zspage.
+ */
+ if (!migrate_zspage(pool, class, &cc))
+ break;
+
+ putback_zspage(pool, class, dst_page);
+ nr_total_migrated += cc.nr_migrated;
+ nr_to_migrate -= cc.nr_migrated;
+ }
+
+ /* Stop if we couldn't find slot */
+ if (dst_page == NULL)
+ break;
+
+ putback_zspage(pool, class, dst_page);
+ putback_zspage(pool, class, src_page);
+ spin_unlock(&class->lock);
+ nr_total_migrated += cc.nr_migrated;
+ cond_resched();
+ spin_lock(&class->lock);
+ }
+
+ if (src_page)
+ putback_zspage(pool, class, src_page);
+
+ spin_unlock(&class->lock);
+
+ return nr_total_migrated;
+}
+
+unsigned long zs_compact(struct zs_pool *pool)
+{
+ int i;
+ unsigned long nr_migrated = 0;
+ struct size_class *class;
+
+ for (i = zs_size_classes - 1; i >= 0; i--) {
+ class = pool->size_class[i];
+ if (!class)
+ continue;
+ if (class->index != i)
+ continue;
+ nr_migrated += __zs_compact(pool, class);
+ }
+
+ return nr_migrated;
+}
+EXPORT_SYMBOL_GPL(zs_compact);
/**
* zs_create_pool - Creates an allocation pool to work from.
@@ -1355,20 +1792,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
if (!pool)
return NULL;
- pool->name = kstrdup(name, GFP_KERNEL);
- if (!pool->name) {
- kfree(pool);
- return NULL;
- }
-
pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
GFP_KERNEL);
if (!pool->size_class) {
- kfree(pool->name);
kfree(pool);
return NULL;
}
+ pool->name = kstrdup(name, GFP_KERNEL);
+ if (!pool->name)
+ goto err;
+
+ if (create_handle_cache(pool))
+ goto err;
+
/*
* Iterate reversly, because, size of size_class that we want to use
* for merging should be larger or equal to current size.
@@ -1406,6 +1843,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
class->size = size;
class->index = i;
class->pages_per_zspage = pages_per_zspage;
+ if (pages_per_zspage == 1 &&
+ get_maxobj_per_zspage(size, pages_per_zspage) == 1)
+ class->huge = true;
spin_lock_init(&class->lock);
pool->size_class[i] = class;
@@ -1450,6 +1890,7 @@ void zs_destroy_pool(struct zs_pool *pool)
kfree(class);
}
+ destroy_handle_cache(pool);
kfree(pool->size_class);
kfree(pool->name);
kfree(pool);
diff --git a/mm/zswap.c b/mm/zswap.c
index 4249e82ff934..2d5727baed59 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -75,9 +75,10 @@ static u64 zswap_duplicate_entry;
/*********************************
* tunables
**********************************/
-/* Enable/disable zswap (disabled by default, fixed at boot for now) */
-static bool zswap_enabled __read_mostly;
-module_param_named(enabled, zswap_enabled, bool, 0444);
+
+/* Enable/disable zswap (disabled by default) */
+static bool zswap_enabled;
+module_param_named(enabled, zswap_enabled, bool, 0644);
/* Compressor to be used by zswap (fixed at boot for now) */
#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
@@ -648,7 +649,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
u8 *src, *dst;
struct zswap_header *zhdr;
- if (!tree) {
+ if (!zswap_enabled || !tree) {
ret = -ENODEV;
goto reject;
}
@@ -901,9 +902,6 @@ static int __init init_zswap(void)
{
gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
- if (!zswap_enabled)
- return 0;
-
pr_info("loading zswap\n");
zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,