summaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c527
1 files changed, 326 insertions, 201 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b34ef4a32a3b..acb93c554f6e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -14,6 +14,12 @@
* Copyright (C) 2012 Parallels Inc. and Google Inc.
* Authors: Glauber Costa and Suleiman Souhlal
*
+ * Native page reclaim
+ * Charge lifetime sanitation
+ * Lockless page tracking & accounting
+ * Unified hierarchy configuration model
+ * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -71,6 +77,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
+struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
@@ -84,6 +91,7 @@ static const char * const mem_cgroup_stat_names[] = {
"rss",
"rss_huge",
"mapped_file",
+ "dirty",
"writeback",
"swap",
};
@@ -253,11 +261,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
- *
- * TODO: Add a water mark for the memory controller. Reclaim will begin when
- * we hit the water mark. May be even add a low water mark, such that
- * no reclaim occurs from a cgroup at it's low water mark, this is
- * a feature that will be implemented much later in the future.
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
@@ -284,9 +287,9 @@ struct mem_cgroup {
*/
bool use_hierarchy;
+ /* protected by memcg_oom_lock */
bool oom_lock;
- atomic_t under_oom;
- atomic_t oom_wakeups;
+ int under_oom;
int swappiness;
/* OOM-Killer disable */
@@ -321,11 +324,6 @@ struct mem_cgroup {
* percpu counter.
*/
struct mem_cgroup_stat_cpu __percpu *stat;
- /*
- * used when a cpu is offlined or other synchronizations
- * See mem_cgroup_read_stat().
- */
- struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
@@ -345,6 +343,11 @@ struct mem_cgroup {
atomic_t numainfo_updating;
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_list;
+ struct wb_domain cgwb_domain;
+#endif
+
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
@@ -454,6 +457,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
return memcg->css.id;
}
+/*
+ * A helper function to get mem_cgroup from ID. must be called under
+ * rcu_read_lock(). The caller is responsible for calling
+ * css_tryget_online() if the mem_cgroup is used for charging. (dropping
+ * refcnt from swap can be called against removed memcg.)
+ */
static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
struct cgroup_subsys_state *css;
@@ -589,6 +598,39 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
return &memcg->css;
}
+/**
+ * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * @page: page of interest
+ *
+ * If memcg is bound to the default hierarchy, css of the memcg associated
+ * with @page is returned. The returned css remains associated with @page
+ * until it is released.
+ *
+ * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+ * is returned.
+ *
+ * XXX: The above description of behavior on the default hierarchy isn't
+ * strictly true yet as replace_page_cache_page() can modify the
+ * association before @page is released even on the default hierarchy;
+ * however, the current and planned usages don't mix the the two functions
+ * and replace_page_cache_page() will soon be updated to make the invariant
+ * actually true.
+ */
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+
+ memcg = page->mem_cgroup;
+
+ if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+ memcg = root_mem_cgroup;
+
+ rcu_read_unlock();
+ return &memcg->css;
+}
+
static struct mem_cgroup_per_zone *
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
@@ -667,7 +709,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
{
unsigned long nr_pages = page_counter_read(&memcg->memory);
- unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
+ unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
unsigned long excess = 0;
if (nr_pages > soft_limit)
@@ -788,15 +830,8 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
long val = 0;
int cpu;
- get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
val += per_cpu(memcg->stat->count[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
- spin_lock(&memcg->pcp_counter_lock);
- val += memcg->nocpu_base.count[idx];
- spin_unlock(&memcg->pcp_counter_lock);
-#endif
- put_online_cpus();
return val;
}
@@ -806,15 +841,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
unsigned long val = 0;
int cpu;
- get_online_cpus();
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
val += per_cpu(memcg->stat->events[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
- spin_lock(&memcg->pcp_counter_lock);
- val += memcg->nocpu_base.events[idx];
- spin_unlock(&memcg->pcp_counter_lock);
-#endif
- put_online_cpus();
return val;
}
@@ -1035,7 +1063,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
goto out_unlock;
do {
- pos = ACCESS_ONCE(iter->position);
+ pos = READ_ONCE(iter->position);
/*
* A racing update may change the position and
* put the last reference, hence css_tryget(),
@@ -1352,13 +1380,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
unsigned long limit;
count = page_counter_read(&memcg->memory);
- limit = ACCESS_ONCE(memcg->memory.limit);
+ limit = READ_ONCE(memcg->memory.limit);
if (count < limit)
margin = limit - count;
if (do_swap_account) {
count = page_counter_read(&memcg->memsw);
- limit = ACCESS_ONCE(memcg->memsw.limit);
+ limit = READ_ONCE(memcg->memsw.limit);
if (count <= limit)
margin = min(margin, limit - count);
}
@@ -1436,15 +1464,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
struct mem_cgroup *iter;
unsigned int i;
- if (!p)
- return;
-
mutex_lock(&oom_info_lock);
rcu_read_lock();
- pr_info("Task in ");
- pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_cont(" killed as a result of limit of ");
+ if (p) {
+ pr_info("Task in ");
+ pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+ pr_cont(" killed as a result of limit of ");
+ } else {
+ pr_info("Memory limit reached of cgroup ");
+ }
+
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont("\n");
@@ -1521,17 +1551,19 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int points = 0;
struct task_struct *chosen = NULL;
+ mutex_lock(&oom_lock);
+
/*
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
*/
if (fatal_signal_pending(current) || task_will_free_mem(current)) {
- mark_tsk_oom_victim(current);
- return;
+ mark_oom_victim(current);
+ goto unlock;
}
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
totalpages = mem_cgroup_get_limit(memcg) ? : 1;
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1555,7 +1587,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
mem_cgroup_iter_break(memcg, iter);
if (chosen)
put_task_struct(chosen);
- return;
+ goto unlock;
case OOM_SCAN_OK:
break;
};
@@ -1576,11 +1608,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
css_task_iter_end(&it);
}
- if (!chosen)
- return;
- points = chosen_points * 1000 / totalpages;
- oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
- NULL, "Memory cgroup out of memory");
+ if (chosen) {
+ points = chosen_points * 1000 / totalpages;
+ oom_kill_process(chosen, gfp_mask, order, points, totalpages,
+ memcg, NULL, "Memory cgroup out of memory");
+ }
+unlock:
+ mutex_unlock(&oom_lock);
}
#if MAX_NUMNODES > 1
@@ -1797,8 +1831,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
{
struct mem_cgroup *iter;
+ spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
- atomic_inc(&iter->under_oom);
+ iter->under_oom++;
+ spin_unlock(&memcg_oom_lock);
}
static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
@@ -1807,11 +1843,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
/*
* When a new child is created while the hierarchy is under oom,
- * mem_cgroup_oom_lock() may not be called. We have to use
- * atomic_add_unless() here.
+ * mem_cgroup_oom_lock() may not be called. Watch for underflow.
*/
+ spin_lock(&memcg_oom_lock);
for_each_mem_cgroup_tree(iter, memcg)
- atomic_add_unless(&iter->under_oom, -1, 0);
+ if (iter->under_oom > 0)
+ iter->under_oom--;
+ spin_unlock(&memcg_oom_lock);
}
static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1837,17 +1875,18 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
return autoremove_wake_function(wait, mode, sync, arg);
}
-static void memcg_wakeup_oom(struct mem_cgroup *memcg)
-{
- atomic_inc(&memcg->oom_wakeups);
- /* for filtering, pass "memcg" as argument. */
- __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
-}
-
static void memcg_oom_recover(struct mem_cgroup *memcg)
{
- if (memcg && atomic_read(&memcg->under_oom))
- memcg_wakeup_oom(memcg);
+ /*
+ * For the following lockless ->under_oom test, the only required
+ * guarantee is that it must see the state asserted by an OOM when
+ * this function is called as a result of userland actions
+ * triggered by the notification of the OOM. This is trivially
+ * achieved by invoking mem_cgroup_mark_under_oom() before
+ * triggering notification.
+ */
+ if (memcg && memcg->under_oom)
+ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
}
static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
@@ -2002,6 +2041,7 @@ again:
return memcg;
}
+EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
/**
* mem_cgroup_end_page_stat - finish a page state statistics transaction
@@ -2020,6 +2060,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
rcu_read_unlock();
}
+EXPORT_SYMBOL(mem_cgroup_end_page_stat);
/**
* mem_cgroup_update_page_stat - update page state statistics
@@ -2160,37 +2201,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
mutex_unlock(&percpu_charge_mutex);
}
-/*
- * This function drains percpu counter value from DEAD cpu and
- * move it to local cpu. Note that this function can be preempted.
- */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
-{
- int i;
-
- spin_lock(&memcg->pcp_counter_lock);
- for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- long x = per_cpu(memcg->stat->count[i], cpu);
-
- per_cpu(memcg->stat->count[i], cpu) = 0;
- memcg->nocpu_base.count[i] += x;
- }
- for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
- unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-
- per_cpu(memcg->stat->events[i], cpu) = 0;
- memcg->nocpu_base.events[i] += x;
- }
- spin_unlock(&memcg->pcp_counter_lock);
-}
-
static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
unsigned long action,
void *hcpu)
{
int cpu = (unsigned long)hcpu;
struct memcg_stock_pcp *stock;
- struct mem_cgroup *iter;
if (action == CPU_ONLINE)
return NOTIFY_OK;
@@ -2198,9 +2214,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
return NOTIFY_OK;
- for_each_mem_cgroup(iter)
- mem_cgroup_drain_pcp_counter(iter, cpu);
-
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
return NOTIFY_OK;
@@ -2314,6 +2327,8 @@ done_restock:
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
+ if (!(gfp_mask & __GFP_WAIT))
+ goto done;
/*
* If the hierarchy is above the normal consumption range,
* make the charging task trim their excess contribution.
@@ -2341,20 +2356,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
}
/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
-{
- /* ID 0 is unused ID */
- if (!id)
- return NULL;
- return mem_cgroup_from_id(id);
-}
-
-/*
* try_get_mem_cgroup_from_page - look up page's memcg association
* @page: the page
*
@@ -2380,7 +2381,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
ent.val = page_private(page);
id = lookup_swap_cgroup_id(ent);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = mem_cgroup_from_id(id);
if (memcg && !css_tryget_online(&memcg->css))
memcg = NULL;
rcu_read_unlock();
@@ -2642,7 +2643,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
return cachep;
memcg = get_mem_cgroup_from_mm(current->mm);
- kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+ kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
goto out;
@@ -2779,92 +2780,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-/**
- * mem_cgroup_move_account - move account of the page
- * @page: the page
- * @nr_pages: number of regular pages (>1 for huge pages)
- * @from: mem_cgroup which the page is moved from.
- * @to: mem_cgroup which the page is moved to. @from != @to.
- *
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
- *
- * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * from old cgroup.
- */
-static int mem_cgroup_move_account(struct page *page,
- unsigned int nr_pages,
- struct mem_cgroup *from,
- struct mem_cgroup *to)
-{
- unsigned long flags;
- int ret;
-
- VM_BUG_ON(from == to);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- ret = -EBUSY;
- if (nr_pages > 1 && !PageTransHuge(page))
- goto out;
-
- /*
- * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
- * of its source page while we change it: page migration takes
- * both pages off the LRU, but page cache replacement doesn't.
- */
- if (!trylock_page(page))
- goto out;
-
- ret = -EINVAL;
- if (page->mem_cgroup != from)
- goto out_unlock;
-
- spin_lock_irqsave(&from->move_lock, flags);
-
- if (!PageAnon(page) && page_mapped(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- }
-
- if (PageWriteback(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- }
-
- /*
- * It is safe to change page->mem_cgroup here because the page
- * is referenced, charged, and isolated - we can't race with
- * uncharging, charging, migration, or LRU putback.
- */
-
- /* caller should have done css_get */
- page->mem_cgroup = to;
- spin_unlock_irqrestore(&from->move_lock, flags);
-
- ret = 0;
-
- local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
- memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
- memcg_check_events(from, page);
- local_irq_enable();
-out_unlock:
- unlock_page(page);
-out:
- return ret;
-}
-
#ifdef CONFIG_MEMCG_SWAP
static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
@@ -3953,7 +3868,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
list_add(&event->list, &memcg->oom_notify);
/* already in OOM ? */
- if (atomic_read(&memcg->under_oom))
+ if (memcg->under_oom)
eventfd_signal(eventfd, 1);
spin_unlock(&memcg_oom_lock);
@@ -3982,7 +3897,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
- seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
+ seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
return 0;
}
@@ -4084,6 +3999,98 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
}
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
+{
+ return &memcg->cgwb_list;
+}
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+ return wb_domain_init(&memcg->cgwb_domain, gfp);
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+ wb_domain_exit(&memcg->cgwb_domain);
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+ wb_domain_size_changed(&memcg->cgwb_domain);
+}
+
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+ if (!memcg->css.parent)
+ return NULL;
+
+ return &memcg->cgwb_domain;
+}
+
+/**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+ * @pavail: out parameter for number of available pages
+ * @pdirty: out parameter for number of dirty pages
+ * @pwriteback: out parameter for number of pages under writeback
+ *
+ * Determine the numbers of available, dirty, and writeback pages in @wb's
+ * memcg. Dirty and writeback are self-explanatory. Available is a bit
+ * more involved.
+ *
+ * A memcg's headroom is "min(max, high) - used". The available memory is
+ * calculated as the lowest headroom of itself and the ancestors plus the
+ * number of pages already being used for file pages. Note that this
+ * doesn't consider the actual amount of available memory in the system.
+ * The caller should further cap *@pavail accordingly.
+ */
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+ unsigned long *pdirty, unsigned long *pwriteback)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ struct mem_cgroup *parent;
+ unsigned long head_room = PAGE_COUNTER_MAX;
+ unsigned long file_pages;
+
+ *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+
+ /* this should eventually include NR_UNSTABLE_NFS */
+ *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+
+ file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+ (1 << LRU_ACTIVE_FILE));
+ while ((parent = parent_mem_cgroup(memcg))) {
+ unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+ unsigned long used = page_counter_read(&memcg->memory);
+
+ head_room = min(head_room, ceiling - min(ceiling, used));
+ memcg = parent;
+ }
+
+ *pavail = file_pages + head_room;
+}
+
+#else /* CONFIG_CGROUP_WRITEBACK */
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+ return 0;
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
/*
* DO NOT USE IN NEW FILES.
*
@@ -4468,9 +4475,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
if (!memcg->stat)
goto out_free;
+
+ if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+ goto out_free_stat;
+
spin_lock_init(&memcg->pcp_counter_lock);
return memcg;
+out_free_stat:
+ free_percpu(memcg->stat);
out_free:
kfree(memcg);
return NULL;
@@ -4497,6 +4510,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_mem_cgroup_per_zone_info(memcg, node);
free_percpu(memcg->stat);
+ memcg_wb_domain_exit(memcg);
kfree(memcg);
}
@@ -4529,6 +4543,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* root ? */
if (parent_css == NULL) {
root_mem_cgroup = memcg;
+ mem_cgroup_root_css = &memcg->css;
page_counter_init(&memcg->memory, NULL);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4547,7 +4562,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
#endif
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+ INIT_LIST_HEAD(&memcg->cgwb_list);
+#endif
return &memcg->css;
free_out:
@@ -4635,6 +4652,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
vmpressure_cleanup(&memcg->vmpressure);
memcg_deactivate_kmem(memcg);
+
+ wb_memcg_offline(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4668,6 +4687,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg->low = 0;
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
+ memcg_wb_domain_size_changed(memcg);
}
#ifdef CONFIG_MMU
@@ -4816,6 +4836,111 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
return page;
}
+/**
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
+ * @from: mem_cgroup which the page is moved from.
+ * @to: mem_cgroup which the page is moved to. @from != @to.
+ *
+ * The caller must confirm following.
+ * - page is not on LRU (isolate_page() is useful.)
+ * - compound_lock is held when nr_pages > 1
+ *
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
+ * from old cgroup.
+ */
+static int mem_cgroup_move_account(struct page *page,
+ unsigned int nr_pages,
+ struct mem_cgroup *from,
+ struct mem_cgroup *to)
+{
+ unsigned long flags;
+ int ret;
+ bool anon;
+
+ VM_BUG_ON(from == to);
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ /*
+ * The page is isolated from LRU. So, collapse function
+ * will not handle this page. But page splitting can happen.
+ * Do this check under compound_page_lock(). The caller should
+ * hold it.
+ */
+ ret = -EBUSY;
+ if (nr_pages > 1 && !PageTransHuge(page))
+ goto out;
+
+ /*
+ * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
+ * of its source page while we change it: page migration takes
+ * both pages off the LRU, but page cache replacement doesn't.
+ */
+ if (!trylock_page(page))
+ goto out;
+
+ ret = -EINVAL;
+ if (page->mem_cgroup != from)
+ goto out_unlock;
+
+ anon = PageAnon(page);
+
+ spin_lock_irqsave(&from->move_lock, flags);
+
+ if (!anon && page_mapped(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
+ nr_pages);
+ }
+
+ /*
+ * move_lock grabbed above and caller set from->moving_account, so
+ * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+ * So mapping should be stable for dirty pages.
+ */
+ if (!anon && PageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping_cap_account_dirty(mapping)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+ nr_pages);
+ }
+ }
+
+ if (PageWriteback(page)) {
+ __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
+ nr_pages);
+ }
+
+ /*
+ * It is safe to change page->mem_cgroup here because the page
+ * is referenced, charged, and isolated - we can't race with
+ * uncharging, charging, migration, or LRU putback.
+ */
+
+ /* caller should have done css_get */
+ page->mem_cgroup = to;
+ spin_unlock_irqrestore(&from->move_lock, flags);
+
+ ret = 0;
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(to, page, nr_pages);
+ memcg_check_events(to, page);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
+ memcg_check_events(from, page);
+ local_irq_enable();
+out_unlock:
+ unlock_page(page);
+out:
+ return ret;
+}
+
static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent, union mc_target *target)
{
@@ -5012,7 +5137,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
- move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
+ move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
if (move_flags) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5246,7 +5371,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
static int memory_low_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = ACCESS_ONCE(memcg->low);
+ unsigned long low = READ_ONCE(memcg->low);
if (low == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5276,7 +5401,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
static int memory_high_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long high = ACCESS_ONCE(memcg->high);
+ unsigned long high = READ_ONCE(memcg->high);
if (high == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5300,13 +5425,14 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
memcg->high = high;
+ memcg_wb_domain_size_changed(memcg);
return nbytes;
}
static int memory_max_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = ACCESS_ONCE(memcg->memory.limit);
+ unsigned long max = READ_ONCE(memcg->memory.limit);
if (max == PAGE_COUNTER_MAX)
seq_puts(m, "max\n");
@@ -5332,6 +5458,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (err)
return err;
+ memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -5838,9 +5965,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, 1);
- /* XXX: caller holds IRQ-safe mapping->tree_lock */
- VM_BUG_ON(!irqs_disabled());
-
+ /* Caller disabled preemption with mapping->tree_lock */
mem_cgroup_charge_statistics(memcg, page, -1);
memcg_check_events(memcg, page);
}
@@ -5861,7 +5986,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
id = swap_cgroup_record(entry, 0);
rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
+ memcg = mem_cgroup_from_id(id);
if (memcg) {
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memsw, 1);