From ffbdccf5e1facd18b54429a749667fb185c10f20 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 3 Jul 2013 15:01:23 -0700 Subject: mm, memcg: don't take task_lock in task_in_mem_cgroup For processes that have detached their mm's, task_in_mem_cgroup() unnecessarily takes task_lock() when rcu_read_lock() is all that is necessary to call mem_cgroup_from_task(). While we're here, switch task_in_mem_cgroup() to return bool. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 194721839cf5..4748966b1511 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1448,11 +1448,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, return ret; } -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) +bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg) { - int ret; struct mem_cgroup *curr = NULL; struct task_struct *p; + bool ret; p = find_lock_task_mm(task); if (p) { @@ -1464,14 +1465,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) * killer still needs to detect if they have already been oom * killed to prevent needlessly killing additional tasks. */ - task_lock(task); + rcu_read_lock(); curr = mem_cgroup_from_task(task); if (curr) css_get(&curr->css); - task_unlock(task); + rcu_read_unlock(); } if (!curr) - return 0; + return false; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is -- cgit v1.2.3 From 519ebea3bf6df45439e79c54bda1d9e29fe13a64 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 3 Jul 2013 15:04:51 -0700 Subject: mm: memcontrol: factor out reclaim iterator loading and updating mem_cgroup_iter() is too hard to follow. Factor out the lockless reclaim iterator loading and updating so it's easier to follow the big picture. Also document the iterator invalidation mechanism a bit more extensively. Signed-off-by: Johannes Weiner Reported-by: Tejun Heo Reviewed-by: Tejun Heo Acked-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 86 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 29 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4748966b1511..2e851f453814 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1148,6 +1148,58 @@ skip_node: return NULL; } +static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) +{ + /* + * When a group in the hierarchy below root is destroyed, the + * hierarchy iterator can no longer be trusted since it might + * have pointed to the destroyed group. Invalidate it. + */ + atomic_inc(&root->dead_count); +} + +static struct mem_cgroup * +mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, + struct mem_cgroup *root, + int *sequence) +{ + struct mem_cgroup *position = NULL; + /* + * A cgroup destruction happens in two stages: offlining and + * release. They are separated by a RCU grace period. + * + * If the iterator is valid, we may still race with an + * offlining. The RCU lock ensures the object won't be + * released, tryget will fail if we lost the race. + */ + *sequence = atomic_read(&root->dead_count); + if (iter->last_dead_count == *sequence) { + smp_rmb(); + position = iter->last_visited; + if (position && !css_tryget(&position->css)) + position = NULL; + } + return position; +} + +static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, + struct mem_cgroup *last_visited, + struct mem_cgroup *new_position, + int sequence) +{ + if (last_visited) + css_put(&last_visited->css); + /* + * We store the sequence count from the time @last_visited was + * loaded successfully instead of rereading it here so that we + * don't lose destruction events in between. We could have + * raced with the destruction of @new_position after all. + */ + iter->last_visited = new_position; + smp_wmb(); + iter->last_dead_count = sequence; +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -1171,7 +1223,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - unsigned long uninitialized_var(dead_count); if (mem_cgroup_disabled()) return NULL; @@ -1191,6 +1242,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, rcu_read_lock(); while (!memcg) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); + int uninitialized_var(seq); if (reclaim) { int nid = zone_to_nid(reclaim->zone); @@ -1204,37 +1256,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, goto out_unlock; } - /* - * If the dead_count mismatches, a destruction - * has happened or is happening concurrently. - * If the dead_count matches, a destruction - * might still happen concurrently, but since - * we checked under RCU, that destruction - * won't free the object until we release the - * RCU reader lock. Thus, the dead_count - * check verifies the pointer is still valid, - * css_tryget() verifies the cgroup pointed to - * is alive. - */ - dead_count = atomic_read(&root->dead_count); - if (dead_count == iter->last_dead_count) { - smp_rmb(); - last_visited = iter->last_visited; - if (last_visited && - !css_tryget(&last_visited->css)) - last_visited = NULL; - } + last_visited = mem_cgroup_iter_load(iter, root, &seq); } memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { - if (last_visited) - css_put(&last_visited->css); - - iter->last_visited = memcg; - smp_wmb(); - iter->last_dead_count = dead_count; + mem_cgroup_iter_update(iter, last_visited, memcg, seq); if (!memcg) iter->generation++; @@ -6318,14 +6346,14 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) struct mem_cgroup *parent = memcg; while ((parent = parent_mem_cgroup(parent))) - atomic_inc(&parent->dead_count); + mem_cgroup_iter_invalidate(parent); /* * if the root memcg is not hierarchical we have to check it * explicitely. */ if (!root_mem_cgroup->use_hierarchy) - atomic_inc(&root_mem_cgroup->dead_count); + mem_cgroup_iter_invalidate(root_mem_cgroup); } static void mem_cgroup_css_offline(struct cgroup *cont) -- cgit v1.2.3 From 54f72fe022d9b2c4de40043a118881121190a117 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 8 Jul 2013 15:59:49 -0700 Subject: memcg: clean up memcg->nodeinfo Remove struct mem_cgroup_lru_info and fold its single member, the variably sized nodeinfo[0], directly into struct mem_cgroup. This should make it more obvious why it has to be the last member there. Also move the comment that's above that special last member below it, so it is more visible to somebody that considers appending to the struct mem_cgroup. Signed-off-by: Johannes Weiner Cc: David Rientjes Acked-by: Michal Hocko Cc: Glauber Costa Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2e851f453814..2b7cd24d4cda 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -187,10 +187,6 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; -struct mem_cgroup_lru_info { - struct mem_cgroup_per_node *nodeinfo[0]; -}; - /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -366,14 +362,8 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif - /* - * Per cgroup active and inactive list, similar to the - * per zone LRU lists. - * - * WARNING: This has to be the last element of the struct. Don't - * add new fields after this point. - */ - struct mem_cgroup_lru_info info; + struct mem_cgroup_per_node *nodeinfo[0]; + /* WARNING: nodeinfo must be the last member here */ }; static size_t memcg_size(void) @@ -683,7 +673,7 @@ static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) { VM_BUG_ON((unsigned)nid >= nr_node_ids); - return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) @@ -6087,13 +6077,13 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) mz->on_tree = false; mz->memcg = memcg; } - memcg->info.nodeinfo[node] = pn; + memcg->nodeinfo[node] = pn; return 0; } static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { - kfree(memcg->info.nodeinfo[node]); + kfree(memcg->nodeinfo[node]); } static struct mem_cgroup *mem_cgroup_alloc(void) -- cgit v1.2.3 From 6d42c232bd1e77288b2660153299b7d12a5c8e15 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 8 Jul 2013 16:00:00 -0700 Subject: memcg: also test for skip accounting at the page allocation level The memory we used to hold the memcg arrays is currently accounted to the current memcg. But that creates a problem, because that memory can only be freed after the last user is gone. Our only way to know which is the last user, is to hook up to freeing time, but the fact that we still have some in flight kmallocs will prevent freeing to happen. I believe therefore to be just easier to account this memory as global overhead. This patch (of 2): Disabling accounting is only relevant for some specific memcg internal allocations. Therefore we would initially not have such check at memcg_kmem_newpage_charge, since direct calls to the page allocator that are marked with GFP_KMEMCG only happen outside memcg core. We are mostly concerned with cache allocations and by having this test at memcg_kmem_get_cache we are already able to relay the allocation to the root cache and bypass the memcg caches altogether. There is one exception, though: the SLUB allocator does not create large order caches, but rather service large kmallocs directly from the page allocator. Therefore, the following sequence, when backed by the SLUB allocator: memcg_stop_kmem_account(); kmalloc() memcg_resume_kmem_account(); would effectively ignore the fact that we should skip accounting, since it will drive us directly to this function without passing through the cache selector memcg_kmem_get_cache. Such large allocations are extremely rare but can happen, for instance, for the cache arrays. This was never a problem in practice, because we weren't skipping accounting for the cache arrays. All the allocations we were skipping were fairly small. However, the fact that we were not skipping those allocations are a problem and can prevent the memcgs from going away. As we fix that, we need to make sure that the fix will also work with the SLUB allocator. Signed-off-by: Glauber Costa Reported-by: Michal Hocko Cc: Johannes Weiner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2b7cd24d4cda..06a595fd6400 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3637,6 +3637,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) int ret; *_memcg = NULL; + + /* + * Disabling accounting is only relevant for some specific memcg + * internal allocations. Therefore we would initially not have such + * check here, since direct calls to the page allocator that are marked + * with GFP_KMEMCG only happen outside memcg core. We are mostly + * concerned with cache allocations, and by having this test at + * memcg_kmem_get_cache, we are already able to relay the allocation to + * the root cache and bypass the memcg cache altogether. + * + * There is one exception, though: the SLUB allocator does not create + * large order caches, but rather service large kmallocs directly from + * the page allocator. Therefore, the following sequence when backed by + * the SLUB allocator: + * + * memcg_stop_kmem_account(); + * kmalloc() + * memcg_resume_kmem_account(); + * + * would effectively ignore the fact that we should skip accounting, + * since it will drive us directly to this function without passing + * through the cache selector memcg_kmem_get_cache. Such large + * allocations are extremely rare but can happen, for instance, for the + * cache arrays. We bring this test here. + */ + if (!current->mm || current->memcg_kmem_skip_account) + return true; + memcg = try_get_mem_cgroup_from_mm(current->mm); /* -- cgit v1.2.3 From 425c598d583883c33c75780225ba8e0794b43bd9 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 8 Jul 2013 16:00:01 -0700 Subject: memcg: do not account memory used for cache creation The memory we used to hold the memcg arrays is currently accounted to the current memcg. But that creates a problem, because that memory can only be freed after the last user is gone. Our only way to know which is the last user, is to hook up to freeing time, but the fact that we still have some in flight kmallocs will prevent freeing to happen. I believe therefore to be just easier to account this memory as global overhead. Signed-off-by: Glauber Costa Cc: Johannes Weiner Cc: Michal Hocko Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 06a595fd6400..64f726599ff0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5232,7 +5232,9 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) static_key_slow_inc(&memcg_kmem_enabled_key); mutex_lock(&set_limit_mutex); + memcg_stop_kmem_account(); ret = memcg_update_cache_sizes(memcg); + memcg_resume_kmem_account(); mutex_unlock(&set_limit_mutex); out: return ret; -- cgit v1.2.3 From fa460c2d37870e0a6f94c70e8b76d05ca11b6db0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Jul 2013 16:00:27 -0700 Subject: Revert "memcg: avoid dangling reference count in creation failure" This reverts commit e4715f01be697a. mem_cgroup_put is hierarchy aware so mem_cgroup_put(memcg) already drops an additional reference from all parents so the additional mem_cgrroup_put(parent) potentially causes use-after-free. Signed-off-by: Michal Hocko Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Cc: [3.9+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 64f726599ff0..6b73d8657d64 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6352,8 +6352,6 @@ mem_cgroup_css_online(struct cgroup *cont) * call __mem_cgroup_free, so return directly */ mem_cgroup_put(memcg); - if (parent->use_hierarchy) - mem_cgroup_put(parent); } return error; } -- cgit v1.2.3 From f37a96914d1aea10fed8d9af10251f0b9caea31b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 8 Jul 2013 16:00:29 -0700 Subject: memcg, kmem: fix reference count handling on the error path mem_cgroup_css_online calls mem_cgroup_put if memcg_init_kmem fails. This is not correct because only memcg_propagate_kmem takes an additional reference while mem_cgroup_sockets_init is allowed to fail as well (although no current implementation fails) but it doesn't take any reference. This all suggests that it should be memcg_propagate_kmem that should clean up after itself so this patch moves mem_cgroup_put over there. Unfortunately this is not that easy (as pointed out by Li Zefan) because memcg_kmem_mark_dead marks the group dead (KMEM_ACCOUNTED_DEAD) if it is marked active (KMEM_ACCOUNTED_ACTIVE) which is the case even if memcg_propagate_kmem fails so the additional reference is dropped in that case in kmem_cgroup_destroy which means that the reference would be dropped two times. The easiest way then would be to simply remove mem_cgrroup_put from mem_cgroup_css_online and rely on kmem_cgroup_destroy doing the right thing. Signed-off-by: Michal Hocko Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Cc: [3.8] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b73d8657d64..bdeb82ca6c20 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6345,14 +6345,6 @@ mem_cgroup_css_online(struct cgroup *cont) error = memcg_init_kmem(memcg, &mem_cgroup_subsys); mutex_unlock(&memcg_create_mutex); - if (error) { - /* - * We call put now because our (and parent's) refcnts - * are already in place. mem_cgroup_put() will internally - * call __mem_cgroup_free, so return directly - */ - mem_cgroup_put(memcg); - } return error; } -- cgit v1.2.3 From 5347e5ae13710420eebbbd0b22c045685704da80 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:30 -0700 Subject: memcg: use css_get() in sock_update_memcg() Use css_get/css_put instead of mem_cgroup_get/put. Note, if at the same time someone is moving @current to a different cgroup and removing the old cgroup, css_tryget() may return false, and sock->sk_cgrp won't be initialized, which is fine. Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bdeb82ca6c20..4c31a21a55be 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -551,15 +551,15 @@ void sock_update_memcg(struct sock *sk) */ if (sk->sk_cgrp) { BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - mem_cgroup_get(sk->sk_cgrp->memcg); + css_get(&sk->sk_cgrp->memcg->css); return; } rcu_read_lock(); memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { - mem_cgroup_get(memcg); + if (!mem_cgroup_is_root(memcg) && + memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { sk->sk_cgrp = cg_proto; } rcu_read_unlock(); @@ -573,7 +573,7 @@ void sock_release_memcg(struct sock *sk) struct mem_cgroup *memcg; WARN_ON(!sk->sk_cgrp->memcg); memcg = sk->sk_cgrp->memcg; - mem_cgroup_put(memcg); + css_put(&sk->sk_cgrp->memcg->css); } } -- cgit v1.2.3 From 20f05310ba62d5816fb339d08effe78683137197 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:31 -0700 Subject: memcg: don't use mem_cgroup_get() when creating a kmemcg cache Use css_get()/css_put() instead of mem_cgroup_get()/mem_cgroup_put(). There are two things being done in the current code: First, we acquired a css_ref to make sure that the underlying cgroup would not go away. That is a short lived reference, and it is put as soon as the cache is created. At this point, we acquire a long-lived per-cache memcg reference count to guarantee that the memcg will still be alive. so it is: enqueue: css_get create : memcg_get, css_put destroy: memcg_put So we only need to get rid of the memcg_get, change the memcg_put to css_put, and get rid of the now extra css_put. (This changelog is mostly written by Glauber) Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4c31a21a55be..80175ded718d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3242,7 +3242,7 @@ void memcg_release_cache(struct kmem_cache *s) list_del(&s->memcg_params->list); mutex_unlock(&memcg->slab_caches_mutex); - mem_cgroup_put(memcg); + css_put(&memcg->css); out: kfree(s->memcg_params); } @@ -3402,16 +3402,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&memcg_cache_mutex); new_cachep = cachep->memcg_params->memcg_caches[idx]; - if (new_cachep) + if (new_cachep) { + css_put(&memcg->css); goto out; + } new_cachep = kmem_cache_dup(memcg, cachep); if (new_cachep == NULL) { new_cachep = cachep; + css_put(&memcg->css); goto out; } - mem_cgroup_get(memcg); atomic_set(&new_cachep->memcg_params->nr_pages , 0); cachep->memcg_params->memcg_caches[idx] = new_cachep; @@ -3499,8 +3501,6 @@ static void memcg_create_cache_work_func(struct work_struct *w) cw = container_of(w, struct create_work, work); memcg_create_kmem_cache(cw->memcg, cw->cachep); - /* Drop the reference gotten when we enqueued. */ - css_put(&cw->memcg->css); kfree(cw); } -- cgit v1.2.3 From 10d5ebf40ff09db03b97cb177f24b9c7c8b4bb52 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:33 -0700 Subject: memcg: use css_get/put when charging/uncharging kmem Use css_get/put instead of mem_cgroup_get/put. We can't do a simple replacement, because here mem_cgroup_put() is called during mem_cgroup_css_free(), while mem_cgroup_css_free() won't be called until css refcnt goes down to 0. Instead we increment css refcnt in mem_cgroup_css_offline(), and then check if there's still kmem charges. If not, css refcnt will be decremented immediately, otherwise the refcnt will be released after the last kmem allocation is uncahred. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Tejun Heo Cc: Michal Hocko Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 80 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 26 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 80175ded718d..bdc9582585af 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -406,6 +406,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) { + /* + * Our caller must use css_get() first, because memcg_uncharge_kmem() + * will call css_put() if it sees the memcg is dead. + */ + smp_wmb(); if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); } @@ -3050,8 +3055,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) if (res_counter_uncharge(&memcg->kmem, size)) return; + /* + * Releases a reference taken in kmem_cgroup_css_offline in case + * this last uncharge is racing with the offlining code or it is + * outliving the memcg existence. + * + * The memory barrier imposed by test&clear is paired with the + * explicit one in memcg_kmem_mark_dead(). + */ if (memcg_kmem_test_and_clear_dead(memcg)) - mem_cgroup_put(memcg); + css_put(&memcg->css); } void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) @@ -5183,14 +5196,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) * starts accounting before all call sites are patched */ memcg_kmem_set_active(memcg); - - /* - * kmem charges can outlive the cgroup. In the case of slab - * pages, for instance, a page contain objects from various - * processes, so it is unfeasible to migrate them away. We - * need to reference count the memcg because of that. - */ - mem_cgroup_get(memcg); } else ret = res_counter_set_limit(&memcg->kmem, val); out: @@ -5223,12 +5228,10 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) goto out; /* - * destroy(), called if we fail, will issue static_key_slow_inc() and - * mem_cgroup_put() if kmem is enabled. We have to either call them - * unconditionally, or clear the KMEM_ACTIVE flag. I personally find - * this more consistent, since it always leads to the same destroy path + * __mem_cgroup_free() will issue static_key_slow_dec() because this + * memcg is active already. If the later initialization fails then the + * cgroup core triggers the cleanup so we do not have to do it here. */ - mem_cgroup_get(memcg); static_key_slow_inc(&memcg_kmem_enabled_key); mutex_lock(&set_limit_mutex); @@ -5913,23 +5916,43 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return mem_cgroup_sockets_init(memcg, ss); } -static void kmem_cgroup_destroy(struct mem_cgroup *memcg) +static void memcg_destroy_kmem(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); +} + +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) +{ + if (!memcg_kmem_is_active(memcg)) + return; + + /* + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes. As we prevent from taking a reference for every + * such allocation we have to be careful when doing uncharge + * (see memcg_uncharge_kmem) and here during offlining. + * + * The idea is that that only the _last_ uncharge which sees + * the dead memcg will drop the last reference. An additional + * reference is taken here before the group is marked dead + * which is then paired with css_put during uncharge resp. here. + * + * Although this might sound strange as this path is called from + * css_offline() when the referencemight have dropped down to 0 + * and shouldn't be incremented anymore (css_tryget would fail) + * we do not have other options because of the kmem allocations + * lifetime. + */ + css_get(&memcg->css); memcg_kmem_mark_dead(memcg); if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) return; - /* - * Charges already down to 0, undo mem_cgroup_get() done in the charge - * path here, being careful not to race with memcg_uncharge_kmem: it is - * possible that the charges went down to 0 between mark_dead and the - * res_counter read, so in that case, we don't need the put - */ if (memcg_kmem_test_and_clear_dead(memcg)) - mem_cgroup_put(memcg); + css_put(&memcg->css); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) @@ -5937,7 +5960,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return 0; } -static void kmem_cgroup_destroy(struct mem_cgroup *memcg) +static void memcg_destroy_kmem(struct mem_cgroup *memcg) +{ +} + +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) { } #endif @@ -6370,6 +6397,8 @@ static void mem_cgroup_css_offline(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + kmem_cgroup_css_offline(memcg); + mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); mem_cgroup_destroy_all_caches(memcg); @@ -6379,9 +6408,8 @@ static void mem_cgroup_css_free(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - kmem_cgroup_destroy(memcg); - - mem_cgroup_put(memcg); + memcg_destroy_kmem(memcg); + __mem_cgroup_free(memcg); } #ifdef CONFIG_MMU -- cgit v1.2.3 From 4050377b509b326c14b275fedb2f69b46f37a7a9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:34 -0700 Subject: memcg: use css_get/put for swap memcg Use css_get/put instead of mem_cgroup_get/put. A simple replacement will do. The historical reason that memcg has its own refcnt instead of always using css_get/put, is that cgroup couldn't be removed if there're still css refs, so css refs can't be used as long-lived reference. The situation has changed so that rmdir a cgroup will succeed regardless css refs, but won't be freed until css refs goes down to 0. Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bdc9582585af..76c0c99b002f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4231,12 +4231,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, unlock_page_cgroup(pc); /* * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed. + * will never be freed, so it's safe to call css_get(). */ memcg_check_events(memcg, page); if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { mem_cgroup_swap_statistics(memcg, true); - mem_cgroup_get(memcg); + css_get(&memcg->css); } /* * Migration does not charge the res_counter for the @@ -4348,7 +4348,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) /* * record memcg information, if swapout && memcg != NULL, - * mem_cgroup_get() was called in uncharge(). + * css_get() was called in uncharge(). */ if (do_swap_account && swapout && memcg) swap_cgroup_record(ent, css_id(&memcg->css)); @@ -4379,7 +4379,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) if (!mem_cgroup_is_root(memcg)) res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_put(memcg); + css_put(&memcg->css); } rcu_read_unlock(); } @@ -4413,11 +4413,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, * This function is only called from task migration context now. * It postpones res_counter and refcount handling till the end * of task migration(mem_cgroup_clear_mc()) for performance - * improvement. But we cannot postpone mem_cgroup_get(to) - * because if the process that has been moved to @to does - * swap-in, the refcount of @to might be decreased to 0. + * improvement. But we cannot postpone css_get(to) because if + * the process that has been moved to @to does swap-in, the + * refcount of @to might be decreased to 0. + * + * We are in attach() phase, so the cgroup is guaranteed to be + * alive, so we can just call css_get(). */ - mem_cgroup_get(to); + css_get(&to->css); return 0; } return -EINVAL; @@ -6718,6 +6721,7 @@ static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; + int i; /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { @@ -6738,7 +6742,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) res_counter_uncharge(&mc.from->memsw, PAGE_SIZE * mc.moved_swap); - __mem_cgroup_put(mc.from, mc.moved_swap); + + for (i = 0; i < mc.moved_swap; i++) + css_put(&mc.from->css); if (!mem_cgroup_is_root(mc.to)) { /* @@ -6748,7 +6754,7 @@ static void __mem_cgroup_clear_mc(void) res_counter_uncharge(&mc.to->res, PAGE_SIZE * mc.moved_swap); } - /* we've already done mem_cgroup_get(mc.to) */ + /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); -- cgit v1.2.3 From 8d76a9797882fc517d87e2b5db2a4f04edaeccec Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:36 -0700 Subject: memcg: don't need to get a reference to the parent The cgroup core guarantees it's always safe to access the parent. Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 76c0c99b002f..c508258d61a1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -503,7 +503,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); static inline @@ -6239,19 +6238,10 @@ static void free_rcu(struct rcu_head *rcu_head) schedule_work(&memcg->work_freeing); } -static void mem_cgroup_get(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->refcnt); -} - static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) { - if (atomic_sub_and_test(count, &memcg->refcnt)) { - struct mem_cgroup *parent = parent_mem_cgroup(memcg); + if (atomic_sub_and_test(count, &memcg->refcnt)) call_rcu(&memcg->rcu_freeing, free_rcu); - if (parent) - mem_cgroup_put(parent); - } } static void mem_cgroup_put(struct mem_cgroup *memcg) @@ -6354,12 +6344,9 @@ mem_cgroup_css_online(struct cgroup *cont) res_counter_init(&memcg->kmem, &parent->kmem); /* - * We increment refcnt of the parent to ensure that we can - * safely access it on res_counter_charge/uncharge. - * This refcnt will be decremented when freeing this - * mem_cgroup(see mem_cgroup_put). + * No need to take a reference to the parent because cgroup + * core guarantees its existence. */ - mem_cgroup_get(parent); } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); -- cgit v1.2.3 From e0743e6bc5b7587dd0bfa902d67d3f81ef3f6618 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:37 -0700 Subject: memcg: kill memcg refcnt Now memcg has the same life cycle as its corresponding cgroup. Kill the useless refcnt. Signed-off-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c508258d61a1..fa521a2f4bf6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -299,8 +299,6 @@ struct mem_cgroup { bool oom_lock; atomic_t under_oom; - atomic_t refcnt; - int swappiness; /* OOM-Killer disable */ int oom_kill_disable; @@ -503,8 +501,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -static void mem_cgroup_put(struct mem_cgroup *memcg); - static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { @@ -6238,17 +6234,6 @@ static void free_rcu(struct rcu_head *rcu_head) schedule_work(&memcg->work_freeing); } -static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) -{ - if (atomic_sub_and_test(count, &memcg->refcnt)) - call_rcu(&memcg->rcu_freeing, free_rcu); -} - -static void mem_cgroup_put(struct mem_cgroup *memcg) -{ - __mem_cgroup_put(memcg, 1); -} - /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ @@ -6308,7 +6293,6 @@ mem_cgroup_css_alloc(struct cgroup *cont) memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); - atomic_set(&memcg->refcnt, 1); memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); @@ -6399,7 +6383,7 @@ static void mem_cgroup_css_free(struct cgroup *cont) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); memcg_destroy_kmem(memcg); - __mem_cgroup_free(memcg); + call_rcu(&memcg->rcu_freeing, free_rcu); } #ifdef CONFIG_MMU -- cgit v1.2.3 From 465939a1fa283cf2a5194362c5accf4429c99c42 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 8 Jul 2013 16:00:38 -0700 Subject: memcg: don't need to free memcg via RCU or workqueue Now memcg has the same life cycle with its corresponding cgroup, and a cgroup is freed via RCU and then mem_cgroup_css_free() will be called in a work function, so we can simply call __mem_cgroup_free() in mem_cgroup_css_free(). This actually reverts commit 59927fb984d ("memcg: free mem_cgroup by RCU to fix oops"). Signed-off-by: Li Zefan Cc: Hugh Dickins Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Tejun Heo Cc: Glauber Costa Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 51 +++++---------------------------------------------- 1 file changed, 5 insertions(+), 46 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fa521a2f4bf6..d12ca6f3c293 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -263,28 +263,10 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; - union { - /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; - - /* - * rcu_freeing is used only when freeing struct mem_cgroup, - * so put it into a union to avoid wasting more memory. - * It must be disjoint from the css field. It could be - * in a union with the res field, but res plays a much - * larger part in mem_cgroup life than memsw, and might - * be of interest, even at time of free, when debugging. - * So share rcu_head with the less interesting memsw. - */ - struct rcu_head rcu_freeing; - /* - * We also need some space for a worker in deferred freeing. - * By the time we call it, rcu_freeing is no longer in use. - */ - struct work_struct work_freeing; - }; + /* + * the counter to account for mem+swap usage. + */ + struct res_counter memsw; /* * the counter to account for kernel memory usage. @@ -6211,29 +6193,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) vfree(memcg); } - -/* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. - */ -static void free_work(struct work_struct *work) -{ - struct mem_cgroup *memcg; - - memcg = container_of(work, struct mem_cgroup, work_freeing); - __mem_cgroup_free(memcg); -} - -static void free_rcu(struct rcu_head *rcu_head) -{ - struct mem_cgroup *memcg; - - memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, free_work); - schedule_work(&memcg->work_freeing); -} - /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ @@ -6383,7 +6342,7 @@ static void mem_cgroup_css_free(struct cgroup *cont) struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); memcg_destroy_kmem(memcg); - call_rcu(&memcg->rcu_freeing, free_rcu); + __mem_cgroup_free(memcg); } #ifdef CONFIG_MMU -- cgit v1.2.3