From 3e32cb2e0a12b6915056ff04601cf1bb9b44f967 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:42:31 -0800 Subject: mm: memcontrol: lockless page counters Memory is internally accounted in bytes, using spinlock-protected 64-bit counters, even though the smallest accounting delta is a page. The counter interface is also convoluted and does too many things. Introduce a new lockless word-sized page counter API, then change all memory accounting over to it. The translation from and to bytes then only happens when interfacing with userspace. The removed locking overhead is noticable when scaling beyond the per-cpu charge caches - on a 4-socket machine with 144-threads, the following test shows the performance differences of 288 memcgs concurrently running a page fault benchmark: vanilla: 18631648.500498 task-clock (msec) # 140.643 CPUs utilized ( +- 0.33% ) 1,380,638 context-switches # 0.074 K/sec ( +- 0.75% ) 24,390 cpu-migrations # 0.001 K/sec ( +- 8.44% ) 1,843,305,768 page-faults # 0.099 M/sec ( +- 0.00% ) 50,134,994,088,218 cycles # 2.691 GHz ( +- 0.33% ) stalled-cycles-frontend stalled-cycles-backend 8,049,712,224,651 instructions # 0.16 insns per cycle ( +- 0.04% ) 1,586,970,584,979 branches # 85.176 M/sec ( +- 0.05% ) 1,724,989,949 branch-misses # 0.11% of all branches ( +- 0.48% ) 132.474343877 seconds time elapsed ( +- 0.21% ) lockless: 12195979.037525 task-clock (msec) # 133.480 CPUs utilized ( +- 0.18% ) 832,850 context-switches # 0.068 K/sec ( +- 0.54% ) 15,624 cpu-migrations # 0.001 K/sec ( +- 10.17% ) 1,843,304,774 page-faults # 0.151 M/sec ( +- 0.00% ) 32,811,216,801,141 cycles # 2.690 GHz ( +- 0.18% ) stalled-cycles-frontend stalled-cycles-backend 9,999,265,091,727 instructions # 0.30 insns per cycle ( +- 0.10% ) 2,076,759,325,203 branches # 170.282 M/sec ( +- 0.12% ) 1,656,917,214 branch-misses # 0.08% of all branches ( +- 0.55% ) 91.369330729 seconds time elapsed ( +- 0.45% ) On top of improved scalability, this also gets rid of the icky long long types in the very heart of memcg, which is great for 32 bit and also makes the code a lot more readable. Notable differences between the old and new API: - res_counter_charge() and res_counter_charge_nofail() become page_counter_try_charge() and page_counter_charge() resp. to match the more common kernel naming scheme of try_do()/do() - res_counter_uncharge_until() is only ever used to cancel a local counter and never to uncharge bigger segments of a hierarchy, so it's replaced by the simpler page_counter_cancel() - res_counter_set_limit() is replaced by page_counter_limit(), which expects its callers to serialize against themselves - res_counter_memparse_write_strategy() is replaced by page_counter_limit(), which rounds down to the nearest page size - rather than up. This is more reasonable for explicitely requested hard upper limits. - to keep charging light-weight, page_counter_try_charge() charges speculatively, only to roll back if the result exceeds the limit. Because of this, a failing bigger charge can temporarily lock out smaller charges that would otherwise succeed. The error is bounded to the difference between the smallest and the biggest possible charge size, so for memcg, this means that a failing THP charge can send base page charges into reclaim upto 2MB (4MB) before the limit would have been reached. This should be acceptable. [akpm@linux-foundation.org: add includes for WARN_ON_ONCE and memparse] [akpm@linux-foundation.org: add includes for WARN_ON_ONCE, memparse, strncmp, and PAGE_SIZE] Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Vladimir Davydov Cc: Tejun Heo Cc: David Rientjes Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- include/linux/page_counter.h | 51 ++++++++++++++++++++++++++++++++++++++++++++ include/net/sock.h | 26 ++++++++-------------- 3 files changed, 62 insertions(+), 20 deletions(-) create mode 100644 include/linux/page_counter.h (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6b75640ef5ab..ea007615e8f9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -447,9 +447,8 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) /* * __GFP_NOFAIL allocations will move on even if charging is not * possible. Therefore we don't even try, and have this allocation - * unaccounted. We could in theory charge it with - * res_counter_charge_nofail, but we hope those allocations are rare, - * and won't be worth the trouble. + * unaccounted. We could in theory charge it forcibly, but we hope + * those allocations are rare, and won't be worth the trouble. */ if (gfp & __GFP_NOFAIL) return true; diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h new file mode 100644 index 000000000000..7cce3be99ff3 --- /dev/null +++ b/include/linux/page_counter.h @@ -0,0 +1,51 @@ +#ifndef _LINUX_PAGE_COUNTER_H +#define _LINUX_PAGE_COUNTER_H + +#include +#include +#include + +struct page_counter { + atomic_long_t count; + unsigned long limit; + struct page_counter *parent; + + /* legacy */ + unsigned long watermark; + unsigned long failcnt; +}; + +#if BITS_PER_LONG == 32 +#define PAGE_COUNTER_MAX LONG_MAX +#else +#define PAGE_COUNTER_MAX (LONG_MAX / PAGE_SIZE) +#endif + +static inline void page_counter_init(struct page_counter *counter, + struct page_counter *parent) +{ + atomic_long_set(&counter->count, 0); + counter->limit = PAGE_COUNTER_MAX; + counter->parent = parent; +} + +static inline unsigned long page_counter_read(struct page_counter *counter) +{ + return atomic_long_read(&counter->count); +} + +int page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); +void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); +int page_counter_try_charge(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail); +int page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); +int page_counter_limit(struct page_counter *counter, unsigned long limit); +int page_counter_memparse(const char *buf, unsigned long *nr_pages); + +static inline void page_counter_reset_watermark(struct page_counter *counter) +{ + counter->watermark = page_counter_read(counter); +} + +#endif /* _LINUX_PAGE_COUNTER_H */ diff --git a/include/net/sock.h b/include/net/sock.h index e6f235ebf6c9..7ff44e062a38 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -54,8 +54,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -1062,7 +1062,7 @@ enum cg_proto_flags { }; struct cg_proto { - struct res_counter memory_allocated; /* Current allocated memory. */ + struct page_counter memory_allocated; /* Current allocated memory. */ struct percpu_counter sockets_allocated; /* Current number of sockets. */ int memory_pressure; long sysctl_mem[3]; @@ -1214,34 +1214,26 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot, unsigned long amt, int *parent_status) { - struct res_counter *fail; - int ret; + page_counter_charge(&prot->memory_allocated, amt); - ret = res_counter_charge_nofail(&prot->memory_allocated, - amt << PAGE_SHIFT, &fail); - if (ret < 0) + if (page_counter_read(&prot->memory_allocated) > + prot->memory_allocated.limit) *parent_status = OVER_LIMIT; } static inline void memcg_memory_allocated_sub(struct cg_proto *prot, unsigned long amt) { - res_counter_uncharge(&prot->memory_allocated, amt << PAGE_SHIFT); -} - -static inline u64 memcg_memory_allocated_read(struct cg_proto *prot) -{ - u64 ret; - ret = res_counter_read_u64(&prot->memory_allocated, RES_USAGE); - return ret >> PAGE_SHIFT; + page_counter_uncharge(&prot->memory_allocated, amt); } static inline long sk_memory_allocated(const struct sock *sk) { struct proto *prot = sk->sk_prot; + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) - return memcg_memory_allocated_read(sk->sk_cgrp); + return page_counter_read(&sk->sk_cgrp->memory_allocated); return atomic_long_read(prot->memory_allocated); } @@ -1255,7 +1247,7 @@ sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status) memcg_memory_allocated_add(sk->sk_cgrp, amt, parent_status); /* update the root cgroup regardless */ atomic_long_add_return(amt, prot->memory_allocated); - return memcg_memory_allocated_read(sk->sk_cgrp); + return page_counter_read(&sk->sk_cgrp->memory_allocated); } return atomic_long_add_return(amt, prot->memory_allocated); -- cgit v1.2.3 From 71f87bee38edddb21d97895fa938744cf3f477bb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:42:34 -0800 Subject: mm: hugetlb_cgroup: convert to lockless page counters Abandon the spinlock-protected byte counters in favor of the unlocked page counters in the hugetlb controller as well. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/hugetlb.txt | 2 +- include/linux/hugetlb_cgroup.h | 1 - init/Kconfig | 3 +- mm/hugetlb_cgroup.c | 103 +++++++++++++++++++++----------------- 4 files changed, 61 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroups/hugetlb.txt index a9faaca1f029..106245c3aecc 100644 --- a/Documentation/cgroups/hugetlb.txt +++ b/Documentation/cgroups/hugetlb.txt @@ -29,7 +29,7 @@ Brief summary of control files hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded - hugetlb..usage_in_bytes # show current res_counter usage for "hugepagesize" hugetlb + hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit For a system supporting two hugepage size (16M and 16G) the control diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 0129f89cf98d..bcc853eccc85 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -16,7 +16,6 @@ #define _LINUX_HUGETLB_CGROUP_H #include -#include struct hugetlb_cgroup; /* diff --git a/init/Kconfig b/init/Kconfig index fd9e88791ba4..a60d1442d1df 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1051,7 +1051,8 @@ config MEMCG_KMEM config CGROUP_HUGETLB bool "HugeTLB Resource Controller for Control Groups" - depends on RESOURCE_COUNTERS && HUGETLB_PAGE + depends on HUGETLB_PAGE + select PAGE_COUNTER default n help Provides a cgroup Resource Controller for HugeTLB pages. diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index a67c26e0f360..037e1c00a5b7 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -14,6 +14,7 @@ */ #include +#include #include #include #include @@ -23,7 +24,7 @@ struct hugetlb_cgroup { /* * the counter to account for hugepages from hugetlb. */ - struct res_counter hugepage[HUGE_MAX_HSTATE]; + struct page_counter hugepage[HUGE_MAX_HSTATE]; }; #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) @@ -60,7 +61,7 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) int idx; for (idx = 0; idx < hugetlb_max_hstate; idx++) { - if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) + if (page_counter_read(&h_cg->hugepage[idx])) return true; } return false; @@ -79,12 +80,12 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent_h_cgroup) { for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) - res_counter_init(&h_cgroup->hugepage[idx], - &parent_h_cgroup->hugepage[idx]); + page_counter_init(&h_cgroup->hugepage[idx], + &parent_h_cgroup->hugepage[idx]); } else { root_h_cgroup = h_cgroup; for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) - res_counter_init(&h_cgroup->hugepage[idx], NULL); + page_counter_init(&h_cgroup->hugepage[idx], NULL); } return &h_cgroup->css; } @@ -108,9 +109,8 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct page *page) { - int csize; - struct res_counter *counter; - struct res_counter *fail_res; + unsigned int nr_pages; + struct page_counter *counter; struct hugetlb_cgroup *page_hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); @@ -123,15 +123,15 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, if (!page_hcg || page_hcg != h_cg) goto out; - csize = PAGE_SIZE << compound_order(page); + nr_pages = 1 << compound_order(page); if (!parent) { parent = root_h_cgroup; /* root has no limit */ - res_counter_charge_nofail(&parent->hugepage[idx], - csize, &fail_res); + page_counter_charge(&parent->hugepage[idx], nr_pages); } counter = &h_cg->hugepage[idx]; - res_counter_uncharge_until(counter, counter->parent, csize); + /* Take the pages off the local counter */ + page_counter_cancel(counter, nr_pages); set_hugetlb_cgroup(page, parent); out: @@ -166,9 +166,8 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { int ret = 0; - struct res_counter *fail_res; + struct page_counter *counter; struct hugetlb_cgroup *h_cg = NULL; - unsigned long csize = nr_pages * PAGE_SIZE; if (hugetlb_cgroup_disabled()) goto done; @@ -187,7 +186,7 @@ again: } rcu_read_unlock(); - ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); + ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); css_put(&h_cg->css); done: *ptr = h_cg; @@ -213,7 +212,6 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page) { struct hugetlb_cgroup *h_cg; - unsigned long csize = nr_pages * PAGE_SIZE; if (hugetlb_cgroup_disabled()) return; @@ -222,61 +220,76 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (unlikely(!h_cg)) return; set_hugetlb_cgroup(page, NULL); - res_counter_uncharge(&h_cg->hugepage[idx], csize); + page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); return; } void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { - unsigned long csize = nr_pages * PAGE_SIZE; - if (hugetlb_cgroup_disabled() || !h_cg) return; if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) return; - res_counter_uncharge(&h_cg->hugepage[idx], csize); + page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); return; } +enum { + RES_USAGE, + RES_LIMIT, + RES_MAX_USAGE, + RES_FAILCNT, +}; + static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - int idx, name; + struct page_counter *counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); - idx = MEMFILE_IDX(cft->private); - name = MEMFILE_ATTR(cft->private); + counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; - return res_counter_read_u64(&h_cg->hugepage[idx], name); + switch (MEMFILE_ATTR(cft->private)) { + case RES_USAGE: + return (u64)page_counter_read(counter) * PAGE_SIZE; + case RES_LIMIT: + return (u64)counter->limit * PAGE_SIZE; + case RES_MAX_USAGE: + return (u64)counter->watermark * PAGE_SIZE; + case RES_FAILCNT: + return counter->failcnt; + default: + BUG(); + } } +static DEFINE_MUTEX(hugetlb_limit_mutex); + static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - int idx, name, ret; - unsigned long long val; + int ret, idx; + unsigned long nr_pages; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); + if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ + return -EINVAL; + buf = strstrip(buf); + ret = page_counter_memparse(buf, &nr_pages); + if (ret) + return ret; + idx = MEMFILE_IDX(of_cft(of)->private); - name = MEMFILE_ATTR(of_cft(of)->private); - switch (name) { + switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: - if (hugetlb_cgroup_is_root(h_cg)) { - /* Can't set limit on root */ - ret = -EINVAL; - break; - } - /* This function does all necessary parse...reuse it */ - ret = res_counter_memparse_write_strategy(buf, &val); - if (ret) - break; - val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx])); - ret = res_counter_set_limit(&h_cg->hugepage[idx], val); + mutex_lock(&hugetlb_limit_mutex); + ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); + mutex_unlock(&hugetlb_limit_mutex); break; default: ret = -EINVAL; @@ -288,18 +301,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - int idx, name, ret = 0; + int ret = 0; + struct page_counter *counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); - idx = MEMFILE_IDX(of_cft(of)->private); - name = MEMFILE_ATTR(of_cft(of)->private); + counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; - switch (name) { + switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_MAX_USAGE: - res_counter_reset_max(&h_cg->hugepage[idx]); + page_counter_reset_watermark(counter); break; case RES_FAILCNT: - res_counter_reset_failcnt(&h_cg->hugepage[idx]); + counter->failcnt = 0; break; default: ret = -EINVAL; -- cgit v1.2.3 From 5b1efc027c0b51ca3e76f4e00c83358f8349f543 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:42:37 -0800 Subject: kernel: res_counter: remove the unused API All memory accounting and limiting has been switched over to the lockless page counters. Bye, res_counter! [akpm@linux-foundation.org: update Documentation/cgroups/memory.txt] [mhocko@suse.cz: ditch the last remainings of res_counter] Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Tejun Heo Cc: David Rientjes Cc: Paul Bolle Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 17 ++- Documentation/cgroups/resource_counter.txt | 197 ------------------------- include/linux/res_counter.h | 223 ----------------------------- init/Kconfig | 6 - kernel/Makefile | 1 - kernel/res_counter.c | 211 --------------------------- 6 files changed, 8 insertions(+), 647 deletions(-) delete mode 100644 Documentation/cgroups/resource_counter.txt delete mode 100644 include/linux/res_counter.h delete mode 100644 kernel/res_counter.c (limited to 'include') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index f624727ab404..67613ff0270c 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -116,16 +116,16 @@ The memory controller is the first controller developed. 2.1. Design -The core of the design is a counter called the res_counter. The res_counter -tracks the current memory usage and limit of the group of processes associated -with the controller. Each cgroup has a memory controller specific data -structure (mem_cgroup) associated with it. +The core of the design is a counter called the page_counter. The +page_counter tracks the current memory usage and limit of the group of +processes associated with the controller. Each cgroup has a memory controller +specific data structure (mem_cgroup) associated with it. 2.2. Accounting +--------------------+ - | mem_cgroup | - | (res_counter) | + | mem_cgroup | + | (page_counter) | +--------------------+ / ^ \ / | \ @@ -352,9 +352,8 @@ set: 0. Configuration a. Enable CONFIG_CGROUPS -b. Enable CONFIG_RESOURCE_COUNTERS -c. Enable CONFIG_MEMCG -d. Enable CONFIG_MEMCG_SWAP (to use swap extension) +b. Enable CONFIG_MEMCG +c. Enable CONFIG_MEMCG_SWAP (to use swap extension) d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) 1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt deleted file mode 100644 index 762ca54eb929..000000000000 --- a/Documentation/cgroups/resource_counter.txt +++ /dev/null @@ -1,197 +0,0 @@ - - The Resource Counter - -The resource counter, declared at include/linux/res_counter.h, -is supposed to facilitate the resource management by controllers -by providing common stuff for accounting. - -This "stuff" includes the res_counter structure and routines -to work with it. - - - -1. Crucial parts of the res_counter structure - - a. unsigned long long usage - - The usage value shows the amount of a resource that is consumed - by a group at a given time. The units of measurement should be - determined by the controller that uses this counter. E.g. it can - be bytes, items or any other unit the controller operates on. - - b. unsigned long long max_usage - - The maximal value of the usage over time. - - This value is useful when gathering statistical information about - the particular group, as it shows the actual resource requirements - for a particular group, not just some usage snapshot. - - c. unsigned long long limit - - The maximal allowed amount of resource to consume by the group. In - case the group requests for more resources, so that the usage value - would exceed the limit, the resource allocation is rejected (see - the next section). - - d. unsigned long long failcnt - - The failcnt stands for "failures counter". This is the number of - resource allocation attempts that failed. - - c. spinlock_t lock - - Protects changes of the above values. - - - -2. Basic accounting routines - - a. void res_counter_init(struct res_counter *rc, - struct res_counter *rc_parent) - - Initializes the resource counter. As usual, should be the first - routine called for a new counter. - - The struct res_counter *parent can be used to define a hierarchical - child -> parent relationship directly in the res_counter structure, - NULL can be used to define no relationship. - - c. int res_counter_charge(struct res_counter *rc, unsigned long val, - struct res_counter **limit_fail_at) - - When a resource is about to be allocated it has to be accounted - with the appropriate resource counter (controller should determine - which one to use on its own). This operation is called "charging". - - This is not very important which operation - resource allocation - or charging - is performed first, but - * if the allocation is performed first, this may create a - temporary resource over-usage by the time resource counter is - charged; - * if the charging is performed first, then it should be uncharged - on error path (if the one is called). - - If the charging fails and a hierarchical dependency exists, the - limit_fail_at parameter is set to the particular res_counter element - where the charging failed. - - d. u64 res_counter_uncharge(struct res_counter *rc, unsigned long val) - - When a resource is released (freed) it should be de-accounted - from the resource counter it was accounted to. This is called - "uncharging". The return value of this function indicate the amount - of charges still present in the counter. - - The _locked routines imply that the res_counter->lock is taken. - - e. u64 res_counter_uncharge_until - (struct res_counter *rc, struct res_counter *top, - unsigned long val) - - Almost same as res_counter_uncharge() but propagation of uncharge - stops when rc == top. This is useful when kill a res_counter in - child cgroup. - - 2.1 Other accounting routines - - There are more routines that may help you with common needs, like - checking whether the limit is reached or resetting the max_usage - value. They are all declared in include/linux/res_counter.h. - - - -3. Analyzing the resource counter registrations - - a. If the failcnt value constantly grows, this means that the counter's - limit is too tight. Either the group is misbehaving and consumes too - many resources, or the configuration is not suitable for the group - and the limit should be increased. - - b. The max_usage value can be used to quickly tune the group. One may - set the limits to maximal values and either load the container with - a common pattern or leave one for a while. After this the max_usage - value shows the amount of memory the container would require during - its common activity. - - Setting the limit a bit above this value gives a pretty good - configuration that works in most of the cases. - - c. If the max_usage is much less than the limit, but the failcnt value - is growing, then the group tries to allocate a big chunk of resource - at once. - - d. If the max_usage is much less than the limit, but the failcnt value - is 0, then this group is given too high limit, that it does not - require. It is better to lower the limit a bit leaving more resource - for other groups. - - - -4. Communication with the control groups subsystem (cgroups) - -All the resource controllers that are using cgroups and resource counters -should provide files (in the cgroup filesystem) to work with the resource -counter fields. They are recommended to adhere to the following rules: - - a. File names - - Field name File name - --------------------------------------------------- - usage usage_in_ - max_usage max_usage_in_ - limit limit_in_ - failcnt failcnt - lock no file :) - - b. Reading from file should show the corresponding field value in the - appropriate format. - - c. Writing to file - - Field Expected behavior - ---------------------------------- - usage prohibited - max_usage reset to usage - limit set the limit - failcnt reset to zero - - - -5. Usage example - - a. Declare a task group (take a look at cgroups subsystem for this) and - fold a res_counter into it - - struct my_group { - struct res_counter res; - - - } - - b. Put hooks in resource allocation/release paths - - int alloc_something(...) - { - if (res_counter_charge(res_counter_ptr, amount) < 0) - return -ENOMEM; - - - } - - void release_something(...) - { - res_counter_uncharge(res_counter_ptr, amount); - - - } - - In order to keep the usage value self-consistent, both the - "res_counter_ptr" and the "amount" in release_something() should be - the same as they were in the alloc_something() when the releasing - resource was allocated. - - c. Provide the way to read res_counter values and set them (the cgroups - still can help with it). - - c. Compile and run :) diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h deleted file mode 100644 index 56b7bc32db4f..000000000000 --- a/include/linux/res_counter.h +++ /dev/null @@ -1,223 +0,0 @@ -#ifndef __RES_COUNTER_H__ -#define __RES_COUNTER_H__ - -/* - * Resource Counters - * Contain common data types and routines for resource accounting - * - * Copyright 2007 OpenVZ SWsoft Inc - * - * Author: Pavel Emelianov - * - * See Documentation/cgroups/resource_counter.txt for more - * info about what this counter is. - */ - -#include -#include - -/* - * The core object. the cgroup that wishes to account for some - * resource may include this counter into its structures and use - * the helpers described beyond - */ - -struct res_counter { - /* - * the current resource consumption level - */ - unsigned long long usage; - /* - * the maximal value of the usage from the counter creation - */ - unsigned long long max_usage; - /* - * the limit that usage cannot exceed - */ - unsigned long long limit; - /* - * the limit that usage can be exceed - */ - unsigned long long soft_limit; - /* - * the number of unsuccessful attempts to consume the resource - */ - unsigned long long failcnt; - /* - * the lock to protect all of the above. - * the routines below consider this to be IRQ-safe - */ - spinlock_t lock; - /* - * Parent counter, used for hierarchial resource accounting - */ - struct res_counter *parent; -}; - -#define RES_COUNTER_MAX ULLONG_MAX - -/** - * Helpers to interact with userspace - * res_counter_read_u64() - returns the value of the specified member. - * res_counter_read/_write - put/get the specified fields from the - * res_counter struct to/from the user - * - * @counter: the counter in question - * @member: the field to work with (see RES_xxx below) - * @buf: the buffer to opeate on,... - * @nbytes: its size... - * @pos: and the offset. - */ - -u64 res_counter_read_u64(struct res_counter *counter, int member); - -ssize_t res_counter_read(struct res_counter *counter, int member, - const char __user *buf, size_t nbytes, loff_t *pos, - int (*read_strategy)(unsigned long long val, char *s)); - -int res_counter_memparse_write_strategy(const char *buf, - unsigned long long *res); - -/* - * the field descriptors. one for each member of res_counter - */ - -enum { - RES_USAGE, - RES_MAX_USAGE, - RES_LIMIT, - RES_FAILCNT, - RES_SOFT_LIMIT, -}; - -/* - * helpers for accounting - */ - -void res_counter_init(struct res_counter *counter, struct res_counter *parent); - -/* - * charge - try to consume more resource. - * - * @counter: the counter - * @val: the amount of the resource. each controller defines its own - * units, e.g. numbers, bytes, Kbytes, etc - * - * returns 0 on success and <0 if the counter->usage will exceed the - * counter->limit - * - * charge_nofail works the same, except that it charges the resource - * counter unconditionally, and returns < 0 if the after the current - * charge we are over limit. - */ - -int __must_check res_counter_charge(struct res_counter *counter, - unsigned long val, struct res_counter **limit_fail_at); -int res_counter_charge_nofail(struct res_counter *counter, - unsigned long val, struct res_counter **limit_fail_at); - -/* - * uncharge - tell that some portion of the resource is released - * - * @counter: the counter - * @val: the amount of the resource - * - * these calls check for usage underflow and show a warning on the console - * - * returns the total charges still present in @counter. - */ - -u64 res_counter_uncharge(struct res_counter *counter, unsigned long val); - -u64 res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val); -/** - * res_counter_margin - calculate chargeable space of a counter - * @cnt: the counter - * - * Returns the difference between the hard limit and the current usage - * of resource counter @cnt. - */ -static inline unsigned long long res_counter_margin(struct res_counter *cnt) -{ - unsigned long long margin; - unsigned long flags; - - spin_lock_irqsave(&cnt->lock, flags); - if (cnt->limit > cnt->usage) - margin = cnt->limit - cnt->usage; - else - margin = 0; - spin_unlock_irqrestore(&cnt->lock, flags); - return margin; -} - -/** - * Get the difference between the usage and the soft limit - * @cnt: The counter - * - * Returns 0 if usage is less than or equal to soft limit - * The difference between usage and soft limit, otherwise. - */ -static inline unsigned long long -res_counter_soft_limit_excess(struct res_counter *cnt) -{ - unsigned long long excess; - unsigned long flags; - - spin_lock_irqsave(&cnt->lock, flags); - if (cnt->usage <= cnt->soft_limit) - excess = 0; - else - excess = cnt->usage - cnt->soft_limit; - spin_unlock_irqrestore(&cnt->lock, flags); - return excess; -} - -static inline void res_counter_reset_max(struct res_counter *cnt) -{ - unsigned long flags; - - spin_lock_irqsave(&cnt->lock, flags); - cnt->max_usage = cnt->usage; - spin_unlock_irqrestore(&cnt->lock, flags); -} - -static inline void res_counter_reset_failcnt(struct res_counter *cnt) -{ - unsigned long flags; - - spin_lock_irqsave(&cnt->lock, flags); - cnt->failcnt = 0; - spin_unlock_irqrestore(&cnt->lock, flags); -} - -static inline int res_counter_set_limit(struct res_counter *cnt, - unsigned long long limit) -{ - unsigned long flags; - int ret = -EBUSY; - - spin_lock_irqsave(&cnt->lock, flags); - if (cnt->usage <= limit) { - cnt->limit = limit; - ret = 0; - } - spin_unlock_irqrestore(&cnt->lock, flags); - return ret; -} - -static inline int -res_counter_set_soft_limit(struct res_counter *cnt, - unsigned long long soft_limit) -{ - unsigned long flags; - - spin_lock_irqsave(&cnt->lock, flags); - cnt->soft_limit = soft_limit; - spin_unlock_irqrestore(&cnt->lock, flags); - return 0; -} - -#endif diff --git a/init/Kconfig b/init/Kconfig index a60d1442d1df..1761c72bc1a0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -972,12 +972,6 @@ config CGROUP_CPUACCT Provides a simple Resource Controller for monitoring the total CPU consumed by the tasks in a cgroup. -config RESOURCE_COUNTERS - bool "Resource counters" - help - This option enables controller independent resource accounting - infrastructure that works with cgroups. - config PAGE_COUNTER bool diff --git a/kernel/Makefile b/kernel/Makefile index 17ea6d4a9a24..a59481a3fa6c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -57,7 +57,6 @@ obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o obj-$(CONFIG_IKCONFIG) += configs.o -obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o diff --git a/kernel/res_counter.c b/kernel/res_counter.c deleted file mode 100644 index e791130f85a7..000000000000 --- a/kernel/res_counter.c +++ /dev/null @@ -1,211 +0,0 @@ -/* - * resource cgroups - * - * Copyright 2007 OpenVZ SWsoft Inc - * - * Author: Pavel Emelianov - * - */ - -#include -#include -#include -#include -#include -#include - -void res_counter_init(struct res_counter *counter, struct res_counter *parent) -{ - spin_lock_init(&counter->lock); - counter->limit = RES_COUNTER_MAX; - counter->soft_limit = RES_COUNTER_MAX; - counter->parent = parent; -} - -static u64 res_counter_uncharge_locked(struct res_counter *counter, - unsigned long val) -{ - if (WARN_ON(counter->usage < val)) - val = counter->usage; - - counter->usage -= val; - return counter->usage; -} - -static int res_counter_charge_locked(struct res_counter *counter, - unsigned long val, bool force) -{ - int ret = 0; - - if (counter->usage + val > counter->limit) { - counter->failcnt++; - ret = -ENOMEM; - if (!force) - return ret; - } - - counter->usage += val; - if (counter->usage > counter->max_usage) - counter->max_usage = counter->usage; - return ret; -} - -static int __res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at, bool force) -{ - int ret, r; - unsigned long flags; - struct res_counter *c, *u; - - r = ret = 0; - *limit_fail_at = NULL; - local_irq_save(flags); - for (c = counter; c != NULL; c = c->parent) { - spin_lock(&c->lock); - r = res_counter_charge_locked(c, val, force); - spin_unlock(&c->lock); - if (r < 0 && !ret) { - ret = r; - *limit_fail_at = c; - if (!force) - break; - } - } - - if (ret < 0 && !force) { - for (u = counter; u != c; u = u->parent) { - spin_lock(&u->lock); - res_counter_uncharge_locked(u, val); - spin_unlock(&u->lock); - } - } - local_irq_restore(flags); - - return ret; -} - -int res_counter_charge(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) -{ - return __res_counter_charge(counter, val, limit_fail_at, false); -} - -int res_counter_charge_nofail(struct res_counter *counter, unsigned long val, - struct res_counter **limit_fail_at) -{ - return __res_counter_charge(counter, val, limit_fail_at, true); -} - -u64 res_counter_uncharge_until(struct res_counter *counter, - struct res_counter *top, - unsigned long val) -{ - unsigned long flags; - struct res_counter *c; - u64 ret = 0; - - local_irq_save(flags); - for (c = counter; c != top; c = c->parent) { - u64 r; - spin_lock(&c->lock); - r = res_counter_uncharge_locked(c, val); - if (c == counter) - ret = r; - spin_unlock(&c->lock); - } - local_irq_restore(flags); - return ret; -} - -u64 res_counter_uncharge(struct res_counter *counter, unsigned long val) -{ - return res_counter_uncharge_until(counter, NULL, val); -} - -static inline unsigned long long * -res_counter_member(struct res_counter *counter, int member) -{ - switch (member) { - case RES_USAGE: - return &counter->usage; - case RES_MAX_USAGE: - return &counter->max_usage; - case RES_LIMIT: - return &counter->limit; - case RES_FAILCNT: - return &counter->failcnt; - case RES_SOFT_LIMIT: - return &counter->soft_limit; - }; - - BUG(); - return NULL; -} - -ssize_t res_counter_read(struct res_counter *counter, int member, - const char __user *userbuf, size_t nbytes, loff_t *pos, - int (*read_strategy)(unsigned long long val, char *st_buf)) -{ - unsigned long long *val; - char buf[64], *s; - - s = buf; - val = res_counter_member(counter, member); - if (read_strategy) - s += read_strategy(*val, s); - else - s += sprintf(s, "%llu\n", *val); - return simple_read_from_buffer((void __user *)userbuf, nbytes, - pos, buf, s - buf); -} - -#if BITS_PER_LONG == 32 -u64 res_counter_read_u64(struct res_counter *counter, int member) -{ - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&counter->lock, flags); - ret = *res_counter_member(counter, member); - spin_unlock_irqrestore(&counter->lock, flags); - - return ret; -} -#else -u64 res_counter_read_u64(struct res_counter *counter, int member) -{ - return *res_counter_member(counter, member); -} -#endif - -int res_counter_memparse_write_strategy(const char *buf, - unsigned long long *resp) -{ - char *end; - unsigned long long res; - - /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ - if (*buf == '-') { - int rc = kstrtoull(buf + 1, 10, &res); - - if (rc) - return rc; - if (res != 1) - return -EINVAL; - *resp = RES_COUNTER_MAX; - return 0; - } - - res = memparse(buf, &end); - if (*end != '\0') - return -EINVAL; - - if (PAGE_ALIGN(res) >= res) - res = PAGE_ALIGN(res); - else - res = RES_COUNTER_MAX; - - *resp = res; - - return 0; -} -- cgit v1.2.3 From e8ea14cc6eadfe2ea63e9989e16e62625a2619f8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:42:42 -0800 Subject: mm: memcontrol: take a css reference for each charged page Charges currently pin the css indirectly by playing tricks during css_offline(): user pages stall the offlining process until all of them have been reparented, whereas kmemcg acquires a keep-alive reference if outstanding kernel pages are detected at that point. In preparation for removing all this complexity, make the pinning explicit and acquire a css references for every charged page. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Cc: David Rientjes Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 26 +++++++++++++++++++++++ include/linux/percpu-refcount.h | 47 +++++++++++++++++++++++++++++++++-------- mm/memcontrol.c | 21 ++++++++++++++---- 3 files changed, 81 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 1d5196889048..9f96b25965c2 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -112,6 +112,19 @@ static inline void css_get(struct cgroup_subsys_state *css) percpu_ref_get(&css->refcnt); } +/** + * css_get_many - obtain references on the specified css + * @css: target css + * @n: number of references to get + * + * The caller must already have a reference. + */ +static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n) +{ + if (!(css->flags & CSS_NO_REF)) + percpu_ref_get_many(&css->refcnt, n); +} + /** * css_tryget - try to obtain a reference on the specified css * @css: target css @@ -159,6 +172,19 @@ static inline void css_put(struct cgroup_subsys_state *css) percpu_ref_put(&css->refcnt); } +/** + * css_put_many - put css references + * @css: target css + * @n: number of references to put + * + * Put references obtained via css_get() and css_tryget_online(). + */ +static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) +{ + if (!(css->flags & CSS_NO_REF)) + percpu_ref_put_many(&css->refcnt, n); +} + /* bits in struct cgroup flags field */ enum { /* Control Group requires release notifications to userspace */ diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 51ce60c35f4c..530b249f7ea4 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -147,27 +147,41 @@ static inline bool __ref_is_percpu(struct percpu_ref *ref, } /** - * percpu_ref_get - increment a percpu refcount + * percpu_ref_get_many - increment a percpu refcount * @ref: percpu_ref to get + * @nr: number of references to get * - * Analagous to atomic_long_inc(). + * Analogous to atomic_long_add(). * * This function is safe to call as long as @ref is between init and exit. */ -static inline void percpu_ref_get(struct percpu_ref *ref) +static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; rcu_read_lock_sched(); if (__ref_is_percpu(ref, &percpu_count)) - this_cpu_inc(*percpu_count); + this_cpu_add(*percpu_count, nr); else - atomic_long_inc(&ref->count); + atomic_long_add(nr, &ref->count); rcu_read_unlock_sched(); } +/** + * percpu_ref_get - increment a percpu refcount + * @ref: percpu_ref to get + * + * Analagous to atomic_long_inc(). + * + * This function is safe to call as long as @ref is between init and exit. + */ +static inline void percpu_ref_get(struct percpu_ref *ref) +{ + percpu_ref_get_many(ref, 1); +} + /** * percpu_ref_tryget - try to increment a percpu refcount * @ref: percpu_ref to try-get @@ -231,28 +245,43 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) } /** - * percpu_ref_put - decrement a percpu refcount + * percpu_ref_put_many - decrement a percpu refcount * @ref: percpu_ref to put + * @nr: number of references to put * * Decrement the refcount, and if 0, call the release function (which was passed * to percpu_ref_init()) * * This function is safe to call as long as @ref is between init and exit. */ -static inline void percpu_ref_put(struct percpu_ref *ref) +static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr) { unsigned long __percpu *percpu_count; rcu_read_lock_sched(); if (__ref_is_percpu(ref, &percpu_count)) - this_cpu_dec(*percpu_count); - else if (unlikely(atomic_long_dec_and_test(&ref->count))) + this_cpu_sub(*percpu_count, nr); + else if (unlikely(atomic_long_sub_and_test(nr, &ref->count))) ref->release(ref); rcu_read_unlock_sched(); } +/** + * percpu_ref_put - decrement a percpu refcount + * @ref: percpu_ref to put + * + * Decrement the refcount, and if 0, call the release function (which was passed + * to percpu_ref_init()) + * + * This function is safe to call as long as @ref is between init and exit. + */ +static inline void percpu_ref_put(struct percpu_ref *ref) +{ + percpu_ref_put_many(ref, 1); +} + /** * percpu_ref_is_zero - test whether a percpu refcount reached zero * @ref: percpu_ref to test diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c3cd3bb77dd9..f69da2ac6323 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2273,6 +2273,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) page_counter_uncharge(&old->memory, stock->nr_pages); if (do_swap_account) page_counter_uncharge(&old->memsw, stock->nr_pages); + css_put_many(&old->css, stock->nr_pages); stock->nr_pages = 0; } stock->cached = NULL; @@ -2530,6 +2531,7 @@ bypass: return -EINTR; done_restock: + css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); done: @@ -2544,6 +2546,8 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) page_counter_uncharge(&memcg->memory, nr_pages); if (do_swap_account) page_counter_uncharge(&memcg->memsw, nr_pages); + + css_put_many(&memcg->css, nr_pages); } /* @@ -2739,6 +2743,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, page_counter_charge(&memcg->memory, nr_pages); if (do_swap_account) page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); ret = 0; } else if (ret) page_counter_uncharge(&memcg->kmem, nr_pages); @@ -2754,8 +2759,10 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, page_counter_uncharge(&memcg->memsw, nr_pages); /* Not down to 0 */ - if (page_counter_uncharge(&memcg->kmem, nr_pages)) + if (page_counter_uncharge(&memcg->kmem, nr_pages)) { + css_put_many(&memcg->css, nr_pages); return; + } /* * Releases a reference taken in kmem_cgroup_css_offline in case @@ -2767,6 +2774,8 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, */ if (memcg_kmem_test_and_clear_dead(memcg)) css_put(&memcg->css); + + css_put_many(&memcg->css, nr_pages); } /* @@ -3394,10 +3403,13 @@ static int mem_cgroup_move_parent(struct page *page, ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent); if (!ret) { + if (!mem_cgroup_is_root(parent)) + css_get_many(&parent->css, nr_pages); /* Take charge off the local counters */ page_counter_cancel(&child->memory, nr_pages); if (do_swap_account) page_counter_cancel(&child->memsw, nr_pages); + css_put_many(&child->css, nr_pages); } if (nr_pages > 1) @@ -5767,7 +5779,6 @@ static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; - int i; /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { @@ -5795,8 +5806,7 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - for (i = 0; i < mc.moved_swap; i++) - css_put(&mc.from->css); + css_put_many(&mc.from->css, mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; @@ -6343,6 +6353,9 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); memcg_check_events(memcg, dummy_page); local_irq_restore(flags); + + if (!mem_cgroup_is_root(memcg)) + css_put_many(&memcg->css, max(nr_mem, nr_memsw)); } static void uncharge_list(struct list_head *page_list) -- cgit v1.2.3 From 64f2199389414341ed3a570663f23616c131ba25 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:42:45 -0800 Subject: mm: memcontrol: remove obsolete kmemcg pinning tricks As charges now pin the css explicitely, there is no more need for kmemcg to acquire a proxy reference for outstanding pages during offlining, or maintain state to identify such "dead" groups. This was the last user of the uncharge functions' return values, so remove them as well. Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Cc: David Rientjes Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_counter.h | 4 +-- mm/memcontrol.c | 74 +------------------------------------------- mm/page_counter.c | 23 +++----------- 3 files changed, 7 insertions(+), 94 deletions(-) (limited to 'include') diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 7cce3be99ff3..955421575d16 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -34,12 +34,12 @@ static inline unsigned long page_counter_read(struct page_counter *counter) return atomic_long_read(&counter->count); } -int page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); +void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); void page_counter_charge(struct page_counter *counter, unsigned long nr_pages); int page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); -int page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); +void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); int page_counter_limit(struct page_counter *counter, unsigned long limit); int page_counter_memparse(const char *buf, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f69da2ac6323..0e6484ea268d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -369,7 +369,6 @@ struct mem_cgroup { /* internal only representation about the status of kmem accounting. */ enum { KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ - KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ }; #ifdef CONFIG_MEMCG_KMEM @@ -383,22 +382,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); } -static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) -{ - /* - * Our caller must use css_get() first, because memcg_uncharge_kmem() - * will call css_put() if it sees the memcg is dead. - */ - smp_wmb(); - if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) - set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); -} - -static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) -{ - return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, - &memcg->kmem_account_flags); -} #endif /* Stuffs for move charges at task migration. */ @@ -2758,22 +2741,7 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, if (do_swap_account) page_counter_uncharge(&memcg->memsw, nr_pages); - /* Not down to 0 */ - if (page_counter_uncharge(&memcg->kmem, nr_pages)) { - css_put_many(&memcg->css, nr_pages); - return; - } - - /* - * Releases a reference taken in kmem_cgroup_css_offline in case - * this last uncharge is racing with the offlining code or it is - * outliving the memcg existence. - * - * The memory barrier imposed by test&clear is paired with the - * explicit one in memcg_kmem_mark_dead(). - */ - if (memcg_kmem_test_and_clear_dead(memcg)) - css_put(&memcg->css); + page_counter_uncharge(&memcg->kmem, nr_pages); css_put_many(&memcg->css, nr_pages); } @@ -4757,40 +4725,6 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); } - -static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) -{ - if (!memcg_kmem_is_active(memcg)) - return; - - /* - * kmem charges can outlive the cgroup. In the case of slab - * pages, for instance, a page contain objects from various - * processes. As we prevent from taking a reference for every - * such allocation we have to be careful when doing uncharge - * (see memcg_uncharge_kmem) and here during offlining. - * - * The idea is that that only the _last_ uncharge which sees - * the dead memcg will drop the last reference. An additional - * reference is taken here before the group is marked dead - * which is then paired with css_put during uncharge resp. here. - * - * Although this might sound strange as this path is called from - * css_offline() when the referencemight have dropped down to 0 and - * shouldn't be incremented anymore (css_tryget_online() would - * fail) we do not have other options because of the kmem - * allocations lifetime. - */ - css_get(&memcg->css); - - memcg_kmem_mark_dead(memcg); - - if (page_counter_read(&memcg->kmem)) - return; - - if (memcg_kmem_test_and_clear_dead(memcg)) - css_put(&memcg->css); -} #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { @@ -4800,10 +4734,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) static void memcg_destroy_kmem(struct mem_cgroup *memcg) { } - -static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) -{ -} #endif /* @@ -5407,8 +5337,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - kmem_cgroup_css_offline(memcg); - /* * This requires that offlining is serialized. Right now that is * guaranteed because css_killed_work_fn() holds the cgroup_mutex. diff --git a/mm/page_counter.c b/mm/page_counter.c index f0cbc0825426..a009574fbba9 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -16,19 +16,14 @@ * page_counter_cancel - take pages out of the local counter * @counter: counter * @nr_pages: number of pages to cancel - * - * Returns whether there are remaining pages in the counter. */ -int page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) +void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; new = atomic_long_sub_return(nr_pages, &counter->count); - /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); - - return new > 0; } /** @@ -117,23 +112,13 @@ failed: * page_counter_uncharge - hierarchically uncharge pages * @counter: counter * @nr_pages: number of pages to uncharge - * - * Returns whether there are remaining charges in @counter. */ -int page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) +void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) { struct page_counter *c; - int ret = 1; - for (c = counter; c; c = c->parent) { - int remainder; - - remainder = page_counter_cancel(c, nr_pages); - if (c == counter && !remainder) - ret = 0; - } - - return ret; + for (c = counter; c; c = c->parent) + page_counter_cancel(c, nr_pages); } /** -- cgit v1.2.3 From 93481ff0e5a0c7636359a7ee52248856da5e7859 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 10 Dec 2014 15:43:01 -0800 Subject: mm: introduce single zone pcplists drain The functions for draining per-cpu pages back to buddy allocators currently always operate on all zones. There are however several cases where the drain is only needed in the context of a single zone, and spilling other pcplists is a waste of time both due to the extra spilling and later refilling. This patch introduces new zone pointer parameter to drain_all_pages() and changes the dummy parameter of drain_local_pages() to be also a zone pointer. When NULL is passed, the functions operate on all zones as usual. Passing a specific zone pointer reduces the work to the single zone. All callers are updated to pass the NULL pointer in this patch. Conversion to single zone (where appropriate) is done in further patches. Signed-off-by: Vlastimil Babka Cc: Naoya Horiguchi Cc: Mel Gorman Cc: Rik van Riel Cc: Yasuaki Ishimatsu Cc: Zhang Yanfei Cc: Xishi Qiu Cc: Vladimir Davydov Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 +-- mm/memory-failure.c | 4 +-- mm/memory_hotplug.c | 4 +-- mm/page_alloc.c | 81 ++++++++++++++++++++++++++++++++++++----------------- mm/page_isolation.c | 2 +- 5 files changed, 63 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 41b30fd4d041..07d2699cdb51 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -381,8 +381,8 @@ extern void free_kmem_pages(unsigned long addr, unsigned int order); void page_alloc_init(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); -void drain_all_pages(void); -void drain_local_pages(void *dummy); +void drain_all_pages(struct zone *zone); +void drain_local_pages(struct zone *zone); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8639f6b28746..851b4d7eef3a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -233,7 +233,7 @@ void shake_page(struct page *p, int access) lru_add_drain_all(); if (PageLRU(p)) return; - drain_all_pages(); + drain_all_pages(NULL); if (PageLRU(p) || is_free_buddy_page(p)) return; } @@ -1661,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags) if (!is_free_buddy_page(page)) lru_add_drain_all(); if (!is_free_buddy_page(page)) - drain_all_pages(); + drain_all_pages(NULL); SetPageHWPoison(page); if (!is_free_buddy_page(page)) pr_info("soft offline: %#lx: page leaked\n", diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1bf4807cb21e..aa0c6e5a3065 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1725,7 +1725,7 @@ repeat: if (drain) { lru_add_drain_all(); cond_resched(); - drain_all_pages(); + drain_all_pages(NULL); } pfn = scan_movable_pages(start_pfn, end_pfn); @@ -1747,7 +1747,7 @@ repeat: lru_add_drain_all(); yield(); /* drain pcp pages, this is synchronous. */ - drain_all_pages(); + drain_all_pages(NULL); /* * dissolve free hugepages in the memory block before doing offlining * actually in order to make hugetlbfs's object counting consistent. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 701fe9018fdc..13d5796de8f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1267,55 +1267,75 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) #endif /* - * Drain pages of the indicated processor. + * Drain pcplists of the indicated processor and zone. * * The processor must either be the current processor and the * thread pinned to the current processor or a processor that * is not online. */ -static void drain_pages(unsigned int cpu) +static void drain_pages_zone(unsigned int cpu, struct zone *zone) { unsigned long flags; - struct zone *zone; + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; - for_each_populated_zone(zone) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; + local_irq_save(flags); + pset = per_cpu_ptr(zone->pageset, cpu); - local_irq_save(flags); - pset = per_cpu_ptr(zone->pageset, cpu); + pcp = &pset->pcp; + if (pcp->count) { + free_pcppages_bulk(zone, pcp->count, pcp); + pcp->count = 0; + } + local_irq_restore(flags); +} - pcp = &pset->pcp; - if (pcp->count) { - free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; - } - local_irq_restore(flags); +/* + * Drain pcplists of all zones on the indicated processor. + * + * The processor must either be the current processor and the + * thread pinned to the current processor or a processor that + * is not online. + */ +static void drain_pages(unsigned int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) { + drain_pages_zone(cpu, zone); } } /* * Spill all of this CPU's per-cpu pages back into the buddy allocator. + * + * The CPU has to be pinned. When zone parameter is non-NULL, spill just + * the single zone's pages. */ -void drain_local_pages(void *arg) +void drain_local_pages(struct zone *zone) { - drain_pages(smp_processor_id()); + int cpu = smp_processor_id(); + + if (zone) + drain_pages_zone(cpu, zone); + else + drain_pages(cpu); } /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * + * When zone parameter is non-NULL, spill just the single zone's pages. + * * Note that this code is protected against sending an IPI to an offline * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but * nothing keeps CPUs from showing up after we populated the cpumask and * before the call to on_each_cpu_mask(). */ -void drain_all_pages(void) +void drain_all_pages(struct zone *zone) { int cpu; - struct per_cpu_pageset *pcp; - struct zone *zone; /* * Allocate in the BSS so we wont require allocation in @@ -1330,20 +1350,31 @@ void drain_all_pages(void) * disables preemption as part of its processing */ for_each_online_cpu(cpu) { + struct per_cpu_pageset *pcp; + struct zone *z; bool has_pcps = false; - for_each_populated_zone(zone) { + + if (zone) { pcp = per_cpu_ptr(zone->pageset, cpu); - if (pcp->pcp.count) { + if (pcp->pcp.count) has_pcps = true; - break; + } else { + for_each_populated_zone(z) { + pcp = per_cpu_ptr(z->pageset, cpu); + if (pcp->pcp.count) { + has_pcps = true; + break; + } } } + if (has_pcps) cpumask_set_cpu(cpu, &cpus_with_pcps); else cpumask_clear_cpu(cpu, &cpus_with_pcps); } - on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); + on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, + zone, 1); } #ifdef CONFIG_HIBERNATION @@ -2433,7 +2464,7 @@ retry: * pages are pinned on the per-cpu lists. Drain them and try again */ if (!page && !drained) { - drain_all_pages(); + drain_all_pages(NULL); drained = true; goto retry; } @@ -6385,7 +6416,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, */ lru_add_drain_all(); - drain_all_pages(); + drain_all_pages(NULL); order = 0; outer_start = start; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c8778f7e208e..f2452e5116b2 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -68,7 +68,7 @@ out: spin_unlock_irqrestore(&zone->lock, flags); if (!ret) - drain_all_pages(); + drain_all_pages(NULL); return ret; } -- cgit v1.2.3 From ebff398017c69a3810bcbc5200ba224d5ccaa207 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 10 Dec 2014 15:43:22 -0800 Subject: mm, compaction: pass classzone_idx and alloc_flags to watermark checking Compaction relies on zone watermark checks for decisions such as if it's worth to start compacting in compaction_suitable() or whether compaction should stop in compact_finished(). The watermark checks take classzone_idx and alloc_flags parameters, which are related to the memory allocation request. But from the context of compaction they are currently passed as 0, including the direct compaction which is invoked to satisfy the allocation request, and could therefore know the proper values. The lack of proper values can lead to mismatch between decisions taken during compaction and decisions related to the allocation request. Lack of proper classzone_idx value means that lowmem_reserve is not taken into account. This has manifested (during recent changes to deferred compaction) when DMA zone was used as fallback for preferred Normal zone. compaction_suitable() without proper classzone_idx would think that the watermarks are already satisfied, but watermark check in get_page_from_freelist() would fail. Because of this problem, deferring compaction has extra complexity that can be removed in the following patch. The issue (not confirmed in practice) with missing alloc_flags is opposite in nature. For allocations that include ALLOC_HIGH, ALLOC_HIGHER or ALLOC_CMA in alloc_flags (the last includes all MOVABLE allocations on CMA-enabled systems) the watermark checking in compaction with 0 passed will be stricter than in get_page_from_freelist(). In these cases compaction might be running for a longer time than is really needed. Another issue compaction_suitable() is that the check for "does the zone need compaction at all?" comes only after the check "does the zone have enough free free pages to succeed compaction". The latter considers extra pages for migration and can therefore in some situations fail and return COMPACT_SKIPPED, although the high-order allocation would succeed and we should return COMPACT_PARTIAL. This patch fixes these problems by adding alloc_flags and classzone_idx to struct compact_control and related functions involved in direct compaction and watermark checking. Where possible, all other callers of compaction_suitable() pass proper values where those are known. This is currently limited to classzone_idx, which is sometimes known in kswapd context. However, the direct reclaim callers should_continue_reclaim() and compaction_ready() do not currently know the proper values, so the coordination between reclaim and compaction may still not be as accurate as it could. This can be fixed later, if it's shown to be an issue. Additionaly the checks in compact_suitable() are reordered to address the second issue described above. The effect of this patch should be slightly better high-order allocation success rates and/or less compaction overhead, depending on the type of allocations and presence of CMA. It allows simplifying deferred compaction code in a followup patch. When testing with stress-highalloc, there was some slight improvement (which might be just due to variance) in success rates of non-THP-like allocations. Signed-off-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Acked-by: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 8 ++++++-- mm/compaction.c | 48 ++++++++++++++++++++++++++-------------------- mm/internal.h | 2 ++ mm/page_alloc.c | 1 + mm/vmscan.c | 12 ++++++------ 5 files changed, 42 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 60bdf8dc02a3..d896765a15b0 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -33,10 +33,12 @@ extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, enum migrate_mode mode, int *contended, + int alloc_flags, int classzone_idx, struct zone **candidate_zone); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); -extern unsigned long compaction_suitable(struct zone *zone, int order); +extern unsigned long compaction_suitable(struct zone *zone, int order, + int alloc_flags, int classzone_idx); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -103,6 +105,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, enum migrate_mode mode, int *contended, + int alloc_flags, int classzone_idx, struct zone **candidate_zone) { return COMPACT_CONTINUE; @@ -116,7 +119,8 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) { } -static inline unsigned long compaction_suitable(struct zone *zone, int order) +static inline unsigned long compaction_suitable(struct zone *zone, int order, + int alloc_flags, int classzone_idx) { return COMPACT_SKIPPED; } diff --git a/mm/compaction.c b/mm/compaction.c index f9792ba3537c..1fc6736815e0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1086,9 +1086,9 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, /* Compaction run is not finished if the watermark is not met */ watermark = low_wmark_pages(zone); - watermark += (1 << cc->order); - if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) + if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, + cc->alloc_flags)) return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ @@ -1114,7 +1114,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_PARTIAL - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ -unsigned long compaction_suitable(struct zone *zone, int order) +unsigned long compaction_suitable(struct zone *zone, int order, + int alloc_flags, int classzone_idx) { int fragindex; unsigned long watermark; @@ -1126,21 +1127,30 @@ unsigned long compaction_suitable(struct zone *zone, int order) if (order == -1) return COMPACT_CONTINUE; + watermark = low_wmark_pages(zone); + /* + * If watermarks for high-order allocation are already met, there + * should be no need for compaction at all. + */ + if (zone_watermark_ok(zone, order, watermark, classzone_idx, + alloc_flags)) + return COMPACT_PARTIAL; + /* * Watermarks for order-0 must be met for compaction. Note the 2UL. * This is because during migration, copies of pages need to be * allocated and for a short time, the footprint is higher */ - watermark = low_wmark_pages(zone) + (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + watermark += (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) return COMPACT_SKIPPED; /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation * - * index of -1000 implies allocations might succeed depending on - * watermarks + * index of -1000 would imply allocations might succeed depending on + * watermarks, but we already failed the high-order watermark check * index towards 0 implies failure is due to lack of memory * index towards 1000 implies failure is due to fragmentation * @@ -1150,10 +1160,6 @@ unsigned long compaction_suitable(struct zone *zone, int order) if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) return COMPACT_SKIPPED; - if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, - 0, 0)) - return COMPACT_PARTIAL; - return COMPACT_CONTINUE; } @@ -1165,7 +1171,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; - ret = compaction_suitable(zone, cc->order); + ret = compaction_suitable(zone, cc->order, cc->alloc_flags, + cc->classzone_idx); switch (ret) { case COMPACT_PARTIAL: case COMPACT_SKIPPED: @@ -1254,7 +1261,8 @@ out: } static unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, enum migrate_mode mode, int *contended) + gfp_t gfp_mask, enum migrate_mode mode, int *contended, + int alloc_flags, int classzone_idx) { unsigned long ret; struct compact_control cc = { @@ -1264,6 +1272,8 @@ static unsigned long compact_zone_order(struct zone *zone, int order, .gfp_mask = gfp_mask, .zone = zone, .mode = mode, + .alloc_flags = alloc_flags, + .classzone_idx = classzone_idx, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1295,6 +1305,7 @@ int sysctl_extfrag_threshold = 500; unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, enum migrate_mode mode, int *contended, + int alloc_flags, int classzone_idx, struct zone **candidate_zone) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); @@ -1303,7 +1314,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_DEFERRED; - int alloc_flags = 0; int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ *contended = COMPACT_CONTENDED_NONE; @@ -1312,10 +1322,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, if (!order || !may_enter_fs || !may_perform_io) return COMPACT_SKIPPED; -#ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { @@ -1326,7 +1332,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, continue; status = compact_zone_order(zone, order, gfp_mask, mode, - &zone_contended); + &zone_contended, alloc_flags, classzone_idx); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended @@ -1335,8 +1341,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, all_zones_contended &= zone_contended; /* If a normal allocation would succeed, stop compacting */ - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, - alloc_flags)) { + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), + classzone_idx, alloc_flags)) { *candidate_zone = zone; /* * We think the allocation will succeed in this zone, diff --git a/mm/internal.h b/mm/internal.h index a4f90ba7068e..b643938fcf12 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -168,6 +168,8 @@ struct compact_control { int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ + const int alloc_flags; /* alloc flags of a direct compactor */ + const int classzone_idx; /* zone index of a direct compactor */ struct zone *zone; int contended; /* Signal need_sched() or lock * contention detected during diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b7c18f094697..e32121fa2ba9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2341,6 +2341,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, compact_result = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, mode, contended_compaction, + alloc_flags, classzone_idx, &last_compact_zone); current->flags &= ~PF_MEMALLOC; diff --git a/mm/vmscan.c b/mm/vmscan.c index 53157e157061..4636d9e822c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2249,7 +2249,7 @@ static inline bool should_continue_reclaim(struct zone *zone, return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order)) { + switch (compaction_suitable(zone, sc->order, 0, 0)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -2346,7 +2346,7 @@ static inline bool compaction_ready(struct zone *zone, int order) * If compaction is not ready to start and allocation is not likely * to succeed without it, then keep reclaiming. */ - if (compaction_suitable(zone, order) == COMPACT_SKIPPED) + if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2824,8 +2824,8 @@ static bool zone_balanced(struct zone *zone, int order, balance_gap, classzone_idx, 0)) return false; - if (IS_ENABLED(CONFIG_COMPACTION) && order && - compaction_suitable(zone, order) == COMPACT_SKIPPED) + if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, + order, 0, classzone_idx) == COMPACT_SKIPPED) return false; return true; @@ -2952,8 +2952,8 @@ static bool kswapd_shrink_zone(struct zone *zone, * from memory. Do not reclaim more than needed for compaction. */ if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && - compaction_suitable(zone, sc->order) != - COMPACT_SKIPPED) + compaction_suitable(zone, sc->order, 0, classzone_idx) + != COMPACT_SKIPPED) testorder = 0; /* -- cgit v1.2.3 From 97d47a65be1e513edd02325ae828c9997878b578 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 10 Dec 2014 15:43:25 -0800 Subject: mm, compaction: simplify deferred compaction Since commit 53853e2d2bfb ("mm, compaction: defer each zone individually instead of preferred zone"), compaction is deferred for each zone where sync direct compaction fails, and reset where it succeeds. However, it was observed that for DMA zone compaction often appeared to succeed while subsequent allocation attempt would not, due to different outcome of watermark check. In order to properly defer compaction in this zone, the candidate zone has to be passed back to __alloc_pages_direct_compact() and compaction deferred in the zone after the allocation attempt fails. The large source of mismatch between watermark check in compaction and allocation was the lack of alloc_flags and classzone_idx values in compaction, which has been fixed in the previous patch. So with this problem fixed, we can simplify the code by removing the candidate_zone parameter and deferring in __alloc_pages_direct_compact(). After this patch, the compaction activity during stress-highalloc benchmark is still somewhat increased, but it's negligible compared to the increase that occurred without the better watermark checking. This suggests that it is still possible to apparently succeed in compaction but fail to allocate, possibly due to parallel allocation activity. [akpm@linux-foundation.org: fix build] Suggested-by: Joonsoo Kim Signed-off-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 6 ++---- mm/compaction.c | 5 +---- mm/page_alloc.c | 12 +----------- 3 files changed, 4 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index d896765a15b0..3238ffa33f68 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -33,8 +33,7 @@ extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx, - struct zone **candidate_zone); + int alloc_flags, int classzone_idx); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order, @@ -105,8 +104,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx, - struct zone **candidate_zone) + int alloc_flags, int classzone_idx) { return COMPACT_CONTINUE; } diff --git a/mm/compaction.c b/mm/compaction.c index 1fc6736815e0..75f4c1206d00 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1298,15 +1298,13 @@ int sysctl_extfrag_threshold = 500; * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that determines if compaction was aborted due to * need_resched() or lock contention - * @candidate_zone: Return the zone where we think allocation should succeed * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx, - struct zone **candidate_zone) + int alloc_flags, int classzone_idx) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1343,7 +1341,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), classzone_idx, alloc_flags)) { - *candidate_zone = zone; /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e32121fa2ba9..edb0ce1e7cf3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2330,7 +2330,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, int classzone_idx, int migratetype, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { - struct zone *last_compact_zone = NULL; unsigned long compact_result; struct page *page; @@ -2341,8 +2340,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, compact_result = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, mode, contended_compaction, - alloc_flags, classzone_idx, - &last_compact_zone); + alloc_flags, classzone_idx); current->flags &= ~PF_MEMALLOC; switch (compact_result) { @@ -2379,14 +2377,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return page; } - /* - * last_compact_zone is where try_to_compact_pages thought allocation - * should succeed, so it did not defer compaction. But here we know - * that it didn't succeed, so we do the defer. - */ - if (last_compact_zone && mode != MIGRATE_ASYNC) - defer_compaction(last_compact_zone, order); - /* * It's bad if compaction run occurs and fails. The most likely reason * is that pages exist, but not enough to satisfy watermarks. -- cgit v1.2.3 From 18eca2e636f921e6350dc31b5b450bb4102d664f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:43:57 -0800 Subject: mm: memcontrol: remove unnecessary PCG_MEMSW memory+swap charge flag Now that mem_cgroup_swapout() fully uncharges the page, every page that is still in use when reaching mem_cgroup_uncharge() is known to carry both the memory and the memory+swap charge. Simplify the uncharge path and remove the PCG_MEMSW page flag accordingly. Signed-off-by: Johannes Weiner Cc: Hugh Dickins Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 1 - mm/memcontrol.c | 34 ++++++++++++---------------------- 2 files changed, 12 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 5c831f1eca79..da62ee2be28b 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -5,7 +5,6 @@ enum { /* flags for mem_cgroup */ PCG_USED = 0x01, /* This page is charged to a memcg */ PCG_MEM = 0x02, /* This page holds a memory charge */ - PCG_MEMSW = 0x04, /* This page holds a memory+swap charge */ }; struct pglist_data; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 266a440c89f9..baf3b535b180 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2614,7 +2614,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, * have the page locked */ pc->mem_cgroup = memcg; - pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); + pc->flags = PCG_USED | PCG_MEM; if (lrucare) unlock_page_lru(page, isolated); @@ -5793,7 +5793,6 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!PageCgroupUsed(pc)) return; - VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); memcg = pc->mem_cgroup; oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); @@ -5989,17 +5988,16 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) } static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, - unsigned long nr_mem, unsigned long nr_memsw, unsigned long nr_anon, unsigned long nr_file, unsigned long nr_huge, struct page *dummy_page) { + unsigned long nr_pages = nr_anon + nr_file; unsigned long flags; if (!mem_cgroup_is_root(memcg)) { - if (nr_mem) - page_counter_uncharge(&memcg->memory, nr_mem); - if (nr_memsw) - page_counter_uncharge(&memcg->memsw, nr_memsw); + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); memcg_oom_recover(memcg); } @@ -6008,23 +6006,21 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); - __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); + __this_cpu_add(memcg->stat->nr_page_events, nr_pages); memcg_check_events(memcg, dummy_page); local_irq_restore(flags); if (!mem_cgroup_is_root(memcg)) - css_put_many(&memcg->css, max(nr_mem, nr_memsw)); + css_put_many(&memcg->css, nr_pages); } static void uncharge_list(struct list_head *page_list) { struct mem_cgroup *memcg = NULL; - unsigned long nr_memsw = 0; unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; unsigned long pgpgout = 0; - unsigned long nr_mem = 0; struct list_head *next; struct page *page; @@ -6051,10 +6047,9 @@ static void uncharge_list(struct list_head *page_list) if (memcg != pc->mem_cgroup) { if (memcg) { - uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, - nr_anon, nr_file, nr_huge, page); - pgpgout = nr_mem = nr_memsw = 0; - nr_anon = nr_file = nr_huge = 0; + uncharge_batch(memcg, pgpgout, nr_anon, nr_file, + nr_huge, page); + pgpgout = nr_anon = nr_file = nr_huge = 0; } memcg = pc->mem_cgroup; } @@ -6070,18 +6065,14 @@ static void uncharge_list(struct list_head *page_list) else nr_file += nr_pages; - if (pc->flags & PCG_MEM) - nr_mem += nr_pages; - if (pc->flags & PCG_MEMSW) - nr_memsw += nr_pages; pc->flags = 0; pgpgout++; } while (next != page_list); if (memcg) - uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, - nr_anon, nr_file, nr_huge, page); + uncharge_batch(memcg, pgpgout, nr_anon, nr_file, + nr_huge, page); } /** @@ -6166,7 +6157,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, return; VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); - VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); if (lrucare) lock_page_lru(oldpage, &isolated); -- cgit v1.2.3 From f4aaa8b43d90294ca7546317997c452600e9a8a7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:44:00 -0800 Subject: mm: memcontrol: remove unnecessary PCG_MEM memory charge flag PCG_MEM is a remnant from an earlier version of 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API"), used to tell whether migration cleared a charge while leaving pc->mem_cgroup valid and PCG_USED set. But in the final version, mem_cgroup_migrate() directly uncharges the source page, rendering this distinction unnecessary. Remove it. Signed-off-by: Johannes Weiner Cc: Hugh Dickins Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 1 - mm/memcontrol.c | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index da62ee2be28b..97536e685843 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -4,7 +4,6 @@ enum { /* flags for mem_cgroup */ PCG_USED = 0x01, /* This page is charged to a memcg */ - PCG_MEM = 0x02, /* This page holds a memory charge */ }; struct pglist_data; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index baf3b535b180..3dfb56a93117 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2614,7 +2614,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, * have the page locked */ pc->mem_cgroup = memcg; - pc->flags = PCG_USED | PCG_MEM; + pc->flags = PCG_USED; if (lrucare) unlock_page_lru(page, isolated); @@ -6156,8 +6156,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, if (!PageCgroupUsed(pc)) return; - VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); - if (lrucare) lock_page_lru(oldpage, &isolated); -- cgit v1.2.3 From 2983331575bfb248abfb02efb5140b4a299e3f45 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:44:02 -0800 Subject: mm: memcontrol: remove unnecessary PCG_USED pc->mem_cgroup valid flag pc->mem_cgroup had to be left intact after uncharge for the final LRU removal, and !PCG_USED indicated whether the page was uncharged. But since commit 0a31bc97c80c ("mm: memcontrol: rewrite uncharge API") pages are uncharged after the final LRU removal. Uncharge can simply clear the pointer and the PCG_USED/PageCgroupUsed sites can test that instead. Because this is the last page_cgroup flag, this patch reduces the memcg per-page overhead to a single pointer. [akpm@linux-foundation.org: remove unneeded initialization of `memcg', per Michal] Signed-off-by: Johannes Weiner Cc: Hugh Dickins Acked-by: Michal Hocko Reviewed-by: Vladimir Davydov Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 10 ----- mm/memcontrol.c | 107 +++++++++++++++++--------------------------- 2 files changed, 41 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 97536e685843..1289be6b436c 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -1,11 +1,6 @@ #ifndef __LINUX_PAGE_CGROUP_H #define __LINUX_PAGE_CGROUP_H -enum { - /* flags for mem_cgroup */ - PCG_USED = 0x01, /* This page is charged to a memcg */ -}; - struct pglist_data; #ifdef CONFIG_MEMCG @@ -19,7 +14,6 @@ struct mem_cgroup; * then the page cgroup for pfn always exists. */ struct page_cgroup { - unsigned long flags; struct mem_cgroup *mem_cgroup; }; @@ -39,10 +33,6 @@ static inline void page_cgroup_init(void) struct page_cgroup *lookup_page_cgroup(struct page *page); -static inline int PageCgroupUsed(struct page_cgroup *pc) -{ - return !!(pc->flags & PCG_USED); -} #else /* !CONFIG_MEMCG */ struct page_cgroup; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3dfb56a93117..09fece0eb9f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1284,14 +1284,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) pc = lookup_page_cgroup(page); memcg = pc->mem_cgroup; - /* * Swapcache readahead pages are added to the LRU - and - * possibly migrated - before they are charged. Ensure - * pc->mem_cgroup is sane. + * possibly migrated - before they are charged. */ - if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) - pc->mem_cgroup = memcg = root_mem_cgroup; + if (!memcg) + memcg = root_mem_cgroup; mz = mem_cgroup_page_zoneinfo(memcg, page); lruvec = &mz->lruvec; @@ -2151,7 +2149,7 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, pc = lookup_page_cgroup(page); again: memcg = pc->mem_cgroup; - if (unlikely(!memcg || !PageCgroupUsed(pc))) + if (unlikely(!memcg)) return NULL; *locked = false; @@ -2159,7 +2157,7 @@ again: return memcg; move_lock_mem_cgroup(memcg, flags); - if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { + if (memcg != pc->mem_cgroup) { move_unlock_mem_cgroup(memcg, flags); goto again; } @@ -2525,7 +2523,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) */ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *memcg = NULL; + struct mem_cgroup *memcg; struct page_cgroup *pc; unsigned short id; swp_entry_t ent; @@ -2533,9 +2531,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) VM_BUG_ON_PAGE(!PageLocked(page), page); pc = lookup_page_cgroup(page); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - if (memcg && !css_tryget_online(&memcg->css)) + memcg = pc->mem_cgroup; + + if (memcg) { + if (!css_tryget_online(&memcg->css)) memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); @@ -2586,7 +2585,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, struct page_cgroup *pc = lookup_page_cgroup(page); int isolated; - VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); + VM_BUG_ON_PAGE(pc->mem_cgroup, page); /* * we don't need page_cgroup_lock about tail pages, becase they are not * accessed by any other context at this point. @@ -2601,7 +2600,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, /* * Nobody should be changing or seriously looking at - * pc->mem_cgroup and pc->flags at this point: + * pc->mem_cgroup at this point: * * - the page is uncharged * @@ -2614,7 +2613,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, * have the page locked */ pc->mem_cgroup = memcg; - pc->flags = PCG_USED; if (lrucare) unlock_page_lru(page, isolated); @@ -3126,37 +3124,22 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, memcg_uncharge_kmem(memcg, 1 << order); return; } - /* - * The page is freshly allocated and not visible to any - * outside callers yet. Set up pc non-atomically. - */ pc = lookup_page_cgroup(page); pc->mem_cgroup = memcg; - pc->flags = PCG_USED; } void __memcg_kmem_uncharge_pages(struct page *page, int order) { - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - - - pc = lookup_page_cgroup(page); - if (!PageCgroupUsed(pc)) - return; - - memcg = pc->mem_cgroup; - pc->flags = 0; + struct page_cgroup *pc = lookup_page_cgroup(page); + struct mem_cgroup *memcg = pc->mem_cgroup; - /* - * We trust that only if there is a memcg associated with the page, it - * is a valid allocation - */ if (!memcg) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); + memcg_uncharge_kmem(memcg, 1 << order); + pc->mem_cgroup = NULL; } #else static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) @@ -3174,23 +3157,16 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) */ void mem_cgroup_split_huge_fixup(struct page *head) { - struct page_cgroup *head_pc; - struct page_cgroup *pc; - struct mem_cgroup *memcg; + struct page_cgroup *pc = lookup_page_cgroup(head); int i; if (mem_cgroup_disabled()) return; - head_pc = lookup_page_cgroup(head); + for (i = 1; i < HPAGE_PMD_NR; i++) + pc[i].mem_cgroup = pc[0].mem_cgroup; - memcg = head_pc->mem_cgroup; - for (i = 1; i < HPAGE_PMD_NR; i++) { - pc = head_pc + i; - pc->mem_cgroup = memcg; - pc->flags = head_pc->flags; - } - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + __this_cpu_sub(pc[0].mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3240,7 +3216,7 @@ static int mem_cgroup_move_account(struct page *page, goto out; ret = -EINVAL; - if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) + if (pc->mem_cgroup != from) goto out_unlock; move_lock_mem_cgroup(from, &flags); @@ -3350,7 +3326,7 @@ static struct page_cgroup *lookup_page_cgroup_used(struct page *page) * the first time, i.e. during boot or memory hotplug; * or when mem_cgroup_disabled(). */ - if (likely(pc) && PageCgroupUsed(pc)) + if (likely(pc) && pc->mem_cgroup) return pc; return NULL; } @@ -3368,10 +3344,8 @@ void mem_cgroup_print_bad_page(struct page *page) struct page_cgroup *pc; pc = lookup_page_cgroup_used(page); - if (pc) { - pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", - pc, pc->flags, pc->mem_cgroup); - } + if (pc) + pr_alert("pc:%p pc->mem_cgroup:%p\n", pc, pc->mem_cgroup); } #endif @@ -5308,7 +5282,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, * mem_cgroup_move_account() checks the pc is valid or * not under LRU exclusion. */ - if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + if (pc->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; if (target) target->page = page; @@ -5344,7 +5318,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, if (!move_anon()) return ret; pc = lookup_page_cgroup(page); - if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + if (pc->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; if (target) { get_page(page); @@ -5788,18 +5762,17 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) return; pc = lookup_page_cgroup(page); + memcg = pc->mem_cgroup; /* Readahead page, never charged */ - if (!PageCgroupUsed(pc)) + if (!memcg) return; - memcg = pc->mem_cgroup; - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); - pc->flags = 0; + pc->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); @@ -5874,7 +5847,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, * the page lock, which serializes swap cache removal, which * in turn serializes uncharging. */ - if (PageCgroupUsed(pc)) + if (pc->mem_cgroup) goto out; } @@ -6036,13 +6009,13 @@ static void uncharge_list(struct list_head *page_list) VM_BUG_ON_PAGE(page_count(page), page); pc = lookup_page_cgroup(page); - if (!PageCgroupUsed(pc)) + if (!pc->mem_cgroup) continue; /* * Nobody should be changing or seriously looking at - * pc->mem_cgroup and pc->flags at this point, we have - * fully exclusive access to the page. + * pc->mem_cgroup at this point, we have fully + * exclusive access to the page. */ if (memcg != pc->mem_cgroup) { @@ -6065,7 +6038,7 @@ static void uncharge_list(struct list_head *page_list) else nr_file += nr_pages; - pc->flags = 0; + pc->mem_cgroup = NULL; pgpgout++; } while (next != page_list); @@ -6091,7 +6064,7 @@ void mem_cgroup_uncharge(struct page *page) /* Don't touch page->lru of any random page, pre-check: */ pc = lookup_page_cgroup(page); - if (!PageCgroupUsed(pc)) + if (!pc->mem_cgroup) return; INIT_LIST_HEAD(&page->lru); @@ -6127,6 +6100,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, bool lrucare) { + struct mem_cgroup *memcg; struct page_cgroup *pc; int isolated; @@ -6143,7 +6117,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, /* Page cache replacement: new page already charged? */ pc = lookup_page_cgroup(newpage); - if (PageCgroupUsed(pc)) + if (pc->mem_cgroup) return; /* @@ -6153,18 +6127,19 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, * reclaim just put back on the LRU but has not released yet. */ pc = lookup_page_cgroup(oldpage); - if (!PageCgroupUsed(pc)) + memcg = pc->mem_cgroup; + if (!memcg) return; if (lrucare) lock_page_lru(oldpage, &isolated); - pc->flags = 0; + pc->mem_cgroup = NULL; if (lrucare) unlock_page_lru(oldpage, isolated); - commit_charge(newpage, pc->mem_cgroup, lrucare); + commit_charge(newpage, memcg, lrucare); } /* -- cgit v1.2.3 From 97ad2be1daf8e6f2d297aa349101b340e1327917 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 10 Dec 2014 15:44:13 -0800 Subject: mm, hugetlb: correct bit shift in hstate_sizelog() hstate_sizelog() would shift left an int rather than long, triggering undefined behaviour and passing an incorrect value when the requested page size was more than 4GB, thus breaking >4GB pages. Signed-off-by: Sasha Levin Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6e6d338641fe..cdd149ca5cc0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -311,7 +311,8 @@ static inline struct hstate *hstate_sizelog(int page_size_log) { if (!page_size_log) return &default_hstate; - return size_to_hstate(1 << page_size_log); + + return size_to_hstate(1UL << page_size_log); } static inline struct hstate *hstate_vma(struct vm_area_struct *vma) -- cgit v1.2.3 From b047501cd9f11d5e1d54ea0f90e2b10754021a0e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 10 Dec 2014 15:44:19 -0800 Subject: memcg: use generic slab iterators for showing slabinfo Let's use generic slab_start/next/stop for showing memcg caches info. In contrast to the current implementation, this will work even if all memcg caches' info doesn't fit into a seq buffer (a page), plus it simply looks neater. Actually, the main reason I do this isn't mere cleanup. I'm going to zap the memcg_slab_caches list, because I find it useless provided we have the slab_caches list, and this patch is a step in this direction. It should be noted that before this patch an attempt to read memory.kmem.slabinfo of a cgroup that doesn't have kmem limit set resulted in -EIO, while after this patch it will silently show nothing except the header, but I don't think it will frustrate anyone. Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ---- mm/memcontrol.c | 25 ++++--------------------- mm/slab.h | 1 + mm/slab_common.c | 25 +++++++++++++++++++------ 4 files changed, 24 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index c265bec6a57d..8a2457d42fc8 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -513,10 +513,6 @@ struct memcg_cache_params { int memcg_update_all_caches(int num_memcgs); -struct seq_file; -int cache_show(struct kmem_cache *s, struct seq_file *m); -void print_slabinfo_header(struct seq_file *m); - /** * kmalloc_array - allocate memory for an array. * @n: number of elements. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 32e3b191857d..9d30129b0d4a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2547,26 +2547,6 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); } -#ifdef CONFIG_SLABINFO -static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - struct memcg_cache_params *params; - - if (!memcg_kmem_is_active(memcg)) - return -EIO; - - print_slabinfo_header(m); - - mutex_lock(&memcg_slab_mutex); - list_for_each_entry(params, &memcg->memcg_slab_caches, list) - cache_show(memcg_params_to_cache(params), m); - mutex_unlock(&memcg_slab_mutex); - - return 0; -} -#endif - static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages) { @@ -4708,7 +4688,10 @@ static struct cftype mem_cgroup_files[] = { #ifdef CONFIG_SLABINFO { .name = "kmem.slabinfo", - .seq_show = mem_cgroup_slabinfo_read, + .seq_start = slab_start, + .seq_next = slab_next, + .seq_stop = slab_stop, + .seq_show = memcg_slab_show, }, #endif #endif diff --git a/mm/slab.h b/mm/slab.h index 078acbcf64e8..1cf4005482dd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -360,5 +360,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) void *slab_start(struct seq_file *m, loff_t *pos); void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); +int memcg_slab_show(struct seq_file *m, void *p); #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 2a3f5ff410cf..e03dd6f2a272 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -811,7 +811,7 @@ EXPORT_SYMBOL(kmalloc_order_trace); #define SLABINFO_RIGHTS S_IRUSR #endif -void print_slabinfo_header(struct seq_file *m) +static void print_slabinfo_header(struct seq_file *m) { /* * Output format version, so at least we can change it @@ -876,7 +876,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) } } -int cache_show(struct kmem_cache *s, struct seq_file *m) +static void cache_show(struct kmem_cache *s, struct seq_file *m) { struct slabinfo sinfo; @@ -895,7 +895,6 @@ int cache_show(struct kmem_cache *s, struct seq_file *m) sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); slabinfo_show_stats(m, s); seq_putc(m, '\n'); - return 0; } static int slab_show(struct seq_file *m, void *p) @@ -904,10 +903,24 @@ static int slab_show(struct seq_file *m, void *p) if (p == slab_caches.next) print_slabinfo_header(m); - if (!is_root_cache(s)) - return 0; - return cache_show(s, m); + if (is_root_cache(s)) + cache_show(s, m); + return 0; +} + +#ifdef CONFIG_MEMCG_KMEM +int memcg_slab_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = list_entry(p, struct kmem_cache, list); + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + if (p == slab_caches.next) + print_slabinfo_header(m); + if (!is_root_cache(s) && s->memcg_params->memcg == memcg) + cache_show(s, m); + return 0; } +#endif /* * slabinfo_op - iterator that generates /proc/slabinfo -- cgit v1.2.3 From 413918bb61b4fa027baa3e79546c47f15e4b9ea8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:44:30 -0800 Subject: mm: memcontrol: pull the NULL check from __mem_cgroup_same_or_subtree() The NULL in mm_match_cgroup() comes from a possibly exiting mm->owner. It makes a lot more sense to check where it's looked up, rather than check for it in __mem_cgroup_same_or_subtree() where it's unexpected. No other callsite passes NULL to __mem_cgroup_same_or_subtree(). Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++-- mm/memcontrol.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ea007615e8f9..e32ab948f589 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -83,11 +83,12 @@ static inline bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) { struct mem_cgroup *task_memcg; - bool match; + bool match = false; rcu_read_lock(); task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - match = __mem_cgroup_same_or_subtree(memcg, task_memcg); + if (task_memcg) + match = __mem_cgroup_same_or_subtree(memcg, task_memcg); rcu_read_unlock(); return match; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 367cc57df362..e5dcebd71dfb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1337,7 +1337,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, { if (root_memcg == memcg) return true; - if (!root_memcg->use_hierarchy || !memcg) + if (!root_memcg->use_hierarchy) return false; return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); } -- cgit v1.2.3 From 2314b42db67be30b747122d65c6cd2c85da34538 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:44:33 -0800 Subject: mm: memcontrol: drop bogus RCU locking from mem_cgroup_same_or_subtree() None of the mem_cgroup_same_or_subtree() callers actually require it to take the RCU lock, either because they hold it themselves or they have css references. Remove it. To make the API change clear, rename the leftover helper to mem_cgroup_is_descendant() to match cgroup_is_descendant(). Signed-off-by: Johannes Weiner Reviewed-by: Vladimir Davydov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 13 +++++----- mm/memcontrol.c | 59 +++++++++++++--------------------------------- mm/oom_kill.c | 4 ++-- 3 files changed, 24 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e32ab948f589..d4575a1d6e99 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -68,10 +68,9 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); -bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg); -bool task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg); +bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, + struct mem_cgroup *root); +bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); @@ -79,8 +78,8 @@ extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css); -static inline -bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) +static inline bool mm_match_cgroup(struct mm_struct *mm, + struct mem_cgroup *memcg) { struct mem_cgroup *task_memcg; bool match = false; @@ -88,7 +87,7 @@ bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) rcu_read_lock(); task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (task_memcg) - match = __mem_cgroup_same_or_subtree(memcg, task_memcg); + match = mem_cgroup_is_descendant(task_memcg, memcg); rcu_read_unlock(); return match; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e5dcebd71dfb..b841bf430179 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1328,41 +1328,24 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, VM_BUG_ON((long)(*lru_size) < 0); } -/* - * Checks whether given mem is same or in the root_mem_cgroup's - * hierarchy subtree - */ -bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) +bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { - if (root_memcg == memcg) + if (root == memcg) return true; - if (!root_memcg->use_hierarchy) + if (!root->use_hierarchy) return false; - return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); -} - -static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, - struct mem_cgroup *memcg) -{ - bool ret; - - rcu_read_lock(); - ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); - rcu_read_unlock(); - return ret; + return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } -bool task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) +bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) { - struct mem_cgroup *curr; + struct mem_cgroup *task_memcg; struct task_struct *p; bool ret; p = find_lock_task_mm(task); if (p) { - curr = get_mem_cgroup_from_mm(p->mm); + task_memcg = get_mem_cgroup_from_mm(p->mm); task_unlock(p); } else { /* @@ -1371,18 +1354,12 @@ bool task_in_mem_cgroup(struct task_struct *task, * killed to prevent needlessly killing additional tasks. */ rcu_read_lock(); - curr = mem_cgroup_from_task(task); - css_get(&curr->css); + task_memcg = mem_cgroup_from_task(task); + css_get(&task_memcg->css); rcu_read_unlock(); } - /* - * We should check use_hierarchy of "memcg" not "curr". Because checking - * use_hierarchy of "curr" here make this function true if hierarchy is - * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* - * hierarchy(even if use_hierarchy is disabled in "memcg"). - */ - ret = mem_cgroup_same_or_subtree(memcg, curr); - css_put(&curr->css); + ret = mem_cgroup_is_descendant(task_memcg, memcg); + css_put(&task_memcg->css); return ret; } @@ -1467,8 +1444,8 @@ static bool mem_cgroup_under_move(struct mem_cgroup *memcg) if (!from) goto unlock; - ret = mem_cgroup_same_or_subtree(memcg, from) - || mem_cgroup_same_or_subtree(memcg, to); + ret = mem_cgroup_is_descendant(from, memcg) || + mem_cgroup_is_descendant(to, memcg); unlock: spin_unlock(&mc.lock); return ret; @@ -1900,12 +1877,8 @@ static int memcg_oom_wake_function(wait_queue_t *wait, oom_wait_info = container_of(wait, struct oom_wait_info, wait); oom_wait_memcg = oom_wait_info->memcg; - /* - * Both of oom_wait_info->memcg and wake_memcg are stable under us. - * Then we can use css_is_ancestor without taking care of RCU. - */ - if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) - && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) + if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && + !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) return 0; return autoremove_wake_function(wait, mode, sync, arg); } @@ -2225,7 +2198,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) memcg = stock->cached; if (!memcg || !stock->nr_pages) continue; - if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) + if (!mem_cgroup_is_descendant(memcg, root_memcg)) continue; if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5340f6b91312..3b014d326151 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -119,7 +119,7 @@ found: /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, - const struct mem_cgroup *memcg, const nodemask_t *nodemask) + struct mem_cgroup *memcg, const nodemask_t *nodemask) { if (is_global_init(p)) return true; @@ -353,7 +353,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, * swapents, oom_score_adj value, and name. */ -static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) +static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; -- cgit v1.2.3 From e4bd6a0248b2a026e07c19995c41a4cb5a49d797 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 10 Dec 2014 15:44:39 -0800 Subject: mm, memcg: fix potential undefined behaviour in page stat accounting Since commit d7365e783edb ("mm: memcontrol: fix missed end-writeback page accounting") mem_cgroup_end_page_stat consumes locked and flags variables directly rather than via pointers which might trigger C undefined behavior as those variables are initialized only in the slow path of mem_cgroup_begin_page_stat. Although mem_cgroup_end_page_stat handles parameters correctly and touches them only when they hold a sensible value it is caller which loads a potentially uninitialized value which then might allow compiler to do crazy things. I haven't seen any warning from gcc and it seems that the current version (4.9) doesn't exploit this type undefined behavior but Sasha has reported the following: UBSan: Undefined behaviour in mm/rmap.c:1084:2 load of value 255 is not a valid value for type '_Bool' CPU: 4 PID: 8304 Comm: rngd Not tainted 3.18.0-rc2-next-20141029-sasha-00039-g77ed13d-dirty #1427 Call Trace: dump_stack (lib/dump_stack.c:52) ubsan_epilogue (lib/ubsan.c:159) __ubsan_handle_load_invalid_value (lib/ubsan.c:482) page_remove_rmap (mm/rmap.c:1084 mm/rmap.c:1096) unmap_page_range (./arch/x86/include/asm/atomic.h:27 include/linux/mm.h:463 mm/memory.c:1146 mm/memory.c:1258 mm/memory.c:1279 mm/memory.c:1303) unmap_single_vma (mm/memory.c:1348) unmap_vmas (mm/memory.c:1377 (discriminator 3)) exit_mmap (mm/mmap.c:2837) mmput (kernel/fork.c:659) do_exit (./arch/x86/include/asm/thread_info.h:168 kernel/exit.c:462 kernel/exit.c:747) do_group_exit (include/linux/sched.h:775 kernel/exit.c:873) SyS_exit_group (kernel/exit.c:901) tracesys_phase2 (arch/x86/kernel/entry_64.S:529) Fix this by using pointer parameters for both locked and flags and be more robust for future compiler changes even though the current code is implemented correctly. Signed-off-by: Michal Hocko Reported-by: Sasha Levin Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 +++--- mm/memcontrol.c | 8 ++++---- mm/page-writeback.c | 4 ++-- mm/rmap.c | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d4575a1d6e99..de018766be45 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -141,8 +141,8 @@ static inline bool mem_cgroup_disabled(void) struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked, unsigned long *flags); -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, - unsigned long flags); +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, + unsigned long *flags); void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx, int val); @@ -297,7 +297,7 @@ static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, } static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, - bool locked, unsigned long flags) + bool *locked, unsigned long *flags) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b841bf430179..031ca345677b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2053,11 +2053,11 @@ again: * @locked: value received from mem_cgroup_begin_page_stat() * @flags: value received from mem_cgroup_begin_page_stat() */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, - unsigned long flags) +void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, + unsigned long *flags) { - if (memcg && locked) - spin_unlock_irqrestore(&memcg->move_lock, flags); + if (memcg && *locked) + spin_unlock_irqrestore(&memcg->move_lock, *flags); rcu_read_unlock(); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 19ceae87522d..d5d81f5384d1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2357,7 +2357,7 @@ int test_clear_page_writeback(struct page *page) dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_page_stat(memcg, locked, memcg_flags); + mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); return ret; } @@ -2399,7 +2399,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_page_stat(memcg, locked, memcg_flags); + mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index 3e4c7213210c..45eba36fd673 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1053,7 +1053,7 @@ void page_add_file_rmap(struct page *page) __inc_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_page_stat(memcg, locked, flags); + mem_cgroup_end_page_stat(memcg, &locked, &flags); } static void page_remove_file_rmap(struct page *page) @@ -1083,7 +1083,7 @@ static void page_remove_file_rmap(struct page *page) if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - mem_cgroup_end_page_stat(memcg, locked, flags); + mem_cgroup_end_page_stat(memcg, &locked, &flags); } /** -- cgit v1.2.3 From 1306a85aed3ec3db98945aafb7dfbe5648a1203c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:44:52 -0800 Subject: mm: embed the memcg pointer directly into struct page Memory cgroups used to have 5 per-page pointers. To allow users to disable that amount of overhead during runtime, those pointers were allocated in a separate array, with a translation layer between them and struct page. There is now only one page pointer remaining: the memcg pointer, that indicates which cgroup the page is associated with when charged. The complexity of runtime allocation and the runtime translation overhead is no longer justified to save that *potential* 0.19% of memory. With CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding after the page->private member and doesn't even increase struct page, and then this patch actually saves space. Remaining users that care can still compile their kernels without CONFIG_MEMCG. text data bss dec hex filename 8828345 1725264 983040 11536649 b00909 vmlinux.old 8827425 1725264 966656 11519345 afc571 vmlinux.new [mhocko@suse.cz: update Documentation/cgroups/memory.txt] Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Vladimir Davydov Acked-by: David S. Miller Acked-by: KAMEZAWA Hiroyuki Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Cc: Joonsoo Kim Acked-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 5 + include/linux/memcontrol.h | 6 +- include/linux/mm_types.h | 5 + include/linux/mmzone.h | 12 -- include/linux/page_cgroup.h | 53 ------- init/main.c | 7 - mm/memcontrol.c | 124 +++++---------- mm/page_alloc.c | 2 - mm/page_cgroup.c | 319 --------------------------------------- 9 files changed, 46 insertions(+), 487 deletions(-) (limited to 'include') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 67613ff0270c..46b2b5080317 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -1,5 +1,10 @@ Memory Resource Controller +NOTE: This document is hopelessly outdated and it asks for a complete + rewrite. It still contains a useful information so we are keeping it + here but make sure to check the current code if you need a deeper + understanding. + NOTE: The Memory Resource Controller has generically been referred to as the memory controller in this document. Do not confuse memory controller used here with the memory controller that is used in hardware. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index de018766be45..c4d080875164 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -25,7 +25,6 @@ #include struct mem_cgroup; -struct page_cgroup; struct page; struct mm_struct; struct kmem_cache; @@ -466,8 +465,6 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) * memcg_kmem_uncharge_pages: uncharge pages from memcg * @page: pointer to struct page being freed * @order: allocation order. - * - * there is no need to specify memcg here, since it is embedded in page_cgroup */ static inline void memcg_kmem_uncharge_pages(struct page *page, int order) @@ -484,8 +481,7 @@ memcg_kmem_uncharge_pages(struct page *page, int order) * * Needs to be called after memcg_kmem_newpage_charge, regardless of success or * failure of the allocation. if @page is NULL, this function will revert the - * charges. Otherwise, it will commit the memcg given by @memcg to the - * corresponding page_cgroup. + * charges. Otherwise, it will commit @page to @memcg. */ static inline void memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 004e9d17b47e..bf9f57529dcf 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -22,6 +22,7 @@ #define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) struct address_space; +struct mem_cgroup; #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) #define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ @@ -167,6 +168,10 @@ struct page { struct page *first_page; /* Compound tail pages */ }; +#ifdef CONFIG_MEMCG + struct mem_cgroup *mem_cgroup; +#endif + /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ffe66e381c04..3879d7664dfc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -722,9 +722,6 @@ typedef struct pglist_data { int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; -#ifdef CONFIG_MEMCG - struct page_cgroup *node_page_cgroup; -#endif #endif #ifndef CONFIG_NO_BOOTMEM struct bootmem_data *bdata; @@ -1078,7 +1075,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) struct page; -struct page_cgroup; struct mem_section { /* * This is, logically, a pointer to an array of struct @@ -1096,14 +1092,6 @@ struct mem_section { /* See declaration of similar field in struct zone */ unsigned long *pageblock_flags; -#ifdef CONFIG_MEMCG - /* - * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use - * section. (see memcontrol.h/page_cgroup.h about this.) - */ - struct page_cgroup *page_cgroup; - unsigned long pad; -#endif /* * WARNING: mem_section must be a power-of-2 in size for the * calculation and use of SECTION_ROOT_MASK to make sense. diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 1289be6b436c..65be35785c86 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -1,59 +1,6 @@ #ifndef __LINUX_PAGE_CGROUP_H #define __LINUX_PAGE_CGROUP_H -struct pglist_data; - -#ifdef CONFIG_MEMCG -struct mem_cgroup; - -/* - * Page Cgroup can be considered as an extended mem_map. - * A page_cgroup page is associated with every page descriptor. The - * page_cgroup helps us identify information about the cgroup - * All page cgroups are allocated at boot or memory hotplug event, - * then the page cgroup for pfn always exists. - */ -struct page_cgroup { - struct mem_cgroup *mem_cgroup; -}; - -extern void pgdat_page_cgroup_init(struct pglist_data *pgdat); - -#ifdef CONFIG_SPARSEMEM -static inline void page_cgroup_init_flatmem(void) -{ -} -extern void page_cgroup_init(void); -#else -extern void page_cgroup_init_flatmem(void); -static inline void page_cgroup_init(void) -{ -} -#endif - -struct page_cgroup *lookup_page_cgroup(struct page *page); - -#else /* !CONFIG_MEMCG */ -struct page_cgroup; - -static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat) -{ -} - -static inline struct page_cgroup *lookup_page_cgroup(struct page *page) -{ - return NULL; -} - -static inline void page_cgroup_init(void) -{ -} - -static inline void page_cgroup_init_flatmem(void) -{ -} -#endif /* CONFIG_MEMCG */ - #include #ifdef CONFIG_MEMCG_SWAP diff --git a/init/main.c b/init/main.c index 321d0ceb26d3..d2e4ead4891f 100644 --- a/init/main.c +++ b/init/main.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include @@ -485,11 +484,6 @@ void __init __weak thread_info_cache_init(void) */ static void __init mm_init(void) { - /* - * page_cgroup requires contiguous pages, - * bigger than MAX_ORDER unless SPARSEMEM. - */ - page_cgroup_init_flatmem(); mem_init(); kmem_cache_init(); percpu_init_late(); @@ -627,7 +621,6 @@ asmlinkage __visible void __init start_kernel(void) initrd_start = 0; } #endif - page_cgroup_init(); debug_objects_mem_init(); kmemleak_init(); setup_per_cpu_pageset(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78cb3b05a9fa..b864067791dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1274,7 +1274,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) { struct mem_cgroup_per_zone *mz; struct mem_cgroup *memcg; - struct page_cgroup *pc; struct lruvec *lruvec; if (mem_cgroup_disabled()) { @@ -1282,8 +1281,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) goto out; } - pc = lookup_page_cgroup(page); - memcg = pc->mem_cgroup; + memcg = page->mem_cgroup; /* * Swapcache readahead pages are added to the LRU - and * possibly migrated - before they are charged. @@ -2020,16 +2018,13 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, unsigned long *flags) { struct mem_cgroup *memcg; - struct page_cgroup *pc; rcu_read_lock(); if (mem_cgroup_disabled()) return NULL; - - pc = lookup_page_cgroup(page); again: - memcg = pc->mem_cgroup; + memcg = page->mem_cgroup; if (unlikely(!memcg)) return NULL; @@ -2038,7 +2033,7 @@ again: return memcg; spin_lock_irqsave(&memcg->move_lock, *flags); - if (memcg != pc->mem_cgroup) { + if (memcg != page->mem_cgroup) { spin_unlock_irqrestore(&memcg->move_lock, *flags); goto again; } @@ -2405,15 +2400,12 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { struct mem_cgroup *memcg; - struct page_cgroup *pc; unsigned short id; swp_entry_t ent; VM_BUG_ON_PAGE(!PageLocked(page), page); - pc = lookup_page_cgroup(page); - memcg = pc->mem_cgroup; - + memcg = page->mem_cgroup; if (memcg) { if (!css_tryget_online(&memcg->css)) memcg = NULL; @@ -2463,10 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated) static void commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare) { - struct page_cgroup *pc = lookup_page_cgroup(page); int isolated; - VM_BUG_ON_PAGE(pc->mem_cgroup, page); + VM_BUG_ON_PAGE(page->mem_cgroup, page); /* * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page @@ -2477,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, /* * Nobody should be changing or seriously looking at - * pc->mem_cgroup at this point: + * page->mem_cgroup at this point: * * - the page is uncharged * @@ -2489,7 +2480,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, * - a page cache insertion, a swapin fault, or a migration * have the page locked */ - pc->mem_cgroup = memcg; + page->mem_cgroup = memcg; if (lrucare) unlock_page_lru(page, isolated); @@ -2972,8 +2963,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) { - struct page_cgroup *pc; - VM_BUG_ON(mem_cgroup_is_root(memcg)); /* The page allocation failed. Revert */ @@ -2981,14 +2970,12 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, memcg_uncharge_kmem(memcg, 1 << order); return; } - pc = lookup_page_cgroup(page); - pc->mem_cgroup = memcg; + page->mem_cgroup = memcg; } void __memcg_kmem_uncharge_pages(struct page *page, int order) { - struct page_cgroup *pc = lookup_page_cgroup(page); - struct mem_cgroup *memcg = pc->mem_cgroup; + struct mem_cgroup *memcg = page->mem_cgroup; if (!memcg) return; @@ -2996,7 +2983,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); memcg_uncharge_kmem(memcg, 1 << order); - pc->mem_cgroup = NULL; + page->mem_cgroup = NULL; } #else static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) @@ -3014,16 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) */ void mem_cgroup_split_huge_fixup(struct page *head) { - struct page_cgroup *pc = lookup_page_cgroup(head); int i; if (mem_cgroup_disabled()) return; for (i = 1; i < HPAGE_PMD_NR; i++) - pc[i].mem_cgroup = pc[0].mem_cgroup; + head[i].mem_cgroup = head->mem_cgroup; - __this_cpu_sub(pc[0].mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3032,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) * mem_cgroup_move_account - move account of the page * @page: the page * @nr_pages: number of regular pages (>1 for huge pages) - * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * @@ -3045,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) */ static int mem_cgroup_move_account(struct page *page, unsigned int nr_pages, - struct page_cgroup *pc, struct mem_cgroup *from, struct mem_cgroup *to) { @@ -3065,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page, goto out; /* - * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup + * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup * of its source page while we change it: page migration takes * both pages off the LRU, but page cache replacement doesn't. */ @@ -3073,7 +3057,7 @@ static int mem_cgroup_move_account(struct page *page, goto out; ret = -EINVAL; - if (pc->mem_cgroup != from) + if (page->mem_cgroup != from) goto out_unlock; spin_lock_irqsave(&from->move_lock, flags); @@ -3093,13 +3077,13 @@ static int mem_cgroup_move_account(struct page *page, } /* - * It is safe to change pc->mem_cgroup here because the page + * It is safe to change page->mem_cgroup here because the page * is referenced, charged, and isolated - we can't race with * uncharging, charging, migration, or LRU putback. */ /* caller should have done css_get */ - pc->mem_cgroup = to; + page->mem_cgroup = to; spin_unlock_irqrestore(&from->move_lock, flags); ret = 0; @@ -3174,36 +3158,17 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, #endif #ifdef CONFIG_DEBUG_VM -static struct page_cgroup *lookup_page_cgroup_used(struct page *page) -{ - struct page_cgroup *pc; - - pc = lookup_page_cgroup(page); - /* - * Can be NULL while feeding pages into the page allocator for - * the first time, i.e. during boot or memory hotplug; - * or when mem_cgroup_disabled(). - */ - if (likely(pc) && pc->mem_cgroup) - return pc; - return NULL; -} - bool mem_cgroup_bad_page_check(struct page *page) { if (mem_cgroup_disabled()) return false; - return lookup_page_cgroup_used(page) != NULL; + return page->mem_cgroup != NULL; } void mem_cgroup_print_bad_page(struct page *page) { - struct page_cgroup *pc; - - pc = lookup_page_cgroup_used(page); - if (pc) - pr_alert("pc:%p pc->mem_cgroup:%p\n", pc, pc->mem_cgroup); + pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); } #endif @@ -5123,7 +5088,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { struct page *page = NULL; - struct page_cgroup *pc; enum mc_target_type ret = MC_TARGET_NONE; swp_entry_t ent = { .val = 0 }; @@ -5137,13 +5101,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (!page && !ent.val) return ret; if (page) { - pc = lookup_page_cgroup(page); /* * Do only loose check w/o serialization. - * mem_cgroup_move_account() checks the pc is valid or + * mem_cgroup_move_account() checks the page is valid or * not under LRU exclusion. */ - if (pc->mem_cgroup == mc.from) { + if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; if (target) target->page = page; @@ -5171,15 +5134,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, union mc_target *target) { struct page *page = NULL; - struct page_cgroup *pc; enum mc_target_type ret = MC_TARGET_NONE; page = pmd_page(pmd); VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!move_anon()) return ret; - pc = lookup_page_cgroup(page); - if (pc->mem_cgroup == mc.from) { + if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; if (target) { get_page(page); @@ -5378,7 +5339,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, enum mc_target_type target_type; union mc_target target; struct page *page; - struct page_cgroup *pc; /* * We don't take compound_lock() here but no race with splitting thp @@ -5399,9 +5359,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, if (target_type == MC_TARGET_PAGE) { page = target.page; if (!isolate_lru_page(page)) { - pc = lookup_page_cgroup(page); if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, - pc, mc.from, mc.to)) { + mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; } @@ -5429,9 +5388,7 @@ retry: page = target.page; if (isolate_lru_page(page)) goto put; - pc = lookup_page_cgroup(page); - if (!mem_cgroup_move_account(page, 1, pc, - mc.from, mc.to)) { + if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -5619,7 +5576,6 @@ static void __init enable_swap_cgroup(void) void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { struct mem_cgroup *memcg; - struct page_cgroup *pc; unsigned short oldid; VM_BUG_ON_PAGE(PageLRU(page), page); @@ -5628,8 +5584,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!do_swap_account) return; - pc = lookup_page_cgroup(page); - memcg = pc->mem_cgroup; + memcg = page->mem_cgroup; /* Readahead page, never charged */ if (!memcg) @@ -5639,7 +5594,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(oldid, page); mem_cgroup_swap_statistics(memcg, true); - pc->mem_cgroup = NULL; + page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); @@ -5706,7 +5661,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, goto out; if (PageSwapCache(page)) { - struct page_cgroup *pc = lookup_page_cgroup(page); /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters @@ -5714,7 +5668,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, * the page lock, which serializes swap cache removal, which * in turn serializes uncharging. */ - if (pc->mem_cgroup) + if (page->mem_cgroup) goto out; } @@ -5867,7 +5821,6 @@ static void uncharge_list(struct list_head *page_list) next = page_list->next; do { unsigned int nr_pages = 1; - struct page_cgroup *pc; page = list_entry(next, struct page, lru); next = page->lru.next; @@ -5875,23 +5828,22 @@ static void uncharge_list(struct list_head *page_list) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); - pc = lookup_page_cgroup(page); - if (!pc->mem_cgroup) + if (!page->mem_cgroup) continue; /* * Nobody should be changing or seriously looking at - * pc->mem_cgroup at this point, we have fully + * page->mem_cgroup at this point, we have fully * exclusive access to the page. */ - if (memcg != pc->mem_cgroup) { + if (memcg != page->mem_cgroup) { if (memcg) { uncharge_batch(memcg, pgpgout, nr_anon, nr_file, nr_huge, page); pgpgout = nr_anon = nr_file = nr_huge = 0; } - memcg = pc->mem_cgroup; + memcg = page->mem_cgroup; } if (PageTransHuge(page)) { @@ -5905,7 +5857,7 @@ static void uncharge_list(struct list_head *page_list) else nr_file += nr_pages; - pc->mem_cgroup = NULL; + page->mem_cgroup = NULL; pgpgout++; } while (next != page_list); @@ -5924,14 +5876,11 @@ static void uncharge_list(struct list_head *page_list) */ void mem_cgroup_uncharge(struct page *page) { - struct page_cgroup *pc; - if (mem_cgroup_disabled()) return; /* Don't touch page->lru of any random page, pre-check: */ - pc = lookup_page_cgroup(page); - if (!pc->mem_cgroup) + if (!page->mem_cgroup) return; INIT_LIST_HEAD(&page->lru); @@ -5968,7 +5917,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, bool lrucare) { struct mem_cgroup *memcg; - struct page_cgroup *pc; int isolated; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); @@ -5983,8 +5931,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, return; /* Page cache replacement: new page already charged? */ - pc = lookup_page_cgroup(newpage); - if (pc->mem_cgroup) + if (newpage->mem_cgroup) return; /* @@ -5993,15 +5940,14 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, * uncharged page when the PFN walker finds a page that * reclaim just put back on the LRU but has not released yet. */ - pc = lookup_page_cgroup(oldpage); - memcg = pc->mem_cgroup; + memcg = oldpage->mem_cgroup; if (!memcg) return; if (lrucare) lock_page_lru(oldpage, &isolated); - pc->mem_cgroup = NULL; + oldpage->mem_cgroup = NULL; if (lrucare) unlock_page_lru(oldpage, isolated); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97b6966816e5..22cfdeffbf69 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,7 +48,6 @@ #include #include #include -#include #include #include #include @@ -4853,7 +4852,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); - pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 5331c2bd85a2..f0f31c1d4d0c 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -1,326 +1,7 @@ #include -#include -#include -#include #include -#include -#include -#include #include -#include #include -#include - -static unsigned long total_usage; - -#if !defined(CONFIG_SPARSEMEM) - - -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) -{ - pgdat->node_page_cgroup = NULL; -} - -struct page_cgroup *lookup_page_cgroup(struct page *page) -{ - unsigned long pfn = page_to_pfn(page); - unsigned long offset; - struct page_cgroup *base; - - base = NODE_DATA(page_to_nid(page))->node_page_cgroup; -#ifdef CONFIG_DEBUG_VM - /* - * The sanity checks the page allocator does upon freeing a - * page can reach here before the page_cgroup arrays are - * allocated when feeding a range of pages to the allocator - * for the first time during bootup or memory hotplug. - */ - if (unlikely(!base)) - return NULL; -#endif - offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; - return base + offset; -} - -static int __init alloc_node_page_cgroup(int nid) -{ - struct page_cgroup *base; - unsigned long table_size; - unsigned long nr_pages; - - nr_pages = NODE_DATA(nid)->node_spanned_pages; - if (!nr_pages) - return 0; - - table_size = sizeof(struct page_cgroup) * nr_pages; - - base = memblock_virt_alloc_try_nid_nopanic( - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nid); - if (!base) - return -ENOMEM; - NODE_DATA(nid)->node_page_cgroup = base; - total_usage += table_size; - return 0; -} - -void __init page_cgroup_init_flatmem(void) -{ - - int nid, fail; - - if (mem_cgroup_disabled()) - return; - - for_each_online_node(nid) { - fail = alloc_node_page_cgroup(nid); - if (fail) - goto fail; - } - printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" - " don't want memory cgroups\n"); - return; -fail: - printk(KERN_CRIT "allocation of page_cgroup failed.\n"); - printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); - panic("Out of memory"); -} - -#else /* CONFIG_FLAT_NODE_MEM_MAP */ - -struct page_cgroup *lookup_page_cgroup(struct page *page) -{ - unsigned long pfn = page_to_pfn(page); - struct mem_section *section = __pfn_to_section(pfn); -#ifdef CONFIG_DEBUG_VM - /* - * The sanity checks the page allocator does upon freeing a - * page can reach here before the page_cgroup arrays are - * allocated when feeding a range of pages to the allocator - * for the first time during bootup or memory hotplug. - */ - if (!section->page_cgroup) - return NULL; -#endif - return section->page_cgroup + pfn; -} - -static void *__meminit alloc_page_cgroup(size_t size, int nid) -{ - gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; - void *addr = NULL; - - addr = alloc_pages_exact_nid(nid, size, flags); - if (addr) { - kmemleak_alloc(addr, size, 1, flags); - return addr; - } - - if (node_state(nid, N_HIGH_MEMORY)) - addr = vzalloc_node(size, nid); - else - addr = vzalloc(size); - - return addr; -} - -static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) -{ - struct mem_section *section; - struct page_cgroup *base; - unsigned long table_size; - - section = __pfn_to_section(pfn); - - if (section->page_cgroup) - return 0; - - table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; - base = alloc_page_cgroup(table_size, nid); - - /* - * The value stored in section->page_cgroup is (base - pfn) - * and it does not point to the memory block allocated above, - * causing kmemleak false positives. - */ - kmemleak_not_leak(base); - - if (!base) { - printk(KERN_ERR "page cgroup allocation failure\n"); - return -ENOMEM; - } - - /* - * The passed "pfn" may not be aligned to SECTION. For the calculation - * we need to apply a mask. - */ - pfn &= PAGE_SECTION_MASK; - section->page_cgroup = base - pfn; - total_usage += table_size; - return 0; -} -#ifdef CONFIG_MEMORY_HOTPLUG -static void free_page_cgroup(void *addr) -{ - if (is_vmalloc_addr(addr)) { - vfree(addr); - } else { - struct page *page = virt_to_page(addr); - size_t table_size = - sizeof(struct page_cgroup) * PAGES_PER_SECTION; - - BUG_ON(PageReserved(page)); - kmemleak_free(addr); - free_pages_exact(addr, table_size); - } -} - -static void __free_page_cgroup(unsigned long pfn) -{ - struct mem_section *ms; - struct page_cgroup *base; - - ms = __pfn_to_section(pfn); - if (!ms || !ms->page_cgroup) - return; - base = ms->page_cgroup + pfn; - free_page_cgroup(base); - ms->page_cgroup = NULL; -} - -static int __meminit online_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, - int nid) -{ - unsigned long start, end, pfn; - int fail = 0; - - start = SECTION_ALIGN_DOWN(start_pfn); - end = SECTION_ALIGN_UP(start_pfn + nr_pages); - - if (nid == -1) { - /* - * In this case, "nid" already exists and contains valid memory. - * "start_pfn" passed to us is a pfn which is an arg for - * online__pages(), and start_pfn should exist. - */ - nid = pfn_to_nid(start_pfn); - VM_BUG_ON(!node_state(nid, N_ONLINE)); - } - - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { - if (!pfn_present(pfn)) - continue; - fail = init_section_page_cgroup(pfn, nid); - } - if (!fail) - return 0; - - /* rollback */ - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) - __free_page_cgroup(pfn); - - return -ENOMEM; -} - -static int __meminit offline_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, int nid) -{ - unsigned long start, end, pfn; - - start = SECTION_ALIGN_DOWN(start_pfn); - end = SECTION_ALIGN_UP(start_pfn + nr_pages); - - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) - __free_page_cgroup(pfn); - return 0; - -} - -static int __meminit page_cgroup_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mn = arg; - int ret = 0; - switch (action) { - case MEM_GOING_ONLINE: - ret = online_page_cgroup(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); - break; - case MEM_OFFLINE: - offline_page_cgroup(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); - break; - case MEM_CANCEL_ONLINE: - offline_page_cgroup(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); - break; - case MEM_GOING_OFFLINE: - break; - case MEM_ONLINE: - case MEM_CANCEL_OFFLINE: - break; - } - - return notifier_from_errno(ret); -} - -#endif - -void __init page_cgroup_init(void) -{ - unsigned long pfn; - int nid; - - if (mem_cgroup_disabled()) - return; - - for_each_node_state(nid, N_MEMORY) { - unsigned long start_pfn, end_pfn; - - start_pfn = node_start_pfn(nid); - end_pfn = node_end_pfn(nid); - /* - * start_pfn and end_pfn may not be aligned to SECTION and the - * page->flags of out of node pages are not initialized. So we - * scan [start_pfn, the biggest section's pfn < end_pfn) here. - */ - for (pfn = start_pfn; - pfn < end_pfn; - pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { - - if (!pfn_valid(pfn)) - continue; - /* - * Nodes's pfns can be overlapping. - * We know some arch can have a nodes layout such as - * -------------pfn--------------> - * N0 | N1 | N2 | N0 | N1 | N2|.... - */ - if (pfn_to_nid(pfn) != nid) - continue; - if (init_section_page_cgroup(pfn, nid)) - goto oom; - } - } - hotplug_memory_notifier(page_cgroup_callback, 0); - printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); - printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " - "don't want memory cgroups\n"); - return; -oom: - printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); - panic("Out of memory"); -} - -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) -{ - return; -} - -#endif - #ifdef CONFIG_MEMCG_SWAP -- cgit v1.2.3 From 5d1ea48bdde67898e87d6d8f511fd097fa64c749 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:44:55 -0800 Subject: mm: page_cgroup: rename file to mm/swap_cgroup.c Now that the external page_cgroup data structure and its lookup is gone, the only code remaining in there is swap slot accounting. Rename it and move the conditional compilation into mm/Makefile. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Vladimir Davydov Acked-by: David S. Miller Acked-by: KAMEZAWA Hiroyuki Cc: "Kirill A. Shutemov" Cc: Tejun Heo Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- include/linux/page_cgroup.h | 40 --------- include/linux/swap_cgroup.h | 42 +++++++++ mm/Makefile | 3 +- mm/memcontrol.c | 2 +- mm/page_cgroup.c | 211 -------------------------------------------- mm/swap_cgroup.c | 208 +++++++++++++++++++++++++++++++++++++++++++ mm/swap_state.c | 1 - mm/swapfile.c | 2 +- 9 files changed, 255 insertions(+), 256 deletions(-) delete mode 100644 include/linux/page_cgroup.h create mode 100644 include/linux/swap_cgroup.h delete mode 100644 mm/page_cgroup.c create mode 100644 mm/swap_cgroup.c (limited to 'include') diff --git a/MAINTAINERS b/MAINTAINERS index 0d6469a2cf70..0aedd3e1804b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2606,7 +2606,7 @@ L: cgroups@vger.kernel.org L: linux-mm@kvack.org S: Maintained F: mm/memcontrol.c -F: mm/page_cgroup.c +F: mm/swap_cgroup.c CORETEMP HARDWARE MONITORING DRIVER M: Fenghua Yu diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h deleted file mode 100644 index 65be35785c86..000000000000 --- a/include/linux/page_cgroup.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef __LINUX_PAGE_CGROUP_H -#define __LINUX_PAGE_CGROUP_H - -#include - -#ifdef CONFIG_MEMCG_SWAP -extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new); -extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); -extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); -extern int swap_cgroup_swapon(int type, unsigned long max_pages); -extern void swap_cgroup_swapoff(int type); -#else - -static inline -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) -{ - return 0; -} - -static inline -unsigned short lookup_swap_cgroup_id(swp_entry_t ent) -{ - return 0; -} - -static inline int -swap_cgroup_swapon(int type, unsigned long max_pages) -{ - return 0; -} - -static inline void swap_cgroup_swapoff(int type) -{ - return; -} - -#endif /* CONFIG_MEMCG_SWAP */ - -#endif /* __LINUX_PAGE_CGROUP_H */ diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h new file mode 100644 index 000000000000..145306bdc92f --- /dev/null +++ b/include/linux/swap_cgroup.h @@ -0,0 +1,42 @@ +#ifndef __LINUX_SWAP_CGROUP_H +#define __LINUX_SWAP_CGROUP_H + +#include + +#ifdef CONFIG_MEMCG_SWAP + +extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, + unsigned short old, unsigned short new); +extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); +extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); +extern int swap_cgroup_swapon(int type, unsigned long max_pages); +extern void swap_cgroup_swapoff(int type); + +#else + +static inline +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +{ + return 0; +} + +static inline +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) +{ + return 0; +} + +static inline int +swap_cgroup_swapon(int type, unsigned long max_pages) +{ + return 0; +} + +static inline void swap_cgroup_swapoff(int type) +{ + return; +} + +#endif /* CONFIG_MEMCG_SWAP */ + +#endif /* __LINUX_SWAP_CGROUP_H */ diff --git a/mm/Makefile b/mm/Makefile index 6d9f40e922f7..b3c6ce932c64 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -56,7 +56,8 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o -obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o +obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o +obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b864067791dc..ab270e34ba3e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -51,7 +51,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c deleted file mode 100644 index f0f31c1d4d0c..000000000000 --- a/mm/page_cgroup.c +++ /dev/null @@ -1,211 +0,0 @@ -#include -#include -#include -#include - -#ifdef CONFIG_MEMCG_SWAP - -static DEFINE_MUTEX(swap_cgroup_mutex); -struct swap_cgroup_ctrl { - struct page **map; - unsigned long length; - spinlock_t lock; -}; - -static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; - -struct swap_cgroup { - unsigned short id; -}; -#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) - -/* - * SwapCgroup implements "lookup" and "exchange" operations. - * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge - * against SwapCache. At swap_free(), this is accessed directly from swap. - * - * This means, - * - we have no race in "exchange" when we're accessed via SwapCache because - * SwapCache(and its swp_entry) is under lock. - * - When called via swap_free(), there is no user of this entry and no race. - * Then, we don't need lock around "exchange". - * - * TODO: we can push these buffers out to HIGHMEM. - */ - -/* - * allocate buffer for swap_cgroup. - */ -static int swap_cgroup_prepare(int type) -{ - struct page *page; - struct swap_cgroup_ctrl *ctrl; - unsigned long idx, max; - - ctrl = &swap_cgroup_ctrl[type]; - - for (idx = 0; idx < ctrl->length; idx++) { - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto not_enough_page; - ctrl->map[idx] = page; - } - return 0; -not_enough_page: - max = idx; - for (idx = 0; idx < max; idx++) - __free_page(ctrl->map[idx]); - - return -ENOMEM; -} - -static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, - struct swap_cgroup_ctrl **ctrlp) -{ - pgoff_t offset = swp_offset(ent); - struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; - - ctrl = &swap_cgroup_ctrl[swp_type(ent)]; - if (ctrlp) - *ctrlp = ctrl; - - mappage = ctrl->map[offset / SC_PER_PAGE]; - sc = page_address(mappage); - return sc + offset % SC_PER_PAGE; -} - -/** - * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. - * @ent: swap entry to be cmpxchged - * @old: old id - * @new: new id - * - * Returns old id at success, 0 at failure. - * (There is no mem_cgroup using 0 as its id) - */ -unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, - unsigned short old, unsigned short new) -{ - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned long flags; - unsigned short retval; - - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - retval = sc->id; - if (retval == old) - sc->id = new; - else - retval = 0; - spin_unlock_irqrestore(&ctrl->lock, flags); - return retval; -} - -/** - * swap_cgroup_record - record mem_cgroup for this swp_entry. - * @ent: swap entry to be recorded into - * @id: mem_cgroup to be recorded - * - * Returns old value at success, 0 at failure. - * (Of course, old value can be 0.) - */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) -{ - struct swap_cgroup_ctrl *ctrl; - struct swap_cgroup *sc; - unsigned short old; - unsigned long flags; - - sc = lookup_swap_cgroup(ent, &ctrl); - - spin_lock_irqsave(&ctrl->lock, flags); - old = sc->id; - sc->id = id; - spin_unlock_irqrestore(&ctrl->lock, flags); - - return old; -} - -/** - * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry - * @ent: swap entry to be looked up. - * - * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) - */ -unsigned short lookup_swap_cgroup_id(swp_entry_t ent) -{ - return lookup_swap_cgroup(ent, NULL)->id; -} - -int swap_cgroup_swapon(int type, unsigned long max_pages) -{ - void *array; - unsigned long array_size; - unsigned long length; - struct swap_cgroup_ctrl *ctrl; - - if (!do_swap_account) - return 0; - - length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); - array_size = length * sizeof(void *); - - array = vzalloc(array_size); - if (!array) - goto nomem; - - ctrl = &swap_cgroup_ctrl[type]; - mutex_lock(&swap_cgroup_mutex); - ctrl->length = length; - ctrl->map = array; - spin_lock_init(&ctrl->lock); - if (swap_cgroup_prepare(type)) { - /* memory shortage */ - ctrl->map = NULL; - ctrl->length = 0; - mutex_unlock(&swap_cgroup_mutex); - vfree(array); - goto nomem; - } - mutex_unlock(&swap_cgroup_mutex); - - return 0; -nomem: - printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); - printk(KERN_INFO - "swap_cgroup can be disabled by swapaccount=0 boot option\n"); - return -ENOMEM; -} - -void swap_cgroup_swapoff(int type) -{ - struct page **map; - unsigned long i, length; - struct swap_cgroup_ctrl *ctrl; - - if (!do_swap_account) - return; - - mutex_lock(&swap_cgroup_mutex); - ctrl = &swap_cgroup_ctrl[type]; - map = ctrl->map; - length = ctrl->length; - ctrl->map = NULL; - ctrl->length = 0; - mutex_unlock(&swap_cgroup_mutex); - - if (map) { - for (i = 0; i < length; i++) { - struct page *page = map[i]; - if (page) - __free_page(page); - } - vfree(map); - } -} - -#endif diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c new file mode 100644 index 000000000000..b5f7f24b8dd1 --- /dev/null +++ b/mm/swap_cgroup.c @@ -0,0 +1,208 @@ +#include +#include +#include + +#include /* depends on mm.h include */ + +static DEFINE_MUTEX(swap_cgroup_mutex); +struct swap_cgroup_ctrl { + struct page **map; + unsigned long length; + spinlock_t lock; +}; + +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; + +struct swap_cgroup { + unsigned short id; +}; +#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) + +/* + * SwapCgroup implements "lookup" and "exchange" operations. + * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge + * against SwapCache. At swap_free(), this is accessed directly from swap. + * + * This means, + * - we have no race in "exchange" when we're accessed via SwapCache because + * SwapCache(and its swp_entry) is under lock. + * - When called via swap_free(), there is no user of this entry and no race. + * Then, we don't need lock around "exchange". + * + * TODO: we can push these buffers out to HIGHMEM. + */ + +/* + * allocate buffer for swap_cgroup. + */ +static int swap_cgroup_prepare(int type) +{ + struct page *page; + struct swap_cgroup_ctrl *ctrl; + unsigned long idx, max; + + ctrl = &swap_cgroup_ctrl[type]; + + for (idx = 0; idx < ctrl->length; idx++) { + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto not_enough_page; + ctrl->map[idx] = page; + } + return 0; +not_enough_page: + max = idx; + for (idx = 0; idx < max; idx++) + __free_page(ctrl->map[idx]); + + return -ENOMEM; +} + +static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, + struct swap_cgroup_ctrl **ctrlp) +{ + pgoff_t offset = swp_offset(ent); + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + struct swap_cgroup *sc; + + ctrl = &swap_cgroup_ctrl[swp_type(ent)]; + if (ctrlp) + *ctrlp = ctrl; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; +} + +/** + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. + * @ent: swap entry to be cmpxchged + * @old: old id + * @new: new id + * + * Returns old id at success, 0 at failure. + * (There is no mem_cgroup using 0 as its id) + */ +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, + unsigned short old, unsigned short new) +{ + struct swap_cgroup_ctrl *ctrl; + struct swap_cgroup *sc; + unsigned long flags; + unsigned short retval; + + sc = lookup_swap_cgroup(ent, &ctrl); + + spin_lock_irqsave(&ctrl->lock, flags); + retval = sc->id; + if (retval == old) + sc->id = new; + else + retval = 0; + spin_unlock_irqrestore(&ctrl->lock, flags); + return retval; +} + +/** + * swap_cgroup_record - record mem_cgroup for this swp_entry. + * @ent: swap entry to be recorded into + * @id: mem_cgroup to be recorded + * + * Returns old value at success, 0 at failure. + * (Of course, old value can be 0.) + */ +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +{ + struct swap_cgroup_ctrl *ctrl; + struct swap_cgroup *sc; + unsigned short old; + unsigned long flags; + + sc = lookup_swap_cgroup(ent, &ctrl); + + spin_lock_irqsave(&ctrl->lock, flags); + old = sc->id; + sc->id = id; + spin_unlock_irqrestore(&ctrl->lock, flags); + + return old; +} + +/** + * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry + * @ent: swap entry to be looked up. + * + * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) + */ +unsigned short lookup_swap_cgroup_id(swp_entry_t ent) +{ + return lookup_swap_cgroup(ent, NULL)->id; +} + +int swap_cgroup_swapon(int type, unsigned long max_pages) +{ + void *array; + unsigned long array_size; + unsigned long length; + struct swap_cgroup_ctrl *ctrl; + + if (!do_swap_account) + return 0; + + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); + array_size = length * sizeof(void *); + + array = vzalloc(array_size); + if (!array) + goto nomem; + + ctrl = &swap_cgroup_ctrl[type]; + mutex_lock(&swap_cgroup_mutex); + ctrl->length = length; + ctrl->map = array; + spin_lock_init(&ctrl->lock); + if (swap_cgroup_prepare(type)) { + /* memory shortage */ + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + vfree(array); + goto nomem; + } + mutex_unlock(&swap_cgroup_mutex); + + return 0; +nomem: + printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); + printk(KERN_INFO + "swap_cgroup can be disabled by swapaccount=0 boot option\n"); + return -ENOMEM; +} + +void swap_cgroup_swapoff(int type) +{ + struct page **map; + unsigned long i, length; + struct swap_cgroup_ctrl *ctrl; + + if (!do_swap_account) + return; + + mutex_lock(&swap_cgroup_mutex); + ctrl = &swap_cgroup_ctrl[type]; + map = ctrl->map; + length = ctrl->length; + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + + if (map) { + for (i = 0; i < length; i++) { + struct page *page = map[i]; + if (page) + __free_page(page); + } + vfree(map); + } +} diff --git a/mm/swap_state.c b/mm/swap_state.c index 154444918685..9711342987a0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -17,7 +17,6 @@ #include #include #include -#include #include diff --git a/mm/swapfile.c b/mm/swapfile.c index 8798b2e0ac59..63f55ccb9b26 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); -- cgit v1.2.3 From 9edad6ea0f1416415f6fe31cc9d1dbc3817803ed Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 10 Dec 2014 15:44:58 -0800 Subject: mm: move page->mem_cgroup bad page handling into generic code Now that the external page_cgroup data structure and its lookup is gone, let the generic bad_page() check for page->mem_cgroup sanity. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Vladimir Davydov Acked-by: David S. Miller Cc: KAMEZAWA Hiroyuki Cc: "Kirill A. Shutemov" Cc: Tejun Heo Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 17 ----------------- init/Kconfig | 12 ------------ mm/debug.c | 5 ++++- mm/memcontrol.c | 15 --------------- mm/page_alloc.c | 12 ++++++++---- 5 files changed, 12 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c4d080875164..6ea9f919e888 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -173,10 +173,6 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, void mem_cgroup_split_huge_fixup(struct page *head); #endif -#ifdef CONFIG_DEBUG_VM -bool mem_cgroup_bad_page_check(struct page *page); -void mem_cgroup_print_bad_page(struct page *page); -#endif #else /* CONFIG_MEMCG */ struct mem_cgroup; @@ -346,19 +342,6 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) } #endif /* CONFIG_MEMCG */ -#if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM) -static inline bool -mem_cgroup_bad_page_check(struct page *page) -{ - return false; -} - -static inline void -mem_cgroup_print_bad_page(struct page *page) -{ -} -#endif - enum { UNDER_LIMIT, SOFT_LIMIT, diff --git a/init/Kconfig b/init/Kconfig index 46768752130d..7e9fbd48e2ab 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -983,18 +983,6 @@ config MEMCG Provides a memory resource controller that manages both anonymous memory and page cache. (See Documentation/cgroups/memory.txt) - Note that setting this option increases fixed memory overhead - associated with each page of memory in the system. By this, - 8(16)bytes/PAGE_SIZE on 32(64)bit system will be occupied by memory - usage tracking struct at boot. Total amount of this is printed out - at boot. - - Only enable when you're ok with these trade offs and really - sure you need the memory resource controller. Even when you enable - this, you can set "cgroup_disable=memory" at your boot option to - disable memory resource controller and you can avoid overheads. - (and lose benefits of memory resource controller) - config MEMCG_SWAP bool "Memory Resource Controller Swap Extension" depends on MEMCG && SWAP diff --git a/mm/debug.c b/mm/debug.c index 5ce45c9a29b5..0e58f3211f89 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason, dump_flags(page->flags & badflags, pageflag_names, ARRAY_SIZE(pageflag_names)); } - mem_cgroup_print_bad_page(page); +#ifdef CONFIG_MEMCG + if (page->mem_cgroup) + pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); +#endif } void dump_page(struct page *page, const char *reason) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab270e34ba3e..1869cb64d089 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3157,21 +3157,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -#ifdef CONFIG_DEBUG_VM -bool mem_cgroup_bad_page_check(struct page *page) -{ - if (mem_cgroup_disabled()) - return false; - - return page->mem_cgroup != NULL; -} - -void mem_cgroup_print_bad_page(struct page *page) -{ - pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); -} -#endif - static DEFINE_MUTEX(memcg_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22cfdeffbf69..a7198c065999 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -640,8 +640,10 @@ static inline int free_pages_check(struct page *page) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; bad_flags = PAGE_FLAGS_CHECK_AT_FREE; } - if (unlikely(mem_cgroup_bad_page_check(page))) - bad_reason = "cgroup check failed"; +#ifdef CONFIG_MEMCG + if (unlikely(page->mem_cgroup)) + bad_reason = "page still charged to cgroup"; +#endif if (unlikely(bad_reason)) { bad_page(page, bad_reason, bad_flags); return 1; @@ -900,8 +902,10 @@ static inline int check_new_page(struct page *page) bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; bad_flags = PAGE_FLAGS_CHECK_AT_PREP; } - if (unlikely(mem_cgroup_bad_page_check(page))) - bad_reason = "cgroup check failed"; +#ifdef CONFIG_MEMCG + if (unlikely(page->mem_cgroup)) + bad_reason = "page still charged to cgroup"; +#endif if (unlikely(bad_reason)) { bad_page(page, bad_reason, bad_flags); return 1; -- cgit v1.2.3 From 7c8bd2322c7fd973d089b27de55e29c92c667a06 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:33 -0800 Subject: exit: ptrace: shift "reap dead" code from exit_ptrace() to forget_original_parent() Now that forget_original_parent() uses ->ptrace_entry for EXIT_DEAD tasks, we can simply pass "dead_children" list to exit_ptrace() and remove another release_task() loop. Plus this way we do not need to drop and reacquire tasklist_lock. Also shift the list_empty(ptraced) check, if we want this optimization it makes sense to eliminate the function call altogether. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 2 +- kernel/exit.c | 10 ++++------ kernel/ptrace.c | 23 +++-------------------- 3 files changed, 8 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index cc79eff4a1ad..987a73a40ef8 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -52,7 +52,7 @@ extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); -extern void exit_ptrace(struct task_struct *tracer); +extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 diff --git a/kernel/exit.c b/kernel/exit.c index 772e9175735c..9c9526d87276 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -553,13 +553,11 @@ static void forget_original_parent(struct task_struct *father) LIST_HEAD(dead_children); write_lock_irq(&tasklist_lock); - /* - * Note that exit_ptrace() and find_new_reaper() might - * drop tasklist_lock and reacquire it. - */ - exit_ptrace(father); - reaper = find_new_reaper(father); + if (unlikely(!list_empty(&father->ptraced))) + exit_ptrace(father, &dead_children); + /* Can drop and reacquire tasklist_lock */ + reaper = find_new_reaper(father); list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { t->real_parent = reaper; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 54e75226c2c4..1eb9d90c3af9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -485,36 +485,19 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) /* * Detach all tasks we were using ptrace on. Called with tasklist held - * for writing, and returns with it held too. But note it can release - * and reacquire the lock. + * for writing. */ -void exit_ptrace(struct task_struct *tracer) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) +void exit_ptrace(struct task_struct *tracer, struct list_head *dead) { struct task_struct *p, *n; - LIST_HEAD(ptrace_dead); - - if (likely(list_empty(&tracer->ptraced))) - return; list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { if (unlikely(p->ptrace & PT_EXITKILL)) send_sig_info(SIGKILL, SEND_SIG_FORCED, p); if (__ptrace_detach(tracer, p)) - list_add(&p->ptrace_entry, &ptrace_dead); - } - - write_unlock_irq(&tasklist_lock); - BUG_ON(!list_empty(&tracer->ptraced)); - - list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { - list_del_init(&p->ptrace_entry); - release_task(p); + list_add(&p->ptrace_entry, dead); } - - write_lock_irq(&tasklist_lock); } int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) -- cgit v1.2.3 From f938612dd97d481b8b5bf960c992ae577f081c17 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 10 Dec 2014 15:45:47 -0800 Subject: include/linux/file.h: remove get_unused_fd() macro Macro get_unused_fd() is used to allocate a file descriptor with default flags. Those default flags (0) don't enable close-on-exec. This can be seen as an unsafe default: in most case close-on-exec should be enabled to not leak file descriptor across exec(). It would be better to have a "safer" default set of flags, eg. O_CLOEXEC must be used to enable close-on-exec. Instead this patch removes get_unused_fd() so that out of tree modules won't be affect by a runtime behavor change which might introduce other kind of bugs: it's better to catch the change at build time, making it easier to fix. Removing the macro will also promote use of get_unused_fd_flags() (or anon_inode_getfd()) with flags provided by userspace. Or, if flags cannot be given by userspace, with flags set to O_CLOEXEC by default. Signed-off-by: Yann Droneaud Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/file.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/file.h b/include/linux/file.h index 4d69123377a2..f87d30882a24 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -66,7 +66,6 @@ extern void set_close_on_exec(unsigned int fd, int flag); extern bool get_close_on_exec(unsigned int fd); extern void put_filp(struct file *); extern int get_unused_fd_flags(unsigned flags); -#define get_unused_fd() get_unused_fd_flags(0) extern void put_unused_fd(unsigned int fd); extern void fd_install(unsigned int fd, struct file *file); -- cgit v1.2.3 From 9e3961a0979817c612b10b2da4f3045ec9faa779 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 10 Dec 2014 15:45:50 -0800 Subject: kernel: add panic_on_warn There have been several times where I have had to rebuild a kernel to cause a panic when hitting a WARN() in the code in order to get a crash dump from a system. Sometimes this is easy to do, other times (such as in the case of a remote admin) it is not trivial to send new images to the user. A much easier method would be a switch to change the WARN() over to a panic. This makes debugging easier in that I can now test the actual image the WARN() was seen on and I do not have to engage in remote debugging. This patch adds a panic_on_warn kernel parameter and /proc/sys/kernel/panic_on_warn calls panic() in the warn_slowpath_common() path. The function will still print out the location of the warning. An example of the panic_on_warn output: The first line below is from the WARN_ON() to output the WARN_ON()'s location. After that the panic() output is displayed. WARNING: CPU: 30 PID: 11698 at /home/prarit/dummy_module/dummy-module.c:25 init_dummy+0x1f/0x30 [dummy_module]() Kernel panic - not syncing: panic_on_warn set ... CPU: 30 PID: 11698 Comm: insmod Tainted: G W OE 3.17.0+ #57 Hardware name: Intel Corporation S2600CP/S2600CP, BIOS RMLSDP.86I.00.29.D696.1311111329 11/11/2013 0000000000000000 000000008e3f87df ffff88080f093c38 ffffffff81665190 0000000000000000 ffffffff818aea3d ffff88080f093cb8 ffffffff8165e2ec ffffffff00000008 ffff88080f093cc8 ffff88080f093c68 000000008e3f87df Call Trace: [] dump_stack+0x46/0x58 [] panic+0xd0/0x204 [] ? init_dummy+0x1f/0x30 [dummy_module] [] warn_slowpath_common+0xd0/0xd0 [] ? dummy_greetings+0x40/0x40 [dummy_module] [] warn_slowpath_null+0x1a/0x20 [] init_dummy+0x1f/0x30 [dummy_module] [] do_one_initcall+0xd4/0x210 [] ? __vunmap+0xc2/0x110 [] load_module+0x16a9/0x1b30 [] ? store_uevent+0x70/0x70 [] ? copy_module_from_fd.isra.44+0x129/0x180 [] SyS_finit_module+0xa6/0xd0 [] system_call_fastpath+0x12/0x17 Successfully tested by me. hpa said: There is another very valid use for this: many operators would rather a machine shuts down than being potentially compromised either functionally or security-wise. Signed-off-by: Prarit Bhargava Cc: Jonathan Corbet Cc: Rusty Russell Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Masami Hiramatsu Acked-by: Yasuaki Ishimatsu Cc: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kdump/kdump.txt | 7 +++++++ Documentation/kernel-parameters.txt | 3 +++ Documentation/sysctl/kernel.txt | 40 ++++++++++++++++++++++++------------- include/linux/kernel.h | 1 + include/uapi/linux/sysctl.h | 1 + kernel/panic.c | 13 ++++++++++++ kernel/sysctl.c | 9 +++++++++ kernel/sysctl_binary.c | 1 + 8 files changed, 61 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index 6c0b9f27e465..bc4bd5a44b88 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt @@ -471,6 +471,13 @@ format. Crash is available on Dave Anderson's site at the following URL: http://people.redhat.com/~anderson/ +Trigger Kdump on WARN() +======================= + +The kernel parameter, panic_on_warn, calls panic() in all WARN() paths. This +will cause a kdump to occur at the panic() call. In cases where a user wants +to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 +to achieve the same behaviour. Contact ======= diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 838f3776c924..d6eb3636fe5a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2509,6 +2509,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. timeout < 0: reboot immediately Format: + panic_on_warn panic() instead of WARN(). Useful to cause kdump + on a WARN(). + crash_kexec_post_notifiers Run kdump after running panic-notifiers and dumping kmsg. This only for the users who doubt kdump always diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 57baff5bdb80..b5d0c8501a18 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -54,8 +54,9 @@ show up in /proc/sys/kernel: - overflowuid - panic - panic_on_oops -- panic_on_unrecovered_nmi - panic_on_stackoverflow +- panic_on_unrecovered_nmi +- panic_on_warn - pid_max - powersave-nap [ PPC only ] - printk @@ -527,19 +528,6 @@ the recommended setting is 60. ============================================================== -panic_on_unrecovered_nmi: - -The default Linux behaviour on an NMI of either memory or unknown is -to continue operation. For many environments such as scientific -computing it is preferable that the box is taken out and the error -dealt with than an uncorrected parity/ECC error get propagated. - -A small number of systems do generate NMI's for bizarre random reasons -such as power management so the default is off. That sysctl works like -the existing panic controls already in that directory. - -============================================================== - panic_on_oops: Controls the kernel's behaviour when an oops or BUG is encountered. @@ -563,6 +551,30 @@ This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled. ============================================================== +panic_on_unrecovered_nmi: + +The default Linux behaviour on an NMI of either memory or unknown is +to continue operation. For many environments such as scientific +computing it is preferable that the box is taken out and the error +dealt with than an uncorrected parity/ECC error get propagated. + +A small number of systems do generate NMI's for bizarre random reasons +such as power management so the default is off. That sysctl works like +the existing panic controls already in that directory. + +============================================================== + +panic_on_warn: + +Calls panic() in the WARN() path when set to 1. This is useful to avoid +a kernel rebuild when attempting to kdump at the location of a WARN(). + +0: only WARN(), default behaviour. + +1: call panic() after printing out WARN() location. + +============================================================== + perf_cpu_time_max_percent: Hints to the kernel how much CPU time it should be allowed to diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 446d76a87ba1..233ea8107038 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -427,6 +427,7 @@ extern int panic_timeout; extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; +extern int panic_on_warn; extern int sysctl_panic_on_stackoverflow; /* * Only to be used by arch init code. If the user over-wrote the default diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 43aaba1cc037..0956373b56db 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -153,6 +153,7 @@ enum KERN_MAX_LOCK_DEPTH=74, /* int: rtmutex's maximum lock depth */ KERN_NMI_WATCHDOG=75, /* int: enable/disable nmi watchdog */ KERN_PANIC_ON_NMI=76, /* int: whether we will panic on an unrecovered */ + KERN_PANIC_ON_WARN=77, /* int: call panic() in WARN() functions */ }; diff --git a/kernel/panic.c b/kernel/panic.c index cf80672b7924..4d8d6f906dec 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -33,6 +33,7 @@ static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); static bool crash_kexec_post_notifiers; +int panic_on_warn __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -428,6 +429,17 @@ static void warn_slowpath_common(const char *file, int line, void *caller, if (args) vprintk(args->fmt, args->args); + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; + panic("panic_on_warn set ...\n"); + } + print_modules(); dump_stack(); print_oops_end_marker(); @@ -485,6 +497,7 @@ EXPORT_SYMBOL(__stack_chk_fail); core_param(panic, panic_timeout, int, 0644); core_param(pause_on_oops, pause_on_oops, int, 0644); +core_param(panic_on_warn, panic_on_warn, int, 0644); static int __init setup_crash_kexec_post_notifiers(char *s) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 15f2511a1b7c..7c54ff79afd7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1104,6 +1104,15 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif + { + .procname = "panic_on_warn", + .data = &panic_on_warn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { } }; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 9a4f750a2963..7e7746a42a62 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -137,6 +137,7 @@ static const struct bin_table bin_kern_table[] = { { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, + { CTL_INT, KERN_PANIC_ON_WARN, "panic_on_warn" }, {} }; -- cgit v1.2.3 From 1dc6244bd6d4f62239487fb0befc41c63e117290 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 10 Dec 2014 15:45:53 -0800 Subject: printk: remove used-once early_vprintk Eliminate the unlikely possibility of message interleaving for early_printk/early_vprintk use. early_vprintk can be done via the %pV extension so remove this unnecessary function and change early_printk to have the equivalent vprintk code. All uses of early_printk already end with a newline so also remove the unnecessary newline from the early_printk function. Signed-off-by: Joe Perches Acked-by: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/kernel/early_printk.c | 19 +++++++++++++------ include/linux/printk.h | 1 - kernel/printk/printk.c | 19 ++++++++----------- 3 files changed, 21 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c index b608e00e7f6d..aefb2c086726 100644 --- a/arch/tile/kernel/early_printk.c +++ b/arch/tile/kernel/early_printk.c @@ -43,13 +43,20 @@ static struct console early_hv_console = { void early_panic(const char *fmt, ...) { - va_list ap; + struct va_format vaf; + va_list args; + arch_local_irq_disable_all(); - va_start(ap, fmt); - early_printk("Kernel panic - not syncing: "); - early_vprintk(fmt, ap); - early_printk("\n"); - va_end(ap); + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + early_printk("Kernel panic - not syncing: %pV", &vaf); + + va_end(args); + dump_stack(); hv_halt(); } diff --git a/include/linux/printk.h b/include/linux/printk.h index d78125f73ac4..3dd489f2dedc 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -118,7 +118,6 @@ int no_printk(const char *fmt, ...) #ifdef CONFIG_EARLY_PRINTK extern asmlinkage __printf(1, 2) void early_printk(const char *fmt, ...); -void early_vprintk(const char *fmt, va_list ap); #else static inline __printf(1, 2) __cold void early_printk(const char *s, ...) { } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ced2b84b1cb7..4815c98ae175 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1881,23 +1881,20 @@ static size_t cont_print_text(char *text, size_t size) { return 0; } #ifdef CONFIG_EARLY_PRINTK struct console *early_console; -void early_vprintk(const char *fmt, va_list ap) -{ - if (early_console) { - char buf[512]; - int n = vscnprintf(buf, sizeof(buf), fmt, ap); - - early_console->write(early_console, buf, n); - } -} - asmlinkage __visible void early_printk(const char *fmt, ...) { va_list ap; + char buf[512]; + int n; + + if (!early_console) + return; va_start(ap, fmt); - early_vprintk(fmt, ap); + n = vscnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); + + early_console->write(early_console, buf, n); } #endif -- cgit v1.2.3 From a39d4a857d4bb0a62d6655c0d69f7387fe1ad160 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 10 Dec 2014 15:50:15 -0800 Subject: printk: add and use LOGLEVEL_ defines for KERN_ equivalents Use #defines instead of magic values. Signed-off-by: Joe Perches Acked-by: Greg Kroah-Hartman Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/usb/storage/debug.c | 2 +- include/linux/kern_levels.h | 13 +++++++++++++ kernel/printk/printk.c | 28 +++++++++++++--------------- lib/dynamic_debug.c | 4 ++-- 4 files changed, 29 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/drivers/usb/storage/debug.c b/drivers/usb/storage/debug.c index 66a684a29938..2d81e1d8ee30 100644 --- a/drivers/usb/storage/debug.c +++ b/drivers/usb/storage/debug.c @@ -188,7 +188,7 @@ int usb_stor_dbg(const struct us_data *us, const char *fmt, ...) va_start(args, fmt); - r = dev_vprintk_emit(7, &us->pusb_dev->dev, fmt, args); + r = dev_vprintk_emit(LOGLEVEL_DEBUG, &us->pusb_dev->dev, fmt, args); va_end(args); diff --git a/include/linux/kern_levels.h b/include/linux/kern_levels.h index 866caaa9e2bb..c2ce155d83cc 100644 --- a/include/linux/kern_levels.h +++ b/include/linux/kern_levels.h @@ -22,4 +22,17 @@ */ #define KERN_CONT "" +/* integer equivalents of KERN_ */ +#define LOGLEVEL_SCHED -2 /* Deferred messages from sched code + * are set to this special level */ +#define LOGLEVEL_DEFAULT -1 /* default (or last) loglevel */ +#define LOGLEVEL_EMERG 0 /* system is unusable */ +#define LOGLEVEL_ALERT 1 /* action must be taken immediately */ +#define LOGLEVEL_CRIT 2 /* critical conditions */ +#define LOGLEVEL_ERR 3 /* error conditions */ +#define LOGLEVEL_WARNING 4 /* warning conditions */ +#define LOGLEVEL_NOTICE 5 /* normal but significant condition */ +#define LOGLEVEL_INFO 6 /* informational */ +#define LOGLEVEL_DEBUG 7 /* debug-level messages */ + #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4815c98ae175..1b7092dbb590 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -62,9 +62,6 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; -/* Deferred messaged from sched code are marked by this special level */ -#define SCHED_MESSAGE_LOGLEVEL -2 - /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -1259,7 +1256,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) int do_syslog(int type, char __user *buf, int len, bool from_file) { bool clear = false; - static int saved_console_loglevel = -1; + static int saved_console_loglevel = LOGLEVEL_DEFAULT; int error; error = check_syslog_permissions(type, from_file); @@ -1316,15 +1313,15 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: - if (saved_console_loglevel == -1) + if (saved_console_loglevel == LOGLEVEL_DEFAULT) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; /* Enable logging to console */ case SYSLOG_ACTION_CONSOLE_ON: - if (saved_console_loglevel != -1) { + if (saved_console_loglevel != LOGLEVEL_DEFAULT) { console_loglevel = saved_console_loglevel; - saved_console_loglevel = -1; + saved_console_loglevel = LOGLEVEL_DEFAULT; } break; /* Set level of messages printed to console */ @@ -1336,7 +1333,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) len = minimum_console_loglevel; console_loglevel = len; /* Implicitly re-enable logging to console */ - saved_console_loglevel = -1; + saved_console_loglevel = LOGLEVEL_DEFAULT; error = 0; break; /* Number of chars in the log buffer */ @@ -1629,8 +1626,8 @@ asmlinkage int vprintk_emit(int facility, int level, /* cpu currently holding logbuf_lock in this function */ static volatile unsigned int logbuf_cpu = UINT_MAX; - if (level == SCHED_MESSAGE_LOGLEVEL) { - level = -1; + if (level == LOGLEVEL_SCHED) { + level = LOGLEVEL_DEFAULT; in_sched = true; } @@ -1695,8 +1692,9 @@ asmlinkage int vprintk_emit(int facility, int level, const char *end_of_header = printk_skip_level(text); switch (kern_level) { case '0' ... '7': - if (level == -1) + if (level == LOGLEVEL_DEFAULT) level = kern_level - '0'; + /* fallthrough */ case 'd': /* KERN_DEFAULT */ lflags |= LOG_PREFIX; } @@ -1710,7 +1708,7 @@ asmlinkage int vprintk_emit(int facility, int level, } } - if (level == -1) + if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; if (dict) @@ -1788,7 +1786,7 @@ EXPORT_SYMBOL(vprintk_emit); asmlinkage int vprintk(const char *fmt, va_list args) { - return vprintk_emit(0, -1, NULL, 0, fmt, args); + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); } EXPORT_SYMBOL(vprintk); @@ -1842,7 +1840,7 @@ asmlinkage __visible int printk(const char *fmt, ...) } #endif va_start(args, fmt); - r = vprintk_emit(0, -1, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args); va_end(args); return r; @@ -2631,7 +2629,7 @@ int printk_deferred(const char *fmt, ...) preempt_disable(); va_start(args, fmt); - r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); + r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args); va_end(args); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index dfba05521748..527799d44476 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -576,7 +576,7 @@ void __dynamic_dev_dbg(struct _ddebug *descriptor, } else { char buf[PREFIX_SIZE]; - dev_printk_emit(7, dev, "%s%s %s: %pV", + dev_printk_emit(LOGLEVEL_DEBUG, dev, "%s%s %s: %pV", dynamic_emit_prefix(descriptor, buf), dev_driver_string(dev), dev_name(dev), &vaf); @@ -605,7 +605,7 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor, if (dev && dev->dev.parent) { char buf[PREFIX_SIZE]; - dev_printk_emit(7, dev->dev.parent, + dev_printk_emit(LOGLEVEL_DEBUG, dev->dev.parent, "%s%s %s %s%s: %pV", dynamic_emit_prefix(descriptor, buf), dev_driver_string(dev->dev.parent), -- cgit v1.2.3