From 3dc43e3e4d0b52197d3205214fe8f162f9e0c334 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:05 +0000 Subject: per-netns ipv4 sysctl_tcp_mem This patch allows each namespace to independently set up its levels for tcp memory pressure thresholds. This patch alone does not buy much: we need to make this values per group of process somehow. This is achieved in the patches that follows in this patchset. Signed-off-by: Glauber Costa Reviewed-by: KAMEZAWA Hiroyuki CC: David S. Miller CC: Eric W. Biederman Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 51 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 69fd7201129a..bbd67abcb51d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -174,6 +175,36 @@ static int proc_allowed_congestion_control(ctl_table *ctl, return ret; } +static int ipv4_tcp_mem(ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned long vec[3]; + struct net *net = current->nsproxy->net_ns; + + ctl_table tmp = { + .data = &vec, + .maxlen = sizeof(vec), + .mode = ctl->mode, + }; + + if (!write) { + ctl->data = &net->ipv4.sysctl_tcp_mem; + return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos); + } + + ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos); + if (ret) + return ret; + + net->ipv4.sysctl_tcp_mem[0] = vec[0]; + net->ipv4.sysctl_tcp_mem[1] = vec[1]; + net->ipv4.sysctl_tcp_mem[2] = vec[2]; + + return 0; +} + static struct ctl_table ipv4_table[] = { { .procname = "tcp_timestamps", @@ -432,13 +463,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_mem", - .data = &sysctl_tcp_mem, - .maxlen = sizeof(sysctl_tcp_mem), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax - }, { .procname = "tcp_wmem", .data = &sysctl_tcp_wmem, @@ -721,6 +745,12 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_ping_group_range, }, + { + .procname = "tcp_mem", + .maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem), + .mode = 0644, + .proc_handler = ipv4_tcp_mem, + }, { } }; @@ -734,6 +764,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); static __net_init int ipv4_sysctl_init_net(struct net *net) { struct ctl_table *table; + unsigned long limit; table = ipv4_net_table; if (!net_eq(net, &init_net)) { @@ -769,6 +800,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count = 4; + limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); + net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; + net->ipv4.sysctl_tcp_mem[1] = limit; + net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; + net->ipv4.ipv4_hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); if (net->ipv4.ipv4_hdr == NULL) -- cgit v1.2.3 From 3aaabe2342c36bf48567b88fa78b819eee14bb5e Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Sun, 11 Dec 2011 21:47:06 +0000 Subject: tcp buffer limitation: per-cgroup limit This patch uses the "tcp.limit_in_bytes" field of the kmem_cgroup to effectively control the amount of kernel memory pinned by a cgroup. This value is ignored in the root cgroup, and in all others, caps the value specified by the admin in the net namespaces' view of tcp_sysctl_mem. If namespaces are being used, the admin is allowed to set a value bigger than cgroup's maximum, the same way it is allowed to set pretty much unlimited values in a real box. Signed-off-by: Glauber Costa Reviewed-by: Hiroyouki Kamezawa CC: David S. Miller CC: Eric W. Biederman Signed-off-by: David S. Miller --- Documentation/cgroups/memory.txt | 1 + include/net/tcp_memcontrol.h | 2 + net/ipv4/sysctl_net_ipv4.c | 14 ++++ net/ipv4/tcp_memcontrol.c | 137 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 152 insertions(+), 2 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 687dea5bf1fd..1c9779a74a25 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -78,6 +78,7 @@ Brief summary of control files. memory.independent_kmem_limit # select whether or not kernel memory limits are independent of user limits + memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory 1. History diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h index 5f5e1582d764..3512082fa909 100644 --- a/include/net/tcp_memcontrol.h +++ b/include/net/tcp_memcontrol.h @@ -14,4 +14,6 @@ struct tcp_memcontrol { struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg); int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss); void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss); +unsigned long long tcp_max_memory(const struct mem_cgroup *memcg); +void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx); #endif /* _TCP_MEMCG_H */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index bbd67abcb51d..fe9bf915676c 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -24,6 +24,7 @@ #include #include #include +#include static int zero; static int tcp_retr1_max = 255; @@ -182,6 +183,9 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, int ret; unsigned long vec[3]; struct net *net = current->nsproxy->net_ns; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM + struct mem_cgroup *memcg; +#endif ctl_table tmp = { .data = &vec, @@ -198,6 +202,16 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, if (ret) return ret; +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + + tcp_prot_mem(memcg, vec[0], 0); + tcp_prot_mem(memcg, vec[1], 1); + tcp_prot_mem(memcg, vec[2], 2); + rcu_read_unlock(); +#endif + net->ipv4.sysctl_tcp_mem[0] = vec[0]; net->ipv4.sysctl_tcp_mem[1] = vec[1]; net->ipv4.sysctl_tcp_mem[2] = vec[2]; diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index bfb0c2b8df46..e3533903409d 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -6,6 +6,19 @@ #include #include +static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft); +static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, + const char *buffer); + +static struct cftype tcp_files[] = { + { + .name = "kmem.tcp.limit_in_bytes", + .write_string = tcp_cgroup_write, + .read_u64 = tcp_cgroup_read, + .private = RES_LIMIT, + }, +}; + static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto) { return container_of(cg_proto, struct tcp_memcontrol, cg_proto); @@ -34,7 +47,7 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) - return 0; + goto create_files; tcp = tcp_from_cgproto(cg_proto); @@ -57,7 +70,9 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated; cg_proto->memcg = memcg; - return 0; +create_files: + return cgroup_add_files(cgrp, ss, tcp_files, + ARRAY_SIZE(tcp_files)); } EXPORT_SYMBOL(tcp_init_cgroup); @@ -66,6 +81,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct cg_proto *cg_proto; struct tcp_memcontrol *tcp; + u64 val; cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) @@ -73,5 +89,122 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) tcp = tcp_from_cgproto(cg_proto); percpu_counter_destroy(&tcp->tcp_sockets_allocated); + + val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); + + if (val != RESOURCE_MAX) + jump_label_dec(&memcg_socket_limit_enabled); } EXPORT_SYMBOL(tcp_destroy_cgroup); + +static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) +{ + struct net *net = current->nsproxy->net_ns; + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + u64 old_lim; + int i; + int ret; + + cg_proto = tcp_prot.proto_cgroup(memcg); + if (!cg_proto) + return -EINVAL; + + if (val > RESOURCE_MAX) + val = RESOURCE_MAX; + + tcp = tcp_from_cgproto(cg_proto); + + old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); + ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val); + if (ret) + return ret; + + for (i = 0; i < 3; i++) + tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, + net->ipv4.sysctl_tcp_mem[i]); + + if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX) + jump_label_dec(&memcg_socket_limit_enabled); + else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX) + jump_label_inc(&memcg_socket_limit_enabled); + + return 0; +} + +static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, + const char *buffer) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + unsigned long long val; + int ret = 0; + + switch (cft->private) { + case RES_LIMIT: + /* see memcontrol.c */ + ret = res_counter_memparse_write_strategy(buffer, &val); + if (ret) + break; + ret = tcp_update_limit(memcg, val); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val) +{ + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + + cg_proto = tcp_prot.proto_cgroup(memcg); + if (!cg_proto) + return default_val; + + tcp = tcp_from_cgproto(cg_proto); + return res_counter_read_u64(&tcp->tcp_memory_allocated, type); +} + +static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + u64 val; + + switch (cft->private) { + case RES_LIMIT: + val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX); + break; + default: + BUG(); + } + return val; +} + +unsigned long long tcp_max_memory(const struct mem_cgroup *memcg) +{ + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + + cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg); + if (!cg_proto) + return 0; + + tcp = tcp_from_cgproto(cg_proto); + return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); +} + +void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx) +{ + struct tcp_memcontrol *tcp; + struct cg_proto *cg_proto; + + cg_proto = tcp_prot.proto_cgroup(memcg); + if (!cg_proto) + return; + + tcp = tcp_from_cgproto(cg_proto); + + tcp->tcp_prot_mem[idx] = val; +} -- cgit v1.2.3 From e6560d4dfe62ddf6010fdf4a417b1b3bdcbf4fd3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Dec 2011 03:51:28 +0000 Subject: net: ping: remove some sparse errors net/ipv4/sysctl_net_ipv4.c:78:6: warning: symbol 'inet_get_ping_group_range_table' was not declared. Should it be static? net/ipv4/sysctl_net_ipv4.c:119:31: warning: incorrect type in argument 2 (different signedness) net/ipv4/sysctl_net_ipv4.c:119:31: expected int *range net/ipv4/sysctl_net_ipv4.c:119:31: got unsigned int * Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index fe9bf915676c..4aa7e9dc0cbb 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -75,7 +75,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, } -void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) +static void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t *high) { gid_t *data = table->data; unsigned seq; @@ -88,7 +88,7 @@ void inet_get_ping_group_range_table(struct ctl_table *table, gid_t *low, gid_t } /* Update system visible IP port range */ -static void set_ping_group_range(struct ctl_table *table, int range[2]) +static void set_ping_group_range(struct ctl_table *table, gid_t range[2]) { gid_t *data = table->data; write_seqlock(&sysctl_local_ports.lock); -- cgit v1.2.3 From 4acb41903b2f99f3dffd4c3df9acc84ca5942cb2 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 30 Jan 2012 01:20:17 +0000 Subject: net/tcp: Fix tcp memory limits initialization when !CONFIG_SYSCTL sysctl_tcp_mem() initialization was moved to sysctl_tcp_ipv4.c in commit 3dc43e3e4d0b52197d3205214fe8f162f9e0c334, since it became a per-ns value. That code, however, will never run when CONFIG_SYSCTL is disabled, leading to bogus values on those fields - causing hung TCP sockets. This patch fixes it by keeping an initialization code in tcp_init(). It will be overwritten by the first net namespace init if CONFIG_SYSCTL is compiled in, and do the right thing if it is compiled out. It is also named properly as tcp_init_mem(), to properly signal its non-sysctl side effect on TCP limits. Reported-by: Ingo Molnar Signed-off-by: Glauber Costa Cc: David S. Miller Link: http://lkml.kernel.org/r/4F22D05A.8030604@parallels.com [ renamed the function, tidied up the changelog a bit ] Signed-off-by: Ingo Molnar Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/sysctl_net_ipv4.c | 1 + net/ipv4/tcp.c | 16 +++++++++++++--- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0118ea999f67..d49db0113a06 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -311,6 +311,8 @@ extern struct proto tcp_prot; #define TCP_ADD_STATS_USER(net, field, val) SNMP_ADD_STATS_USER((net)->mib.tcp_statistics, field, val) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) +extern void tcp_init_mem(struct net *net); + extern void tcp_v4_err(struct sk_buff *skb, u32); extern void tcp_shutdown (struct sock *sk, int how); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4aa7e9dc0cbb..4cb9cd2f2c39 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -814,6 +814,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count = 4; + tcp_init_mem(net); limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9bcdec3ad772..06373b4a449a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3216,6 +3216,16 @@ static int __init set_thash_entries(char *str) } __setup("thash_entries=", set_thash_entries); +void tcp_init_mem(struct net *net) +{ + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + unsigned long limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); + net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; + net->ipv4.sysctl_tcp_mem[1] = limit; + net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; +} + void __init tcp_init(void) { struct sk_buff *skb = NULL; @@ -3276,9 +3286,9 @@ void __init tcp_init(void) sysctl_tcp_max_orphans = cnt / 2; sysctl_max_syn_backlog = max(128, cnt / 256); - /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = ((unsigned long)init_net.ipv4.sysctl_tcp_mem[1]) - << (PAGE_SHIFT - 7); + tcp_init_mem(&init_net); + limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); max_share = min(4UL*1024*1024, limit); sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; -- cgit v1.2.3 From c43b874d5d714f271b80d4c3f49e05d0cbf51ed2 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 2 Feb 2012 00:07:00 +0000 Subject: tcp: properly initialize tcp memory limits Commit 4acb4190 tries to fix the using uninitialized value introduced by commit 3dc43e3, but it would make the per-socket memory limits too small. This patch fixes this and also remove the redundant codes introduced in 4acb4190. Signed-off-by: Jason Wang Acked-by: Glauber Costa Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 6 ------ net/ipv4/tcp.c | 4 ++-- 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'net/ipv4/sysctl_net_ipv4.c') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4cb9cd2f2c39..7a7724da9bff 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -778,7 +778,6 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); static __net_init int ipv4_sysctl_init_net(struct net *net) { struct ctl_table *table; - unsigned long limit; table = ipv4_net_table; if (!net_eq(net, &init_net)) { @@ -815,11 +814,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count = 4; tcp_init_mem(net); - limit = nr_free_buffer_pages() / 8; - limit = max(limit, 128UL); - net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; - net->ipv4.sysctl_tcp_mem[1] = limit; - net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; net->ipv4.ipv4_hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a34f5cfdd44c..37755ccc0e96 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3229,7 +3229,6 @@ __setup("thash_entries=", set_thash_entries); void tcp_init_mem(struct net *net) { - /* Set per-socket limits to no more than 1/128 the pressure threshold */ unsigned long limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; @@ -3298,7 +3297,8 @@ void __init tcp_init(void) sysctl_max_syn_backlog = max(128, cnt / 256); tcp_init_mem(&init_net); - limit = nr_free_buffer_pages() / 8; + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10); limit = max(limit, 128UL); max_share = min(4UL*1024*1024, limit); -- cgit v1.2.3