From 9b28ae243ef3b13d8a88b5451d025475c75ebdef Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Tue, 21 May 2019 08:52:38 +0100 Subject: bpf: fix out-of-bounds read in __bpf_skc_lookup __bpf_skc_lookup takes a socket tuple and the length of the tuple as an argument. Based on the length, it decides which address family to pass to the helper function sk_lookup. In case of AF_INET6, it fails to verify that the length of the tuple is long enough. sk_lookup may therefore access data past the end of the tuple. Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") Signed-off-by: Lorenz Bauer Signed-off-by: Daniel Borkmann --- net/core/filter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 55bfc941d17a..76f1d99640e2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5304,7 +5304,13 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct net *net; int sdif; - family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; + if (len == sizeof(tuple->ipv4)) + family = AF_INET; + else if (len == sizeof(tuple->ipv6)) + family = AF_INET6; + else + return NULL; + if (unlikely(family == AF_UNSPEC || flags || !((s32)netns_id < 0 || netns_id <= S32_MAX))) goto out; -- cgit v1.2.3 From f7355a6c049711ecfbeed573e43d48bee7acb83a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 17 May 2019 14:21:17 -0700 Subject: bpf: Check sk_fullsock() before returning from bpf_sk_lookup() The BPF_FUNC_sk_lookup_xxx helpers return RET_PTR_TO_SOCKET_OR_NULL. Meaning a fullsock ptr and its fullsock's fields in bpf_sock can be accessed, e.g. type, protocol, mark and priority. Some new helper, like bpf_sk_storage_get(), also expects ARG_PTR_TO_SOCKET is a fullsock. bpf_sk_lookup() currently calls sk_to_full_sk() before returning. However, the ptr returned from sk_to_full_sk() is not guaranteed to be a fullsock. For example, it cannot get a fullsock if sk is in TCP_TIME_WAIT. This patch checks for sk_fullsock() before returning. If it is not a fullsock, sock_gen_put() is called if needed and then returns NULL. Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF") Cc: Joe Stringer Signed-off-by: Martin KaFai Lau Acked-by: Joe Stringer Signed-off-by: Daniel Borkmann --- net/core/filter.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 76f1d99640e2..fdcc504d2dec 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5343,8 +5343,14 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto, netns_id, flags); - if (sk) + if (sk) { sk = sk_to_full_sk(sk); + if (!sk_fullsock(sk)) { + if (!sock_flag(sk, SOCK_RCU_FREE)) + sock_gen_put(sk); + return NULL; + } + } return sk; } @@ -5375,8 +5381,14 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id, flags); - if (sk) + if (sk) { sk = sk_to_full_sk(sk); + if (!sk_fullsock(sk)) { + if (!sock_flag(sk, SOCK_RCU_FREE)) + sock_gen_put(sk); + return NULL; + } + } return sk; } -- cgit v1.2.3 From bd95e678e0f6e18351ecdc147ca819145db9ed7b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 24 May 2019 08:01:00 -0700 Subject: bpf: sockmap, fix use after free from sleep in psock backlog workqueue Backlog work for psock (sk_psock_backlog) might sleep while waiting for memory to free up when sending packets. However, while sleeping the socket may be closed and removed from the map by the user space side. This breaks an assumption in sk_stream_wait_memory, which expects the wait queue to be still there when it wakes up resulting in a use-after-free shown below. To fix his mark sendmsg as MSG_DONTWAIT to avoid the sleep altogether. We already set the flag for the sendpage case but we missed the case were sendmsg is used. Sockmap is currently the only user of skb_send_sock_locked() so only the sockmap paths should be impacted. ================================================================== BUG: KASAN: use-after-free in remove_wait_queue+0x31/0x70 Write of size 8 at addr ffff888069a0c4e8 by task kworker/0:2/110 CPU: 0 PID: 110 Comm: kworker/0:2 Not tainted 5.0.0-rc2-00335-g28f9d1a3d4fe-dirty #14 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014 Workqueue: events sk_psock_backlog Call Trace: print_address_description+0x6e/0x2b0 ? remove_wait_queue+0x31/0x70 kasan_report+0xfd/0x177 ? remove_wait_queue+0x31/0x70 ? remove_wait_queue+0x31/0x70 remove_wait_queue+0x31/0x70 sk_stream_wait_memory+0x4dd/0x5f0 ? sk_stream_wait_close+0x1b0/0x1b0 ? wait_woken+0xc0/0xc0 ? tcp_current_mss+0xc5/0x110 tcp_sendmsg_locked+0x634/0x15d0 ? tcp_set_state+0x2e0/0x2e0 ? __kasan_slab_free+0x1d1/0x230 ? kmem_cache_free+0x70/0x140 ? sk_psock_backlog+0x40c/0x4b0 ? process_one_work+0x40b/0x660 ? worker_thread+0x82/0x680 ? kthread+0x1b9/0x1e0 ? ret_from_fork+0x1f/0x30 ? check_preempt_curr+0xaf/0x130 ? iov_iter_kvec+0x5f/0x70 ? kernel_sendmsg_locked+0xa0/0xe0 skb_send_sock_locked+0x273/0x3c0 ? skb_splice_bits+0x180/0x180 ? start_thread+0xe0/0xe0 ? update_min_vruntime.constprop.27+0x88/0xc0 sk_psock_backlog+0xb3/0x4b0 ? strscpy+0xbf/0x1e0 process_one_work+0x40b/0x660 worker_thread+0x82/0x680 ? process_one_work+0x660/0x660 kthread+0x1b9/0x1e0 ? __kthread_create_on_node+0x250/0x250 ret_from_fork+0x1f/0x30 Fixes: 20bf50de3028c ("skbuff: Function to send an skbuf on a socket") Reported-by: Jakub Sitnicki Tested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e89be6282693..4a7c656b195b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2337,6 +2337,7 @@ do_frag_list: kv.iov_base = skb->data + offset; kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT; ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); if (ret <= 0) -- cgit v1.2.3 From 983695fa676568fc0fe5ddd995c7267aabc24632 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 7 Jun 2019 01:48:57 +0200 Subject: bpf: fix unconnected udp hooks Intention of cgroup bind/connect/sendmsg BPF hooks is to act transparently to applications as also stated in original motivation in 7828f20e3779 ("Merge branch 'bpf-cgroup-bind-connect'"). When recently integrating the latter two hooks into Cilium to enable host based load-balancing with Kubernetes, I ran into the issue that pods couldn't start up as DNS got broken. Kubernetes typically sets up DNS as a service and is thus subject to load-balancing. Upon further debugging, it turns out that the cgroupv2 sendmsg BPF hooks API is currently insufficient and thus not usable as-is for standard applications shipped with most distros. To break down the issue we ran into with a simple example: # cat /etc/resolv.conf nameserver 147.75.207.207 nameserver 147.75.207.208 For the purpose of a simple test, we set up above IPs as service IPs and transparently redirect traffic to a different DNS backend server for that node: # cilium service list ID Frontend Backend 1 147.75.207.207:53 1 => 8.8.8.8:53 2 147.75.207.208:53 1 => 8.8.8.8:53 The attached BPF program is basically selecting one of the backends if the service IP/port matches on the cgroup hook. DNS breaks here, because the hooks are not transparent enough to applications which have built-in msg_name address checks: # nslookup 1.1.1.1 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 [...] ;; connection timed out; no servers could be reached # dig 1.1.1.1 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 [...] ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 ;; global options: +cmd ;; connection timed out; no servers could be reached For comparison, if none of the service IPs is used, and we tell nslookup to use 8.8.8.8 directly it works just fine, of course: # nslookup 1.1.1.1 8.8.8.8 1.1.1.1.in-addr.arpa name = one.one.one.one. In order to fix this and thus act more transparent to the application, this needs reverse translation on recvmsg() side. A minimal fix for this API is to add similar recvmsg() hooks behind the BPF cgroups static key such that the program can track state and replace the current sockaddr_in{,6} with the original service IP. From BPF side, this basically tracks the service tuple plus socket cookie in an LRU map where the reverse NAT can then be retrieved via map value as one example. Side-note: the BPF cgroups static key should be converted to a per-hook static key in future. Same example after this fix: # cilium service list ID Frontend Backend 1 147.75.207.207:53 1 => 8.8.8.8:53 2 147.75.207.208:53 1 => 8.8.8.8:53 Lookups work fine now: # nslookup 1.1.1.1 1.1.1.1.in-addr.arpa name = one.one.one.one. Authoritative answers can be found from: # dig 1.1.1.1 ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 ;; global options: +cmd ;; Got answer: ;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 51550 ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1 ;; OPT PSEUDOSECTION: ; EDNS: version: 0, flags:; udp: 512 ;; QUESTION SECTION: ;1.1.1.1. IN A ;; AUTHORITY SECTION: . 23426 IN SOA a.root-servers.net. nstld.verisign-grs.com. 2019052001 1800 900 604800 86400 ;; Query time: 17 msec ;; SERVER: 147.75.207.207#53(147.75.207.207) ;; WHEN: Tue May 21 12:59:38 UTC 2019 ;; MSG SIZE rcvd: 111 And from an actual packet level it shows that we're using the back end server when talking via 147.75.207.20{7,8} front end: # tcpdump -i any udp [...] 12:59:52.698732 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) 12:59:52.698735 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) [...] In order to be flexible and to have same semantics as in sendmsg BPF programs, we only allow return codes in [1,1] range. In the sendmsg case the program is called if msg->msg_name is present which can be the case in both, connected and unconnected UDP. The former only relies on the sockaddr_in{,6} passed via connect(2) if passed msg->msg_name was NULL. Therefore, on recvmsg side, we act in similar way to call into the BPF program whenever a non-NULL msg->msg_name was passed independent of sk->sk_state being TCP_ESTABLISHED or not. Note that for TCP case, the msg->msg_name is ignored in the regular recvmsg path and therefore not relevant. For the case of ip{,v6}_recv_error() paths, picked up via MSG_ERRQUEUE, the hook is not called. This is intentional as it aligns with the same semantics as in case of TCP cgroup BPF hooks right now. This might be better addressed in future through a different bpf_attach_type such that this case can be distinguished from the regular recvmsg paths, for example. Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Acked-by: Martynas Pumputis Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 8 ++++++++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 8 ++++++++ kernel/bpf/verifier.c | 12 ++++++++---- net/core/filter.c | 2 ++ net/ipv4/udp.c | 4 ++++ net/ipv6/udp.c | 4 ++++ 7 files changed, 36 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index cb3c6b3b89c8..a7f7a98ec39d 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -238,6 +238,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) +#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL) + +#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -339,6 +345,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63e0cf66f01a..e4114a7e4451 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -192,6 +192,8 @@ enum bpf_attach_type { BPF_LIRC_MODE2, BPF_FLOW_DISSECTOR, BPF_CGROUP_SYSCTL, + BPF_CGROUP_UDP4_RECVMSG, + BPF_CGROUP_UDP6_RECVMSG, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index cb5440b02e82..e8ba3a153691 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1581,6 +1581,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: return 0; default: return -EINVAL; @@ -1875,6 +1877,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -1960,6 +1964,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; break; case BPF_CGROUP_SOCK_OPS: @@ -2011,6 +2017,8 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: + case BPF_CGROUP_UDP6_RECVMSG: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: case BPF_CGROUP_SYSCTL: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 95f9354495ad..d2c8a6677ac4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5361,9 +5361,12 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); switch (env->prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + range = tnum_range(1, 1); case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SYSCTL: @@ -5380,16 +5383,17 @@ static int check_return_code(struct bpf_verifier_env *env) } if (!tnum_in(range, reg->var_off)) { + char tn_buf[48]; + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { - char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "has value %s", tn_buf); } else { verbose(env, "has unknown scalar value"); } - verbose(env, " should have been 0 or 1\n"); + tnum_strn(tn_buf, sizeof(tn_buf), range); + verbose(env, " should have been in %s\n", tn_buf); return -EINVAL; } return 0; diff --git a/net/core/filter.c b/net/core/filter.c index fdcc504d2dec..2814d785c110 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6748,6 +6748,7 @@ static bool sock_addr_is_valid_access(int off, int size, case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP4_RECVMSG: break; default: return false; @@ -6758,6 +6759,7 @@ static bool sock_addr_is_valid_access(int off, int size, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_UDP6_SENDMSG: + case BPF_CGROUP_UDP6_RECVMSG: break; default: return false; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 85db0e3d7f3f..2d862823cbb7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1783,6 +1783,10 @@ try_again: sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); + + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, + (struct sockaddr *)sin); } if (udp_sk(sk)->gro_enabled) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4e52c37bb836..15570d7b9b61 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -369,6 +369,10 @@ try_again: inet6_iif(skb)); } *addr_len = sizeof(*sin6); + + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, + (struct sockaddr *)sin6); } if (udp_sk(sk)->gro_enabled) -- cgit v1.2.3 From f12dd75959b0138f94da8ddcf43f2f3cf277216d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 11 Jun 2019 14:45:57 -0700 Subject: bpf: net: Set sk_bpf_storage back to NULL for cloned sk The cloned sk should not carry its parent-listener's sk_bpf_storage. This patch fixes it by setting it back to NULL. Fixes: 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage") Signed-off-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- net/core/sock.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 2b3701958486..d90fd04622e5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1850,6 +1850,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) goto out; } RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); +#ifdef CONFIG_BPF_SYSCALL + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); +#endif newsk->sk_err = 0; newsk->sk_err_soft = 0; -- cgit v1.2.3 From f0d2ca1531377e7da888913e277eefac05a59b6f Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 12 Jun 2019 17:18:38 +0200 Subject: net: ethtool: Allow matching on vlan DEI bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using ethtool, users can specify a classification action matching on the full vlan tag, which includes the DEI bit (also previously called CFI). However, when converting the ethool_flow_spec to a flow_rule, we use dissector keys to represent the matching patterns. Since the vlan dissector key doesn't include the DEI bit, this information was silently discarded when translating the ethtool flow spec in to a flow_rule. This commit adds the DEI bit into the vlan dissector key, and allows propagating the information to the driver when parsing the ethtool flow spec. Fixes: eca4205f9ec3 ("ethtool: add ethtool_rx_flow_spec to flow_rule structure translator") Reported-by: Michał Mirosław Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + net/core/ethtool.c | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'net/core') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 7c5a8d9a8d2a..dfabc0503446 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -46,6 +46,7 @@ struct flow_dissector_key_tags { struct flow_dissector_key_vlan { u16 vlan_id:12, + vlan_dei:1, vlan_priority:3; __be16 vlan_tpid; }; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d08b1e19ce9c..4d1011b2e24f 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -3020,6 +3020,11 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input) match->mask.vlan.vlan_id = ntohs(ext_m_spec->vlan_tci) & 0x0fff; + match->key.vlan.vlan_dei = + !!(ext_h_spec->vlan_tci & htons(0x1000)); + match->mask.vlan.vlan_dei = + !!(ext_m_spec->vlan_tci & htons(0x1000)); + match->key.vlan.vlan_priority = (ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13; match->mask.vlan.vlan_priority = -- cgit v1.2.3 From ce27ec60648d8e066227cb2f58b1d3d4f7253d08 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Jun 2019 16:22:21 -0700 Subject: net: add high_order_alloc_disable sysctl/static key >From linux-3.7, (commit 5640f7685831 "net: use a per task frag allocator") TCP sendmsg() has preferred using order-3 allocations. While it gives good results for most cases, we had reports that heavy uses of TCP over loopback were hitting a spinlock contention in page allocations/freeing. This commits adds a sysctl so that admins can opt-in for order-0 allocations. Hopefully mm layer might optimize order-3 allocations in the future since it could give us a nice boost (see 8 lines of following benchmark) The following benchmark shows a win when more than 8 TCP_STREAM threads are running (56 x86 cores server in my tests) for thr in {1..30} do sysctl -wq net.core.high_order_alloc_disable=0 T0=`./super_netperf $thr -H 127.0.0.1 -l 15` sysctl -wq net.core.high_order_alloc_disable=1 T1=`./super_netperf $thr -H 127.0.0.1 -l 15` echo $thr:$T0:$T1 done 1: 49979: 37267 2: 98745: 76286 3: 141088: 110051 4: 177414: 144772 5: 197587: 173563 6: 215377: 208448 7: 241061: 234087 8: 267155: 263373 9: 295069: 297402 10: 312393: 335213 11: 340462: 368778 12: 371366: 403954 13: 412344: 443713 14: 426617: 473580 15: 474418: 507861 16: 503261: 538539 17: 522331: 563096 18: 532409: 567084 19: 550824: 605240 20: 525493: 641988 21: 564574: 665843 22: 567349: 690868 23: 583846: 710917 24: 588715: 736306 25: 603212: 763494 26: 604083: 792654 27: 602241: 796450 28: 604291: 797993 29: 611610: 833249 30: 577356: 841062 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ net/core/sock.c | 4 +++- net/core/sysctl_net_core.c | 7 +++++++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/net/sock.h b/include/net/sock.h index 7d7f4ce63bb2..6cbc16136357 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2534,6 +2534,8 @@ extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; +DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); + static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ diff --git a/net/core/sock.c b/net/core/sock.c index 2b3701958486..7f49392579a5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2320,6 +2320,7 @@ static void sk_leave_memory_pressure(struct sock *sk) /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) +DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** * skb_page_frag_refill - check that a page_frag contains enough room @@ -2344,7 +2345,8 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) } pfrag->offset = 0; - if (SKB_FRAG_PAGE_ORDER) { + if (SKB_FRAG_PAGE_ORDER && + !static_branch_unlikely(&net_high_order_alloc_disable_key)) { /* Avoid direct reclaim but allow kswapd to wake */ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 1a2685694abd..f9204719aeee 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -562,6 +562,13 @@ static struct ctl_table net_core_table[] = { .extra1 = &zero, .extra2 = &two, }, + { + .procname = "high_order_alloc_disable", + .data = &net_high_order_alloc_disable_key.key, + .maxlen = sizeof(net_high_order_alloc_disable_key), + .mode = 0644, + .proc_handler = proc_do_static_key, + }, { } }; -- cgit v1.2.3 From 85749218e3a6a1ab4fb3c698394f40f07b80be5e Mon Sep 17 00:00:00 2001 From: Arthur Fabre Date: Sat, 15 Jun 2019 14:36:27 -0700 Subject: bpf: Fix out of bounds memory access in bpf_sk_storage bpf_sk_storage maps use multiple spin locks to reduce contention. The number of locks to use is determined by the number of possible CPUs. With only 1 possible CPU, bucket_log == 0, and 2^0 = 1 locks are used. When updating elements, the correct lock is determined with hash_ptr(). Calling hash_ptr() with 0 bits is undefined behavior, as it does: x >> (64 - bits) Using the value results in an out of bounds memory access. In my case, this manifested itself as a page fault when raw_spin_lock_bh() is called later, when running the self tests: ./tools/testing/selftests/bpf/test_verifier 773 775 [ 16.366342] BUG: unable to handle page fault for address: ffff8fe7a66f93f8 Force the minimum number of locks to two. Signed-off-by: Arthur Fabre Fixes: 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage") Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- net/core/bpf_sk_storage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index cc9597a87770..d1c4e1f3be2c 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -633,7 +633,8 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); - smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus())); + /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ + smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus()))); nbuckets = 1U << smap->bucket_log; smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN); -- cgit v1.2.3 From 36b2f61a42c29add695f3bd192ce44d5c113c1eb Mon Sep 17 00:00:00 2001 From: Govindarajulu Varadarajan Date: Fri, 14 Jun 2019 06:13:54 -0700 Subject: net: handle 802.1P vlan 0 packets properly When stack receives pkt: [802.1P vlan 0][802.1AD vlan 100][IPv4], vlan_do_receive() returns false if it does not find vlan_dev. Later __netif_receive_skb_core() fails to find packet type handler for skb->protocol 801.1AD and drops the packet. 801.1P header with vlan id 0 should be handled as untagged packets. This patch fixes it by checking if vlan_id is 0 and processes next vlan header. Signed-off-by: Govindarajulu Varadarajan Signed-off-by: David S. Miller --- net/core/dev.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index eb7fb6daa1ef..d6edd218babd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4923,8 +4923,36 @@ skip_classify: } if (unlikely(skb_vlan_tag_present(skb))) { - if (skb_vlan_tag_get_id(skb)) +check_vlan_id: + if (skb_vlan_tag_get_id(skb)) { + /* Vlan id is non 0 and vlan_do_receive() above couldn't + * find vlan device. + */ skb->pkt_type = PACKET_OTHERHOST; + } else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || + skb->protocol == cpu_to_be16(ETH_P_8021AD)) { + /* Outer header is 802.1P with vlan 0, inner header is + * 802.1Q or 802.1AD and vlan_do_receive() above could + * not find vlan dev for vlan id 0. + */ + __vlan_hwaccel_clear_tag(skb); + skb = skb_vlan_untag(skb); + if (unlikely(!skb)) + goto out; + if (vlan_do_receive(&skb)) + /* After stripping off 802.1P header with vlan 0 + * vlan dev is found for inner header. + */ + goto another_round; + else if (unlikely(!skb)) + goto out; + else + /* We have stripped outer 802.1P vlan 0 header. + * But could not find vlan dev. + * check again for vlan id to set OTHERHOST. + */ + goto check_vlan_id; + } /* Note: we might in the future use prio bits * and set skb->priority like in vlan_do_receive() * For the time being, just ignore Priority Code Point -- cgit v1.2.3 From f3e92cb8e2eb8c27d109e6fd73d3a69a8c09e288 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Jun 2019 16:28:48 -0700 Subject: neigh: fix use-after-free read in pneigh_get_next Nine years ago, I added RCU handling to neighbours, not pneighbours. (pneigh are not commonly used) Unfortunately I missed that /proc dump operations would use a common entry and exit point : neigh_seq_start() and neigh_seq_stop() We need to read_lock(tbl->lock) or risk use-after-free while iterating the pneigh structures. We might later convert pneigh to RCU and revert this patch. sysbot reported : BUG: KASAN: use-after-free in pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158 Read of size 8 at addr ffff888097f2a700 by task syz-executor.0/9825 CPU: 1 PID: 9825 Comm: syz-executor.0 Not tainted 5.2.0-rc4+ #32 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317 kasan_report+0x12/0x20 mm/kasan/common.c:614 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132 pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158 neigh_seq_next+0xdb/0x210 net/core/neighbour.c:3240 seq_read+0x9cf/0x1110 fs/seq_file.c:258 proc_reg_read+0x1fc/0x2c0 fs/proc/inode.c:221 do_loop_readv_writev fs/read_write.c:714 [inline] do_loop_readv_writev fs/read_write.c:701 [inline] do_iter_read+0x4a4/0x660 fs/read_write.c:935 vfs_readv+0xf0/0x160 fs/read_write.c:997 kernel_readv fs/splice.c:359 [inline] default_file_splice_read+0x475/0x890 fs/splice.c:414 do_splice_to+0x127/0x180 fs/splice.c:877 splice_direct_to_actor+0x2d2/0x970 fs/splice.c:954 do_splice_direct+0x1da/0x2a0 fs/splice.c:1063 do_sendfile+0x597/0xd00 fs/read_write.c:1464 __do_sys_sendfile64 fs/read_write.c:1525 [inline] __se_sys_sendfile64 fs/read_write.c:1511 [inline] __x64_sys_sendfile64+0x1dd/0x220 fs/read_write.c:1511 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4592c9 Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f4aab51dc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000028 RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00000000004592c9 RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000005 RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000080000000 R11: 0000000000000246 R12: 00007f4aab51e6d4 R13: 00000000004c689d R14: 00000000004db828 R15: 00000000ffffffff Allocated by task 9827: save_stack+0x23/0x90 mm/kasan/common.c:71 set_track mm/kasan/common.c:79 [inline] __kasan_kmalloc mm/kasan/common.c:489 [inline] __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:462 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:503 __do_kmalloc mm/slab.c:3660 [inline] __kmalloc+0x15c/0x740 mm/slab.c:3669 kmalloc include/linux/slab.h:552 [inline] pneigh_lookup+0x19c/0x4a0 net/core/neighbour.c:731 arp_req_set_public net/ipv4/arp.c:1010 [inline] arp_req_set+0x613/0x720 net/ipv4/arp.c:1026 arp_ioctl+0x652/0x7f0 net/ipv4/arp.c:1226 inet_ioctl+0x2a0/0x340 net/ipv4/af_inet.c:926 sock_do_ioctl+0xd8/0x2f0 net/socket.c:1043 sock_ioctl+0x3ed/0x780 net/socket.c:1194 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:509 [inline] do_vfs_ioctl+0xd5f/0x1380 fs/ioctl.c:696 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713 __do_sys_ioctl fs/ioctl.c:720 [inline] __se_sys_ioctl fs/ioctl.c:718 [inline] __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 9824: save_stack+0x23/0x90 mm/kasan/common.c:71 set_track mm/kasan/common.c:79 [inline] __kasan_slab_free+0x102/0x150 mm/kasan/common.c:451 kasan_slab_free+0xe/0x10 mm/kasan/common.c:459 __cache_free mm/slab.c:3432 [inline] kfree+0xcf/0x220 mm/slab.c:3755 pneigh_ifdown_and_unlock net/core/neighbour.c:812 [inline] __neigh_ifdown+0x236/0x2f0 net/core/neighbour.c:356 neigh_ifdown+0x20/0x30 net/core/neighbour.c:372 arp_ifdown+0x1d/0x21 net/ipv4/arp.c:1274 inetdev_destroy net/ipv4/devinet.c:319 [inline] inetdev_event+0xa14/0x11f0 net/ipv4/devinet.c:1544 notifier_call_chain+0xc2/0x230 kernel/notifier.c:95 __raw_notifier_call_chain kernel/notifier.c:396 [inline] raw_notifier_call_chain+0x2e/0x40 kernel/notifier.c:403 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1749 call_netdevice_notifiers_extack net/core/dev.c:1761 [inline] call_netdevice_notifiers net/core/dev.c:1775 [inline] rollback_registered_many+0x9b9/0xfc0 net/core/dev.c:8178 rollback_registered+0x109/0x1d0 net/core/dev.c:8220 unregister_netdevice_queue net/core/dev.c:9267 [inline] unregister_netdevice_queue+0x1ee/0x2c0 net/core/dev.c:9260 unregister_netdevice include/linux/netdevice.h:2631 [inline] __tun_detach+0xd8a/0x1040 drivers/net/tun.c:724 tun_detach drivers/net/tun.c:741 [inline] tun_chr_close+0xe0/0x180 drivers/net/tun.c:3451 __fput+0x2ff/0x890 fs/file_table.c:280 ____fput+0x16/0x20 fs/file_table.c:313 task_work_run+0x145/0x1c0 kernel/task_work.c:113 tracehook_notify_resume include/linux/tracehook.h:185 [inline] exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:168 prepare_exit_to_usermode arch/x86/entry/common.c:199 [inline] syscall_return_slowpath arch/x86/entry/common.c:279 [inline] do_syscall_64+0x58e/0x680 arch/x86/entry/common.c:304 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff888097f2a700 which belongs to the cache kmalloc-64 of size 64 The buggy address is located 0 bytes inside of 64-byte region [ffff888097f2a700, ffff888097f2a740) The buggy address belongs to the page: page:ffffea00025fca80 refcount:1 mapcount:0 mapping:ffff8880aa400340 index:0x0 flags: 0x1fffc0000000200(slab) raw: 01fffc0000000200 ffffea000250d548 ffffea00025726c8 ffff8880aa400340 raw: 0000000000000000 ffff888097f2a000 0000000100000020 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888097f2a600: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc ffff888097f2a680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc >ffff888097f2a700: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ^ ffff888097f2a780: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc ffff888097f2a800: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc Fixes: 767e97e1e0db ("neigh: RCU conversion of struct neighbour") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/core/neighbour.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 0e2c07355463..9e7fc929bc50 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3203,6 +3203,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) + __acquires(tbl->lock) __acquires(rcu_bh) { struct neigh_seq_state *state = seq->private; @@ -3213,6 +3214,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl rcu_read_lock_bh(); state->nht = rcu_dereference_bh(tbl->nht); + read_lock(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } @@ -3246,8 +3248,13 @@ out: EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) + __releases(tbl->lock) __releases(rcu_bh) { + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + + read_unlock(&tbl->lock); rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_seq_stop); -- cgit v1.2.3