From 9b28ae243ef3b13d8a88b5451d025475c75ebdef Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@cloudflare.com>
Date: Tue, 21 May 2019 08:52:38 +0100
Subject: bpf: fix out-of-bounds read in __bpf_skc_lookup

__bpf_skc_lookup takes a socket tuple and the length of the
tuple as an argument. Based on the length, it decides which
address family to pass to the helper function sk_lookup.

In case of AF_INET6, it fails to verify that the length
of the tuple is long enough. sk_lookup may therefore access
data past the end of the tuple.

Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF")
Signed-off-by: Lorenz Bauer <lmb@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 55bfc941d17a..76f1d99640e2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5304,7 +5304,13 @@ __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
 	struct net *net;
 	int sdif;
 
-	family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6;
+	if (len == sizeof(tuple->ipv4))
+		family = AF_INET;
+	else if (len == sizeof(tuple->ipv6))
+		family = AF_INET6;
+	else
+		return NULL;
+
 	if (unlikely(family == AF_UNSPEC || flags ||
 		     !((s32)netns_id < 0 || netns_id <= S32_MAX)))
 		goto out;
-- 
cgit v1.2.3


From f7355a6c049711ecfbeed573e43d48bee7acb83a Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 17 May 2019 14:21:17 -0700
Subject: bpf: Check sk_fullsock() before returning from bpf_sk_lookup()

The BPF_FUNC_sk_lookup_xxx helpers return RET_PTR_TO_SOCKET_OR_NULL.
Meaning a fullsock ptr and its fullsock's fields in bpf_sock can be
accessed, e.g. type, protocol, mark and priority.
Some new helper, like bpf_sk_storage_get(), also expects
ARG_PTR_TO_SOCKET is a fullsock.

bpf_sk_lookup() currently calls sk_to_full_sk() before returning.
However, the ptr returned from sk_to_full_sk() is not guaranteed
to be a fullsock.  For example, it cannot get a fullsock if sk
is in TCP_TIME_WAIT.

This patch checks for sk_fullsock() before returning. If it is not
a fullsock, sock_gen_put() is called if needed and then returns NULL.

Fixes: 6acc9b432e67 ("bpf: Add helper to retrieve socket in BPF")
Cc: Joe Stringer <joe@isovalent.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Joe Stringer <joe@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 76f1d99640e2..fdcc504d2dec 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5343,8 +5343,14 @@ __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
 	struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
 					   ifindex, proto, netns_id, flags);
 
-	if (sk)
+	if (sk) {
 		sk = sk_to_full_sk(sk);
+		if (!sk_fullsock(sk)) {
+			if (!sock_flag(sk, SOCK_RCU_FREE))
+				sock_gen_put(sk);
+			return NULL;
+		}
+	}
 
 	return sk;
 }
@@ -5375,8 +5381,14 @@ bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
 	struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
 					 flags);
 
-	if (sk)
+	if (sk) {
 		sk = sk_to_full_sk(sk);
+		if (!sk_fullsock(sk)) {
+			if (!sock_flag(sk, SOCK_RCU_FREE))
+				sock_gen_put(sk);
+			return NULL;
+		}
+	}
 
 	return sk;
 }
-- 
cgit v1.2.3


From bd95e678e0f6e18351ecdc147ca819145db9ed7b Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 24 May 2019 08:01:00 -0700
Subject: bpf: sockmap, fix use after free from sleep in psock backlog
 workqueue

Backlog work for psock (sk_psock_backlog) might sleep while waiting
for memory to free up when sending packets. However, while sleeping
the socket may be closed and removed from the map by the user space
side.

This breaks an assumption in sk_stream_wait_memory, which expects the
wait queue to be still there when it wakes up resulting in a
use-after-free shown below. To fix his mark sendmsg as MSG_DONTWAIT
to avoid the sleep altogether. We already set the flag for the
sendpage case but we missed the case were sendmsg is used.
Sockmap is currently the only user of skb_send_sock_locked() so only
the sockmap paths should be impacted.

==================================================================
BUG: KASAN: use-after-free in remove_wait_queue+0x31/0x70
Write of size 8 at addr ffff888069a0c4e8 by task kworker/0:2/110

CPU: 0 PID: 110 Comm: kworker/0:2 Not tainted 5.0.0-rc2-00335-g28f9d1a3d4fe-dirty #14
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-2.fc27 04/01/2014
Workqueue: events sk_psock_backlog
Call Trace:
 print_address_description+0x6e/0x2b0
 ? remove_wait_queue+0x31/0x70
 kasan_report+0xfd/0x177
 ? remove_wait_queue+0x31/0x70
 ? remove_wait_queue+0x31/0x70
 remove_wait_queue+0x31/0x70
 sk_stream_wait_memory+0x4dd/0x5f0
 ? sk_stream_wait_close+0x1b0/0x1b0
 ? wait_woken+0xc0/0xc0
 ? tcp_current_mss+0xc5/0x110
 tcp_sendmsg_locked+0x634/0x15d0
 ? tcp_set_state+0x2e0/0x2e0
 ? __kasan_slab_free+0x1d1/0x230
 ? kmem_cache_free+0x70/0x140
 ? sk_psock_backlog+0x40c/0x4b0
 ? process_one_work+0x40b/0x660
 ? worker_thread+0x82/0x680
 ? kthread+0x1b9/0x1e0
 ? ret_from_fork+0x1f/0x30
 ? check_preempt_curr+0xaf/0x130
 ? iov_iter_kvec+0x5f/0x70
 ? kernel_sendmsg_locked+0xa0/0xe0
 skb_send_sock_locked+0x273/0x3c0
 ? skb_splice_bits+0x180/0x180
 ? start_thread+0xe0/0xe0
 ? update_min_vruntime.constprop.27+0x88/0xc0
 sk_psock_backlog+0xb3/0x4b0
 ? strscpy+0xbf/0x1e0
 process_one_work+0x40b/0x660
 worker_thread+0x82/0x680
 ? process_one_work+0x660/0x660
 kthread+0x1b9/0x1e0
 ? __kthread_create_on_node+0x250/0x250
 ret_from_fork+0x1f/0x30

Fixes: 20bf50de3028c ("skbuff: Function to send an skbuf on a socket")
Reported-by: Jakub Sitnicki <jakub@cloudflare.com>
Tested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/skbuff.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e89be6282693..4a7c656b195b 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2337,6 +2337,7 @@ do_frag_list:
 		kv.iov_base = skb->data + offset;
 		kv.iov_len = slen;
 		memset(&msg, 0, sizeof(msg));
+		msg.msg_flags = MSG_DONTWAIT;
 
 		ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
 		if (ret <= 0)
-- 
cgit v1.2.3


From 983695fa676568fc0fe5ddd995c7267aabc24632 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 7 Jun 2019 01:48:57 +0200
Subject: bpf: fix unconnected udp hooks

Intention of cgroup bind/connect/sendmsg BPF hooks is to act transparently
to applications as also stated in original motivation in 7828f20e3779 ("Merge
branch 'bpf-cgroup-bind-connect'"). When recently integrating the latter
two hooks into Cilium to enable host based load-balancing with Kubernetes,
I ran into the issue that pods couldn't start up as DNS got broken. Kubernetes
typically sets up DNS as a service and is thus subject to load-balancing.

Upon further debugging, it turns out that the cgroupv2 sendmsg BPF hooks API
is currently insufficient and thus not usable as-is for standard applications
shipped with most distros. To break down the issue we ran into with a simple
example:

  # cat /etc/resolv.conf
  nameserver 147.75.207.207
  nameserver 147.75.207.208

For the purpose of a simple test, we set up above IPs as service IPs and
transparently redirect traffic to a different DNS backend server for that
node:

  # cilium service list
  ID   Frontend            Backend
  1    147.75.207.207:53   1 => 8.8.8.8:53
  2    147.75.207.208:53   1 => 8.8.8.8:53

The attached BPF program is basically selecting one of the backends if the
service IP/port matches on the cgroup hook. DNS breaks here, because the
hooks are not transparent enough to applications which have built-in msg_name
address checks:

  # nslookup 1.1.1.1
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
  [...]
  ;; connection timed out; no servers could be reached

  # dig 1.1.1.1
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53
  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
  [...]

  ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1
  ;; global options: +cmd
  ;; connection timed out; no servers could be reached

For comparison, if none of the service IPs is used, and we tell nslookup
to use 8.8.8.8 directly it works just fine, of course:

  # nslookup 1.1.1.1 8.8.8.8
  1.1.1.1.in-addr.arpa	name = one.one.one.one.

In order to fix this and thus act more transparent to the application,
this needs reverse translation on recvmsg() side. A minimal fix for this
API is to add similar recvmsg() hooks behind the BPF cgroups static key
such that the program can track state and replace the current sockaddr_in{,6}
with the original service IP. From BPF side, this basically tracks the
service tuple plus socket cookie in an LRU map where the reverse NAT can
then be retrieved via map value as one example. Side-note: the BPF cgroups
static key should be converted to a per-hook static key in future.

Same example after this fix:

  # cilium service list
  ID   Frontend            Backend
  1    147.75.207.207:53   1 => 8.8.8.8:53
  2    147.75.207.208:53   1 => 8.8.8.8:53

Lookups work fine now:

  # nslookup 1.1.1.1
  1.1.1.1.in-addr.arpa    name = one.one.one.one.

  Authoritative answers can be found from:

  # dig 1.1.1.1

  ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1
  ;; global options: +cmd
  ;; Got answer:
  ;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 51550
  ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1

  ;; OPT PSEUDOSECTION:
  ; EDNS: version: 0, flags:; udp: 512
  ;; QUESTION SECTION:
  ;1.1.1.1.                       IN      A

  ;; AUTHORITY SECTION:
  .                       23426   IN      SOA     a.root-servers.net. nstld.verisign-grs.com. 2019052001 1800 900 604800 86400

  ;; Query time: 17 msec
  ;; SERVER: 147.75.207.207#53(147.75.207.207)
  ;; WHEN: Tue May 21 12:59:38 UTC 2019
  ;; MSG SIZE  rcvd: 111

And from an actual packet level it shows that we're using the back end
server when talking via 147.75.207.20{7,8} front end:

  # tcpdump -i any udp
  [...]
  12:59:52.698732 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38)
  12:59:52.698735 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38)
  12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67)
  12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67)
  [...]

In order to be flexible and to have same semantics as in sendmsg BPF
programs, we only allow return codes in [1,1] range. In the sendmsg case
the program is called if msg->msg_name is present which can be the case
in both, connected and unconnected UDP.

The former only relies on the sockaddr_in{,6} passed via connect(2) if
passed msg->msg_name was NULL. Therefore, on recvmsg side, we act in similar
way to call into the BPF program whenever a non-NULL msg->msg_name was
passed independent of sk->sk_state being TCP_ESTABLISHED or not. Note
that for TCP case, the msg->msg_name is ignored in the regular recvmsg
path and therefore not relevant.

For the case of ip{,v6}_recv_error() paths, picked up via MSG_ERRQUEUE,
the hook is not called. This is intentional as it aligns with the same
semantics as in case of TCP cgroup BPF hooks right now. This might be
better addressed in future through a different bpf_attach_type such
that this case can be distinguished from the regular recvmsg paths,
for example.

Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrey Ignatov <rdna@fb.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Martynas Pumputis <m@lambda.lt>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf-cgroup.h |  8 ++++++++
 include/uapi/linux/bpf.h   |  2 ++
 kernel/bpf/syscall.c       |  8 ++++++++
 kernel/bpf/verifier.c      | 12 ++++++++----
 net/core/filter.c          |  2 ++
 net/ipv4/udp.c             |  4 ++++
 net/ipv6/udp.c             |  4 ++++
 7 files changed, 36 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index cb3c6b3b89c8..a7f7a98ec39d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -238,6 +238,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)		       \
 	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
 
+#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr)			\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL)
+
+#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr)			\
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL)
+
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
 ({									       \
 	int __ret = 0;							       \
@@ -339,6 +345,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; })
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 63e0cf66f01a..e4114a7e4451 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -192,6 +192,8 @@ enum bpf_attach_type {
 	BPF_LIRC_MODE2,
 	BPF_FLOW_DISSECTOR,
 	BPF_CGROUP_SYSCTL,
+	BPF_CGROUP_UDP4_RECVMSG,
+	BPF_CGROUP_UDP6_RECVMSG,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb5440b02e82..e8ba3a153691 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1581,6 +1581,8 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
 		case BPF_CGROUP_INET6_CONNECT:
 		case BPF_CGROUP_UDP4_SENDMSG:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UDP4_RECVMSG:
+		case BPF_CGROUP_UDP6_RECVMSG:
 			return 0;
 		default:
 			return -EINVAL;
@@ -1875,6 +1877,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_CONNECT:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UDP4_RECVMSG:
+	case BPF_CGROUP_UDP6_RECVMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -1960,6 +1964,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET6_CONNECT:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UDP4_RECVMSG:
+	case BPF_CGROUP_UDP6_RECVMSG:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
 		break;
 	case BPF_CGROUP_SOCK_OPS:
@@ -2011,6 +2017,8 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET6_CONNECT:
 	case BPF_CGROUP_UDP4_SENDMSG:
 	case BPF_CGROUP_UDP6_SENDMSG:
+	case BPF_CGROUP_UDP4_RECVMSG:
+	case BPF_CGROUP_UDP6_RECVMSG:
 	case BPF_CGROUP_SOCK_OPS:
 	case BPF_CGROUP_DEVICE:
 	case BPF_CGROUP_SYSCTL:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 95f9354495ad..d2c8a6677ac4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5361,9 +5361,12 @@ static int check_return_code(struct bpf_verifier_env *env)
 	struct tnum range = tnum_range(0, 1);
 
 	switch (env->prog->type) {
+	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+		if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
+		    env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+			range = tnum_range(1, 1);
 	case BPF_PROG_TYPE_CGROUP_SKB:
 	case BPF_PROG_TYPE_CGROUP_SOCK:
-	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_CGROUP_DEVICE:
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
@@ -5380,16 +5383,17 @@ static int check_return_code(struct bpf_verifier_env *env)
 	}
 
 	if (!tnum_in(range, reg->var_off)) {
+		char tn_buf[48];
+
 		verbose(env, "At program exit the register R0 ");
 		if (!tnum_is_unknown(reg->var_off)) {
-			char tn_buf[48];
-
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 			verbose(env, "has value %s", tn_buf);
 		} else {
 			verbose(env, "has unknown scalar value");
 		}
-		verbose(env, " should have been 0 or 1\n");
+		tnum_strn(tn_buf, sizeof(tn_buf), range);
+		verbose(env, " should have been in %s\n", tn_buf);
 		return -EINVAL;
 	}
 	return 0;
diff --git a/net/core/filter.c b/net/core/filter.c
index fdcc504d2dec..2814d785c110 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6748,6 +6748,7 @@ static bool sock_addr_is_valid_access(int off, int size,
 		case BPF_CGROUP_INET4_BIND:
 		case BPF_CGROUP_INET4_CONNECT:
 		case BPF_CGROUP_UDP4_SENDMSG:
+		case BPF_CGROUP_UDP4_RECVMSG:
 			break;
 		default:
 			return false;
@@ -6758,6 +6759,7 @@ static bool sock_addr_is_valid_access(int off, int size,
 		case BPF_CGROUP_INET6_BIND:
 		case BPF_CGROUP_INET6_CONNECT:
 		case BPF_CGROUP_UDP6_SENDMSG:
+		case BPF_CGROUP_UDP6_RECVMSG:
 			break;
 		default:
 			return false;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 85db0e3d7f3f..2d862823cbb7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1783,6 +1783,10 @@ try_again:
 		sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
 		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
 		*addr_len = sizeof(*sin);
+
+		if (cgroup_bpf_enabled)
+			BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+							(struct sockaddr *)sin);
 	}
 
 	if (udp_sk(sk)->gro_enabled)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 4e52c37bb836..15570d7b9b61 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -369,6 +369,10 @@ try_again:
 						    inet6_iif(skb));
 		}
 		*addr_len = sizeof(*sin6);
+
+		if (cgroup_bpf_enabled)
+			BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
+						(struct sockaddr *)sin6);
 	}
 
 	if (udp_sk(sk)->gro_enabled)
-- 
cgit v1.2.3


From f12dd75959b0138f94da8ddcf43f2f3cf277216d Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 11 Jun 2019 14:45:57 -0700
Subject: bpf: net: Set sk_bpf_storage back to NULL for cloned sk

The cloned sk should not carry its parent-listener's sk_bpf_storage.
This patch fixes it by setting it back to NULL.

Fixes: 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/sock.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net/core')

diff --git a/net/core/sock.c b/net/core/sock.c
index 2b3701958486..d90fd04622e5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1850,6 +1850,9 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 			goto out;
 		}
 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
+#ifdef CONFIG_BPF_SYSCALL
+		RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+#endif
 
 		newsk->sk_err	   = 0;
 		newsk->sk_err_soft = 0;
-- 
cgit v1.2.3


From f0d2ca1531377e7da888913e277eefac05a59b6f Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Wed, 12 Jun 2019 17:18:38 +0200
Subject: net: ethtool: Allow matching on vlan DEI bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using ethtool, users can specify a classification action matching on the
full vlan tag, which includes the DEI bit (also previously called CFI).

However, when converting the ethool_flow_spec to a flow_rule, we use
dissector keys to represent the matching patterns.

Since the vlan dissector key doesn't include the DEI bit, this
information was silently discarded when translating the ethtool
flow spec in to a flow_rule.

This commit adds the DEI bit into the vlan dissector key, and allows
propagating the information to the driver when parsing the ethtool flow
spec.

Fixes: eca4205f9ec3 ("ethtool: add ethtool_rx_flow_spec to flow_rule structure translator")
Reported-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_dissector.h | 1 +
 net/core/ethtool.c           | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'net/core')

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 7c5a8d9a8d2a..dfabc0503446 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -46,6 +46,7 @@ struct flow_dissector_key_tags {
 
 struct flow_dissector_key_vlan {
 	u16	vlan_id:12,
+		vlan_dei:1,
 		vlan_priority:3;
 	__be16	vlan_tpid;
 };
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index d08b1e19ce9c..4d1011b2e24f 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -3020,6 +3020,11 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
 			match->mask.vlan.vlan_id =
 				ntohs(ext_m_spec->vlan_tci) & 0x0fff;
 
+			match->key.vlan.vlan_dei =
+				!!(ext_h_spec->vlan_tci & htons(0x1000));
+			match->mask.vlan.vlan_dei =
+				!!(ext_m_spec->vlan_tci & htons(0x1000));
+
 			match->key.vlan.vlan_priority =
 				(ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13;
 			match->mask.vlan.vlan_priority =
-- 
cgit v1.2.3


From ce27ec60648d8e066227cb2f58b1d3d4f7253d08 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 14 Jun 2019 16:22:21 -0700
Subject: net: add high_order_alloc_disable sysctl/static key

>From linux-3.7, (commit 5640f7685831 "net: use a per task frag
allocator") TCP sendmsg() has preferred using order-3 allocations.

While it gives good results for most cases, we had reports
that heavy uses of TCP over loopback were hitting a spinlock
contention in page allocations/freeing.

This commits adds a sysctl so that admins can opt-in
for order-0 allocations. Hopefully mm layer might optimize
order-3 allocations in the future since it could give us
a nice boost  (see 8 lines of following benchmark)

The following benchmark shows a win when more than 8 TCP_STREAM
threads are running (56 x86 cores server in my tests)

for thr in {1..30}
do
 sysctl -wq net.core.high_order_alloc_disable=0
 T0=`./super_netperf $thr -H 127.0.0.1 -l 15`
 sysctl -wq net.core.high_order_alloc_disable=1
 T1=`./super_netperf $thr -H 127.0.0.1 -l 15`
 echo $thr:$T0:$T1
done

1: 49979: 37267
2: 98745: 76286
3: 141088: 110051
4: 177414: 144772
5: 197587: 173563
6: 215377: 208448
7: 241061: 234087
8: 267155: 263373
9: 295069: 297402
10: 312393: 335213
11: 340462: 368778
12: 371366: 403954
13: 412344: 443713
14: 426617: 473580
15: 474418: 507861
16: 503261: 538539
17: 522331: 563096
18: 532409: 567084
19: 550824: 605240
20: 525493: 641988
21: 564574: 665843
22: 567349: 690868
23: 583846: 710917
24: 588715: 736306
25: 603212: 763494
26: 604083: 792654
27: 602241: 796450
28: 604291: 797993
29: 611610: 833249
30: 577356: 841062

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h         | 2 ++
 net/core/sock.c            | 4 +++-
 net/core/sysctl_net_core.c | 7 +++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/include/net/sock.h b/include/net/sock.h
index 7d7f4ce63bb2..6cbc16136357 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2534,6 +2534,8 @@ extern int sysctl_optmem_max;
 extern __u32 sysctl_wmem_default;
 extern __u32 sysctl_rmem_default;
 
+DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
+
 static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
 {
 	/* Does this proto have per netns sysctl_wmem ? */
diff --git a/net/core/sock.c b/net/core/sock.c
index 2b3701958486..7f49392579a5 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2320,6 +2320,7 @@ static void sk_leave_memory_pressure(struct sock *sk)
 
 /* On 32bit arches, an skb frag is limited to 2^15 */
 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
+DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
 
 /**
  * skb_page_frag_refill - check that a page_frag contains enough room
@@ -2344,7 +2345,8 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
 	}
 
 	pfrag->offset = 0;
-	if (SKB_FRAG_PAGE_ORDER) {
+	if (SKB_FRAG_PAGE_ORDER &&
+	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
 		/* Avoid direct reclaim but allow kswapd to wake */
 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
 					  __GFP_COMP | __GFP_NOWARN |
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 1a2685694abd..f9204719aeee 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -562,6 +562,13 @@ static struct ctl_table net_core_table[] = {
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
+	{
+		.procname	= "high_order_alloc_disable",
+		.data		= &net_high_order_alloc_disable_key.key,
+		.maxlen         = sizeof(net_high_order_alloc_disable_key),
+		.mode		= 0644,
+		.proc_handler	= proc_do_static_key,
+	},
 	{ }
 };
 
-- 
cgit v1.2.3


From 85749218e3a6a1ab4fb3c698394f40f07b80be5e Mon Sep 17 00:00:00 2001
From: Arthur Fabre <afabre@cloudflare.com>
Date: Sat, 15 Jun 2019 14:36:27 -0700
Subject: bpf: Fix out of bounds memory access in bpf_sk_storage

bpf_sk_storage maps use multiple spin locks to reduce contention.
The number of locks to use is determined by the number of possible CPUs.
With only 1 possible CPU, bucket_log == 0, and 2^0 = 1 locks are used.

When updating elements, the correct lock is determined with hash_ptr().
Calling hash_ptr() with 0 bits is undefined behavior, as it does:

x >> (64 - bits)

Using the value results in an out of bounds memory access.
In my case, this manifested itself as a page fault when raw_spin_lock_bh()
is called later, when running the self tests:

./tools/testing/selftests/bpf/test_verifier 773 775
[   16.366342] BUG: unable to handle page fault for address: ffff8fe7a66f93f8

Force the minimum number of locks to two.

Signed-off-by: Arthur Fabre <afabre@cloudflare.com>
Fixes: 6ac99e8f23d4 ("bpf: Introduce bpf sk local storage")
Acked-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/bpf_sk_storage.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index cc9597a87770..d1c4e1f3be2c 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -633,7 +633,8 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(-ENOMEM);
 	bpf_map_init_from_attr(&smap->map, attr);
 
-	smap->bucket_log = ilog2(roundup_pow_of_two(num_possible_cpus()));
+	/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+	smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus())));
 	nbuckets = 1U << smap->bucket_log;
 	smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
 				 GFP_USER | __GFP_NOWARN);
-- 
cgit v1.2.3


From 36b2f61a42c29add695f3bd192ce44d5c113c1eb Mon Sep 17 00:00:00 2001
From: Govindarajulu Varadarajan <gvaradar@cisco.com>
Date: Fri, 14 Jun 2019 06:13:54 -0700
Subject: net: handle 802.1P vlan 0 packets properly

When stack receives pkt: [802.1P vlan 0][802.1AD vlan 100][IPv4],
vlan_do_receive() returns false if it does not find vlan_dev. Later
__netif_receive_skb_core() fails to find packet type handler for
skb->protocol 801.1AD and drops the packet.

801.1P header with vlan id 0 should be handled as untagged packets.
This patch fixes it by checking if vlan_id is 0 and processes next vlan
header.

Signed-off-by: Govindarajulu Varadarajan <gvaradar@cisco.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/dev.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'net/core')

diff --git a/net/core/dev.c b/net/core/dev.c
index eb7fb6daa1ef..d6edd218babd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4923,8 +4923,36 @@ skip_classify:
 	}
 
 	if (unlikely(skb_vlan_tag_present(skb))) {
-		if (skb_vlan_tag_get_id(skb))
+check_vlan_id:
+		if (skb_vlan_tag_get_id(skb)) {
+			/* Vlan id is non 0 and vlan_do_receive() above couldn't
+			 * find vlan device.
+			 */
 			skb->pkt_type = PACKET_OTHERHOST;
+		} else if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+			   skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+			/* Outer header is 802.1P with vlan 0, inner header is
+			 * 802.1Q or 802.1AD and vlan_do_receive() above could
+			 * not find vlan dev for vlan id 0.
+			 */
+			__vlan_hwaccel_clear_tag(skb);
+			skb = skb_vlan_untag(skb);
+			if (unlikely(!skb))
+				goto out;
+			if (vlan_do_receive(&skb))
+				/* After stripping off 802.1P header with vlan 0
+				 * vlan dev is found for inner header.
+				 */
+				goto another_round;
+			else if (unlikely(!skb))
+				goto out;
+			else
+				/* We have stripped outer 802.1P vlan 0 header.
+				 * But could not find vlan dev.
+				 * check again for vlan id to set OTHERHOST.
+				 */
+				goto check_vlan_id;
+		}
 		/* Note: we might in the future use prio bits
 		 * and set skb->priority like in vlan_do_receive()
 		 * For the time being, just ignore Priority Code Point
-- 
cgit v1.2.3


From f3e92cb8e2eb8c27d109e6fd73d3a69a8c09e288 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 15 Jun 2019 16:28:48 -0700
Subject: neigh: fix use-after-free read in pneigh_get_next

Nine years ago, I added RCU handling to neighbours, not pneighbours.
(pneigh are not commonly used)

Unfortunately I missed that /proc dump operations would use a
common entry and exit point : neigh_seq_start() and neigh_seq_stop()

We need to read_lock(tbl->lock) or risk use-after-free while
iterating the pneigh structures.

We might later convert pneigh to RCU and revert this patch.

sysbot reported :

BUG: KASAN: use-after-free in pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158
Read of size 8 at addr ffff888097f2a700 by task syz-executor.0/9825

CPU: 1 PID: 9825 Comm: syz-executor.0 Not tainted 5.2.0-rc4+ #32
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x172/0x1f0 lib/dump_stack.c:113
 print_address_description.cold+0x7c/0x20d mm/kasan/report.c:188
 __kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
 kasan_report+0x12/0x20 mm/kasan/common.c:614
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132
 pneigh_get_next.isra.0+0x24b/0x280 net/core/neighbour.c:3158
 neigh_seq_next+0xdb/0x210 net/core/neighbour.c:3240
 seq_read+0x9cf/0x1110 fs/seq_file.c:258
 proc_reg_read+0x1fc/0x2c0 fs/proc/inode.c:221
 do_loop_readv_writev fs/read_write.c:714 [inline]
 do_loop_readv_writev fs/read_write.c:701 [inline]
 do_iter_read+0x4a4/0x660 fs/read_write.c:935
 vfs_readv+0xf0/0x160 fs/read_write.c:997
 kernel_readv fs/splice.c:359 [inline]
 default_file_splice_read+0x475/0x890 fs/splice.c:414
 do_splice_to+0x127/0x180 fs/splice.c:877
 splice_direct_to_actor+0x2d2/0x970 fs/splice.c:954
 do_splice_direct+0x1da/0x2a0 fs/splice.c:1063
 do_sendfile+0x597/0xd00 fs/read_write.c:1464
 __do_sys_sendfile64 fs/read_write.c:1525 [inline]
 __se_sys_sendfile64 fs/read_write.c:1511 [inline]
 __x64_sys_sendfile64+0x1dd/0x220 fs/read_write.c:1511
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4592c9
Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f4aab51dc78 EFLAGS: 00000246 ORIG_RAX: 0000000000000028
RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00000000004592c9
RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000005
RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000080000000 R11: 0000000000000246 R12: 00007f4aab51e6d4
R13: 00000000004c689d R14: 00000000004db828 R15: 00000000ffffffff

Allocated by task 9827:
 save_stack+0x23/0x90 mm/kasan/common.c:71
 set_track mm/kasan/common.c:79 [inline]
 __kasan_kmalloc mm/kasan/common.c:489 [inline]
 __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:462
 kasan_kmalloc+0x9/0x10 mm/kasan/common.c:503
 __do_kmalloc mm/slab.c:3660 [inline]
 __kmalloc+0x15c/0x740 mm/slab.c:3669
 kmalloc include/linux/slab.h:552 [inline]
 pneigh_lookup+0x19c/0x4a0 net/core/neighbour.c:731
 arp_req_set_public net/ipv4/arp.c:1010 [inline]
 arp_req_set+0x613/0x720 net/ipv4/arp.c:1026
 arp_ioctl+0x652/0x7f0 net/ipv4/arp.c:1226
 inet_ioctl+0x2a0/0x340 net/ipv4/af_inet.c:926
 sock_do_ioctl+0xd8/0x2f0 net/socket.c:1043
 sock_ioctl+0x3ed/0x780 net/socket.c:1194
 vfs_ioctl fs/ioctl.c:46 [inline]
 file_ioctl fs/ioctl.c:509 [inline]
 do_vfs_ioctl+0xd5f/0x1380 fs/ioctl.c:696
 ksys_ioctl+0xab/0xd0 fs/ioctl.c:713
 __do_sys_ioctl fs/ioctl.c:720 [inline]
 __se_sys_ioctl fs/ioctl.c:718 [inline]
 __x64_sys_ioctl+0x73/0xb0 fs/ioctl.c:718
 do_syscall_64+0xfd/0x680 arch/x86/entry/common.c:301
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 9824:
 save_stack+0x23/0x90 mm/kasan/common.c:71
 set_track mm/kasan/common.c:79 [inline]
 __kasan_slab_free+0x102/0x150 mm/kasan/common.c:451
 kasan_slab_free+0xe/0x10 mm/kasan/common.c:459
 __cache_free mm/slab.c:3432 [inline]
 kfree+0xcf/0x220 mm/slab.c:3755
 pneigh_ifdown_and_unlock net/core/neighbour.c:812 [inline]
 __neigh_ifdown+0x236/0x2f0 net/core/neighbour.c:356
 neigh_ifdown+0x20/0x30 net/core/neighbour.c:372
 arp_ifdown+0x1d/0x21 net/ipv4/arp.c:1274
 inetdev_destroy net/ipv4/devinet.c:319 [inline]
 inetdev_event+0xa14/0x11f0 net/ipv4/devinet.c:1544
 notifier_call_chain+0xc2/0x230 kernel/notifier.c:95
 __raw_notifier_call_chain kernel/notifier.c:396 [inline]
 raw_notifier_call_chain+0x2e/0x40 kernel/notifier.c:403
 call_netdevice_notifiers_info+0x3f/0x90 net/core/dev.c:1749
 call_netdevice_notifiers_extack net/core/dev.c:1761 [inline]
 call_netdevice_notifiers net/core/dev.c:1775 [inline]
 rollback_registered_many+0x9b9/0xfc0 net/core/dev.c:8178
 rollback_registered+0x109/0x1d0 net/core/dev.c:8220
 unregister_netdevice_queue net/core/dev.c:9267 [inline]
 unregister_netdevice_queue+0x1ee/0x2c0 net/core/dev.c:9260
 unregister_netdevice include/linux/netdevice.h:2631 [inline]
 __tun_detach+0xd8a/0x1040 drivers/net/tun.c:724
 tun_detach drivers/net/tun.c:741 [inline]
 tun_chr_close+0xe0/0x180 drivers/net/tun.c:3451
 __fput+0x2ff/0x890 fs/file_table.c:280
 ____fput+0x16/0x20 fs/file_table.c:313
 task_work_run+0x145/0x1c0 kernel/task_work.c:113
 tracehook_notify_resume include/linux/tracehook.h:185 [inline]
 exit_to_usermode_loop+0x273/0x2c0 arch/x86/entry/common.c:168
 prepare_exit_to_usermode arch/x86/entry/common.c:199 [inline]
 syscall_return_slowpath arch/x86/entry/common.c:279 [inline]
 do_syscall_64+0x58e/0x680 arch/x86/entry/common.c:304
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff888097f2a700
 which belongs to the cache kmalloc-64 of size 64
The buggy address is located 0 bytes inside of
 64-byte region [ffff888097f2a700, ffff888097f2a740)
The buggy address belongs to the page:
page:ffffea00025fca80 refcount:1 mapcount:0 mapping:ffff8880aa400340 index:0x0
flags: 0x1fffc0000000200(slab)
raw: 01fffc0000000200 ffffea000250d548 ffffea00025726c8 ffff8880aa400340
raw: 0000000000000000 ffff888097f2a000 0000000100000020 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff888097f2a600: 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc fc
 ffff888097f2a680: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
>ffff888097f2a700: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
                   ^
 ffff888097f2a780: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
 ffff888097f2a800: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc

Fixes: 767e97e1e0db ("neigh: RCU conversion of struct neighbour")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/neighbour.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'net/core')

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 0e2c07355463..9e7fc929bc50 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3203,6 +3203,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
 }
 
 void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
+	__acquires(tbl->lock)
 	__acquires(rcu_bh)
 {
 	struct neigh_seq_state *state = seq->private;
@@ -3213,6 +3214,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
 
 	rcu_read_lock_bh();
 	state->nht = rcu_dereference_bh(tbl->nht);
+	read_lock(&tbl->lock);
 
 	return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
 }
@@ -3246,8 +3248,13 @@ out:
 EXPORT_SYMBOL(neigh_seq_next);
 
 void neigh_seq_stop(struct seq_file *seq, void *v)
+	__releases(tbl->lock)
 	__releases(rcu_bh)
 {
+	struct neigh_seq_state *state = seq->private;
+	struct neigh_table *tbl = state->tbl;
+
+	read_unlock(&tbl->lock);
 	rcu_read_unlock_bh();
 }
 EXPORT_SYMBOL(neigh_seq_stop);
-- 
cgit v1.2.3