From ba6b8de423f8d0dee48d6030288ed81c03ddf9f0 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 23 Apr 2018 15:39:23 -0700
Subject: bpf: sockmap, map_release does not hold refcnt for pinned maps

Relying on map_release hook to decrement the reference counts when a
map is removed only works if the map is not being pinned. In the
pinned case the ref is decremented immediately and the BPF programs
released. After this BPF programs may not be in-use which is not
what the user would expect.

This patch moves the release logic into bpf_map_put_uref() and brings
sockmap in-line with how a similar case is handled in prog array maps.

Fixes: 3d9e952697de ("bpf: sockmap, fix leaking maps with attached but not detached progs")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/arraymap.c | 3 ++-
 kernel/bpf/sockmap.c  | 4 ++--
 kernel/bpf/syscall.c  | 4 ++--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 14750e7c5ee4..027107f4be53 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -476,7 +476,7 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr)
 }
 
 /* decrement refcnt of all bpf_progs that are stored in this map */
-void bpf_fd_array_map_clear(struct bpf_map *map)
+static void bpf_fd_array_map_clear(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
@@ -495,6 +495,7 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_fd_get_ptr = prog_fd_array_get_ptr,
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
 	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
+	.map_release_uref = bpf_fd_array_map_clear,
 };
 
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a3b21385e947..a73d484b6e4c 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1831,7 +1831,7 @@ static int sock_map_update_elem(struct bpf_map *map,
 	return err;
 }
 
-static void sock_map_release(struct bpf_map *map, struct file *map_file)
+static void sock_map_release(struct bpf_map *map)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	struct bpf_prog *orig;
@@ -1855,7 +1855,7 @@ const struct bpf_map_ops sock_map_ops = {
 	.map_get_next_key = sock_map_get_next_key,
 	.map_update_elem = sock_map_update_elem,
 	.map_delete_elem = sock_map_delete_elem,
-	.map_release = sock_map_release,
+	.map_release_uref = sock_map_release,
 };
 
 BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ca46df19c9a..ebfe9f29dae8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -257,8 +257,8 @@ static void bpf_map_free_deferred(struct work_struct *work)
 static void bpf_map_put_uref(struct bpf_map *map)
 {
 	if (atomic_dec_and_test(&map->usercnt)) {
-		if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
-			bpf_fd_array_map_clear(map);
+		if (map->ops->map_release_uref)
+			map->ops->map_release_uref(map);
 	}
 }
 
-- 
cgit v1.2.3


From e20f7334837ae47341d8ec4e3170d0b4336a3676 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 23 Apr 2018 15:39:28 -0700
Subject: bpf: sockmap, sk_wait_event needed to handle blocking cases

In the recvmsg handler we need to add a wait event to support the
blocking use cases. Without this we return zero and may confuse
user applications. In the wait event any data received on the
sk either via sk_receive_queue or the psock ingress list will
wake up the sock.

Fixes: fa246693a111 ("bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a73d484b6e4c..aaf50ec77c94 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -43,6 +43,7 @@
 #include <net/tcp.h>
 #include <linux/ptr_ring.h>
 #include <net/inet_common.h>
+#include <linux/sched/signal.h>
 
 #define SOCK_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -732,6 +733,26 @@ out_err:
 	return err;
 }
 
+static int bpf_wait_data(struct sock *sk,
+			 struct smap_psock *psk, int flags,
+			 long timeo, int *err)
+{
+	int rc;
+
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	rc = sk_wait_event(sk, &timeo,
+			   !list_empty(&psk->ingress) ||
+			   !skb_queue_empty(&sk->sk_receive_queue),
+			   &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+
+	return rc;
+}
+
 static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 			   int nonblock, int flags, int *addr_len)
 {
@@ -755,6 +776,7 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
 
 	lock_sock(sk);
+bytes_ready:
 	while (copied != len) {
 		struct scatterlist *sg;
 		struct sk_msg_buff *md;
@@ -809,6 +831,28 @@ static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
 		}
 	}
 
+	if (!copied) {
+		long timeo;
+		int data;
+		int err = 0;
+
+		timeo = sock_rcvtimeo(sk, nonblock);
+		data = bpf_wait_data(sk, psock, flags, timeo, &err);
+
+		if (data) {
+			if (!skb_queue_empty(&sk->sk_receive_queue)) {
+				release_sock(sk);
+				smap_release_sock(psock, sk);
+				copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+				return copied;
+			}
+			goto bytes_ready;
+		}
+
+		if (err)
+			copied = err;
+	}
+
 	release_sock(sk);
 	smap_release_sock(psock, sk);
 	return copied;
-- 
cgit v1.2.3


From 4fcfdfb83391c74e62683469289db42a143440ac Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Mon, 23 Apr 2018 15:39:33 -0700
Subject: bpf: sockmap, fix double page_put on ENOMEM error in redirect path

In the case where the socket memory boundary is hit the redirect
path returns an ENOMEM error. However, before checking for this
condition the redirect scatterlist buffer is setup with a valid
page and length. This is never unwound so when the buffers are
released latter in the error path we do a put_page() and clear
the scatterlist fields. But, because the initial error happens
before completing the scatterlist buffer we end up with both the
original buffer and the redirect buffer pointing to the same page
resulting in duplicate put_page() calls.

To fix this simply move the initial configuration of the redirect
scatterlist buffer below the sock memory check.

Found this while running TCP_STREAM test with netperf using Cilium.

Fixes: fa246693a111 ("bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/sockmap.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index aaf50ec77c94..634415c7fbcd 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -524,8 +524,6 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
 	i = md->sg_start;
 
 	do {
-		r->sg_data[i] = md->sg_data[i];
-
 		size = (apply && apply_bytes < md->sg_data[i].length) ?
 			apply_bytes : md->sg_data[i].length;
 
@@ -536,6 +534,7 @@ static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
 		}
 
 		sk_mem_charge(sk, size);
+		r->sg_data[i] = md->sg_data[i];
 		r->sg_data[i].length = size;
 		md->sg_data[i].length -= size;
 		md->sg_data[i].offset += size;
-- 
cgit v1.2.3


From 3cc9a472d625f31f981063882b07e96229b9e71a Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 2 May 2018 13:50:19 -0700
Subject: bpf: sockmap, fix scatterlist update on error path in send with apply

When the call to do_tcp_sendpage() fails to send the complete block
requested we either retry if only a partial send was completed or
abort if we receive a error less than or equal to zero. Before
returning though we must update the scatterlist length/offset to
account for any partial send completed.

Before this patch we did this at the end of the retry loop, but
this was buggy when used while applying a verdict to fewer bytes
than in the scatterlist. When the scatterlist length was being set
we forgot to account for the apply logic reducing the size variable.
So the result was we chopped off some bytes in the scatterlist without
doing proper cleanup on them. This results in a WARNING when the
sock is tore down because the bytes have previously been charged to
the socket but are never uncharged.

The simple fix is to simply do the accounting inside the retry loop
subtracting from the absolute scatterlist values rather than trying
to accumulate the totals and subtract at the end.

Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 634415c7fbcd..943929a05c92 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -326,6 +326,9 @@ retry:
 			if (ret > 0) {
 				if (apply)
 					apply_bytes -= ret;
+
+				sg->offset += ret;
+				sg->length -= ret;
 				size -= ret;
 				offset += ret;
 				if (uncharge)
@@ -333,8 +336,6 @@ retry:
 				goto retry;
 			}
 
-			sg->length = size;
-			sg->offset = offset;
 			return ret;
 		}
 
-- 
cgit v1.2.3


From fec51d40ea65dd8f51a3e27fc69b4e7dc4f17776 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 2 May 2018 13:50:24 -0700
Subject: bpf: sockmap, zero sg_size on error when buffer is released

When an error occurs during a redirect we have two cases that need
to be handled (i) we have a cork'ed buffer (ii) we have a normal
sendmsg buffer.

In the cork'ed buffer case we don't currently support recovering from
errors in a redirect action. So the buffer is released and the error
should _not_ be pushed back to the caller of sendmsg/sendpage. The
rationale here is the user will get an error that relates to old
data that may have been sent by some arbitrary thread on that sock.
Instead we simple consume the data and tell the user that the data
has been consumed. We may add proper error recovery in the future.
However, this patch fixes a bug where the bytes outstanding counter
sg_size was not zeroed. This could result in a case where if the user
has both a cork'ed action and apply action in progress we may
incorrectly call into the BPF program when the user expected an
old verdict to be applied via the apply action. I don't have a use
case where using apply and cork at the same time is valid but we
never explicitly reject it because it should work fine. This patch
ensures the sg_size is zeroed so we don't have this case.

In the normal sendmsg buffer case (no cork data) we also do not
zero sg_size. Again this can confuse the apply logic when the logic
calls into the BPF program when the BPF programmer expected the old
verdict to remain. So ensure we set sg_size to zero here as well. And
additionally to keep the psock state in-sync with the sk_msg_buff
release all the memory as well. Previously we did this before
returning to the user but this left a gap where psock and sk_msg_buff
states were out of sync which seems fragile. No additional overhead
is taken here except for a call to check the length and realize its
already been freed. This is in the error path as well so in my
opinion lets have robust code over optimized error paths.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 943929a05c92..052c313b12db 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -701,15 +701,22 @@ more_data:
 		err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
 		lock_sock(sk);
 
+		if (unlikely(err < 0)) {
+			free_start_sg(sk, m);
+			psock->sg_size = 0;
+			if (!cork)
+				*copied -= send;
+		} else {
+			psock->sg_size -= send;
+		}
+
 		if (cork) {
 			free_start_sg(sk, m);
+			psock->sg_size = 0;
 			kfree(m);
 			m = NULL;
+			err = 0;
 		}
-		if (unlikely(err))
-			*copied -= err;
-		else
-			psock->sg_size -= send;
 		break;
 	case __SK_DROP:
 	default:
-- 
cgit v1.2.3


From abaeb096ca38cad02c8a68c49ddd7efc043c319a Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 2 May 2018 13:50:29 -0700
Subject: bpf: sockmap, fix error handling in redirect failures

When a redirect failure happens we release the buffers in-flight
without calling a sk_mem_uncharge(), the uncharge is called before
dropping the sock lock for the redirecte, however we missed updating
the ring start index. When no apply actions are in progress this
is OK because we uncharge the entire buffer before the redirect.
But, when we have apply logic running its possible that only a
portion of the buffer is being redirected. In this case we only
do memory accounting for the buffer slice being redirected and
expect to be able to loop over the BPF program again and/or if
a sock is closed uncharge the memory at sock destruct time.

With an invalid start index however the program logic looks at
the start pointer index, checks the length, and when seeing the
length is zero (from the initial release and failure to update
the pointer) aborts without uncharging/releasing the remaining
memory.

The fix for this is simply to update the start index. To avoid
fixing this error in two locations we do a small refactor and
remove one case where it is open-coded. Then fix it in the
single function.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/sockmap.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 052c313b12db..098eca568c2b 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -393,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
 	} while (i != md->sg_end);
 }
 
-static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
+static void free_bytes_sg(struct sock *sk, int bytes,
+			  struct sk_msg_buff *md, bool charge)
 {
 	struct scatterlist *sg = md->sg_data;
 	int i = md->sg_start, free;
@@ -403,11 +404,13 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
 		if (bytes < free) {
 			sg[i].length -= bytes;
 			sg[i].offset += bytes;
-			sk_mem_uncharge(sk, bytes);
+			if (charge)
+				sk_mem_uncharge(sk, bytes);
 			break;
 		}
 
-		sk_mem_uncharge(sk, sg[i].length);
+		if (charge)
+			sk_mem_uncharge(sk, sg[i].length);
 		put_page(sg_page(&sg[i]));
 		bytes -= sg[i].length;
 		sg[i].length = 0;
@@ -418,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
 		if (i == MAX_SKB_FRAGS)
 			i = 0;
 	}
+	md->sg_start = i;
 }
 
 static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
@@ -576,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
 				       struct sk_msg_buff *md,
 				       int flags)
 {
+	bool ingress = !!(md->flags & BPF_F_INGRESS);
 	struct smap_psock *psock;
 	struct scatterlist *sg;
-	int i, err, free = 0;
-	bool ingress = !!(md->flags & BPF_F_INGRESS);
+	int err = 0;
 
 	sg = md->sg_data;
 
@@ -607,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
 out_rcu:
 	rcu_read_unlock();
 out:
-	i = md->sg_start;
-	while (sg[i].length) {
-		free += sg[i].length;
-		put_page(sg_page(&sg[i]));
-		sg[i].length = 0;
-		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-	}
-	return free;
+	free_bytes_sg(NULL, send, md, false);
+	return err;
 }
 
 static inline void bpf_md_init(struct smap_psock *psock)
@@ -720,7 +716,7 @@ more_data:
 		break;
 	case __SK_DROP:
 	default:
-		free_bytes_sg(sk, send, m);
+		free_bytes_sg(sk, send, m, true);
 		apply_bytes_dec(psock, send);
 		*copied -= send;
 		psock->sg_size -= send;
-- 
cgit v1.2.3