From 3bdbd0228e7555ec745e08469b98e5a0966409d6 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 16 Dec 2018 15:47:04 -0800
Subject: bpf: sockmap, metadata support for reporting size of msg

This adds metadata to sk_msg_md for BPF programs to read the sk_msg
size.

When the SK_MSG program is running under an application that is using
sendfile the data is not copied into sk_msg buffers by default. Rather
the BPF program uses sk_msg_pull_data to read the bytes in. This
avoids doing the costly memcopy instructions when they are not in
fact needed. However, if we don't know the size of the sk_msg we
have to guess if needed bytes are available by doing a pull request
which may fail. By including the size of the sk_msg BPF programs can
check the size before issuing sk_msg_pull_data requests.

Additionally, the same applies for sendmsg calls when the application
provides multiple iovs. Here the BPF program needs to pull in data
to update data pointers but its not clear where the data ends without
a size parameter. In many cases "guessing" is not easy to do
and results in multiple calls to pull and without bounded loops
everything gets fairly tricky.

Clean this up by including a u32 size field. Note, all writes into
sk_msg_md are rejected already from sk_msg_is_valid_access so nothing
additional is needed there.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index f9348806e843..3a3b21726fb5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7530,6 +7530,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_num));
 		break;
+
+	case offsetof(struct sk_msg_md, size):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_sg, size));
+		break;
 	}
 
 	return insn - insn_buf;
-- 
cgit v1.2.3


From bc1b4f013b5029b4c8b63fb9ba8d084119486d7b Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 20 Dec 2018 11:35:30 -0800
Subject: bpf: sk_msg, improve offset chk in _is_valid_access

The check for max offset in sk_msg_is_valid_access uses sizeof()
which is incorrect because it would allow accessing possibly
past the end of the struct in the padded case. Further, it doesn't
preclude accessing any padding that may be added in the middle of
a struct. All told this makes it fragile to rely on.

To fix this explicitly check offsets with fields using the
bpf_ctx_range() and bpf_ctx_range_till() macros.

For reference the current structure layout looks as follows (reported
by pahole)

struct sk_msg_md {
	union {
		void *             data;                 /*           8 */
	};                                               /*     0     8 */
	union {
		void *             data_end;             /*           8 */
	};                                               /*     8     8 */
	__u32                      family;               /*    16     4 */
	__u32                      remote_ip4;           /*    20     4 */
	__u32                      local_ip4;            /*    24     4 */
	__u32                      remote_ip6[4];        /*    28    16 */
	__u32                      local_ip6[4];         /*    44    16 */
	__u32                      remote_port;          /*    60     4 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	__u32                      local_port;           /*    64     4 */
	__u32                      size;                 /*    68     4 */

	/* size: 72, cachelines: 2, members: 10 */
	/* last cacheline: 8 bytes */
};

So there should be no padding at the moment but fixing this now
prevents future errors.

Reported-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/filter.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'net/core')

diff --git a/net/core/filter.c b/net/core/filter.c
index 3a3b21726fb5..6bd9f08f6162 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6313,6 +6313,9 @@ static bool sk_msg_is_valid_access(int off, int size,
 	if (type == BPF_WRITE)
 		return false;
 
+	if (off % size != 0)
+		return false;
+
 	switch (off) {
 	case offsetof(struct sk_msg_md, data):
 		info->reg_type = PTR_TO_PACKET;
@@ -6324,16 +6327,20 @@ static bool sk_msg_is_valid_access(int off, int size,
 		if (size != sizeof(__u64))
 			return false;
 		break;
-	default:
+	case bpf_ctx_range(struct sk_msg_md, family):
+	case bpf_ctx_range(struct sk_msg_md, remote_ip4):
+	case bpf_ctx_range(struct sk_msg_md, local_ip4):
+	case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
+	case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
+	case bpf_ctx_range(struct sk_msg_md, remote_port):
+	case bpf_ctx_range(struct sk_msg_md, local_port):
+	case bpf_ctx_range(struct sk_msg_md, size):
 		if (size != sizeof(__u32))
 			return false;
-	}
-
-	if (off < 0 || off >= sizeof(struct sk_msg_md))
-		return false;
-	if (off % size != 0)
+		break;
+	default:
 		return false;
-
+	}
 	return true;
 }
 
-- 
cgit v1.2.3


From 7a69c0f250568e6ab72f401b2c69aa0e666c94f2 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 20 Dec 2018 11:35:31 -0800
Subject: bpf: skmsg, replace comments with BUILD bug

Enforce comment on structure layout dependency with a BUILD_BUG_ON
to ensure the condition is maintained.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skmsg.h | 4 +---
 net/core/filter.c     | 3 +++
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index eb8f6cb84c10..dd57e6f408b1 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -36,9 +36,7 @@ struct sk_msg_sg {
 	struct scatterlist		data[MAX_MSG_FRAGS + 1];
 };
 
-/* UAPI in filter.c depends on struct sk_msg_sg being first element. If
- * this is moved filter.c also must be updated.
- */
+/* UAPI in filter.c depends on struct sk_msg_sg being first element. */
 struct sk_msg {
 	struct sk_msg_sg		sg;
 	void				*data;
diff --git a/net/core/filter.c b/net/core/filter.c
index 6bd9f08f6162..447dd1bad31f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -7425,6 +7425,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 	int off;
 #endif
 
+	/* convert ctx uses the fact sg element is first in struct */
+	BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);
+
 	switch (si->off) {
 	case offsetof(struct sk_msg_md, data):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
-- 
cgit v1.2.3


From 51199405f967207de372d9b60989eb87d7ae8809 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 20 Dec 2018 11:35:32 -0800
Subject: bpf: skb_verdict, support SK_PASS on RX BPF path

Add SK_PASS verdict support to SK_SKB_VERDICT programs. Now that
support for redirects exists we can implement SK_PASS as a redirect
to the same socket. This simplifies the BPF programs and avoids an
extra map lookup on RX path for simple visibility cases.

Further, reduces user (BPF programmer in this context) confusion
when their program drops skb due to lack of support.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/skmsg.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'net/core')

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 56a99d0c9aa0..8a91a460de8f 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -669,6 +669,22 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
 	bool ingress;
 
 	switch (verdict) {
+	case __SK_PASS:
+		sk_other = psock->sk;
+		if (sock_flag(sk_other, SOCK_DEAD) ||
+		    !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+			goto out_free;
+		}
+		if (atomic_read(&sk_other->sk_rmem_alloc) <=
+		    sk_other->sk_rcvbuf) {
+			struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);
+
+			tcp->bpf.flags |= BPF_F_INGRESS;
+			skb_queue_tail(&psock->ingress_skb, skb);
+			schedule_work(&psock->work);
+			break;
+		}
+		goto out_free;
 	case __SK_REDIRECT:
 		sk_other = tcp_skb_bpf_redirect_fetch(skb);
 		if (unlikely(!sk_other))
-- 
cgit v1.2.3


From 552de91068828daef50a227a665068cf8dde835e Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 20 Dec 2018 11:35:33 -0800
Subject: bpf: sk_msg, fix socket data_ready events

When a skb verdict program is in-use and either another BPF program
redirects to that socket or the new SK_PASS support is used the
data_ready callback does not wake up application. Instead because
the stream parser/verdict is using the sk data_ready callback we wake
up the stream parser/verdict block.

Fix this by adding a helper to check if the stream parser block is
enabled on the sk and if so call the saved pointer which is the
upper layers wake up function.

This fixes application stalls observed when an application is waiting
for data in a blocking read().

Fixes: d829e9c4112b ("tls: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/skmsg.h | 8 ++++++++
 net/core/skmsg.c      | 6 +++---
 net/ipv4/tcp_bpf.c    | 2 +-
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'net/core')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index dd57e6f408b1..178a3933a71b 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -417,6 +417,14 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
 		sk_psock_drop(sk, psock);
 }
 
+static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
+{
+	if (psock->parser.enabled)
+		psock->parser.saved_data_ready(sk);
+	else
+		sk->sk_data_ready(sk);
+}
+
 static inline void psock_set_prog(struct bpf_prog **pprog,
 				  struct bpf_prog *prog)
 {
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 8a91a460de8f..3df7627db4bb 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -403,7 +403,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
 	msg->skb = skb;
 
 	sk_psock_queue_msg(psock, msg);
-	sk->sk_data_ready(sk);
+	sk_psock_data_ready(sk, psock);
 	return copied;
 }
 
@@ -751,7 +751,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
 }
 
 /* Called with socket lock held. */
-static void sk_psock_data_ready(struct sock *sk)
+static void sk_psock_strp_data_ready(struct sock *sk)
 {
 	struct sk_psock *psock;
 
@@ -799,7 +799,7 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
 		return;
 
 	parser->saved_data_ready = sk->sk_data_ready;
-	sk->sk_data_ready = sk_psock_data_ready;
+	sk->sk_data_ready = sk_psock_strp_data_ready;
 	sk->sk_write_space = sk_psock_write_space;
 	parser->enabled = true;
 }
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index a47c1cdf90fc..87503343743d 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -198,7 +198,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
 		msg->sg.start = i;
 		msg->sg.size -= apply_bytes;
 		sk_psock_queue_msg(psock, tmp);
-		sk->sk_data_ready(sk);
+		sk_psock_data_ready(sk, psock);
 	} else {
 		sk_msg_free(sk, tmp);
 		kfree(tmp);
-- 
cgit v1.2.3


From a136678c0bdbb650daff5df5eec1dab960e074a7 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Thu, 20 Dec 2018 11:35:34 -0800
Subject: bpf: sk_msg, zap ingress queue on psock down

In addition to releasing any cork'ed data on a psock when the psock
is removed we should also release any skb's in the ingress work queue.
Otherwise the skb's eventually get free'd but late in the tear
down process so we see the WARNING due to non-zero sk_forward_alloc.

  void sk_stream_kill_queues(struct sock *sk)
  {
	...
	WARN_ON(sk->sk_forward_alloc);
	...
  }

Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 net/core/skmsg.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/core')

diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 3df7627db4bb..86c9726fced8 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -572,6 +572,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
 {
 	rcu_assign_sk_user_data(sk, NULL);
 	sk_psock_cork_free(psock);
+	sk_psock_zap_ingress(psock);
 	sk_psock_restore_proto(sk, psock);
 
 	write_lock_bh(&sk->sk_callback_lock);
-- 
cgit v1.2.3