From 13e56ec2cc9860aa22e01ffc7a3160f35a96b728 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 5 Dec 2018 20:40:47 -0800
Subject: selftests/bpf: use thoff instead of nhoff in BPF flow dissector

We are returning thoff from the flow dissector, not the nhoff. Pass
thoff along with nhoff to the bpf program (initially thoff == nhoff)
and expect flow dissector amend/return thoff, not nhoff.

This avoids confusion, when by the time bpf flow dissector exits,
nhoff == thoff, which doesn't make much sense.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/flow_dissector.c              |  1 +
 tools/testing/selftests/bpf/bpf_flow.c | 36 ++++++++++++++++------------------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 588f475019d4..ff5556d80570 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -783,6 +783,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 		/* Pass parameters to the BPF program */
 		cb->qdisc_cb.flow_keys = &flow_keys;
 		flow_keys.nhoff = nhoff;
+		flow_keys.thoff = nhoff;
 
 		bpf_compute_data_pointers((struct sk_buff *)skb);
 		result = BPF_PROG_RUN(attached, skb);
diff --git a/tools/testing/selftests/bpf/bpf_flow.c b/tools/testing/selftests/bpf/bpf_flow.c
index 107350a7821d..df9d32fd2055 100644
--- a/tools/testing/selftests/bpf/bpf_flow.c
+++ b/tools/testing/selftests/bpf/bpf_flow.c
@@ -70,18 +70,18 @@ static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb,
 {
 	void *data_end = (void *)(long)skb->data_end;
 	void *data = (void *)(long)skb->data;
-	__u16 nhoff = skb->flow_keys->nhoff;
+	__u16 thoff = skb->flow_keys->thoff;
 	__u8 *hdr;
 
 	/* Verifies this variable offset does not overflow */
-	if (nhoff > (USHRT_MAX - hdr_size))
+	if (thoff > (USHRT_MAX - hdr_size))
 		return NULL;
 
-	hdr = data + nhoff;
+	hdr = data + thoff;
 	if (hdr + hdr_size <= data_end)
 		return hdr;
 
-	if (bpf_skb_load_bytes(skb, nhoff, buffer, hdr_size))
+	if (bpf_skb_load_bytes(skb, thoff, buffer, hdr_size))
 		return NULL;
 
 	return buffer;
@@ -158,13 +158,13 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
 			/* Only inspect standard GRE packets with version 0 */
 			return BPF_OK;
 
-		keys->nhoff += sizeof(*gre); /* Step over GRE Flags and Proto */
+		keys->thoff += sizeof(*gre); /* Step over GRE Flags and Proto */
 		if (GRE_IS_CSUM(gre->flags))
-			keys->nhoff += 4; /* Step over chksum and Padding */
+			keys->thoff += 4; /* Step over chksum and Padding */
 		if (GRE_IS_KEY(gre->flags))
-			keys->nhoff += 4; /* Step over key */
+			keys->thoff += 4; /* Step over key */
 		if (GRE_IS_SEQ(gre->flags))
-			keys->nhoff += 4; /* Step over sequence number */
+			keys->thoff += 4; /* Step over sequence number */
 
 		keys->is_encap = true;
 
@@ -174,7 +174,7 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
 			if (!eth)
 				return BPF_DROP;
 
-			keys->nhoff += sizeof(*eth);
+			keys->thoff += sizeof(*eth);
 
 			return parse_eth_proto(skb, eth->h_proto);
 		} else {
@@ -191,7 +191,6 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
 		if ((__u8 *)tcp + (tcp->doff << 2) > data_end)
 			return BPF_DROP;
 
-		keys->thoff = keys->nhoff;
 		keys->sport = tcp->source;
 		keys->dport = tcp->dest;
 		return BPF_OK;
@@ -201,7 +200,6 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
 		if (!udp)
 			return BPF_DROP;
 
-		keys->thoff = keys->nhoff;
 		keys->sport = udp->source;
 		keys->dport = udp->dest;
 		return BPF_OK;
@@ -252,8 +250,8 @@ PROG(IP)(struct __sk_buff *skb)
 	keys->ipv4_src = iph->saddr;
 	keys->ipv4_dst = iph->daddr;
 
-	keys->nhoff += iph->ihl << 2;
-	if (data + keys->nhoff > data_end)
+	keys->thoff += iph->ihl << 2;
+	if (data + keys->thoff > data_end)
 		return BPF_DROP;
 
 	if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
@@ -285,7 +283,7 @@ PROG(IPV6)(struct __sk_buff *skb)
 	keys->addr_proto = ETH_P_IPV6;
 	memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr));
 
-	keys->nhoff += sizeof(struct ipv6hdr);
+	keys->thoff += sizeof(struct ipv6hdr);
 
 	return parse_ipv6_proto(skb, ip6h->nexthdr);
 }
@@ -301,7 +299,7 @@ PROG(IPV6OP)(struct __sk_buff *skb)
 	/* hlen is in 8-octets and does not include the first 8 bytes
 	 * of the header
 	 */
-	skb->flow_keys->nhoff += (1 + ip6h->hdrlen) << 3;
+	skb->flow_keys->thoff += (1 + ip6h->hdrlen) << 3;
 
 	return parse_ipv6_proto(skb, ip6h->nexthdr);
 }
@@ -315,7 +313,7 @@ PROG(IPV6FR)(struct __sk_buff *skb)
 	if (!fragh)
 		return BPF_DROP;
 
-	keys->nhoff += sizeof(*fragh);
+	keys->thoff += sizeof(*fragh);
 	keys->is_frag = true;
 	if (!(fragh->frag_off & bpf_htons(IP6_OFFSET)))
 		keys->is_first_frag = true;
@@ -341,7 +339,7 @@ PROG(VLAN)(struct __sk_buff *skb)
 	__be16 proto;
 
 	/* Peek back to see if single or double-tagging */
-	if (bpf_skb_load_bytes(skb, keys->nhoff - sizeof(proto), &proto,
+	if (bpf_skb_load_bytes(skb, keys->thoff - sizeof(proto), &proto,
 			       sizeof(proto)))
 		return BPF_DROP;
 
@@ -354,14 +352,14 @@ PROG(VLAN)(struct __sk_buff *skb)
 		if (vlan->h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q))
 			return BPF_DROP;
 
-		keys->nhoff += sizeof(*vlan);
+		keys->thoff += sizeof(*vlan);
 	}
 
 	vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan);
 	if (!vlan)
 		return BPF_DROP;
 
-	keys->nhoff += sizeof(*vlan);
+	keys->thoff += sizeof(*vlan);
 	/* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/
 	if (vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) ||
 	    vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q))
-- 
cgit v1.2.3


From ec3d837aac5dca7cb8a69c9f101690c182da79c4 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Wed, 5 Dec 2018 20:40:48 -0800
Subject: net/flow_dissector: correctly cap nhoff and thoff in case of BPF

We want to make sure that the following condition holds:
0 <= nhoff <= thoff <= skb->len

BPF program can set out-of-bounds nhoff and thoff, which is dangerous, see
recent commit d0c081b49137 ("flow_dissector: properly cap thoff field")'.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 net/core/flow_dissector.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index ff5556d80570..af68207ee56c 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -791,9 +791,12 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 		/* Restore state */
 		memcpy(cb, &cb_saved, sizeof(cb_saved));
 
+		flow_keys.nhoff = clamp_t(u16, flow_keys.nhoff, 0, skb->len);
+		flow_keys.thoff = clamp_t(u16, flow_keys.thoff,
+					  flow_keys.nhoff, skb->len);
+
 		__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
 					 target_container);
-		key_control->thoff = min_t(u16, key_control->thoff, skb->len);
 		rcu_read_unlock();
 		return result == BPF_OK;
 	}
-- 
cgit v1.2.3


From 51a11b14c29c6c7cc985c6ce7dd9e20f61cf7f0a Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.ibm.com>
Date: Thu, 6 Dec 2018 14:57:01 +0530
Subject: bpf: powerpc: fix broken uapi for BPF_PROG_TYPE_PERF_EVENT

Now that there are different variants of pt_regs for userspace and
kernel, the uapi for the BPF_PROG_TYPE_PERF_EVENT program type must
be changed by exporting the user_pt_regs structure instead of the
pt_regs structure that is in-kernel only.

Fixes: 002af9391bfb ("powerpc: Split user/kernel definitions of struct pt_regs")
Signed-off-by: Sandipan Das <sandipan@linux.ibm.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/powerpc/include/asm/perf_event.h          | 2 ++
 arch/powerpc/include/uapi/asm/Kbuild           | 1 -
 arch/powerpc/include/uapi/asm/bpf_perf_event.h | 9 +++++++++
 3 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/uapi/asm/bpf_perf_event.h

diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h
index 8bf1b6351716..16a49819da9a 100644
--- a/arch/powerpc/include/asm/perf_event.h
+++ b/arch/powerpc/include/asm/perf_event.h
@@ -26,6 +26,8 @@
 #include <asm/ptrace.h>
 #include <asm/reg.h>
 
+#define perf_arch_bpf_user_pt_regs(regs) &regs->user_regs
+
 /*
  * Overload regs->result to specify whether we should use the MSR (result
  * is zero) or the SIAR (result is non zero).
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index a658091a19f9..3712152206f3 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -1,7 +1,6 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
-generic-y += bpf_perf_event.h
 generic-y += param.h
 generic-y += poll.h
 generic-y += resource.h
diff --git a/arch/powerpc/include/uapi/asm/bpf_perf_event.h b/arch/powerpc/include/uapi/asm/bpf_perf_event.h
new file mode 100644
index 000000000000..b551b741653d
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/bpf_perf_event.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _UAPI__ASM_BPF_PERF_EVENT_H__
+#define _UAPI__ASM_BPF_PERF_EVENT_H__
+
+#include <asm/ptrace.h>
+
+typedef struct user_pt_regs bpf_user_pt_regs_t;
+
+#endif /* _UAPI__ASM_BPF_PERF_EVENT_H__ */
-- 
cgit v1.2.3


From c2a20a2731df11f9b7b7030f7ac3fc222c9ce39d Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Thu, 6 Dec 2018 20:14:11 -0800
Subject: selftests/bpf: add missing pointer dereference for map stacktrace
 fixup

I get a segfault without it, other fixups always do dereference, and
without dereference I don't understand how it can ever work.

Fixes: 7c85c448e7d74 ("selftests/bpf: test_verifier, check bpf_map_lookup_elem access in bpf prog")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index df6f751cc1e8..d23929a1985d 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -14166,7 +14166,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type,
 		do {
 			prog[*fixup_map_stacktrace].imm = map_fds[12];
 			fixup_map_stacktrace++;
-		} while (fixup_map_stacktrace);
+		} while (*fixup_map_stacktrace);
 	}
 }
 
-- 
cgit v1.2.3


From aca1a80ebe3e4d49adaf6516c61a6786b1ee7dad Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Mon, 10 Dec 2018 15:25:04 -0800
Subject: selftests/bpf: use proper type when passing prog_type

Use bpf_prog_type instead of bpf_map_type when passing prog_type.

-Wenum-conversion might be unhappy about it:
	error: implicit conversion from enumeration type
	'enum bpf_map_type' to different enumeration type
	'enum bpf_prog_type'

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 tools/testing/selftests/bpf/test_verifier.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index d23929a1985d..37583267bdbc 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -13940,7 +13940,7 @@ static int create_map(uint32_t type, uint32_t size_key,
 	return fd;
 }
 
-static int create_prog_dummy1(enum bpf_map_type prog_type)
+static int create_prog_dummy1(enum bpf_prog_type prog_type)
 {
 	struct bpf_insn prog[] = {
 		BPF_MOV64_IMM(BPF_REG_0, 42),
@@ -13951,7 +13951,7 @@ static int create_prog_dummy1(enum bpf_map_type prog_type)
 				ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 }
 
-static int create_prog_dummy2(enum bpf_map_type prog_type, int mfd, int idx)
+static int create_prog_dummy2(enum bpf_prog_type prog_type, int mfd, int idx)
 {
 	struct bpf_insn prog[] = {
 		BPF_MOV64_IMM(BPF_REG_3, idx),
@@ -13966,7 +13966,7 @@ static int create_prog_dummy2(enum bpf_map_type prog_type, int mfd, int idx)
 				ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
 }
 
-static int create_prog_array(enum bpf_map_type prog_type, uint32_t max_elem,
+static int create_prog_array(enum bpf_prog_type prog_type, uint32_t max_elem,
 			     int p1key)
 {
 	int p2key = 1;
@@ -14037,7 +14037,7 @@ static int create_cgroup_storage(bool percpu)
 
 static char bpf_vlog[UINT_MAX >> 8];
 
-static void do_test_fixup(struct bpf_test *test, enum bpf_map_type prog_type,
+static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
 			  struct bpf_insn *prog, int *map_fds)
 {
 	int *fixup_map_hash_8b = test->fixup_map_hash_8b;
-- 
cgit v1.2.3


From fdadd04931c2d7cd294dc5b2b342863f94be53a3 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 11 Dec 2018 12:14:12 +0100
Subject: bpf: fix bpf_jit_limit knob for PAGE_SIZE >= 64K

Michael and Sandipan report:

  Commit ede95a63b5 introduced a bpf_jit_limit tuneable to limit BPF
  JIT allocations. At compile time it defaults to PAGE_SIZE * 40000,
  and is adjusted again at init time if MODULES_VADDR is defined.

  For ppc64 kernels, MODULES_VADDR isn't defined, so we're stuck with
  the compile-time default at boot-time, which is 0x9c400000 when
  using 64K page size. This overflows the signed 32-bit bpf_jit_limit
  value:

  root@ubuntu:/tmp# cat /proc/sys/net/core/bpf_jit_limit
  -1673527296

  and can cause various unexpected failures throughout the network
  stack. In one case `strace dhclient eth0` reported:

  setsockopt(5, SOL_SOCKET, SO_ATTACH_FILTER, {len=11, filter=0x105dd27f8},
             16) = -1 ENOTSUPP (Unknown error 524)

  and similar failures can be seen with tools like tcpdump. This doesn't
  always reproduce however, and I'm not sure why. The more consistent
  failure I've seen is an Ubuntu 18.04 KVM guest booted on a POWER9
  host would time out on systemd/netplan configuring a virtio-net NIC
  with no noticeable errors in the logs.

Given this and also given that in near future some architectures like
arm64 will have a custom area for BPF JIT image allocations we should
get rid of the BPF_JIT_LIMIT_DEFAULT fallback / default entirely. For
4.21, we have an overridable bpf_jit_alloc_exec(), bpf_jit_free_exec()
so therefore add another overridable bpf_jit_alloc_exec_limit() helper
function which returns the possible size of the memory area for deriving
the default heuristic in bpf_jit_charge_init().

Like bpf_jit_alloc_exec() and bpf_jit_free_exec(), the new
bpf_jit_alloc_exec_limit() assumes that module_alloc() is the default
JIT memory provider, and therefore in case archs implement their custom
module_alloc() we use MODULES_{END,_VADDR} for limits and otherwise for
vmalloc_exec() cases like on ppc64 we use VMALLOC_{END,_START}.

Additionally, for archs supporting large page sizes, we should change
the sysctl to be handled as long to not run into sysctl restrictions
in future.

Fixes: ede95a63b5e8 ("bpf: add bpf_jit_limit knob to restrict unpriv allocations")
Reported-by: Sandipan Das <sandipan@linux.ibm.com>
Reported-by: Michael Roth <mdroth@linux.vnet.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Michael Roth <mdroth@linux.vnet.ibm.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h     |  2 +-
 kernel/bpf/core.c          | 21 +++++++++++++++------
 net/core/sysctl_net_core.c | 20 +++++++++++++++++---
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 795ff0b869bb..a8b9d90a8042 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -861,7 +861,7 @@ bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 extern int bpf_jit_enable;
 extern int bpf_jit_harden;
 extern int bpf_jit_kallsyms;
-extern int bpf_jit_limit;
+extern long bpf_jit_limit;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b1a3545d0ec8..b2890c268cb3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -365,13 +365,11 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp)
 }
 
 #ifdef CONFIG_BPF_JIT
-# define BPF_JIT_LIMIT_DEFAULT	(PAGE_SIZE * 40000)
-
 /* All BPF JIT sysctl knobs here. */
 int bpf_jit_enable   __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON);
 int bpf_jit_harden   __read_mostly;
 int bpf_jit_kallsyms __read_mostly;
-int bpf_jit_limit    __read_mostly = BPF_JIT_LIMIT_DEFAULT;
+long bpf_jit_limit   __read_mostly;
 
 static __always_inline void
 bpf_get_prog_addr_region(const struct bpf_prog *prog,
@@ -580,16 +578,27 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 
 static atomic_long_t bpf_jit_current;
 
+/* Can be overridden by an arch's JIT compiler if it has a custom,
+ * dedicated BPF backend memory area, or if neither of the two
+ * below apply.
+ */
+u64 __weak bpf_jit_alloc_exec_limit(void)
+{
 #if defined(MODULES_VADDR)
+	return MODULES_END - MODULES_VADDR;
+#else
+	return VMALLOC_END - VMALLOC_START;
+#endif
+}
+
 static int __init bpf_jit_charge_init(void)
 {
 	/* Only used as heuristic here to derive limit. */
-	bpf_jit_limit = min_t(u64, round_up((MODULES_END - MODULES_VADDR) >> 2,
-					    PAGE_SIZE), INT_MAX);
+	bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
+					    PAGE_SIZE), LONG_MAX);
 	return 0;
 }
 pure_initcall(bpf_jit_charge_init);
-#endif
 
 static int bpf_jit_charge_modmem(u32 pages)
 {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 37b4667128a3..d67ec17f2cc8 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -28,6 +28,8 @@ static int two __maybe_unused = 2;
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 static int max_skb_frags = MAX_SKB_FRAGS;
+static long long_one __maybe_unused = 1;
+static long long_max __maybe_unused = LONG_MAX;
 
 static int net_msg_warn;	/* Unused, but still a sysctl */
 
@@ -289,6 +291,17 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
 
 	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 }
+
+static int
+proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write,
+				     void __user *buffer, size_t *lenp,
+				     loff_t *ppos)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
 #endif
 
 static struct ctl_table net_core_table[] = {
@@ -398,10 +411,11 @@ static struct ctl_table net_core_table[] = {
 	{
 		.procname	= "bpf_jit_limit",
 		.data		= &bpf_jit_limit,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(long),
 		.mode		= 0600,
-		.proc_handler	= proc_dointvec_minmax_bpf_restricted,
-		.extra1		= &one,
+		.proc_handler	= proc_dolongvec_minmax_bpf_restricted,
+		.extra1		= &long_one,
+		.extra2		= &long_max,
 	},
 #endif
 	{
-- 
cgit v1.2.3


From 7640ead939247e91e84b7ec6ec001f30193cc7df Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 12 Dec 2018 16:29:07 -0800
Subject: bpf: verifier: make sure callees don't prune with caller differences

Currently for liveness and state pruning the register parentage
chains don't include states of the callee.  This makes some sense
as the callee can't access those registers.  However, this means
that READs done after the callee returns will not propagate into
the states of the callee.  Callee will then perform pruning
disregarding differences in caller state.

Example:

   0: (85) call bpf_user_rnd_u32
   1: (b7) r8 = 0
   2: (55) if r0 != 0x0 goto pc+1
   3: (b7) r8 = 1
   4: (bf) r1 = r8
   5: (85) call pc+4
   6: (15) if r8 == 0x1 goto pc+1
   7: (05) *(u64 *)(r9 - 8) = r3
   8: (b7) r0 = 0
   9: (95) exit

   10: (15) if r1 == 0x0 goto pc+0
   11: (95) exit

Here we acquire unknown state with call to get_random() [1].  Then
we store this random state in r8 (either 0 or 1) [1 - 3], and make
a call on line 5.  Callee does nothing but a trivial conditional
jump (to create a pruning point).  Upon return caller checks the
state of r8 and either performs an unsafe read or not.

Verifier will first explore the path with r8 == 1, creating a pruning
point at [11].  The parentage chain for r8 will include only callers
states so once verifier reaches [6] it will mark liveness only on states
in the caller, and not [11].  Now when verifier walks the paths with
r8 == 0 it will reach [11] and since REG_LIVE_READ on r8 was not
propagated there it will prune the walk entirely (stop walking
the entire program, not just the callee).  Since [6] was never walked
with r8 == 0, [7] will be considered dead and replaced with "goto -1"
causing hang at runtime.

This patch weaves the callee's explored states onto the callers
parentage chain.  Rough parentage for r8 would have looked like this
before:

[0] [1] [2] [3] [4] [5]   [10]      [11]      [6]      [7]
     |           |      ,---|----.    |        |        |
  sl0:         sl0:    / sl0:     \ sl0:      sl0:     sl0:
  fr0: r8 <-- fr0: r8<+--fr0: r8   `fr0: r8  ,fr0: r8<-fr0: r8
                       \ fr1: r8 <- fr1: r8 /
                        \__________________/

after:

[0] [1] [2] [3] [4] [5]   [10]      [11]      [6]      [7]
     |           |          |         |        |        |
   sl0:         sl0:      sl0:       sl0:      sl0:     sl0:
   fr0: r8 <-- fr0: r8 <- fr0: r8 <- fr0: r8 <-fr0: r8<-fr0: r8
                          fr1: r8 <- fr1: r8

Now the mark from instruction 6 will travel through callees states.

Note that we don't have to connect r0 because its overwritten by
callees state on return and r1 - r5 because those are not alive
any more once a call is made.

v2:
 - don't connect the callees registers twice (Alexei: suggestion & code)
 - add more details to the comment (Ed & Alexei)
v1: don't unnecessarily link caller saved regs (Jiong)

Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)")
Reported-by: David Beckett <david.beckett@netronome.com>
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Jiong Wang <jiong.wang@netronome.com>
Reviewed-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/bpf/verifier.c                       | 13 ++++++++++---
 tools/testing/selftests/bpf/test_verifier.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index fc760d00a38c..51ba84d4d34a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5102,9 +5102,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	}
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
-	/* connect new state to parentage chain */
-	for (i = 0; i < BPF_REG_FP; i++)
-		cur_regs(env)[i].parent = &new->frame[new->curframe]->regs[i];
+	/* connect new state to parentage chain. Current frame needs all
+	 * registers connected. Only r6 - r9 of the callers are alive (pushed
+	 * to the stack implicitly by JITs) so in callers' frames connect just
+	 * r6 - r9 as an optimization. Callers will have r1 - r5 connected to
+	 * the state of the call instruction (with WRITTEN set), and r0 comes
+	 * from callee with its full parentage chain, anyway.
+	 */
+	for (j = 0; j <= cur->curframe; j++)
+		for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
+			cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
 	/* clear write marks in current state: the writes we did are not writes
 	 * our child did, so they don't screen off its reads from us.
 	 * (There are no read marks in current state, because reads always mark
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 37583267bdbc..f8eac4a544f4 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -13915,6 +13915,34 @@ static struct bpf_test tests[] = {
 		.result_unpriv = REJECT,
 		.result = ACCEPT,
 	},
+	{
+		"calls: cross frame pruning",
+		.insns = {
+			/* r8 = !!random();
+			 * call pruner()
+			 * if (r8)
+			 *     do something bad;
+			 */
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_get_prandom_u32),
+			BPF_MOV64_IMM(BPF_REG_8, 0),
+			BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+			BPF_MOV64_IMM(BPF_REG_8, 1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 1, 1),
+			BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_1, 0),
+			BPF_MOV64_IMM(BPF_REG_0, 0),
+			BPF_EXIT_INSN(),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+		.errstr_unpriv = "function calls to other bpf functions are allowed for root only",
+		.result_unpriv = REJECT,
+		.errstr = "!read_ok",
+		.result = REJECT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.2.3